From ad12fa3c3b1de81fca3aa0b5aa5e0465a6f83990 Mon Sep 17 00:00:00 2001 From: tink2123 Date: Wed, 19 Oct 2022 11:17:23 +0800 Subject: [PATCH 01/19] update finetune doc --- doc/doc_ch/finetune.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/doc_ch/finetune.md b/doc/doc_ch/finetune.md index e8f146aad..7270afc2b 100644 --- a/doc/doc_ch/finetune.md +++ b/doc/doc_ch/finetune.md @@ -100,6 +100,10 @@ PaddleOCR提供的配置文件是在8卡训练(相当于总的batch size是`8* * 数据分布:建议分布与实测场景尽量一致。如果实测场景包含大量短文本,则训练数据中建议也包含较多短文本,如果实测场景对于空格识别效果要求较高,则训练数据中建议也包含较多带空格的文本内容。 +* 数据合成:针对部分字符识别有误的情况,建议获取一批特定字符数据,加入到原数据中使用小学习率微调。其中原始数据与新增数据比例可尝试 10:1 ~ 5:1, 避免新增数据过多导致模型学偏,同时尽量平衡语料词频,确保常用字的出现频率不会过低。 + + 特定字符生成可以使用 TextRenderer 工具,合成例子可参考 [数码管数据合成](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/applications/%E5%85%89%E5%8A%9F%E7%8E%87%E8%AE%A1%E6%95%B0%E7%A0%81%E7%AE%A1%E5%AD%97%E7%AC%A6%E8%AF%86%E5%88%AB/%E5%85%89%E5%8A%9F%E7%8E%87%E8%AE%A1%E6%95%B0%E7%A0%81%E7%AE%A1%E5%AD%97%E7%AC%A6%E8%AF%86%E5%88%AB.md#31-%E6%95%B0%E6%8D%AE%E5%87%86%E5%A4%87) + 合成数据语料尽量来自真实使用场景,字体、背景在贴近真实场景的基础上保持丰富,有助于提升模型效果。 * 通用中英文数据:在训练的时候,可以在训练集中添加通用真实数据(如在不更换字典的微调场景中,建议添加LSVT、RCTW、MTWI等真实数据),进一步提升模型的泛化性能。 @@ -168,3 +172,8 @@ Train: - general.txt ratio_list: [1.0, 0.1] ``` + +### 3.4 训练调优 + +训练过程并非一蹴而就的,完成一个阶段的训练评估后,建议收集分析当前模型在真实场景中的 badcase,有针对性的调整训练数据比例,或者进一步新增合成数据。 +通过多次迭代训练,不断优化模型效果。 From a7ac5e3fa6259356e236f3500c785aa006efdc08 Mon Sep 17 00:00:00 2001 From: tink2123 Date: Wed, 19 Oct 2022 11:21:03 +0800 Subject: [PATCH 02/19] update finetune doc --- doc/doc_ch/finetune.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/doc_ch/finetune.md b/doc/doc_ch/finetune.md index 7270afc2b..973c4cb10 100644 --- a/doc/doc_ch/finetune.md +++ b/doc/doc_ch/finetune.md @@ -100,10 +100,10 @@ PaddleOCR提供的配置文件是在8卡训练(相当于总的batch size是`8* * 数据分布:建议分布与实测场景尽量一致。如果实测场景包含大量短文本,则训练数据中建议也包含较多短文本,如果实测场景对于空格识别效果要求较高,则训练数据中建议也包含较多带空格的文本内容。 -* 数据合成:针对部分字符识别有误的情况,建议获取一批特定字符数据,加入到原数据中使用小学习率微调。其中原始数据与新增数据比例可尝试 10:1 ~ 5:1, 避免新增数据过多导致模型学偏,同时尽量平衡语料词频,确保常用字的出现频率不会过低。 +* 数据合成:针对部分字符识别有误的情况,建议获取一批特定字符数据,加入到原数据中使用小学习率微调。其中原始数据与新增数据比例可尝试 10:1 ~ 5:1, 避免单一场景数据过多导致模型过拟合,同时尽量平衡语料词频,确保常用字的出现频率不会过低。 特定字符生成可以使用 TextRenderer 工具,合成例子可参考 [数码管数据合成](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/applications/%E5%85%89%E5%8A%9F%E7%8E%87%E8%AE%A1%E6%95%B0%E7%A0%81%E7%AE%A1%E5%AD%97%E7%AC%A6%E8%AF%86%E5%88%AB/%E5%85%89%E5%8A%9F%E7%8E%87%E8%AE%A1%E6%95%B0%E7%A0%81%E7%AE%A1%E5%AD%97%E7%AC%A6%E8%AF%86%E5%88%AB.md#31-%E6%95%B0%E6%8D%AE%E5%87%86%E5%A4%87) - 合成数据语料尽量来自真实使用场景,字体、背景在贴近真实场景的基础上保持丰富,有助于提升模型效果。 + ,合成数据语料尽量来自真实使用场景,在贴近真实场景的基础上保持字体、背景的丰富性,有助于提升模型效果。 * 通用中英文数据:在训练的时候,可以在训练集中添加通用真实数据(如在不更换字典的微调场景中,建议添加LSVT、RCTW、MTWI等真实数据),进一步提升模型的泛化性能。 From 2145d8c4ec755d9ba315630cd8f337a8acd54d06 Mon Sep 17 00:00:00 2001 From: WenmuZhou <572459439@qq.com> Date: Thu, 20 Oct 2022 17:03:47 +0800 Subject: [PATCH 03/19] add recovery requirements to whl --- paddleocr.py | 2 +- ppstructure/docs/quickstart.md | 8 +------- ppstructure/docs/quickstart_en.md | 8 +------- setup.py | 15 +++++++++++---- 4 files changed, 14 insertions(+), 19 deletions(-) diff --git a/paddleocr.py b/paddleocr.py index 44308a823..887feb96a 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -47,7 +47,7 @@ __all__ = [ ] SUPPORT_DET_MODEL = ['DB'] -VERSION = '2.6.0.2' +VERSION = '2.6.0.3' SUPPORT_REC_MODEL = ['CRNN', 'SVTR_LCNet'] BASE_DIR = os.path.expanduser("~/.paddleocr/") diff --git a/ppstructure/docs/quickstart.md b/ppstructure/docs/quickstart.md index 6fbd31c3c..9909f7950 100644 --- a/ppstructure/docs/quickstart.md +++ b/ppstructure/docs/quickstart.md @@ -45,16 +45,10 @@ ```bash # 安装 paddleocr,推荐使用2.6版本 -pip3 install "paddleocr>=2.6" +pip3 install "paddleocr>=2.6.0.3" # 安装 图像方向分类依赖包paddleclas(如不需要图像方向分类功能,可跳过) pip3 install paddleclas>=2.4.3 - -# 安装 关键信息抽取 依赖包(如不需要KIE功能,可跳过) -pip3 install -r ppstructure/kie/requirements.txt - -# 安装 版面恢复 依赖包(如不需要版面恢复功能,可跳过) -pip3 install -r ppstructure/recovery/requirements.txt ``` diff --git a/ppstructure/docs/quickstart_en.md b/ppstructure/docs/quickstart_en.md index 446f9d2ee..c990088a2 100644 --- a/ppstructure/docs/quickstart_en.md +++ b/ppstructure/docs/quickstart_en.md @@ -47,16 +47,10 @@ For more software version requirements, please refer to the instructions in [Ins ```bash # Install paddleocr, version 2.6 is recommended -pip3 install "paddleocr>=2.6" +pip3 install "paddleocr>=2.6.0.3" # Install the image direction classification dependency package paddleclas (if you do not use the image direction classification, you can skip it) pip3 install paddleclas>=2.4.3 - -# Install the KIE dependency packages (if you do not use the KIE, you can skip it) -pip3 install -r kie/requirements.txt - -# Install the layout recovery dependency packages (if you do not use the layout recovery, you can skip it) -pip3 install -r recovery/requirements.txt ``` diff --git a/setup.py b/setup.py index 7d4d871d8..3aa0a1701 100644 --- a/setup.py +++ b/setup.py @@ -16,9 +16,16 @@ from setuptools import setup from io import open from paddleocr import VERSION -with open('requirements.txt', encoding="utf-8-sig") as f: - requirements = f.readlines() - requirements.append('tqdm') +def load_requirements(file_list=None): + if file_list is None: + file_list = ['requirements.txt'] + if isinstance(file_list,str): + file_list = [file_list] + requirements = [] + for file in file_list: + with open(file, encoding="utf-8-sig") as f: + requirements.extend(f.readlines()) + return requirements def readme(): @@ -34,7 +41,7 @@ setup( include_package_data=True, entry_points={"console_scripts": ["paddleocr= paddleocr.paddleocr:main"]}, version=VERSION, - install_requires=requirements, + install_requires=load_requirements(['requirements.txt', 'ppstructure/recovery/requirements.txt']), license='Apache License 2.0', description='Awesome OCR toolkits based on PaddlePaddle (8.6M ultra-lightweight pre-trained model, support training and deployment among server, mobile, embeded and IoT devices', long_description=readme(), From 8fbc08f176a17eec9b2eefad54179da4ab3003e9 Mon Sep 17 00:00:00 2001 From: tink2123 Date: Thu, 20 Oct 2022 17:05:36 +0800 Subject: [PATCH 04/19] fix typo --- doc/doc_ch/inference_ppocr.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/doc_ch/inference_ppocr.md b/doc/doc_ch/inference_ppocr.md index 01c5efd26..2061f059d 100644 --- a/doc/doc_ch/inference_ppocr.md +++ b/doc/doc_ch/inference_ppocr.md @@ -87,9 +87,9 @@ Predicts of ./doc/imgs_words/ch/word_4.jpg:('实力活力', 0.9956803321838379) ``` # 下载英文数字识别模型: -wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar -tar xf en_PP-OCRv3_det_infer.tar -python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./en_PP-OCRv3_det_infer/" --rec_char_dict_path="ppocr/utils/en_dict.txt" +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar +tar xf en_PP-OCRv3_rec_infer.tar +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./en_PP-OCRv3_rec_infer/" --rec_char_dict_path="ppocr/utils/en_dict.txt" ``` ![](../imgs_words/en/word_1.png) From 14efc410501acb82ca114f96c3fcb2ec36481e31 Mon Sep 17 00:00:00 2001 From: WenmuZhou <572459439@qq.com> Date: Thu, 20 Oct 2022 17:05:53 +0800 Subject: [PATCH 05/19] update model link --- doc/doc_ch/algorithm_det_drrg.md | 2 +- doc/doc_ch/algorithm_overview.md | 2 +- doc/doc_ch/algorithm_rec_can.md | 2 +- doc/doc_ch/algorithm_sr_telescope.md | 6 +++--- doc/doc_en/algorithm_det_drrg_en.md | 2 +- doc/doc_en/algorithm_overview_en.md | 2 +- doc/doc_en/algorithm_rec_can_en.md | 2 +- doc/doc_en/algorithm_sr_telescope_en.md | 6 +++--- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/doc/doc_ch/algorithm_det_drrg.md b/doc/doc_ch/algorithm_det_drrg.md index d89a16ae6..8e08d01d1 100644 --- a/doc/doc_ch/algorithm_det_drrg.md +++ b/doc/doc_ch/algorithm_det_drrg.md @@ -23,7 +23,7 @@ | 模型 |骨干网络|配置文件|precision|recall|Hmean|下载链接| |-----| --- | --- | --- | --- | --- | --- | -| DRRG | ResNet50_vd | [configs/det/det_r50_drrg_ctw.yml](../../configs/det/det_r50_drrg_ctw.yml)| 89.92%|80.91%|85.18%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/det_r50_drrg_ctw.tar)| +| DRRG | ResNet50_vd | [configs/det/det_r50_drrg_ctw.yml](../../configs/det/det_r50_drrg_ctw.yml)| 89.92%|80.91%|85.18%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/det_r50_drrg_ctw_train.tar)| ## 2. 环境配置 diff --git a/doc/doc_ch/algorithm_overview.md b/doc/doc_ch/algorithm_overview.md index 44c1e117e..7de581c27 100755 --- a/doc/doc_ch/algorithm_overview.md +++ b/doc/doc_ch/algorithm_overview.md @@ -55,7 +55,7 @@ PaddleOCR将**持续新增**支持OCR领域前沿算法与模型,**欢迎广 |模型|骨干网络|precision|recall|Hmean|下载链接| | --- | --- | --- | --- | --- | --- | |FCE|ResNet50_dcn|88.39%|82.18%|85.27%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/det_r50_dcn_fce_ctw_v2.0_train.tar)| -|DRRG|ResNet50_vd|89.92%|80.91%|85.18%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/det_r50_drrg_ctw.tar)| +|DRRG|ResNet50_vd|89.92%|80.91%|85.18%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/det_r50_drrg_ctw_train.tar)| **说明:** SAST模型训练额外加入了icdar2013、icdar2017、COCO-Text、ArT等公开数据集进行调优。PaddleOCR用到的经过整理格式的英文公开数据集下载: * [百度云地址](https://pan.baidu.com/s/12cPnZcVuV1zn5DOd4mqjVw) (提取码: 2bpi) diff --git a/doc/doc_ch/algorithm_rec_can.md b/doc/doc_ch/algorithm_rec_can.md index 4f266cb33..e4f4ba6f3 100644 --- a/doc/doc_ch/algorithm_rec_can.md +++ b/doc/doc_ch/algorithm_rec_can.md @@ -27,7 +27,7 @@ |模型 |骨干网络|配置文件|ExpRate|下载链接| | ----- | ----- | ----- | ----- | ----- | -|CAN|DenseNet|[rec_d28_can.yml](../../configs/rec/rec_d28_can.yml)|51.72|[训练模型](https://paddleocr.bj.bcebos.com/contribution/can_train.tar)| +|CAN|DenseNet|[rec_d28_can.yml](../../configs/rec/rec_d28_can.yml)|51.72|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_d28_can_train.tar)| ## 2. 环境配置 diff --git a/doc/doc_ch/algorithm_sr_telescope.md b/doc/doc_ch/algorithm_sr_telescope.md index 9a21734b6..e2351be72 100644 --- a/doc/doc_ch/algorithm_sr_telescope.md +++ b/doc/doc_ch/algorithm_sr_telescope.md @@ -27,7 +27,7 @@ |模型|骨干网络|PSNR_Avg|SSIM_Avg|配置文件|下载链接| |---|---|---|---|---|---| -|Text Telescope|tbsrn|21.56|0.7411| [configs/sr/sr_telescope.yml](../../configs/sr/sr_telescope.yml)|[训练模型](https://paddleocr.bj.bcebos.com/contribution/Telescope_train.tar.gz)| +|Text Telescope|tbsrn|21.56|0.7411| [configs/sr/sr_telescope.yml](../../configs/sr/sr_telescope.yml)|[训练模型](https://paddleocr.bj.bcebos.com/contribution/sr_telescope_train.tar)| [TextZoom数据集](https://paddleocr.bj.bcebos.com/dataset/TextZoom.tar) 来自两个超分数据集RealSR和SR-RAW,两个数据集都包含LR-HR对,TextZoom有17367对训数据和4373对测试数据。 @@ -118,8 +118,8 @@ python3 tools/infer/predict_sr.py --sr_model_dir=./inference/sr_out --image_dir= ```bibtex @INPROCEEDINGS{9578891, author={Chen, Jingye and Li, Bin and Xue, Xiangyang}, - booktitle={2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, - title={Scene Text Telescope: Text-Focused Scene Image Super-Resolution}, + booktitle={2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + title={Scene Text Telescope: Text-Focused Scene Image Super-Resolution}, year={2021}, volume={}, number={}, diff --git a/doc/doc_en/algorithm_det_drrg_en.md b/doc/doc_en/algorithm_det_drrg_en.md index 2bb7b5703..8d6538a02 100644 --- a/doc/doc_en/algorithm_det_drrg_en.md +++ b/doc/doc_en/algorithm_det_drrg_en.md @@ -25,7 +25,7 @@ On the CTW1500 dataset, the text detection result is as follows: |Model|Backbone|Configuration|Precision|Recall|Hmean|Download| | --- | --- | --- | --- | --- | --- | --- | -| DRRG | ResNet50_vd | [configs/det/det_r50_drrg_ctw.yml](../../configs/det/det_r50_drrg_ctw.yml)| 89.92%|80.91%|85.18%|[trained model](https://paddleocr.bj.bcebos.com/contribution/det_r50_drrg_ctw.tar)| +| DRRG | ResNet50_vd | [configs/det/det_r50_drrg_ctw.yml](../../configs/det/det_r50_drrg_ctw.yml)| 89.92%|80.91%|85.18%|[trained model](https://paddleocr.bj.bcebos.com/contribution/det_r50_drrg_ctw_train.tar)| ## 2. Environment diff --git a/doc/doc_en/algorithm_overview_en.md b/doc/doc_en/algorithm_overview_en.md index 2614226e0..09ff40791 100755 --- a/doc/doc_en/algorithm_overview_en.md +++ b/doc/doc_en/algorithm_overview_en.md @@ -53,7 +53,7 @@ On CTW1500 dataset, the text detection result is as follows: |Model|Backbone|Precision|Recall|Hmean| Download link| | --- | --- | --- | --- | --- |---| |FCE|ResNet50_dcn|88.39%|82.18%|85.27%| [trained model](https://paddleocr.bj.bcebos.com/contribution/det_r50_dcn_fce_ctw_v2.0_train.tar) | -|DRRG|ResNet50_vd|89.92%|80.91%|85.18%|[trained model](https://paddleocr.bj.bcebos.com/contribution/det_r50_drrg_ctw.tar)| +|DRRG|ResNet50_vd|89.92%|80.91%|85.18%|[trained model](https://paddleocr.bj.bcebos.com/contribution/det_r50_drrg_ctw_train.tar)| **Note:** Additional data, like icdar2013, icdar2017, COCO-Text, ArT, was added to the model training of SAST. Download English public dataset in organized format used by PaddleOCR from: * [Baidu Drive](https://pan.baidu.com/s/12cPnZcVuV1zn5DOd4mqjVw) (download code: 2bpi). diff --git a/doc/doc_en/algorithm_rec_can_en.md b/doc/doc_en/algorithm_rec_can_en.md index da6c9c609..e65bb2aa8 100644 --- a/doc/doc_en/algorithm_rec_can_en.md +++ b/doc/doc_en/algorithm_rec_can_en.md @@ -25,7 +25,7 @@ Using CROHME handwrittem mathematical expression recognition datasets for traini |Model|Backbone|config|exprate|Download link| | --- | --- | --- | --- | --- | -|CAN|DenseNet|[rec_d28_can.yml](../../configs/rec/rec_d28_can.yml)|51.72|[trained model](https://paddleocr.bj.bcebos.com/contribution/can_train.tar)| +|CAN|DenseNet|[rec_d28_can.yml](../../configs/rec/rec_d28_can.yml)|51.72|[trained model](https://paddleocr.bj.bcebos.com/contribution/rec_d28_can_train.tar)| ## 2. Environment diff --git a/doc/doc_en/algorithm_sr_telescope_en.md b/doc/doc_en/algorithm_sr_telescope_en.md index 89f3b373e..9acb52431 100644 --- a/doc/doc_en/algorithm_sr_telescope_en.md +++ b/doc/doc_en/algorithm_sr_telescope_en.md @@ -28,7 +28,7 @@ Referring to the [FudanOCR](https://github.com/FudanVI/FudanOCR/tree/main/scene- |Model|Backbone|config|Acc|Download link| |---|---|---|---|---|---| -|Text Gestalt|tsrn|21.56|0.7411| [configs/sr/sr_telescope.yml](../../configs/sr/sr_telescope.yml)|[train model](https://paddleocr.bj.bcebos.com/contribution/Telescope_train.tar.gz)| +|Text Gestalt|tsrn|21.56|0.7411| [configs/sr/sr_telescope.yml](../../configs/sr/sr_telescope.yml)|[train model](https://paddleocr.bj.bcebos.com/contribution/sr_telescope_train.tar)| The [TextZoom dataset](https://paddleocr.bj.bcebos.com/dataset/TextZoom.tar) comes from two superfraction data sets, RealSR and SR-RAW, both of which contain LR-HR pairs. TextZoom has 17367 pairs of training data and 4373 pairs of test data. @@ -127,8 +127,8 @@ Not supported ```bibtex @INPROCEEDINGS{9578891, author={Chen, Jingye and Li, Bin and Xue, Xiangyang}, - booktitle={2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, - title={Scene Text Telescope: Text-Focused Scene Image Super-Resolution}, + booktitle={2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + title={Scene Text Telescope: Text-Focused Scene Image Super-Resolution}, year={2021}, volume={}, number={}, From edd4bc1b5abe54393d672a2e67cb3c3e8b7a63f7 Mon Sep 17 00:00:00 2001 From: WenmuZhou <572459439@qq.com> Date: Fri, 21 Oct 2022 01:48:43 +0000 Subject: [PATCH 06/19] fix prepare.sh syntax error --- test_tipc/prepare.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/test_tipc/prepare.sh b/test_tipc/prepare.sh index b76332af9..452ec31c4 100644 --- a/test_tipc/prepare.sh +++ b/test_tipc/prepare.sh @@ -290,6 +290,7 @@ if [ ${MODE} = "lite_train_lite_infer" ];then if [ ${model_name} == "sr_telescope" ]; then wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/TextZoom.tar --no-check-certificate cd ./train_data/ && tar xf TextZoom.tar && cd ../ + fi if [ ${model_name} == "rec_d28_can" ]; then wget -nc -P ./train_data/ https://paddleocr.bj.bcebos.com/dataset/CROHME_lite.tar --no-check-certificate cd ./train_data/ && tar xf CROHME_lite.tar && cd ../ From 4241dd06e9ff178db5996946ea8c84f106e18ae6 Mon Sep 17 00:00:00 2001 From: WenmuZhou <572459439@qq.com> Date: Mon, 24 Oct 2022 04:14:51 +0000 Subject: [PATCH 07/19] update PP-Structurev to PP-StructureV --- StyleText/README.md | 7 +-- StyleText/README_ch.md | 4 +- applications/PCB字符识别/PCB字符识别.md | 24 ++++---- applications/中文表格识别.md | 8 +-- .../光功率计数码管字符识别.md | 18 +++--- applications/印章弯曲文字识别.md | 6 +- applications/发票关键信息抽取.md | 8 +-- applications/液晶屏读数识别.md | 36 +++++------ applications/轻量级车牌识别.md | 32 +++++----- applications/高精度中文识别模型.md | 6 +- doc/doc_ch/PP-OCRv3_introduction.md | 56 ++++++++--------- doc/doc_ch/algorithm_det_east.md | 2 +- doc/doc_ch/algorithm_e2e_pgnet.md | 2 +- doc/doc_ch/algorithm_kie_sdmgr.md | 2 +- doc/doc_ch/algorithm_overview.md | 4 +- doc/doc_ch/algorithm_rec_can.md | 4 +- doc/doc_ch/algorithm_rec_rare.md | 4 +- doc/doc_ch/algorithm_rec_seed.md | 2 +- doc/doc_ch/algorithm_rec_spin.md | 2 +- doc/doc_ch/algorithm_rec_visionlan.md | 2 +- doc/doc_ch/models_list.md | 30 +++++----- doc/doc_en/PP-OCRv3_introduction_en.md | 60 +++++++++---------- doc/doc_en/algorithm_det_east_en.md | 2 +- doc/doc_en/algorithm_e2e_pgnet_en.md | 2 +- doc/doc_en/algorithm_kie_sdmgr_en.md | 2 +- doc/doc_en/algorithm_overview_en.md | 4 +- doc/doc_en/algorithm_rec_can_en.md | 2 +- doc/doc_en/algorithm_rec_rare_en.md | 4 +- doc/doc_en/algorithm_rec_seed_en.md | 2 +- doc/doc_en/algorithm_rec_spin_en.md | 2 +- doc/doc_en/algorithm_rec_visionlan_en.md | 2 +- doc/doc_en/distributed_training_en.md | 6 +- doc/doc_en/models_list_en.md | 22 +++---- .../docs/PP-StructureV2_introduction.md | 52 ++++++++-------- ppstructure/docs/models_list.md | 14 ++--- ppstructure/docs/models_list_en.md | 12 ++-- ppstructure/table/README.md | 2 +- ppstructure/table/README_ch.md | 2 +- 38 files changed, 225 insertions(+), 226 deletions(-) diff --git a/StyleText/README.md b/StyleText/README.md index 65a72ac80..609c90539 100644 --- a/StyleText/README.md +++ b/StyleText/README.md @@ -120,7 +120,7 @@ In actual application scenarios, it is often necessary to synthesize pictures in * `with_label`:Whether the `label_file` is label file list. * `CorpusGenerator`: * `method`:Method of CorpusGenerator,supports `FileCorpus` and `EnNumCorpus`. If `EnNumCorpus` is used,No other configuration is needed,otherwise you need to set `corpus_file` and `language`. - * `language`:Language of the corpus. Currently, the tool only supports English(en), Simplified Chinese(ch) and Korean(ko). + * `language`:Language of the corpus. Currently, the tool only supports English(en), Simplified Chinese(ch) and Korean(ko). * `corpus_file`: Filepath of the corpus. Corpus file should be a text file which will be split by line-endings('\n'). Corpus generator samples one line each time. @@ -171,9 +171,8 @@ After adding the above synthetic data for training, the accuracy of the recognit | Scenario | Characters | Raw Data | Test Data | Only Use Raw Data
Recognition Accuracy | New Synthetic Data | Simultaneous Use of Synthetic Data
Recognition Accuracy | Index Improvement | | -------- | ---------- | -------- | -------- | -------------------------- | ------------ | ---------------------- | -------- | -| Metal surface | English and numbers | 2203 | 650 | 0.5938 | 20000 | 0.7546 | 16% | -| Random background | Korean | 5631 | 1230 | 0.3012 | 100000 | 0.5057 | 20% | - +| Metal surface | English and numbers | 2203 | 650 | 59.38% | 20000 | 75.46% | 16.00% | +| Random background | Korean | 5,631 | 1230 | 30.12% | 100000 | 50.57% | 20.00% | ### Code Structure diff --git a/StyleText/README_ch.md b/StyleText/README_ch.md index ccd1efaf1..b35967f4a 100644 --- a/StyleText/README_ch.md +++ b/StyleText/README_ch.md @@ -156,8 +156,8 @@ python3 tools/synth_image.py -c configs/config.yml --style_image examples/style_ | 场景 | 字符 | 原始数据 | 测试数据 | 只使用原始数据
识别准确率 | 新增合成数据 | 同时使用合成数据
识别准确率 | 指标提升 | | -------- | ---------- | -------- | -------- | -------------------------- | ------------ | ---------------------- | -------- | -| 金属表面 | 英文和数字 | 2203 | 650 | 0.5938 | 20000 | 0.7546 | 16% | -| 随机背景 | 韩语 | 5631 | 1230 | 0.3012 | 100000 | 0.5057 | 20% | +| 金属表面 | 英文和数字 | 2203 | 650 | 59.38% | 20000 | 75.46% | 16.00% | +| 随机背景 | 韩语 | 5631 | 1,230 | 30.12% | 100000 | 50.57% | 20.00% | diff --git a/applications/PCB字符识别/PCB字符识别.md b/applications/PCB字符识别/PCB字符识别.md index ee13bacff..804d57e3b 100644 --- a/applications/PCB字符识别/PCB字符识别.md +++ b/applications/PCB字符识别/PCB字符识别.md @@ -266,8 +266,8 @@ python3 tools/eval.py \ | 序号 | 方案 | hmean | 效果提升 | 实验分析 | | -------- | -------- | -------- | -------- | -------- | | 1 | PP-OCRv3英文超轻量检测预训练模型 | 64.64% | - | 提供的预训练模型具有泛化能力 | -| 2 | PP-OCRv3英文超轻量检测预训练模型 + 验证集padding | 72.13% |+7.5% | padding可以提升尺寸较小图片的检测效果| -| 3 | PP-OCRv3英文超轻量检测预训练模型 + fine-tune | 100% | +27.9% | fine-tune会提升垂类场景效果 | +| 2 | PP-OCRv3英文超轻量检测预训练模型 + 验证集padding | 72.13% |+7.50% | padding可以提升尺寸较小图片的检测效果| +| 3 | PP-OCRv3英文超轻量检测预训练模型 + fine-tune | 100.00% | +27.90% | fine-tune会提升垂类场景效果 | ``` @@ -420,12 +420,12 @@ python3 tools/eval.py \ | 序号 | 方案 | acc | 效果提升 | 实验分析 | | -------- | -------- | -------- | -------- | -------- | | 1 | PP-OCRv3中英文超轻量识别预训练模型直接评估 | 46.67% | - | 提供的预训练模型具有泛化能力 | -| 2 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune | 42.02% |-4.6% | 在数据量不足的情况,反而比预训练模型效果低(也可以通过调整超参数再试试)| -| 3 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 公开通用识别数据集 | 77% | +30% | 在数据量不足的情况下,可以考虑补充公开数据训练 | -| 4 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 增加PCB图像数量 | 99.99% | +23% | 如果能获取更多数据量的情况,可以通过增加数据量提升效果 | +| 2 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune | 42.02% |-4.60% | 在数据量不足的情况,反而比预训练模型效果低(也可以通过调整超参数再试试)| +| 3 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 公开通用识别数据集 | 77.00% | +30.00% | 在数据量不足的情况下,可以考虑补充公开数据训练 | +| 4 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 增加PCB图像数量 | 99.99% | +23.00% | 如果能获取更多数据量的情况,可以通过增加数据量提升效果 | ``` -注:上述实验结果均是在1500张图片(1200张训练集,300张测试集)、2W张图片、添加公开通用识别数据集上训练、评估的得到,AIstudio只提供了100张数据,所以指标有所差异属于正常,只要策略有效、规律相同即可。 +注:上述实验结果均是在1,500张图片(1,200张训练集,300张测试集)、2W张图片、添加公开通用识别数据集上训练、评估的得到,AIstudio只提供了100张数据,所以指标有所差异属于正常,只要策略有效、规律相同即可。 ``` # 6. 模型导出 @@ -614,23 +614,23 @@ python3 tools/end2end/eval_end2end.py ./save_gt_label/ ./save_PPOCRV2_infer/ | 序号 | 方案 | hmean | 效果提升 | 实验分析 | | ---- | -------------------------------------------------------- | ------ | -------- | ------------------------------------- | | 1 | PP-OCRv3英文超轻量检测预训练模型直接评估 | 64.64% | - | 提供的预训练模型具有泛化能力 | -| 2 | PP-OCRv3英文超轻量检测预训练模型 + 验证集padding直接评估 | 72.13% | +7.5% | padding可以提升尺寸较小图片的检测效果 | -| 3 | PP-OCRv3英文超轻量检测预训练模型 + fine-tune | 100% | +27.9% | fine-tune会提升垂类场景效果 | +| 2 | PP-OCRv3英文超轻量检测预训练模型 + 验证集padding直接评估 | 72.13% | +7.50% | padding可以提升尺寸较小图片的检测效果 | +| 3 | PP-OCRv3英文超轻量检测预训练模型 + fine-tune | 100.00% | +27.90% | fine-tune会提升垂类场景效果 | * 识别 | 序号 | 方案 | acc | 效果提升 | 实验分析 | | ---- | ------------------------------------------------------------ | ------ | -------- | ------------------------------------------------------------ | | 1 | PP-OCRv3中英文超轻量识别预训练模型直接评估 | 46.67% | - | 提供的预训练模型具有泛化能力 | -| 2 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune | 42.02% | -4.6% | 在数据量不足的情况,反而比预训练模型效果低(也可以通过调整超参数再试试) | -| 3 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 公开通用识别数据集 | 77% | +30% | 在数据量不足的情况下,可以考虑补充公开数据训练 | -| 4 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 增加PCB图像数量 | 99.99% | +23% | 如果能获取更多数据量的情况,可以通过增加数据量提升效果 | +| 2 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune | 42.02% | -4.60% | 在数据量不足的情况,反而比预训练模型效果低(也可以通过调整超参数再试试) | +| 3 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 公开通用识别数据集 | 77.00% | +30.00% | 在数据量不足的情况下,可以考虑补充公开数据训练 | +| 4 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 增加PCB图像数量 | 99.99% | +23.00% | 如果能获取更多数据量的情况,可以通过增加数据量提升效果 | * 端到端 | det | rec | fmeasure | | --------------------------------------------- | ------------------------------------------------------------ | -------- | -| PP-OCRv3英文超轻量检测预训练模型 + fine-tune | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 增加PCB图像数量 | 93.3% | +| PP-OCRv3英文超轻量检测预训练模型 + fine-tune | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 增加PCB图像数量 | 93.30% | *结论* diff --git a/applications/中文表格识别.md b/applications/中文表格识别.md index af7cc96b7..d61514ff2 100644 --- a/applications/中文表格识别.md +++ b/applications/中文表格识别.md @@ -34,7 +34,7 @@ ![](https://ai-studio-static-online.cdn.bcebos.com/5ffff2093a144a6993a75eef71634a52276015ee43a04566b9c89d353198c746) -当前的表格识别算法不能很好的处理这些场景下的表格图像。在本例中,我们使用PP-Structurev2最新发布的表格识别模型SLANet来演示如何进行中文表格是识别。同时,为了方便作业流程,我们使用表格属性识别模型对表格图像的属性进行识别,对表格的难易程度进行判断,加快人工进行校对速度。 +当前的表格识别算法不能很好的处理这些场景下的表格图像。在本例中,我们使用PP-StructureV2最新发布的表格识别模型SLANet来演示如何进行中文表格是识别。同时,为了方便作业流程,我们使用表格属性识别模型对表格图像的属性进行识别,对表格的难易程度进行判断,加快人工进行校对速度。 本项目AI Studio链接:https://aistudio.baidu.com/aistudio/projectdetail/4588067 @@ -192,14 +192,14 @@ plt.show() ### 2.3 训练 -这里选用PP-Structurev2中的表格识别模型[SLANet](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/configs/table/SLANet.yml) +这里选用PP-StructureV2中的表格识别模型[SLANet](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/configs/table/SLANet.yml) -SLANet是PP-Structurev2全新推出的表格识别模型,相比PP-Structurev1中TableRec-RARE,在速度不变的情况下精度提升4.7%。TEDS提升2% +SLANet是PP-StructureV2全新推出的表格识别模型,相比PP-StructureV1中TableRec-RARE,在速度不变的情况下精度提升4.7%。TEDS提升2% |算法|Acc|[TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src)|Speed| | --- | --- | --- | ---| -| EDD[2] |x| 88.3% |x| +| EDD[2] |x| 88.30% |x| | TableRec-RARE(ours) | 71.73%| 93.88% |779ms| | SLANet(ours) | 76.31%| 95.89%|766ms| diff --git a/applications/光功率计数码管字符识别/光功率计数码管字符识别.md b/applications/光功率计数码管字符识别/光功率计数码管字符识别.md index 2a35cb170..25e32cfad 100644 --- a/applications/光功率计数码管字符识别/光功率计数码管字符识别.md +++ b/applications/光功率计数码管字符识别/光功率计数码管字符识别.md @@ -182,15 +182,15 @@ PPOCRLabel是一款适用于OCR领域的半自动化图形标注工具,内置P | ID | 策略 | 模型大小 | 精度 | 预测耗时(CPU + MKLDNN)| |-----|-----|--------|----| --- | -| 01 | PP-OCRv2 | 8M | 74.8% | 8.54ms | -| 02 | SVTR_Tiny | 21M | 80.1% | 97ms | -| 03 | SVTR_LCNet(h32) | 12M | 71.9% | 6.6ms | -| 04 | SVTR_LCNet(h48) | 12M | 73.98% | 7.6ms | -| 05 | + GTC | 12M | 75.8% | 7.6ms | -| 06 | + TextConAug | 12M | 76.3% | 7.6ms | -| 07 | + TextRotNet | 12M | 76.9% | 7.6ms | -| 08 | + UDML | 12M | 78.4% | 7.6ms | -| 09 | + UIM | 12M | 79.4% | 7.6ms | +| 01 | PP-OCRv2 | 8M | 74.80% | 8.54ms | +| 02 | SVTR_Tiny | 21M | 80.10% | 97.00ms | +| 03 | SVTR_LCNet(h32) | 12M | 71.90% | 6.60ms | +| 04 | SVTR_LCNet(h48) | 12M | 73.98% | 7.60ms | +| 05 | + GTC | 12M | 75.80% | 7.60ms | +| 06 | + TextConAug | 12M | 76.30% | 7.60ms | +| 07 | + TextRotNet | 12M | 76.90% | 7.60ms | +| 08 | + UDML | 12M | 78.40% | 7.60ms | +| 09 | + UIM | 12M | 79.40% | 7.60ms | ### 3.3 开始训练 diff --git a/applications/印章弯曲文字识别.md b/applications/印章弯曲文字识别.md index fce9ea772..702561cef 100644 --- a/applications/印章弯曲文字识别.md +++ b/applications/印章弯曲文字识别.md @@ -30,9 +30,9 @@ | 任务 | 训练数据数量 | 精度 | | -------- | - | -------- | -| 印章检测 | 1000 | 95% | -| 印章文字识别-端对端OCR方法 | 700 | 47% | -| 印章文字识别-两阶段OCR方法 | 700 | 55% | +| 印章检测 | 1000 | 95.00% | +| 印章文字识别-端对端OCR方法 | 700 | 47.00% | +| 印章文字识别-两阶段OCR方法 | 700 | 55.00% | 点击进入 [AI Studio 项目](https://aistudio.baidu.com/aistudio/projectdetail/4586113) diff --git a/applications/发票关键信息抽取.md b/applications/发票关键信息抽取.md index 82f5b8d48..b8a8ee216 100644 --- a/applications/发票关键信息抽取.md +++ b/applications/发票关键信息抽取.md @@ -145,8 +145,8 @@ LayoutXLM与VI-LayoutXLM针对该场景的训练结果如下所示。 | 模型 | 迭代轮数 | Hmean | | :---: | :---: | :---: | -| LayoutXLM | 50 | 100% | -| VI-LayoutXLM | 50 | 100% | +| LayoutXLM | 50 | 100.00% | +| VI-LayoutXLM | 50 | 100.00% | 可以看出,由于当前数据量较少,场景比较简单,因此2个模型的Hmean均达到了100%。 @@ -274,8 +274,8 @@ LayoutXLM与VI-LayoutXLM针对该场景的训练结果如下所示。 | 模型 | 迭代轮数 | Hmean | | :---: | :---: | :---: | -| LayoutXLM | 50 | 98.0% | -| VI-LayoutXLM | 50 | 99.3% | +| LayoutXLM | 50 | 98.00% | +| VI-LayoutXLM | 50 | 99.30% | 可以看出,对于VI-LayoutXLM相比LayoutXLM的Hmean高了1.3%。 diff --git a/applications/液晶屏读数识别.md b/applications/液晶屏读数识别.md index ff2fb2cb4..f70fa06d8 100644 --- a/applications/液晶屏读数识别.md +++ b/applications/液晶屏读数识别.md @@ -110,7 +110,7 @@ python tools/eval.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Globa | | 方案 |hmeans| |---|---------------------------|---| -| 0 | PP-OCRv3中英文超轻量检测预训练模型直接预测 |47.5%| +| 0 | PP-OCRv3中英文超轻量检测预训练模型直接预测 |47.50%| #### 4.3.2 预训练模型直接finetune ##### 修改配置文件 @@ -143,8 +143,8 @@ python tools/eval.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Globa 结果如下: | | 方案 |hmeans| |---|---------------------------|---| -| 0 | PP-OCRv3中英文超轻量检测预训练模型直接预测 |47.5%| -| 1 | PP-OCRv3中英文超轻量检测预训练模型fintune |65.2%| +| 0 | PP-OCRv3中英文超轻量检测预训练模型直接预测 |47.50%| +| 1 | PP-OCRv3中英文超轻量检测预训练模型fintune |65.20%| #### 4.3.3 基于预训练模型Finetune_student模型 @@ -175,9 +175,9 @@ python tools/eval.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml -o G 结果如下: | | 方案 |hmeans| |---|---------------------------|---| -| 0 | PP-OCRv3中英文超轻量检测预训练模型直接预测 |47.5%| -| 1 | PP-OCRv3中英文超轻量检测预训练模型fintune |65.2%| -| 2 | PP-OCRv3中英文超轻量检测预训练模型fintune学生模型 |80.0%| +| 0 | PP-OCRv3中英文超轻量检测预训练模型直接预测 |47.50%| +| 1 | PP-OCRv3中英文超轻量检测预训练模型fintune |65.20%| +| 2 | PP-OCRv3中英文超轻量检测预训练模型fintune学生模型 |80.00%| #### 4.3.4 基于预训练模型Finetune_teacher模型 @@ -233,10 +233,10 @@ python tools/eval.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml -o Globa 结果如下: | | 方案 |hmeans| |---|---------------------------|---| -| 0 | PP-OCRv3中英文超轻量检测预训练模型直接预测 |47.5%| -| 1 | PP-OCRv3中英文超轻量检测预训练模型fintune |65.2%| -| 2 | PP-OCRv3中英文超轻量检测预训练模型fintune学生模型 |80.0%| -| 3 | PP-OCRv3中英文超轻量检测预训练模型fintune教师模型 |84.8%| +| 0 | PP-OCRv3中英文超轻量检测预训练模型直接预测 |47.50%| +| 1 | PP-OCRv3中英文超轻量检测预训练模型fintune |65.20%| +| 2 | PP-OCRv3中英文超轻量检测预训练模型fintune学生模型 |80.00%| +| 3 | PP-OCRv3中英文超轻量检测预训练模型fintune教师模型 |84.80%| #### 4.3.5 采用CML蒸馏进一步提升student模型精度 @@ -294,11 +294,11 @@ python tools/eval.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Globa 结果如下: | | 方案 |hmeans| |---|---------------------------|---| -| 0 | PP-OCRv3中英文超轻量检测预训练模型直接预测 |47.5%| -| 1 | PP-OCRv3中英文超轻量检测预训练模型fintune |65.2%| -| 2 | PP-OCRv3中英文超轻量检测预训练模型fintune学生模型 |80.0%| -| 3 | PP-OCRv3中英文超轻量检测预训练模型fintune教师模型 |84.8%| -| 4 | 基于2和3训练好的模型fintune |82.7%| +| 0 | PP-OCRv3中英文超轻量检测预训练模型直接预测 |47.50%| +| 1 | PP-OCRv3中英文超轻量检测预训练模型fintune |65.20%| +| 2 | PP-OCRv3中英文超轻量检测预训练模型fintune学生模型 |80.00%| +| 3 | PP-OCRv3中英文超轻量检测预训练模型fintune教师模型 |84.80%| +| 4 | 基于2和3训练好的模型fintune |82.70%| 如需获取已训练模型,请扫码填写问卷,加入PaddleOCR官方交流群获取全部OCR垂类模型下载链接、《动手学OCR》电子书等全套OCR学习资料🎁
@@ -445,7 +445,7 @@ python tools/eval.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml -o 结果如下: | | 方案 |accuracy| |---|---------------------------|---| -| 0 | PP-OCRv3中英文超轻量识别预训练模型直接预测 |70.4%| +| 0 | PP-OCRv3中英文超轻量识别预训练模型直接预测 |70.40%| #### 开始训练 我们使用上面修改好的配置文件configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml,预训练模型,数据集路径,学习率,训练轮数等都已经设置完毕后,可以使用下面命令开始训练。 @@ -465,8 +465,8 @@ python tools/eval.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml -o 结果如下: | | 方案 |accuracy| |---|---------------------------|---| -| 0 | PP-OCRv3中英文超轻量识别预训练模型直接预测 |70.4%| -| 1 | PP-OCRv3中英文超轻量识别预训练模型finetune |82.2%| +| 0 | PP-OCRv3中英文超轻量识别预训练模型直接预测 |70.40%| +| 1 | PP-OCRv3中英文超轻量识别预训练模型finetune |82.20%| 如需获取已训练模型,请扫码填写问卷,加入PaddleOCR官方交流群获取全部OCR垂类模型下载链接、《动手学OCR》电子书等全套OCR学习资料🎁
diff --git a/applications/轻量级车牌识别.md b/applications/轻量级车牌识别.md index 1a63091b9..c9b76ee61 100644 --- a/applications/轻量级车牌识别.md +++ b/applications/轻量级车牌识别.md @@ -329,7 +329,7 @@ python tools/eval.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml -o \ |方案|hmeans| |---|---| |PP-OCRv3中英文超轻量检测预训练模型直接预测|76.12%| -|PP-OCRv3中英文超轻量检测预训练模型 fine-tune|99%| +|PP-OCRv3中英文超轻量检测预训练模型 fine-tune|99.00%| 可以看到进行fine-tune能显著提升车牌检测的效果。 @@ -357,8 +357,8 @@ python3.7 deploy/slim/quantization/quant.py -c configs/det/ch_PP-OCRv3/ch_PP-OCR |方案|hmeans| 模型大小 | 预测速度(lite) | |---|---|------|------------| -|PP-OCRv3中英文超轻量检测预训练模型 fine-tune|99%| 2.5M | 223ms | -|PP-OCRv3中英文超轻量检测预训练模型 fine-tune+量化|98.91%| 1M | 189ms | +|PP-OCRv3中英文超轻量检测预训练模型 fine-tune|99.00%| 2.5M | 223ms | +|PP-OCRv3中英文超轻量检测预训练模型 fine-tune+量化|98.91%| 1.0M | 189ms | 可以看到通过量化训练在精度几乎无损的情况下,降低模型体积60%并且推理速度提升15%。 @@ -492,7 +492,7 @@ text = text.replace('·','') |方案|acc| |---|---| -|PP-OCRv3中英文超轻量识别预训练模型直接预测|0.2%| +|PP-OCRv3中英文超轻量识别预训练模型直接预测|0.20%| |PP-OCRv3中英文超轻量识别预训练模型直接预测+后处理去掉多识别的`·`|90.97%| 可以看到,去掉多余的`·`能大幅提高精度。 @@ -547,7 +547,7 @@ python tools/eval.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml -o \ |方案| acc | |---|--------| -|PP-OCRv3中英文超轻量识别预训练模型直接预测| 0% | +|PP-OCRv3中英文超轻量识别预训练模型直接预测| 0.00% | |PP-OCRv3中英文超轻量识别预训练模型直接预测+后处理去掉多识别的`·`| 90.97% | |PP-OCRv3中英文超轻量识别预训练模型 fine-tune| 94.54% | @@ -578,7 +578,7 @@ python3.7 deploy/slim/quantization/quant.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_ |方案| acc | 模型大小 | 预测速度(lite) | |---|--------|-------|------------| |PP-OCRv3中英文超轻量识别预训练模型 fine-tune| 94.54% | 10.3M | 4.2ms | -|PP-OCRv3中英文超轻量识别预训练模型 fine-tune + 量化| 93.4% | 4.8M | 1.8ms | +|PP-OCRv3中英文超轻量识别预训练模型 fine-tune + 量化| 93.40% | 4.8M | 1.8ms | 可以看到量化后能降低模型体积53%并且推理速度提升57%,但是由于识别数据过少,量化带来了1%的精度下降。 @@ -738,7 +738,7 @@ fmeasure: 87.36% |PP-OCRv3中英文超轻量检测预训练模型
PP-OCRv3中英文超轻量识别预训练模型| 0.04% | |PP-OCRv3中英文超轻量检测预训练模型
PP-OCRv3中英文超轻量识别预训练模型 + 后处理去掉多识别的`·`| 78.27% | |PP-OCRv3中英文超轻量检测预训练模型+fine-tune
PP-OCRv3中英文超轻量识别预训练模型+fine-tune| 87.14% | -|PP-OCRv3中英文超轻量检测预训练模型+fine-tune+量化
PP-OCRv3中英文超轻量识别预训练模型+fine-tune+量化| 88% | +|PP-OCRv3中英文超轻量检测预训练模型+fine-tune+量化
PP-OCRv3中英文超轻量识别预训练模型+fine-tune+量化| 88.00% | 从结果中可以看到对预训练模型不做修改,只根据场景下的具体情况进行后处理的修改就能大幅提升端到端指标到78.27%,在CCPD数据集上进行 fine-tune 后指标进一步提升到87.14%, 在经过量化训练之后,由于检测模型的recall变高,指标进一步提升到88%。但是这个结果仍旧不符合检测模型+识别模型的真实性能(99%*94%=93%),因此我们需要对 base case 进行具体分析。 @@ -763,8 +763,8 @@ if len(txt) != 8: # 车牌字符串长度为8 |---|---|---|---|---|---| |PP-OCRv3中英文超轻量检测预训练模型
PP-OCRv3中英文超轻量识别预训练模型|0.04%|0.08%|0.02%|0.05%|0.00%(A)| |PP-OCRv3中英文超轻量检测预训练模型
PP-OCRv3中英文超轻量识别预训练模型 + 后处理去掉多识别的`·`|78.27%|90.84%|78.61%|79.43%|91.66%(A+B+C)| -|PP-OCRv3中英文超轻量检测预训练模型+fine-tune
PP-OCRv3中英文超轻量识别预训练模型+fine-tune|87.14%|90.40%|87.66%|89.98|92.5%(A+B+C)| -|PP-OCRv3中英文超轻量检测预训练模型+fine-tune+量化
PP-OCRv3中英文超轻量识别预训练模型+fine-tune+量化|88%|90.54%|88.5%|89.46%|92.02%(A+B+C)| +|PP-OCRv3中英文超轻量检测预训练模型+fine-tune
PP-OCRv3中英文超轻量识别预训练模型+fine-tune|87.14%|90.40%|87.66%|89.98%|92.50%(A+B+C)| +|PP-OCRv3中英文超轻量检测预训练模型+fine-tune+量化
PP-OCRv3中英文超轻量识别预训练模型+fine-tune+量化|88.00%|90.54%|88.50%|89.46%|92.02%(A+B+C)| 从结果中可以看到对预训练模型不做修改,只根据场景下的具体情况进行后处理的修改就能大幅提升端到端指标到91.66%,在CCPD数据集上进行 fine-tune 后指标进一步提升到92.5%, 在经过量化训练之后,指标变为92.02%。 @@ -800,17 +800,17 @@ python tools/infer/predict_system.py \ |方案|hmeans| 模型大小 | 预测速度(lite) | |---|---|------|------------| |PP-OCRv3中英文超轻量检测预训练模型直接预测|76.12%|2.5M| 233ms | -|PP-OCRv3中英文超轻量检测预训练模型 fine-tune|99%| 2.5M | 233ms | -|PP-OCRv3中英文超轻量检测预训练模型 fine-tune + 量化|98.91%| 1M | 189ms |fine-tune +|PP-OCRv3中英文超轻量检测预训练模型 fine-tune|99.00%| 2.5M | 233ms | +|PP-OCRv3中英文超轻量检测预训练模型 fine-tune + 量化|98.91%| 1.0M | 189ms |fine-tune - 识别 |方案| acc | 模型大小 | 预测速度(lite) | |---|--------|-------|------------| -|PP-OCRv3中英文超轻量识别预训练模型直接预测| 0% |10.3M| 4.2ms | +|PP-OCRv3中英文超轻量识别预训练模型直接预测| 0.00% |10.3M| 4.2ms | |PP-OCRv3中英文超轻量识别预训练模型直接预测+后处理去掉多识别的`·`| 90.97% |10.3M| 4.2ms | -|PP-OCRv3中英文超轻量识别预训练模型 fine-tune| 94.54% | 10.3M | 4,2ms | -|PP-OCRv3中英文超轻量识别预训练模型 fine-tune + 量化| 93.4% | 4.8M | 1.8ms | +|PP-OCRv3中英文超轻量识别预训练模型 fine-tune| 94.54% | 10.3M | 4.2ms | +|PP-OCRv3中英文超轻量识别预训练模型 fine-tune + 量化| 93.40% | 4.8M | 1.8ms | - 端到端指标如下: @@ -819,8 +819,8 @@ python tools/infer/predict_system.py \ |---|---|---|---| |PP-OCRv3中英文超轻量检测预训练模型
PP-OCRv3中英文超轻量识别预训练模型|0.08%|12.8M|298ms| |PP-OCRv3中英文超轻量检测预训练模型
PP-OCRv3中英文超轻量识别预训练模型 + 后处理去掉多识别的`·`|91.66%|12.8M|298ms| -|PP-OCRv3中英文超轻量检测预训练模型+fine-tune
PP-OCRv3中英文超轻量识别预训练模型+fine-tune|92.5%|12.8M|298ms| -|PP-OCRv3中英文超轻量检测预训练模型+fine-tune+量化
PP-OCRv3中英文超轻量识别预训练模型+fine-tune+量化|92.02%|5.8M|224ms| +|PP-OCRv3中英文超轻量检测预训练模型+fine-tune
PP-OCRv3中英文超轻量识别预训练模型+fine-tune|92.50%|12.8M|298ms| +|PP-OCRv3中英文超轻量检测预训练模型+fine-tune+量化
PP-OCRv3中英文超轻量识别预训练模型+fine-tune+量化|92.02%|5.80M|224ms| **结论** diff --git a/applications/高精度中文识别模型.md b/applications/高精度中文识别模型.md index 4e71e2330..b233855f4 100644 --- a/applications/高精度中文识别模型.md +++ b/applications/高精度中文识别模型.md @@ -13,9 +13,9 @@ PP-OCRv3是百度开源的超轻量级场景文本检测识别模型库,其中 |中文识别算法|模型|UIM|精度| | --- | --- | --- |--- | -|PP-OCRv3|SVTR_LCNet| w/o |78.4%| -|PP-OCRv3|SVTR_LCNet| w |79.4%| -|SVTR|SVTR-Tiny|-|82.5%| +|PP-OCRv3|SVTR_LCNet| w/o |78.40%| +|PP-OCRv3|SVTR_LCNet| w |79.40%| +|SVTR|SVTR-Tiny|-|82.50%| aistudio项目链接: [高精度中文场景文本识别模型SVTR](https://aistudio.baidu.com/aistudio/projectdetail/4263032) diff --git a/doc/doc_ch/PP-OCRv3_introduction.md b/doc/doc_ch/PP-OCRv3_introduction.md index ddeb78d74..446af23e4 100644 --- a/doc/doc_ch/PP-OCRv3_introduction.md +++ b/doc/doc_ch/PP-OCRv3_introduction.md @@ -53,13 +53,13 @@ PP-OCRv3检测模型是对PP-OCRv2中的[CML](https://arxiv.org/pdf/2109.03144.p |序号|策略|模型大小|hmean|速度(cpu + mkldnn)| |-|-|-|-|-| -|baseline teacher|PP-OCR server|49M|83.2%|171ms| -|teacher1|DB-R50-LK-PAN|124M|85.0%|396ms| -|teacher2|DB-R50-LK-PAN-DML|124M|86.0%|396ms| -|baseline student|PP-OCRv2|3M|83.2%|117ms| -|student0|DB-MV3-RSE-FPN|3.6M|84.5%|124ms| -|student1|DB-MV3-CML(teacher2)|3M|84.3%|117ms| -|student2|DB-MV3-RSE-FPN-CML(teacher2)|3.6M|85.4%|124ms| +|baseline teacher|PP-OCR server|49.0M|83.20%|171ms| +|teacher1|DB-R50-LK-PAN|124.0M|85.00%|396ms| +|teacher2|DB-R50-LK-PAN-DML|124.0M|86.00%|396ms| +|baseline student|PP-OCRv2|3.0M|83.20%|117ms| +|student0|DB-MV3-RSE-FPN|3.6M|84.50%|124ms| +|student1|DB-MV3-CML(teacher2)|3.0M|84.30%|117ms| +|student2|DB-MV3-RSE-FPN-CML(teacher2)|3.60M|85.40%|124ms| 测试环境: Intel Gold 6148 CPU,预测时开启MKLDNN加速。 @@ -101,15 +101,15 @@ PP-OCRv3的识别模块是基于文本识别算法[SVTR](https://arxiv.org/abs/2 | ID | 策略 | 模型大小 | 精度 | 预测耗时(CPU + MKLDNN)| |-----|-----|--------|----| --- | -| 01 | PP-OCRv2 | 8M | 74.8% | 8.54ms | -| 02 | SVTR_Tiny | 21M | 80.1% | 97ms | -| 03 | SVTR_LCNet(h32) | 12M | 71.9% | 6.6ms | -| 04 | SVTR_LCNet(h48) | 12M | 73.98% | 7.6ms | -| 05 | + GTC | 12M | 75.8% | 7.6ms | -| 06 | + TextConAug | 12M | 76.3% | 7.6ms | -| 07 | + TextRotNet | 12M | 76.9% | 7.6ms | -| 08 | + UDML | 12M | 78.4% | 7.6ms | -| 09 | + UIM | 12M | 79.4% | 7.6ms | +| 01 | PP-OCRv2 | 8.0M | 74.80% | 8.54ms | +| 02 | SVTR_Tiny | 21.0M | 80.10% | 97.00ms | +| 03 | SVTR_LCNet(h32) | 12.0M | 71.90% | 6.60ms | +| 04 | SVTR_LCNet(h48) | 12.0M | 73.98% | 7.60ms | +| 05 | + GTC | 12.0M | 75.80% | 7.60ms | +| 06 | + TextConAug | 12.0M | 76.30% | 7.60ms | +| 07 | + TextRotNet | 12.0M | 76.90% | 7.60ms | +| 08 | + UDML | 12.0M | 78.40% | 7.60ms | +| 09 | + UIM | 12.0M | 79.40% | 7.60ms | 注: 测试速度时,实验01-03输入图片尺寸均为(3,32,320),04-08输入图片尺寸均为(3,48,320)。在实际预测时,图像为变长输入,速度会有所变化。测试环境: Intel Gold 6148 CPU,预测时开启MKLDNN加速。 @@ -144,12 +144,12 @@ SVTR_Tiny 网络结构如下所示: | ID | 策略 | 模型大小 | 精度 | 速度(CPU + MKLDNN)| |-----|-----|--------|----| --- | -| 01 | PP-OCRv2-baseline | 8M | 69.3% | 8.54ms | -| 02 | SVTR_Tiny | 21M | 80.1% | 97ms | -| 03 | SVTR_LCNet(G4) | 9.2M | 76% | 30ms | -| 04 | SVTR_LCNet(G2) | 13M | 72.98% | 9.37ms | -| 05 | SVTR_LCNet(h32) | 12M | 71.9% | 6.6ms | -| 06 | SVTR_LCNet(h48) | 12M | 73.98% | 7.6ms | +| 01 | PP-OCRv2-baseline | 8.0M | 69.30% | 8.54ms | +| 02 | SVTR_Tiny | 21.0M | 80.10% | 97.00ms | +| 03 | SVTR_LCNet(G4) | 9.2M | 76.00% | 30.00ms | +| 04 | SVTR_LCNet(G2) | 13.0M | 72.98% | 9.37ms | +| 05 | SVTR_LCNet(h32) | 12.0M | 71.90% | 6.60ms | +| 06 | SVTR_LCNet(h48) | 12.0M | 73.98% | 7.60ms | 注: 测试速度时,01-05输入图片尺寸均为(3,32,320); PP-OCRv2-baseline 代表没有借助蒸馏方法训练得到的模型 @@ -199,10 +199,10 @@ UIM(Unlabeled Images Mining)是一种非常简单的无标注数据挖掘方 | Model | Hmean | Model Size (M) | Time Cost (CPU, ms) | Time Cost (T4 GPU, ms) | |-----|-----|--------|----| --- | -| PP-OCR mobile | 50.3% | 8.1 | 356 | 116 | -| PP-OCR server | 57.0% | 155.1 | 1056 | 200 | -| PP-OCRv2 | 57.6% | 11.6 | 330 | 111 | -| PP-OCRv3 | 62.9% | 15.6 | 331 | 86.64 | +| PP-OCR mobile | 50.30% | 8.1 | 356.00 | 116.00 | +| PP-OCR server | 57.00% | 155.1 | 1056.00 | 200.00 | +| PP-OCRv2 | 57.60% | 11.6 | 330.00 | 111.00 | +| PP-OCRv3 | 62.90% | 15.6 | 331.00 | 86.64 | 测试环境:CPU型号为Intel Gold 6148,CPU预测时开启MKLDNN加速。 @@ -218,5 +218,5 @@ UIM(Unlabeled Images Mining)是一种非常简单的无标注数据挖掘方 | Model | 拉丁语系 | 阿拉伯语系 | 日语 | 韩语 | |-----|-----|--------|----| --- | -| PP-OCR_mul | 69.6% | 40.5% | 38.5% | 55.4% | -| PP-OCRv3_mul | 75.2%| 45.37% | 45.8% | 60.1% | +| PP-OCR_mul | 69.60% | 40.50% | 38.50% | 55.40% | +| PP-OCRv3_mul | 75.20%| 45.37% | 45.80% | 60.10% | diff --git a/doc/doc_ch/algorithm_det_east.md b/doc/doc_ch/algorithm_det_east.md index b89018e34..94a0d097d 100644 --- a/doc/doc_ch/algorithm_det_east.md +++ b/doc/doc_ch/algorithm_det_east.md @@ -27,7 +27,7 @@ |模型|骨干网络|配置文件|precision|recall|Hmean|下载链接| | --- | --- | --- | --- | --- | --- | --- | |EAST|ResNet50_vd|88.71%| 81.36%| 84.88%| [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)| -|EAST| MobileNetV3| 78.2%| 79.1%| 78.65%| [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)| +|EAST| MobileNetV3| 78.20%| 79.10%| 78.65%| [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)| diff --git a/doc/doc_ch/algorithm_e2e_pgnet.md b/doc/doc_ch/algorithm_e2e_pgnet.md index 83c1114e5..934328106 100644 --- a/doc/doc_ch/algorithm_e2e_pgnet.md +++ b/doc/doc_ch/algorithm_e2e_pgnet.md @@ -34,7 +34,7 @@ PGNet算法细节详见[论文](https://www.aaai.org/AAAI21Papers/AAAI-2885.Wang |PGNetA|det_precision|det_recall|det_f_score|e2e_precision|e2e_recall|e2e_f_score|FPS|下载| | --- | --- | --- | --- | --- | --- | --- | --- | --- | -|Paper|85.30|86.80|86.1|-|-|61.7|38.20 (size=640)|-| +|Paper|85.30|86.80|86.10|-|-|61.70|38.20 (size=640)|-| |Ours|87.03|82.48|84.69|61.71|58.43|60.03|48.73 (size=768)|[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/pgnet/en_server_pgnetA.tar)| *note:PaddleOCR里的PGNet实现针对预测速度做了优化,在精度下降可接受范围内,可以显著提升端对端预测速度* diff --git a/doc/doc_ch/algorithm_kie_sdmgr.md b/doc/doc_ch/algorithm_kie_sdmgr.md index 10f3ca063..86b44f6d4 100644 --- a/doc/doc_ch/algorithm_kie_sdmgr.md +++ b/doc/doc_ch/algorithm_kie_sdmgr.md @@ -33,7 +33,7 @@ |模型|骨干网络|配置文件|hmean|下载链接| | --- | --- | --- | --- | --- | -|SDMGR|VGG6|[configs/kie/sdmgr/kie_unet_sdmgr.yml](../../configs/kie/sdmgr/kie_unet_sdmgr.yml)|86.7%|[训练模型]( https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)/[推理模型(coming soon)]()| +|SDMGR|VGG6|[configs/kie/sdmgr/kie_unet_sdmgr.yml](../../configs/kie/sdmgr/kie_unet_sdmgr.yml)|86.70%|[训练模型]( https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)/[推理模型(coming soon)]()| diff --git a/doc/doc_ch/algorithm_overview.md b/doc/doc_ch/algorithm_overview.md index 7de581c27..7f6919c13 100755 --- a/doc/doc_ch/algorithm_overview.md +++ b/doc/doc_ch/algorithm_overview.md @@ -36,7 +36,7 @@ PaddleOCR将**持续新增**支持OCR领域前沿算法与模型,**欢迎广 |模型|骨干网络|precision|recall|Hmean|下载链接| | --- | --- | --- | --- | --- | --- | |EAST|ResNet50_vd|88.71%|81.36%|84.88%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)| -|EAST|MobileNetV3|78.2%|79.1%|78.65%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_east_v2.0_train.tar)| +|EAST|MobileNetV3|78.20%|79.10%|78.65%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_east_v2.0_train.tar)| |DB|ResNet50_vd|86.41%|78.72%|82.38%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_db_v2.0_train.tar)| |DB|MobileNetV3|77.29%|73.08%|75.12%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_db_v2.0_train.tar)| |SAST|ResNet50_vd|91.39%|83.77%|87.42%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_icdar15_v2.0_train.tar)| @@ -143,7 +143,7 @@ PaddleOCR将**持续新增**支持OCR领域前沿算法与模型,**欢迎广 |模型|骨干网络|配置文件|hmean|下载链接| | --- | --- | --- | --- | --- | -|SDMGR|VGG6|[configs/kie/sdmgr/kie_unet_sdmgr.yml](../../configs/kie/sdmgr/kie_unet_sdmgr.yml)|86.7%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)| +|SDMGR|VGG6|[configs/kie/sdmgr/kie_unet_sdmgr.yml](../../configs/kie/sdmgr/kie_unet_sdmgr.yml)|86.70%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)| 在XFUND_zh公开数据集上,算法效果如下: diff --git a/doc/doc_ch/algorithm_rec_can.md b/doc/doc_ch/algorithm_rec_can.md index e4f4ba6f3..816a255ef 100644 --- a/doc/doc_ch/algorithm_rec_can.md +++ b/doc/doc_ch/algorithm_rec_can.md @@ -27,7 +27,7 @@ |模型 |骨干网络|配置文件|ExpRate|下载链接| | ----- | ----- | ----- | ----- | ----- | -|CAN|DenseNet|[rec_d28_can.yml](../../configs/rec/rec_d28_can.yml)|51.72|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_d28_can_train.tar)| +|CAN|DenseNet|[rec_d28_can.yml](../../configs/rec/rec_d28_can.yml)|51.72%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_d28_can_train.tar)| ## 2. 环境配置 @@ -60,7 +60,7 @@ python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs python3 tools/train.py -c configs/rec/rec_d28_can.yml -o Train.dataset.transforms.GrayImageChannelFormat.inverse=False ``` -- 默认每训练1个epoch(1105次iteration)进行1次评估,若您更改训练的batch_size,或更换数据集,请在训练时作出如下修改 +- 默认每训练1个epoch(1,105次iteration)进行1次评估,若您更改训练的batch_size,或更换数据集,请在训练时作出如下修改 ``` python3 tools/train.py -c configs/rec/rec_d28_can.yml -o Global.eval_batch_step=[0, {length_of_dataset//batch_size}] diff --git a/doc/doc_ch/algorithm_rec_rare.md b/doc/doc_ch/algorithm_rec_rare.md index dddd27ef9..9476c2e69 100644 --- a/doc/doc_ch/algorithm_rec_rare.md +++ b/doc/doc_ch/algorithm_rec_rare.md @@ -25,8 +25,8 @@ |模型|骨干网络|配置文件|Avg Accuracy|下载链接| | --- | --- | --- | --- | --- | -|RARE|Resnet34_vd|[configs/rec/rec_r34_vd_tps_bilstm_att.yml](../../configs/rec/rec_r34_vd_tps_bilstm_att.yml)|83.6%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_att_v2.0_train.tar)| -|RARE|MobileNetV3|[configs/rec/rec_mv3_tps_bilstm_att.yml](../../configs/rec/rec_mv3_tps_bilstm_att.yml)|82.5%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_att_v2.0_train.tar)| +|RARE|Resnet34_vd|[configs/rec/rec_r34_vd_tps_bilstm_att.yml](../../configs/rec/rec_r34_vd_tps_bilstm_att.yml)|83.60%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_att_v2.0_train.tar)| +|RARE|MobileNetV3|[configs/rec/rec_mv3_tps_bilstm_att.yml](../../configs/rec/rec_mv3_tps_bilstm_att.yml)|82.50%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_att_v2.0_train.tar)| diff --git a/doc/doc_ch/algorithm_rec_seed.md b/doc/doc_ch/algorithm_rec_seed.md index 710e92272..6d59c9fee 100644 --- a/doc/doc_ch/algorithm_rec_seed.md +++ b/doc/doc_ch/algorithm_rec_seed.md @@ -27,7 +27,7 @@ |模型|骨干网络|Avg Accuracy|配置文件|下载链接| |---|---|---|---|---| -|SEED|Aster_Resnet| 85.2% | [configs/rec/rec_resnet_stn_bilstm_att.yml](../../configs/rec/rec_resnet_stn_bilstm_att.yml) | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar) | +|SEED|Aster_Resnet| 85.20% | [configs/rec/rec_resnet_stn_bilstm_att.yml](../../configs/rec/rec_resnet_stn_bilstm_att.yml) | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar) | ## 2. 环境配置 diff --git a/doc/doc_ch/algorithm_rec_spin.md b/doc/doc_ch/algorithm_rec_spin.md index 908a85a41..2b9c04abc 100644 --- a/doc/doc_ch/algorithm_rec_spin.md +++ b/doc/doc_ch/algorithm_rec_spin.md @@ -26,7 +26,7 @@ SPIN收录于AAAI2020。主要用于OCR识别任务。在任意形状文本识 |模型|骨干网络|配置文件|Acc|下载链接| | --- | --- | --- | --- | --- | -|SPIN|ResNet32|[rec_r32_gaspin_bilstm_att.yml](../../configs/rec/rec_r32_gaspin_bilstm_att.yml)|90.0%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_r32_gaspin_bilstm_att.tar)| +|SPIN|ResNet32|[rec_r32_gaspin_bilstm_att.yml](../../configs/rec/rec_r32_gaspin_bilstm_att.yml)|90.00%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_r32_gaspin_bilstm_att.tar)| diff --git a/doc/doc_ch/algorithm_rec_visionlan.md b/doc/doc_ch/algorithm_rec_visionlan.md index b4474c29f..eb58942c2 100644 --- a/doc/doc_ch/algorithm_rec_visionlan.md +++ b/doc/doc_ch/algorithm_rec_visionlan.md @@ -27,7 +27,7 @@ |模型|骨干网络|配置文件|Acc|下载链接| | --- | --- | --- | --- | --- | -|VisionLAN|ResNet45|[rec_r45_visionlan.yml](../../configs/rec/rec_r45_visionlan.yml)|90.3%|[预训练、训练模型](https://paddleocr.bj.bcebos.com/VisionLAN/rec_r45_visionlan_train.tar)| +|VisionLAN|ResNet45|[rec_r45_visionlan.yml](../../configs/rec/rec_r45_visionlan.yml)|90.30%|[预训练、训练模型](https://paddleocr.bj.bcebos.com/VisionLAN/rec_r45_visionlan_train.tar)| ## 2. 环境配置 diff --git a/doc/doc_ch/models_list.md b/doc/doc_ch/models_list.md index c36cac037..c6cbd6873 100644 --- a/doc/doc_ch/models_list.md +++ b/doc/doc_ch/models_list.md @@ -42,12 +42,12 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训 |模型名称|模型简介|配置文件|推理模型大小|下载地址| | --- | --- | --- | --- | --- | |ch_PP-OCRv3_det_slim|【最新】slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 1.1M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_distill_train.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)| -|ch_PP-OCRv3_det| 【最新】原始超轻量模型,支持中英文、多语种文本检测 |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar)| -|ch_PP-OCRv2_det_slim| slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)| 3M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)| -|ch_PP-OCRv2_det| 原始超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)|3M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)| -|ch_ppocr_mobile_slim_v2.0_det|slim裁剪版超轻量模型,支持中英文、多语种文本检测|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)| 2.6M |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_det_prune_infer.tar)| -|ch_ppocr_mobile_v2.0_det|原始超轻量模型,支持中英文、多语种文本检测|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)|3M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)| -|ch_ppocr_server_v2.0_det|通用模型,支持中英文、多语种文本检测,比超轻量模型更大,但效果更好|[ch_det_res18_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_res18_db_v2.0.yml)|47M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar)| +|ch_PP-OCRv3_det| 【最新】原始超轻量模型,支持中英文、多语种文本检测 |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.80M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar)| +|ch_PP-OCRv2_det_slim| slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)| 3.0M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)| +|ch_PP-OCRv2_det| 原始超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)|3.0M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)| +|ch_ppocr_mobile_slim_v2.0_det|slim裁剪版超轻量模型,支持中英文、多语种文本检测|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)| 2.60M |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_det_prune_infer.tar)| +|ch_ppocr_mobile_v2.0_det|原始超轻量模型,支持中英文、多语种文本检测|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)|3.0M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)| +|ch_ppocr_server_v2.0_det|通用模型,支持中英文、多语种文本检测,比超轻量模型更大,但效果更好|[ch_det_res18_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_res18_db_v2.0.yml)|47.0M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar)| @@ -83,10 +83,10 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训 | --- | --- | --- | --- | --- | |ch_PP-OCRv3_rec_slim |【最新】slim量化版超轻量模型,支持中英文、数字识别|[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 4.9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_train.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb) | |ch_PP-OCRv3_rec|【最新】原始超轻量模型,支持中英文、数字识别|[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 12.4M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | -|ch_PP-OCRv2_rec_slim| slim量化版超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) | -|ch_PP-OCRv2_rec| 原始超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)|8.5M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) | -|ch_ppocr_mobile_slim_v2.0_rec|slim裁剪量化版超轻量模型,支持中英文、数字识别|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)| 6M |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_train.tar) | -|ch_ppocr_mobile_v2.0_rec|原始超轻量模型,支持中英文、数字识别|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)|5.2M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) | +|ch_PP-OCRv2_rec_slim| slim量化版超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9.0M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) | +|ch_PP-OCRv2_rec| 原始超轻量模型,支持中英文、数字识别|[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)|8.50M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) | +|ch_ppocr_mobile_slim_v2.0_rec|slim裁剪量化版超轻量模型,支持中英文、数字识别|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)| 6.0M |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_train.tar) | +|ch_ppocr_mobile_v2.0_rec|原始超轻量模型,支持中英文、数字识别|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)|5.20M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) | |ch_ppocr_server_v2.0_rec|通用模型,支持中英文、数字识别|[rec_chinese_common_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml)|94.8M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_train.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) | **说明:** `训练模型`是基于预训练模型在真实数据与竖排合成文本数据上finetune得到的模型,在真实应用场景中有着更好的表现,`预训练模型`则是直接基于全量真实数据与合成数据训练得到,更适合用于在自己的数据集上finetune。 @@ -107,9 +107,9 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训 |模型名称|字典文件|模型简介|配置文件|推理模型大小|下载地址| | --- | --- | --- | --- |--- | --- | -| korean_PP-OCRv3_rec | ppocr/utils/dict/korean_dict.txt |韩文识别|[korean_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml)|11M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/korean_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/korean_PP-OCRv3_rec_train.tar) | -| japan_PP-OCRv3_rec | ppocr/utils/dict/japan_dict.txt |日文识别|[japan_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml)|11M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/japan_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/japan_PP-OCRv3_rec_train.tar) | -| chinese_cht_PP-OCRv3_rec | ppocr/utils/dict/chinese_cht_dict.txt | 中文繁体识别|[chinese_cht_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml)|12M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/chinese_cht_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/chinese_cht_PP-OCRv3_rec_train.tar) | +| korean_PP-OCRv3_rec | ppocr/utils/dict/korean_dict.txt |韩文识别|[korean_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml)|11.0M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/korean_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/korean_PP-OCRv3_rec_train.tar) | +| japan_PP-OCRv3_rec | ppocr/utils/dict/japan_dict.txt |日文识别|[japan_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml)|11.0M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/japan_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/japan_PP-OCRv3_rec_train.tar) | +| chinese_cht_PP-OCRv3_rec | ppocr/utils/dict/chinese_cht_dict.txt | 中文繁体识别|[chinese_cht_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml)|12.0M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/chinese_cht_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/chinese_cht_PP-OCRv3_rec_train.tar) | | te_PP-OCRv3_rec | ppocr/utils/dict/te_dict.txt | 泰卢固文识别|[te_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml)|9.6M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/te_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/te_PP-OCRv3_rec_train.tar) | | ka_PP-OCRv3_rec | ppocr/utils/dict/ka_dict.txt |卡纳达文识别|[ka_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml)|9.9M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_train.tar) | | ta_PP-OCRv3_rec | ppocr/utils/dict/ta_dict.txt |泰米尔文识别|[ta_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml)|9.6M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_train.tar) | @@ -140,9 +140,9 @@ Paddle-Lite 是一个高性能、轻量级、灵活性强且易于扩展的深 |模型版本|模型简介|模型大小|检测模型|文本方向分类模型|识别模型|Paddle-Lite版本| |---|---|---|---|---|---|---| -|PP-OCRv2|蒸馏版超轻量中文OCR移动端模型|11M|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_infer_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_infer_opt.nb)|v2.10| +|PP-OCRv2|蒸馏版超轻量中文OCR移动端模型|11.0M|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_infer_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_infer_opt.nb)|v2.10| |PP-OCRv2(slim)|蒸馏版超轻量中文OCR移动端模型|4.6M|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_slim_opt.nb)|v2.10| -|PP-OCRv2|蒸馏版超轻量中文OCR移动端模型|11M|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer_opt.nb)|v2.9| +|PP-OCRv2|蒸馏版超轻量中文OCR移动端模型|11.0M|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer_opt.nb)|v2.9| |PP-OCRv2(slim)|蒸馏版超轻量中文OCR移动端模型|4.9M|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_opt.nb)|v2.9| |V2.0|ppocr_v2.0超轻量中文OCR移动端模型|7.8M|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_det_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_rec_opt.nb)|v2.9| |V2.0(slim)|ppocr_v2.0超轻量中文OCR移动端模型|3.3M|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_det_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_rec_slim_opt.nb)|v2.9| diff --git a/doc/doc_en/PP-OCRv3_introduction_en.md b/doc/doc_en/PP-OCRv3_introduction_en.md index 815ad9b0e..8d5a36edf 100644 --- a/doc/doc_en/PP-OCRv3_introduction_en.md +++ b/doc/doc_en/PP-OCRv3_introduction_en.md @@ -55,13 +55,13 @@ The ablation experiments are as follows: |ID|Strategy|Model Size|Hmean|The Inference Time(cpu + mkldnn)| |-|-|-|-|-| -|baseline teacher|PP-OCR server|49M|83.2%|171ms| -|teacher1|DB-R50-LK-PAN|124M|85.0%|396ms| -|teacher2|DB-R50-LK-PAN-DML|124M|86.0%|396ms| -|baseline student|PP-OCRv2|3M|83.2%|117ms| -|student0|DB-MV3-RSE-FPN|3.6M|84.5%|124ms| -|student1|DB-MV3-CML(teacher2)|3M|84.3%|117ms| -|student2|DB-MV3-RSE-FPN-CML(teacher2)|3.6M|85.4%|124ms| +|baseline teacher|PP-OCR server|49.0M|83.20%|171ms| +|teacher1|DB-R50-LK-PAN|124.0M|85.00%|396ms| +|teacher2|DB-R50-LK-PAN-DML|124.0M|86.00%|396ms| +|baseline student|PP-OCRv2|3.0M|83.20%|117ms| +|student0|DB-MV3-RSE-FPN|3.6M|84.50%|124ms| +|student1|DB-MV3-CML(teacher2)|3.0M|84.30%|117ms| +|student2|DB-MV3-RSE-FPN-CML(teacher2)|3.6M|85.40%|124ms| Testing environment: Intel Gold 6148 CPU, with MKLDNN acceleration enabled during inference. @@ -111,15 +111,15 @@ Based on the above strategy, compared with PP-OCRv2, the PP-OCRv3 recognition mo | ID | strategy | Model size | accuracy | prediction speed(CPU + MKLDNN)| |-----|-----|--------|----| --- | -| 01 | PP-OCRv2 | 8M | 74.8% | 8.54ms | -| 02 | SVTR_Tiny | 21M | 80.1% | 97ms | -| 03 | SVTR_LCNet(h32) | 12M | 71.9% | 6.6ms | -| 04 | SVTR_LCNet(h48) | 12M | 73.98% | 7.6ms | -| 05 | + GTC | 12M | 75.8% | 7.6ms | -| 06 | + TextConAug | 12M | 76.3% | 7.6ms | -| 07 | + TextRotNet | 12M | 76.9% | 7.6ms | -| 08 | + UDML | 12M | 78.4% | 7.6ms | -| 09 | + UIM | 12M | 79.4% | 7.6ms | +| 01 | PP-OCRv2 | 8.0M | 74.80% | 8.54ms | +| 02 | SVTR_Tiny | 21.0M | 80.10% | 97.00ms | +| 03 | SVTR_LCNet(h32) | 12.0M | 71.90% | 6.60ms | +| 04 | SVTR_LCNet(h48) | 12.0M | 73.98% | 7.60ms | +| 05 | + GTC | 12.0M | 75.80% | 7.60ms | +| 06 | + TextConAug | 12.0M | 76.30% | 7.60ms | +| 07 | + TextRotNet | 12.0M | 76.90% | 7.60ms | +| 08 | + UDML | 12.0M | 78.40% | 7.60ms | +| 09 | + UIM | 12.0M | 79.40% | 7.60ms | Note: When testing the speed, the input image shape of Experiment 01-03 is (3, 32, 320), and the input image shape of 04-08 is (3, 48, 320). In the actual prediction, the image is a variable-length input, and the speed will vary. Testing environment: Intel Gold 6148 CPU, with MKLDNN acceleration enabled during prediction. @@ -158,12 +158,12 @@ The ablation experiments are as follows: | ID | strategy | Model size | accuracy | prediction speed(CPU + MKLDNN)| |-----|-----|--------|----| --- | -| 01 | PP-OCRv2-baseline | 8M | 69.3% | 8.54ms | -| 02 | SVTR_Tiny | 21M | 80.1% | 97ms | -| 03 | SVTR_LCNet(G4) | 9.2M | 76% | 30ms | -| 04 | SVTR_LCNet(G2) | 13M | 72.98% | 9.37ms | -| 05 | SVTR_LCNet(h32) | 12M | 71.9% | 6.6ms | -| 06 | SVTR_LCNet(h48) | 12M | 73.98% | 7.6ms | +| 01 | PP-OCRv2-baseline | 8.0M | 69.30% | 8.54ms | +| 02 | SVTR_Tiny | 21.0M | 80.10% | 97.00ms | +| 03 | SVTR_LCNet(G4) | 9.2M | 76.00% | 30.00ms | +| 04 | SVTR_LCNet(G2) | 13.0M | 72.98% | 9.37ms | +| 05 | SVTR_LCNet(h32) | 12.0M | 71.90% | 6.60ms | +| 06 | SVTR_LCNet(h48) | 12.0M | 73.98% | 7.60ms | Note: When testing the speed, the input image shape of 01-05 are all (3, 32, 320); PP-OCRv2-baseline represents the model trained without distillation method @@ -210,21 +210,21 @@ UIM (Unlabeled Images Mining) is a very simple unlabeled data mining strategy. T ## 4. End-to-end Evaluation -With the optimization strategies mentioned above, PP-OCRv3 outperforms PP-OCRv2 by 5% in terms of end-to-end Hmean for Chinese scenarios with comparable speed. The specific metrics are shown as follows. +With the optimization strategies mentioned above, PP-OCRv3 outperforms PP-OCRv2 by 5.00% in terms of end-to-end Hmean for Chinese scenarios with comparable speed. The specific metrics are shown as follows. | Model | Hmean | Model Size (M) | Time Cost (CPU, ms) | Time Cost (T4 GPU, ms) | |-----|-----|--------|----| --- | -| PP-OCR mobile | 50.3% | 8.1 | 356 | 116 | -| PP-OCR server | 57.0% | 155.1 | 1056 | 200 | -| PP-OCRv2 | 57.6% | 11.6 | 330 | 111 | -| PP-OCRv3 | 62.9% | 15.6 | 331 | 86.64 | +| PP-OCR mobile | 50.30% | 8.1 | 356.00 | 116.00 | +| PP-OCR server | 57.00% | 155.1 | 1056.00 | 200.00 | +| PP-OCRv2 | 57.60% | 11.6 | 330.00 | 111.00 | +| PP-OCRv3 | 62.90% | 15.6 | 331.00 | 86.64 | Testing environment: - CPU: Intel Gold 6148, and MKLDNN acceleration is enabled during CPU inference. -In addition to Chinese scenarios, the recognition model for English is also optimized with an increasement of 11% for end-to-end Hmean, which is shown as follows. +In addition to Chinese scenarios, the recognition model for English is also optimized with an increasement of 11.00% for end-to-end Hmean, which is shown as follows. | Model | Recall | Precision | Hmean | |-----|-----|--------|----| @@ -235,5 +235,5 @@ At the same time, recognition models for more than 80 language are also upgraded | Model | Latin | Arabic | Japanese | Korean | |-----|-----|--------|----| --- | -| PP-OCR_mul | 69.6% | 40.5% | 38.5% | 55.4% | -| PP-OCRv3_mul | 75.2% | 45.37% | 45.8% | 60.1% | +| PP-OCR_mul | 69.60% | 40.50% | 38.50% | 55.40% | +| PP-OCRv3_mul | 75.20% | 45.37% | 45.80% | 60.10% | diff --git a/doc/doc_en/algorithm_det_east_en.md b/doc/doc_en/algorithm_det_east_en.md index 07c434a9b..3848464ab 100644 --- a/doc/doc_en/algorithm_det_east_en.md +++ b/doc/doc_en/algorithm_det_east_en.md @@ -27,7 +27,7 @@ On the ICDAR2015 dataset, the text detection result is as follows: |Model|Backbone|Configuration|Precision|Recall|Hmean|Download| | --- | --- | --- | --- | --- | --- | --- | |EAST|ResNet50_vd|88.71%| 81.36%| 84.88%| [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)| -|EAST| MobileNetV3| 78.2%| 79.1%| 78.65%| [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)| +|EAST| MobileNetV3| 78.20%| 79.10%| 78.65%| [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)| diff --git a/doc/doc_en/algorithm_e2e_pgnet_en.md b/doc/doc_en/algorithm_e2e_pgnet_en.md index ab74c57bc..ccb5e6c07 100644 --- a/doc/doc_en/algorithm_e2e_pgnet_en.md +++ b/doc/doc_en/algorithm_e2e_pgnet_en.md @@ -29,7 +29,7 @@ The results of detection and recognition are as follows: #### Test environment: NVIDIA Tesla V100-SXM2-16GB |PGNetA|det_precision|det_recall|det_f_score|e2e_precision|e2e_recall|e2e_f_score|FPS|download| | --- | --- | --- | --- | --- | --- | --- | --- | --- | -|Paper|85.30|86.80|86.1|-|-|61.7|38.20 (size=640)|-| +|Paper|85.30|86.80|86.10|-|-|61.70|38.20 (size=640)|-| |Ours|87.03|82.48|84.69|61.71|58.43|60.03|48.73 (size=768)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/pgnet/en_server_pgnetA.tar)| *note:PGNet in PaddleOCR optimizes the prediction speed, and can significantly improve the end-to-end prediction speed within the acceptable range of accuracy reduction* diff --git a/doc/doc_en/algorithm_kie_sdmgr_en.md b/doc/doc_en/algorithm_kie_sdmgr_en.md index 5b12b8c95..ce52ef135 100644 --- a/doc/doc_en/algorithm_kie_sdmgr_en.md +++ b/doc/doc_en/algorithm_kie_sdmgr_en.md @@ -26,7 +26,7 @@ On wildreceipt dataset, the algorithm reproduction Hmean is as follows. |Model|Backbone |Cnnfig|Hmean|Download link| | --- | --- | --- | --- | --- | -|SDMGR|VGG6|[configs/kie/sdmgr/kie_unet_sdmgr.yml](../../configs/kie/sdmgr/kie_unet_sdmgr.yml)|86.7%|[trained model]( https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)/[inference model(coming soon)]()| +|SDMGR|VGG6|[configs/kie/sdmgr/kie_unet_sdmgr.yml](../../configs/kie/sdmgr/kie_unet_sdmgr.yml)|86.70%|[trained model]( https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)/[inference model(coming soon)]()| diff --git a/doc/doc_en/algorithm_overview_en.md b/doc/doc_en/algorithm_overview_en.md index 09ff40791..309d074ed 100755 --- a/doc/doc_en/algorithm_overview_en.md +++ b/doc/doc_en/algorithm_overview_en.md @@ -34,7 +34,7 @@ On the ICDAR2015 dataset, the text detection result is as follows: |Model|Backbone|Precision|Recall|Hmean|Download link| | --- | --- | --- | --- | --- | --- | |EAST|ResNet50_vd|88.71%|81.36%|84.88%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)| -|EAST|MobileNetV3|78.2%|79.1%|78.65%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_east_v2.0_train.tar)| +|EAST|MobileNetV3|78.20%|79.10%|78.65%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_east_v2.0_train.tar)| |DB|ResNet50_vd|86.41%|78.72%|82.38%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_db_v2.0_train.tar)| |DB|MobileNetV3|77.29%|73.08%|75.12%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_db_v2.0_train.tar)| |SAST|ResNet50_vd|91.39%|83.77%|87.42%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_icdar15_v2.0_train.tar)| @@ -141,7 +141,7 @@ On wildreceipt dataset, the algorithm result is as follows: |Model|Backbone|Config|Hmean|Download link| | --- | --- | --- | --- | --- | -|SDMGR|VGG6|[configs/kie/sdmgr/kie_unet_sdmgr.yml](../../configs/kie/sdmgr/kie_unet_sdmgr.yml)|86.7%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)| +|SDMGR|VGG6|[configs/kie/sdmgr/kie_unet_sdmgr.yml](../../configs/kie/sdmgr/kie_unet_sdmgr.yml)|86.70%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)| On XFUND_zh dataset, the algorithm result is as follows: diff --git a/doc/doc_en/algorithm_rec_can_en.md b/doc/doc_en/algorithm_rec_can_en.md index e65bb2aa8..5cc7038f6 100644 --- a/doc/doc_en/algorithm_rec_can_en.md +++ b/doc/doc_en/algorithm_rec_can_en.md @@ -25,7 +25,7 @@ Using CROHME handwrittem mathematical expression recognition datasets for traini |Model|Backbone|config|exprate|Download link| | --- | --- | --- | --- | --- | -|CAN|DenseNet|[rec_d28_can.yml](../../configs/rec/rec_d28_can.yml)|51.72|[trained model](https://paddleocr.bj.bcebos.com/contribution/rec_d28_can_train.tar)| +|CAN|DenseNet|[rec_d28_can.yml](../../configs/rec/rec_d28_can.yml)|51.72%|[trained model](https://paddleocr.bj.bcebos.com/contribution/rec_d28_can_train.tar)| ## 2. Environment diff --git a/doc/doc_en/algorithm_rec_rare_en.md b/doc/doc_en/algorithm_rec_rare_en.md index 3aeb1e3ad..a756ac75b 100644 --- a/doc/doc_en/algorithm_rec_rare_en.md +++ b/doc/doc_en/algorithm_rec_rare_en.md @@ -25,8 +25,8 @@ Using MJSynth and SynthText two text recognition datasets for training, and eval |Models|Backbone Networks|Configuration Files|Avg Accuracy|Download Links| | --- | --- | --- | --- | --- | -|RARE|Resnet34_vd|[configs/rec/rec_r34_vd_tps_bilstm_att.yml](../../configs/rec/rec_r34_vd_tps_bilstm_att.yml)|83.6%|[training model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_att_v2.0_train.tar)| -|RARE|MobileNetV3|[configs/rec/rec_mv3_tps_bilstm_att.yml](../../configs/rec/rec_mv3_tps_bilstm_att.yml)|82.5%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_att_v2.0_train.tar)| +|RARE|Resnet34_vd|[configs/rec/rec_r34_vd_tps_bilstm_att.yml](../../configs/rec/rec_r34_vd_tps_bilstm_att.yml)|83.60%|[training model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_att_v2.0_train.tar)| +|RARE|MobileNetV3|[configs/rec/rec_mv3_tps_bilstm_att.yml](../../configs/rec/rec_mv3_tps_bilstm_att.yml)|82.50%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_att_v2.0_train.tar)| diff --git a/doc/doc_en/algorithm_rec_seed_en.md b/doc/doc_en/algorithm_rec_seed_en.md index f8d7ae6d3..83cadfcea 100644 --- a/doc/doc_en/algorithm_rec_seed_en.md +++ b/doc/doc_en/algorithm_rec_seed_en.md @@ -27,7 +27,7 @@ Using MJSynth and SynthText two text recognition datasets for training, and eval |Model|Backbone|ACC|config|Download link| | --- | --- | --- | --- | --- | -|SEED|Aster_Resnet| 85.2% | [configs/rec/rec_resnet_stn_bilstm_att.yml](../../configs/rec/rec_resnet_stn_bilstm_att.yml) | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar) | +|SEED|Aster_Resnet| 85.20% | [configs/rec/rec_resnet_stn_bilstm_att.yml](../../configs/rec/rec_resnet_stn_bilstm_att.yml) | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar) | ## 2. Environment diff --git a/doc/doc_en/algorithm_rec_spin_en.md b/doc/doc_en/algorithm_rec_spin_en.md index 03f8d8f69..3aea58097 100644 --- a/doc/doc_en/algorithm_rec_spin_en.md +++ b/doc/doc_en/algorithm_rec_spin_en.md @@ -25,7 +25,7 @@ Using MJSynth and SynthText two text recognition datasets for training, and eval |Model|Backbone|config|Acc|Download link| | --- | --- | --- | --- | --- | -|SPIN|ResNet32|[rec_r32_gaspin_bilstm_att.yml](../../configs/rec/rec_r32_gaspin_bilstm_att.yml)|90.0%|[trained model](https://paddleocr.bj.bcebos.com/contribution/rec_r32_gaspin_bilstm_att.tar) | +|SPIN|ResNet32|[rec_r32_gaspin_bilstm_att.yml](../../configs/rec/rec_r32_gaspin_bilstm_att.yml)|90.00%|[trained model](https://paddleocr.bj.bcebos.com/contribution/rec_r32_gaspin_bilstm_att.tar) | diff --git a/doc/doc_en/algorithm_rec_visionlan_en.md b/doc/doc_en/algorithm_rec_visionlan_en.md index f67aa3c62..585e85391 100644 --- a/doc/doc_en/algorithm_rec_visionlan_en.md +++ b/doc/doc_en/algorithm_rec_visionlan_en.md @@ -25,7 +25,7 @@ Using MJSynth and SynthText two text recognition datasets for training, and eval |Model|Backbone|config|Acc|Download link| | --- | --- | --- | --- | --- | -|VisionLAN|ResNet45|[rec_r45_visionlan.yml](../../configs/rec/rec_r45_visionlan.yml)|90.3%|[预训练、训练模型](https://paddleocr.bj.bcebos.com/VisionLAN/rec_r45_visionlan_train.tar)| +|VisionLAN|ResNet45|[rec_r45_visionlan.yml](../../configs/rec/rec_r45_visionlan.yml)|90.30%|[预训练、训练模型](https://paddleocr.bj.bcebos.com/VisionLAN/rec_r45_visionlan_train.tar)| ## 2. Environment diff --git a/doc/doc_en/distributed_training_en.md b/doc/doc_en/distributed_training_en.md index a9db354ad..947fb139b 100644 --- a/doc/doc_en/distributed_training_en.md +++ b/doc/doc_en/distributed_training_en.md @@ -47,14 +47,14 @@ python3 -m paddle.distributed.launch \ | Model | Configuration | Configuration | 8 GPU training time / Accuracy | 3x8 GPU training time / Accuracy | Acceleration ratio | |:------:|:-----:|:--------:|:--------:|:--------:|:-----:| -| CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 260k Chinese dataset | 2.50d/66.7% | 1.67d/67.0% | **1.5** | +| CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 260k Chinese dataset | 2.50d/66.70% | 1.67d/67.00% | **1.5** | * We conducted model training on 3x8 V100 GPUs. Accuracy, training time, and multi machine acceleration ratio of different models are shown below. | Model | Configuration | Configuration | 8 GPU training time / Accuracy | 3x8 GPU training time / Accuracy | Acceleration ratio | |:------:|:-----:|:--------:|:--------:|:--------:|:-----:| -| SLANet | [SLANet.yml](../../configs/table/SLANet.yml) | PubTabNet | 49.8h/76.2% | 19.75h/74.77% | **2.52** | +| SLANet | [SLANet.yml](../../configs/table/SLANet.yml) | PubTabNet | 49.80h/76.20% | 19.75h/74.77% | **2.52** | > Note: when training with 3x8 GPUs, the single card batch size is unchanged compared with the 1x8 GPUs' training process, and the learning rate is multiplied by 2 (if it is multiplied by 3 by default, the accuracy is only 73.42%). @@ -65,4 +65,4 @@ python3 -m paddle.distributed.launch \ | Model | Configuration | Configuration | 8 GPU training time / Accuracy | 4x8 GPU training time / Accuracy | Acceleration ratio | |:------:|:-----:|:--------:|:--------:|:--------:|:-----:| -| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | PP-OCRv3_rec data | 10d/- | 2.84d/74.0% | **3.5** | +| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | PP-OCRv3_rec data | 10d/- | 2.84d/74.00% | **3.5** | diff --git a/doc/doc_en/models_list_en.md b/doc/doc_en/models_list_en.md index c52f71dfe..3ec5013cf 100644 --- a/doc/doc_en/models_list_en.md +++ b/doc/doc_en/models_list_en.md @@ -39,11 +39,11 @@ Relationship of the above models is as follows. | --- | --- | --- | --- | --- | |ch_PP-OCRv3_det_slim| [New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 1.1M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_distill_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)| |ch_PP-OCRv3_det| [New] Original lightweight model, supporting Chinese, English, multilingual text detection |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar)| -|ch_PP-OCRv2_det_slim| [New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)| 3M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)| -|ch_PP-OCRv2_det| [New] Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)|3M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)| +|ch_PP-OCRv2_det_slim| [New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)| 3.0M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)| +|ch_PP-OCRv2_det| [New] Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)|3.0M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)| |ch_ppocr_mobile_slim_v2.0_det|Slim pruned lightweight model, supporting Chinese, English, multilingual text detection|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)|2.6M |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_det_prune_infer.tar)| -|ch_ppocr_mobile_v2.0_det|Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)|3M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)| -|ch_ppocr_server_v2.0_det|General model, which is larger than the lightweight model, but achieved better performance|[ch_det_res18_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_res18_db_v2.0.yml)|47M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar)| +|ch_ppocr_mobile_v2.0_det|Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)|3.0M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)| +|ch_ppocr_server_v2.0_det|General model, which is larger than the lightweight model, but achieved better performance|[ch_det_res18_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_res18_db_v2.0.yml)|47.0M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar)| @@ -77,9 +77,9 @@ Relationship of the above models is as follows. | --- | --- | --- | --- | --- | |ch_PP-OCRv3_rec_slim | [New] Slim qunatization with distillation lightweight model, supporting Chinese, English text recognition |[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 4.9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb) | |ch_PP-OCRv3_rec| [New] Original lightweight model, supporting Chinese, English, multilingual text recognition |[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 12.4M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | -|ch_PP-OCRv2_rec_slim| Slim qunatization with distillation lightweight model, supporting Chinese, English text recognition|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) | +|ch_PP-OCRv2_rec_slim| Slim qunatization with distillation lightweight model, supporting Chinese, English text recognition|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9.0M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) | |ch_PP-OCRv2_rec| Original lightweight model, supporting Chinese, English, multilingual text recognition |[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)|8.5M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) | -|ch_ppocr_mobile_slim_v2.0_rec|Slim pruned and quantized lightweight model, supporting Chinese, English and number recognition|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)| 6M | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_train.tar) | +|ch_ppocr_mobile_slim_v2.0_rec|Slim pruned and quantized lightweight model, supporting Chinese, English and number recognition|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)| 6.0M | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_train.tar) | |ch_ppocr_mobile_v2.0_rec|Original lightweight model, supporting Chinese, English and number recognition|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)|5.2M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) | |ch_ppocr_server_v2.0_rec|General model, supporting Chinese, English and number recognition|[rec_chinese_common_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml)|94.8M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_train.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) | @@ -101,9 +101,9 @@ Relationship of the above models is as follows. |model name| dict file | description|config|model size|download| | --- | --- | --- |--- | --- | --- | -| korean_PP-OCRv3_rec | ppocr/utils/dict/korean_dict.txt |Lightweight model for Korean recognition|[korean_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml)|11M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/korean_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/korean_PP-OCRv3_rec_train.tar) | -| japan_PP-OCRv3_rec | ppocr/utils/dict/japan_dict.txt |Lightweight model for Japanese recognition|[japan_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml)|11M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/japan_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/japan_PP-OCRv3_rec_train.tar) | -| chinese_cht_PP-OCRv3_rec | ppocr/utils/dict/chinese_cht_dict.txt | Lightweight model for chinese cht|[chinese_cht_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml)|12M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/chinese_cht_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/chinese_cht_PP-OCRv3_rec_train.tar) | +| korean_PP-OCRv3_rec | ppocr/utils/dict/korean_dict.txt |Lightweight model for Korean recognition|[korean_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml)|11.0M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/korean_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/korean_PP-OCRv3_rec_train.tar) | +| japan_PP-OCRv3_rec | ppocr/utils/dict/japan_dict.txt |Lightweight model for Japanese recognition|[japan_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml)|11.0M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/japan_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/japan_PP-OCRv3_rec_train.tar) | +| chinese_cht_PP-OCRv3_rec | ppocr/utils/dict/chinese_cht_dict.txt | Lightweight model for chinese cht|[chinese_cht_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml)|12.0M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/chinese_cht_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/chinese_cht_PP-OCRv3_rec_train.tar) | | te_PP-OCRv3_rec | ppocr/utils/dict/te_dict.txt | Lightweight model for Telugu recognition |[te_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml)|9.6M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/te_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/te_PP-OCRv3_rec_train.tar) | | ka_PP-OCRv3_rec | ppocr/utils/dict/ka_dict.txt | Lightweight model for Kannada recognition |[ka_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml)|9.9M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_train.tar) | | ta_PP-OCRv3_rec | ppocr/utils/dict/ta_dict.txt |Lightweight model for Tamil recognition|[ta_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml)|9.6M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_train.tar) | @@ -131,9 +131,9 @@ This chapter lists OCR nb models with PP-OCRv2 or earlier versions. You can acce |Version|Introduction|Model size|Detection model|Text Direction model|Recognition model|Paddle-Lite branch| |---|---|---|---|---|---|---| -|PP-OCRv2|extra-lightweight chinese OCR optimized model|11M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_infer_opt.nb)|v2.10| +|PP-OCRv2|extra-lightweight chinese OCR optimized model|11.0M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_infer_opt.nb)|v2.10| |PP-OCRv2(slim)|extra-lightweight chinese OCR optimized model|4.6M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_slim_opt.nb)|v2.10| -|PP-OCRv2|extra-lightweight chinese OCR optimized model|11M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer_opt.nb)|v2.9| +|PP-OCRv2|extra-lightweight chinese OCR optimized model|11.0M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer_opt.nb)|v2.9| |PP-OCRv2(slim)|extra-lightweight chinese OCR optimized model|4.9M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_opt.nb)|v2.9| |V2.0|ppocr_v2.0 extra-lightweight chinese OCR optimized model|7.8M|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_det_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_rec_opt.nb)|v2.9| |V2.0(slim)|ppovr_v2.0 extra-lightweight chinese OCR optimized model|3.3M|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_det_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_rec_slim_opt.nb)|v2.9| diff --git a/ppstructure/docs/PP-StructureV2_introduction.md b/ppstructure/docs/PP-StructureV2_introduction.md index efaf35f2b..555fc4560 100644 --- a/ppstructure/docs/PP-StructureV2_introduction.md +++ b/ppstructure/docs/PP-StructureV2_introduction.md @@ -16,11 +16,11 @@ 现实场景中包含大量的文档图像,它们以图片等非结构化形式存储。基于文档图像的结构化分析与信息抽取对于数据的数字化存储以及产业的数字化转型至关重要。基于该考虑,PaddleOCR自研并发布了PP-Structure智能文档分析系统,旨在帮助开发者更好的完成版面分析、表格识别、关键信息抽取等文档理解相关任务。 -近期,PaddleOCR团队针对PP-Structurev1的版面分析、表格识别、关键信息抽取模块,进行了共计8个方面的升级,同时新增整图方向矫正、文档复原等功能,打造出一个全新的、效果更优的文档分析系统:PP-StructureV2。 +近期,PaddleOCR团队针对PP-StructureV1的版面分析、表格识别、关键信息抽取模块,进行了共计8个方面的升级,同时新增整图方向矫正、文档复原等功能,打造出一个全新的、效果更优的文档分析系统:PP-StructureV2。 ## 2. 简介 -PP-StructureV2在PP-Structurev1的基础上进一步改进,主要有以下3个方面升级: +PP-StructureV2在PP-StructureV1的基础上进一步改进,主要有以下3个方面升级: * **系统功能升级** :新增图像矫正和版面复原模块,图像转word/pdf、关键信息抽取能力全覆盖! * **系统性能优化** : @@ -52,7 +52,7 @@ PP-StructureV2系统流程图如下所示,文档图像首先经过图像矫正 * TB-YX:考虑阅读顺序的文本行排序逻辑 * UDML:联合互学习知识蒸馏策略 -最终,与PP-Structurev1相比: +最终,与PP-StructureV1相比: - 版面分析模型参数量减少95.6%,推理速度提升11倍,精度提升0.4%; - 表格识别预测耗时不变,模型精度提升6%,端到端TEDS提升2%; @@ -74,17 +74,17 @@ PP-StructureV2系统流程图如下所示,文档图像首先经过图像矫正 ### 4.1 版面分析 -版面分析指的是对图片形式的文档进行区域划分,定位其中的关键区域,如文字、标题、表格、图片等,PP-Structurev1使用了PaddleDetection中开源的高效检测算法PP-YOLOv2完成版面分析的任务。 +版面分析指的是对图片形式的文档进行区域划分,定位其中的关键区域,如文字、标题、表格、图片等,PP-StructureV1使用了PaddleDetection中开源的高效检测算法PP-YOLOv2完成版面分析的任务。 在PP-StructureV2中,我们发布基于PP-PicoDet的轻量级版面分析模型,并针对版面分析场景定制图像尺度,同时使用FGD知识蒸馏算法,进一步提升模型精度。最终CPU上`41ms`即可完成版面分析过程(仅包含模型推理时间,数据预处理耗时大约50ms左右)。在公开数据集PubLayNet 上,消融实验如下: | 实验序号 | 策略 | 模型存储(M) | mAP | CPU预测耗时(ms) | |:------:|:------:|:------:|:------:|:------:| -| 1 | PP-YOLOv2(640*640) | 221 | 93.6% | 512 | -| 2 | PP-PicoDet-LCNet2.5x(640*640) | 29.7 | 92.5% |53.2| -| 3 | PP-PicoDet-LCNet2.5x(800*608) | 29.7 | 94.2% |83.1 | -| 4 | PP-PicoDet-LCNet1.0x(800*608) | 9.7 | 93.5% | 41.2| -| 5 | PP-PicoDet-LCNet1.0x(800*608) + FGD | 9.7 | 94% |41.2| +| 1 | PP-YOLOv2(640*640) | 221.0 | 93.60% | 512.00 | +| 2 | PP-PicoDet-LCNet2.5x(640*640) | 29.7 | 92.50% |53.20| +| 3 | PP-PicoDet-LCNet2.5x(800*608) | 29.7 | 94.20% |83.10 | +| 4 | PP-PicoDet-LCNet1.0x(800*608) | 9.7 | 93.50% | 41.20| +| 5 | PP-PicoDet-LCNet1.0x(800*608) + FGD | 9.7 | 94.00% |41.20| * 测试条件 * paddle版本:2.3.0 @@ -94,8 +94,8 @@ PP-StructureV2系统流程图如下所示,文档图像首先经过图像矫正 | 模型 | mAP | CPU预测耗时 | |-------------------|-----------|------------| -| layoutparser (Detectron2) | 88.98% | 2.9s | -| PP-StructureV2 (PP-PicoDet) | **94%** | 41.2ms | +| layoutparser (Detectron2) | 88.98% | 2.90s | +| PP-StructureV2 (PP-PicoDet) | **94.00%** | 41.20ms | [PubLayNet](https://github.com/ibm-aur-nlp/PubLayNet)数据集是一个大型的文档图像数据集,包含Text、Title、Tale、Figure、List,共5个类别。数据集中包含335,703张训练集、11,245张验证集和11,405张测试集。训练数据与标注示例图如下所示: @@ -108,7 +108,7 @@ PP-StructureV2系统流程图如下所示,文档图像首先经过图像矫正 **(1)轻量级版面分析模型PP-PicoDet** -`PP-PicoDet`是PaddleDetection中提出的轻量级目标检测模型,通过使用PP-LCNet骨干网络、CSP-PAN特征融合模块、SimOTA标签分配方法等优化策略,最终在CPU与移动端具有卓越的性能。我们将PP-Structurev1中采用的PP-YOLOv2模型替换为`PP-PicoDet`,同时针对版面分析场景优化预测尺度,从针对目标检测设计的`640*640`调整为更适配文档图像的`800*608`,在`1.0x`配置下,模型精度与PP-YOLOv2相当,CPU平均预测速度可提升11倍。 +`PP-PicoDet`是PaddleDetection中提出的轻量级目标检测模型,通过使用PP-LCNet骨干网络、CSP-PAN特征融合模块、SimOTA标签分配方法等优化策略,最终在CPU与移动端具有卓越的性能。我们将PP-StructureV1中采用的PP-YOLOv2模型替换为`PP-PicoDet`,同时针对版面分析场景优化预测尺度,从针对目标检测设计的`640*640`调整为更适配文档图像的`800*608`,在`1.0x`配置下,模型精度与PP-YOLOv2相当,CPU平均预测速度可提升11倍。 **(1)FGD知识蒸馏** @@ -130,10 +130,10 @@ FGD(Focal and Global Knowledge Distillation for Detectors),是一种兼顾 | 实验序号 | 策略 | mAP | |:------:|:------:|:------:| -| 1 | PP-YOLOv2 | 84.7% | -| 2 | PP-PicoDet-LCNet2.5x(800*608) | 87.8% | -| 3 | PP-PicoDet-LCNet1.0x(800*608) | 84.5% | -| 4 | PP-PicoDet-LCNet1.0x(800*608) + FGD | 86.8% | +| 1 | PP-YOLOv2 | 84.70% | +| 2 | PP-PicoDet-LCNet2.5x(800*608) | 87.80% | +| 3 | PP-PicoDet-LCNet1.0x(800*608) | 84.50% | +| 4 | PP-PicoDet-LCNet1.0x(800*608) + FGD | 86.80% | **(2)表格版面分析** @@ -144,10 +144,10 @@ FGD(Focal and Global Knowledge Distillation for Detectors),是一种兼顾 | 实验序号 | 策略 | mAP | |:------:|:------:|:------:| -| 1 | PP-YOLOv2 |91.3% | -| 2 | PP-PicoDet-LCNet2.5x(800*608) | 95.9% | -| 3 | PP-PicoDet-LCNet1.0x(800*608) | 95.2% | -| 4 | PP-PicoDet-LCNet1.0x(800*608) + FGD | 95.7% | +| 1 | PP-YOLOv2 |91.30% | +| 2 | PP-PicoDet-LCNet2.5x(800*608) | 95.90% | +| 3 | PP-PicoDet-LCNet1.0x(800*608) | 95.20% | +| 4 | PP-PicoDet-LCNet1.0x(800*608) + FGD | 95.70% | 表格检测效果示意图如下: @@ -157,7 +157,7 @@ FGD(Focal and Global Knowledge Distillation for Detectors),是一种兼顾 ### 4.2 表格识别 -基于深度学习的表格识别算法种类丰富,PP-Structurev1中,我们基于文本识别算法RARE研发了端到端表格识别算法TableRec-RARE,模型输出为表格结构的HTML表示,进而可以方便地转化为Excel文件。PP-StructureV2中,我们对模型结构和损失函数等5个方面进行升级,提出了 SLANet (Structure Location Alignment Network) ,模型结构如下图所示: +基于深度学习的表格识别算法种类丰富,PP-StructureV1中,我们基于文本识别算法RARE研发了端到端表格识别算法TableRec-RARE,模型输出为表格结构的HTML表示,进而可以方便地转化为Excel文件。PP-StructureV2中,我们对模型结构和损失函数等5个方面进行升级,提出了 SLANet (Structure Location Alignment Network) ,模型结构如下图所示:
@@ -170,7 +170,7 @@ FGD(Focal and Global Knowledge Distillation for Detectors),是一种兼顾 |TableRec-RARE| 71.73% | 93.88% |779ms |6.8M| |+PP-LCNet| 74.71% |94.37% |778ms| 8.7M| |+CSP-PAN| 75.68%| 94.72% |708ms| 9.3M| -|+SLAHead| 77.7%|94.85%| 766ms| 9.2M| +|+SLAHead| 77.70%|94.85%| 766ms| 9.2M| |+MergeToken| 76.31%| 95.89%|766ms| 9.2M| * 测试环境 @@ -181,7 +181,7 @@ FGD(Focal and Global Knowledge Distillation for Detectors),是一种兼顾 |策略|Acc|TEDS|推理速度(CPU+MKLDNN)|模型大小| |---|---|---|---|---| -|TableMaster|77.9%|96.12%|2144ms|253M| +|TableMaster|77.90%|96.12%|2144ms|253.0M| |TableRec-RARE| 71.73% | 93.88% |779ms |6.8M| |SLANet|76.31%| 95.89%|766ms|9.2M| @@ -218,7 +218,7 @@ PP-StructureV2中,我们参考TableMaster中的token处理方法,将`` 除了上述模型策略的升级外,本次升级还开源了中文表格识别模型。在实际应用场景中,表格图像存在着各种各样的倾斜角度(PubTabNet数据集不存在该问题),因此在中文模型中,我们将单元格坐标回归的点数从2个(左上,右下)增加到4个(左上,右上,右下,左下)。在内部测试集上,模型升级前后指标如下: |模型|acc| |---|---| -|TableRec-RARE|44.3%| +|TableRec-RARE|44.30%| |SLANet|59.35%| 可视化结果如下,左为输入图像,右为识别的html表格 @@ -307,8 +307,8 @@ LayoutLMv2以及LayoutXLM中引入视觉骨干网络,用于提取视觉特征 |-----------------|----------|---------|--------| | LayoutLMv2 | 0.76 | 84.20% | - | | VI-LayoutLMv2 | 0.42 | 82.10% | -2.10% | -| LayoutXLM | 1.4 | 89.50% | - | -| VI-LayouXLM | 1.1 | 90.46% | +0.96% | +| LayoutXLM | 1.40 | 89.50% | - | +| VI-LayouXLM | 1.10 | 90.46% | +0.96% | 同时,基于XFUND数据集,VI-LayoutXLM在RE任务上的精度也进一步提升了`1.06%`。 diff --git a/ppstructure/docs/models_list.md b/ppstructure/docs/models_list.md index afed95600..a5b9549a7 100644 --- a/ppstructure/docs/models_list.md +++ b/ppstructure/docs/models_list.md @@ -13,11 +13,11 @@ |模型名称|模型简介|推理模型大小|下载地址|dict path| | --- | --- | --- | --- | --- | | picodet_lcnet_x1_0_fgd_layout | 基于PicoDet LCNet_x1_0和FGD蒸馏在PubLayNet 数据集训练的英文版面分析模型,可以划分**文字、标题、表格、图片以及列表**5类区域 | 9.7M | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout.pdparams) | [PubLayNet dict](../../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt) | -| ppyolov2_r50vd_dcn_365e_publaynet | 基于PP-YOLOv2在PubLayNet数据集上训练的英文版面分析模型 | 221M | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [训练模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) | 同上 | +| ppyolov2_r50vd_dcn_365e_publaynet | 基于PP-YOLOv2在PubLayNet数据集上训练的英文版面分析模型 | 221.0M | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [训练模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) | 同上 | | picodet_lcnet_x1_0_fgd_layout_cdla | CDLA数据集训练的中文版面分析模型,可以划分为**表格、图片、图片标题、表格、表格标题、页眉、脚本、引用、公式**10类区域 | 9.7M | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla.pdparams) | [CDLA dict](../../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt) | | picodet_lcnet_x1_0_fgd_layout_table | 表格数据集训练的版面分析模型,支持中英文文档表格区域的检测 | 9.7M | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table.pdparams) | [Table dict](../../ppocr/utils/dict/layout_dict/layout_table_dict.txt) | -| ppyolov2_r50vd_dcn_365e_tableBank_word | 基于PP-YOLOv2在TableBank Word 数据集训练的版面分析模型,支持英文文档表格区域的检测 | 221M | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | 同上 | -| ppyolov2_r50vd_dcn_365e_tableBank_latex | 基于PP-YOLOv2在TableBank Latex数据集训练的版面分析模型,支持英文文档表格区域的检测 | 221M | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | 同上 | +| ppyolov2_r50vd_dcn_365e_tableBank_word | 基于PP-YOLOv2在TableBank Word 数据集训练的版面分析模型,支持英文文档表格区域的检测 | 221.0M | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | 同上 | +| ppyolov2_r50vd_dcn_365e_tableBank_latex | 基于PP-YOLOv2在TableBank Latex数据集训练的版面分析模型,支持英文文档表格区域的检测 | 221.0M | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | 同上 | @@ -54,9 +54,9 @@ |re_VI-LayoutXLM_xfund_zh|基于VI-LayoutXLM在xfund中文数据集上训练的RE模型|1.1G| 83.92% | 15.49 |[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar) | |ser_LayoutXLM_xfund_zh|基于LayoutXLM在xfund中文数据集上训练的SER模型|1.4G| 90.38% | 19.49 |[推理模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) | |re_LayoutXLM_xfund_zh|基于LayoutXLM在xfund中文数据集上训练的RE模型|1.4G| 74.83% | 19.49 |[推理模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) | -|ser_LayoutLMv2_xfund_zh|基于LayoutLMv2在xfund中文数据集上训练的SER模型|778M| 85.44% | 31.46 |[推理模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar) | -|re_LayoutLMv2_xfund_zh|基于LayoutLMv2在xfun中文数据集上训练的RE模型|765M| 67.77% | 31.46 |[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar) | -|ser_LayoutLM_xfund_zh|基于LayoutLM在xfund中文数据集上训练的SER模型|430M| 77.31% | - |[推理模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar) | +|ser_LayoutLMv2_xfund_zh|基于LayoutLMv2在xfund中文数据集上训练的SER模型|778.0M| 85.44% | 31.46 |[推理模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar) | +|re_LayoutLMv2_xfund_zh|基于LayoutLMv2在xfun中文数据集上训练的RE模型|765.0M| 67.77% | 31.46 |[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar) | +|ser_LayoutLM_xfund_zh|基于LayoutLM在xfund中文数据集上训练的SER模型|430.0M| 77.31% | - |[推理模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar) | * 注:上述预测耗时信息仅包含了inference模型的推理耗时,没有统计预处理与后处理耗时,测试环境为`V100 GPU + CUDA 10.2 + CUDNN 8.1.1 + TRT 7.2.3.4`。 @@ -65,4 +65,4 @@ |模型名称|模型简介|模型大小|精度|下载地址| | --- | --- | --- |--- | --- | -|SDMGR|关键信息提取模型|78M| 86.70% | [推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)| +|SDMGR|关键信息提取模型|78.0M| 86.70% | [推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)| diff --git a/ppstructure/docs/models_list_en.md b/ppstructure/docs/models_list_en.md index 291d42f99..5908f45e8 100644 --- a/ppstructure/docs/models_list_en.md +++ b/ppstructure/docs/models_list_en.md @@ -13,11 +13,11 @@ |model name| description | inference model size |download|dict path| | --- |---------------------------------------------------------------------------------------------------------------------------------------------------------| --- | --- | --- | | picodet_lcnet_x1_0_fgd_layout | The layout analysis English model trained on the PubLayNet dataset based on PicoDet LCNet_x1_0 and FGD . the model can recognition 5 types of areas such as **Text, Title, Table, Picture and List** | 9.7M | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout.pdparams) | [PubLayNet dict](../../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt) | -| ppyolov2_r50vd_dcn_365e_publaynet | The layout analysis English model trained on the PubLayNet dataset based on PP-YOLOv2 | 221M | [inference_moel](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [trained model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) | same as above | -| picodet_lcnet_x1_0_fgd_layout_cdla | The layout analysis Chinese model trained on the CDLA dataset, the model can recognition 10 types of areas such as **Table、Figure、Figure caption、Table、Table caption、Header、Footer、Reference、Equation** | 9.7M | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla.pdparams) | [CDLA dict](../../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt) | -| picodet_lcnet_x1_0_fgd_layout_table | The layout analysis model trained on the table dataset, the model can detect tables in Chinese and English documents | 9.7M | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table.pdparams) | [Table dict](../../ppocr/utils/dict/layout_dict/layout_table_dict.txt) | -| ppyolov2_r50vd_dcn_365e_tableBank_word | The layout analysis model trained on the TableBank Word dataset based on PP-YOLOv2, the model can detect tables in English documents | 221M | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | same as above | -| ppyolov2_r50vd_dcn_365e_tableBank_latex | The layout analysis model trained on the TableBank Latex dataset based on PP-YOLOv2, the model can detect tables in English documents | 221M | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | same as above | +| ppyolov2_r50vd_dcn_365e_publaynet | The layout analysis English model trained on the PubLayNet dataset based on PP-YOLOv2 | 221.0M | [inference_moel](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [trained model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) | same as above | +| picodet_lcnet_x1_0_fgd_layout_cdla | The layout analysis Chinese model trained on the CDLA dataset, the model can recognition 10 types of areas such as **Table、Figure、Figure caption、Table、Table caption、Header、Footer、Reference、Equation** | 9.70M | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla.pdparams) | [CDLA dict](../../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt) | +| picodet_lcnet_x1_0_fgd_layout_table | The layout analysis model trained on the table dataset, the model can detect tables in Chinese and English documents | 9.70M | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table.pdparams) | [Table dict](../../ppocr/utils/dict/layout_dict/layout_table_dict.txt) | +| ppyolov2_r50vd_dcn_365e_tableBank_word | The layout analysis model trained on the TableBank Word dataset based on PP-YOLOv2, the model can detect tables in English documents | 221.0M | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | same as above | +| ppyolov2_r50vd_dcn_365e_tableBank_latex | The layout analysis model trained on the TableBank Latex dataset based on PP-YOLOv2, the model can detect tables in English documents | 221.0M | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | same as above | ## 2. OCR and Table Recognition @@ -63,4 +63,4 @@ On wildreceipt dataset, the algorithm result is as follows: |Model|Backbone|Config|Hmean|Download link| | --- | --- | --- | --- | --- | -|SDMGR|VGG6|[configs/kie/sdmgr/kie_unet_sdmgr.yml](../../configs/kie/sdmgr/kie_unet_sdmgr.yml)|86.7%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)| +|SDMGR|VGG6|[configs/kie/sdmgr/kie_unet_sdmgr.yml](../../configs/kie/sdmgr/kie_unet_sdmgr.yml)|86.70%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)| diff --git a/ppstructure/table/README.md b/ppstructure/table/README.md index cebbd1cca..bacd9ff5d 100644 --- a/ppstructure/table/README.md +++ b/ppstructure/table/README.md @@ -32,7 +32,7 @@ We evaluated the algorithm on the PubTabNet[1] eval dataset, and the |Method|Acc|[TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src)|Speed| | --- | --- | --- | ---| -| EDD[2] |x| 88.3 |x| +| EDD[2] |x| 88.30 |x| | TableRec-RARE(ours) | 71.73%| 93.88% |779ms| | SLANet(ours) | 76.31%| 95.89%|766ms| diff --git a/ppstructure/table/README_ch.md b/ppstructure/table/README_ch.md index 72b7f5cbe..b8817523c 100644 --- a/ppstructure/table/README_ch.md +++ b/ppstructure/table/README_ch.md @@ -38,7 +38,7 @@ |算法|Acc|[TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src)|Speed| | --- | --- | --- | ---| -| EDD[2] |x| 88.3% |x| +| EDD[2] |x| 88.30% |x| | TableRec-RARE(ours) | 71.73%| 93.88% |779ms| | SLANet(ours) |76.31%| 95.89%|766ms| From aaf077e687fe78de950d89e2c3c4439a66b063a5 Mon Sep 17 00:00:00 2001 From: WenmuZhou <572459439@qq.com> Date: Mon, 24 Oct 2022 04:17:29 +0000 Subject: [PATCH 08/19] update doc --- StyleText/README.md | 2 +- StyleText/README_ch.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/StyleText/README.md b/StyleText/README.md index 609c90539..30f9eaba5 100644 --- a/StyleText/README.md +++ b/StyleText/README.md @@ -172,7 +172,7 @@ After adding the above synthetic data for training, the accuracy of the recognit | Scenario | Characters | Raw Data | Test Data | Only Use Raw Data
Recognition Accuracy | New Synthetic Data | Simultaneous Use of Synthetic Data
Recognition Accuracy | Index Improvement | | -------- | ---------- | -------- | -------- | -------------------------- | ------------ | ---------------------- | -------- | | Metal surface | English and numbers | 2203 | 650 | 59.38% | 20000 | 75.46% | 16.00% | -| Random background | Korean | 5,631 | 1230 | 30.12% | 100000 | 50.57% | 20.00% | +| Random background | Korean | 5631 | 1230 | 30.12% | 100000 | 50.57% | 20.00% | ### Code Structure diff --git a/StyleText/README_ch.md b/StyleText/README_ch.md index b35967f4a..110a6c63f 100644 --- a/StyleText/README_ch.md +++ b/StyleText/README_ch.md @@ -157,7 +157,7 @@ python3 tools/synth_image.py -c configs/config.yml --style_image examples/style_ | 场景 | 字符 | 原始数据 | 测试数据 | 只使用原始数据
识别准确率 | 新增合成数据 | 同时使用合成数据
识别准确率 | 指标提升 | | -------- | ---------- | -------- | -------- | -------------------------- | ------------ | ---------------------- | -------- | | 金属表面 | 英文和数字 | 2203 | 650 | 59.38% | 20000 | 75.46% | 16.00% | -| 随机背景 | 韩语 | 5631 | 1,230 | 30.12% | 100000 | 50.57% | 20.00% | +| 随机背景 | 韩语 | 5631 | 1230 | 30.12% | 100000 | 50.57% | 20.00% | From 40541bde9688db2c4fcdab95344e854e1c63de05 Mon Sep 17 00:00:00 2001 From: WenmuZhou <572459439@qq.com> Date: Mon, 24 Oct 2022 05:34:35 +0000 Subject: [PATCH 09/19] update can doc --- doc/doc_ch/algorithm_rec_can.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/doc_ch/algorithm_rec_can.md b/doc/doc_ch/algorithm_rec_can.md index 816a255ef..2cc5a72bf 100644 --- a/doc/doc_ch/algorithm_rec_can.md +++ b/doc/doc_ch/algorithm_rec_can.md @@ -60,7 +60,7 @@ python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs python3 tools/train.py -c configs/rec/rec_d28_can.yml -o Train.dataset.transforms.GrayImageChannelFormat.inverse=False ``` -- 默认每训练1个epoch(1,105次iteration)进行1次评估,若您更改训练的batch_size,或更换数据集,请在训练时作出如下修改 +- 默认每训练1个epoch(1105次iteration)进行1次评估,若您更改训练的batch_size,或更换数据集,请在训练时作出如下修改 ``` python3 tools/train.py -c configs/rec/rec_d28_can.yml -o Global.eval_batch_step=[0, {length_of_dataset//batch_size}] From b2bc6b746c21238d83079eb2db7f6838af126fd8 Mon Sep 17 00:00:00 2001 From: WenmuZhou <572459439@qq.com> Date: Mon, 24 Oct 2022 05:36:34 +0000 Subject: [PATCH 10/19] update doc --- applications/PCB字符识别/PCB字符识别.md | 2 +- paddleocr.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/applications/PCB字符识别/PCB字符识别.md b/applications/PCB字符识别/PCB字符识别.md index 804d57e3b..d4596335a 100644 --- a/applications/PCB字符识别/PCB字符识别.md +++ b/applications/PCB字符识别/PCB字符识别.md @@ -425,7 +425,7 @@ python3 tools/eval.py \ | 4 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 增加PCB图像数量 | 99.99% | +23.00% | 如果能获取更多数据量的情况,可以通过增加数据量提升效果 | ``` -注:上述实验结果均是在1,500张图片(1,200张训练集,300张测试集)、2W张图片、添加公开通用识别数据集上训练、评估的得到,AIstudio只提供了100张数据,所以指标有所差异属于正常,只要策略有效、规律相同即可。 +注:上述实验结果均是在1500张图片(1200张训练集,300张测试集)、2W张图片、添加公开通用识别数据集上训练、评估的得到,AIstudio只提供了100张数据,所以指标有所差异属于正常,只要策略有效、规律相同即可。 ``` # 6. 模型导出 diff --git a/paddleocr.py b/paddleocr.py index 887feb96a..d552474ce 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -53,8 +53,8 @@ BASE_DIR = os.path.expanduser("~/.paddleocr/") DEFAULT_OCR_MODEL_VERSION = 'PP-OCRv3' SUPPORT_OCR_MODEL_VERSION = ['PP-OCR', 'PP-OCRv2', 'PP-OCRv3'] -DEFAULT_STRUCTURE_MODEL_VERSION = 'PP-Structurev2' -SUPPORT_STRUCTURE_MODEL_VERSION = ['PP-Structure', 'PP-Structurev2'] +DEFAULT_STRUCTURE_MODEL_VERSION = 'PP-StructureV2' +SUPPORT_STRUCTURE_MODEL_VERSION = ['PP-Structure', 'PP-StructureV2'] MODEL_URLS = { 'OCR': { 'PP-OCRv3': { @@ -272,7 +272,7 @@ MODEL_URLS = { } } }, - 'PP-Structurev2': { + 'PP-StructureV2': { 'table': { 'en': { 'url': @@ -326,10 +326,10 @@ def parse_args(mMain=True): "--structure_version", type=str, choices=SUPPORT_STRUCTURE_MODEL_VERSION, - default='PP-Structurev2', + default='PP-StructureV2', help='Model version, the current model support list is as follows:' ' 1. PP-Structure Support en table structure model.' - ' 2. PP-Structurev2 Support ch and en table structure model.') + ' 2. PP-StructureV2 Support ch and en table structure model.') for action in parser._actions: if action.dest in [ From 18171accd4a874468331da21e1ec0beb5c007beb Mon Sep 17 00:00:00 2001 From: WenmuZhou <572459439@qq.com> Date: Mon, 24 Oct 2022 07:40:44 +0000 Subject: [PATCH 11/19] update style text doc --- StyleText/README.md | 4 ++-- StyleText/README_ch.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/StyleText/README.md b/StyleText/README.md index 30f9eaba5..eddedbd7e 100644 --- a/StyleText/README.md +++ b/StyleText/README.md @@ -171,8 +171,8 @@ After adding the above synthetic data for training, the accuracy of the recognit | Scenario | Characters | Raw Data | Test Data | Only Use Raw Data
Recognition Accuracy | New Synthetic Data | Simultaneous Use of Synthetic Data
Recognition Accuracy | Index Improvement | | -------- | ---------- | -------- | -------- | -------------------------- | ------------ | ---------------------- | -------- | -| Metal surface | English and numbers | 2203 | 650 | 59.38% | 20000 | 75.46% | 16.00% | -| Random background | Korean | 5631 | 1230 | 30.12% | 100000 | 50.57% | 20.00% | +| Metal surface | English and numbers | 2203 | 650 | 59.38% | 20000 | 75.46% | 16.08% | +| Random background | Korean | 5631 | 1230 | 30.12% | 100000 | 50.57% | 20.45% | ### Code Structure diff --git a/StyleText/README_ch.md b/StyleText/README_ch.md index 110a6c63f..7818f2df0 100644 --- a/StyleText/README_ch.md +++ b/StyleText/README_ch.md @@ -156,8 +156,8 @@ python3 tools/synth_image.py -c configs/config.yml --style_image examples/style_ | 场景 | 字符 | 原始数据 | 测试数据 | 只使用原始数据
识别准确率 | 新增合成数据 | 同时使用合成数据
识别准确率 | 指标提升 | | -------- | ---------- | -------- | -------- | -------------------------- | ------------ | ---------------------- | -------- | -| 金属表面 | 英文和数字 | 2203 | 650 | 59.38% | 20000 | 75.46% | 16.00% | -| 随机背景 | 韩语 | 5631 | 1230 | 30.12% | 100000 | 50.57% | 20.00% | +| 金属表面 | 英文和数字 | 2203 | 650 | 59.38% | 20000 | 75.46% | 16.08% | +| 随机背景 | 韩语 | 5631 | 1230 | 30.12% | 100000 | 50.57% | 20.45% | From b92501faf62fa6f35e7d200f36720d303c681f63 Mon Sep 17 00:00:00 2001 From: littletomatodonkey Date: Mon, 24 Oct 2022 15:43:01 +0800 Subject: [PATCH 12/19] fix pic (#8067) --- README.md | 2 +- README_ch.md | 2 +- ppocr/utils/visual.py | 8 +++++--- ppstructure/README.md | 2 +- ppstructure/README_ch.md | 2 +- 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index b8996346a..5f85aaaa1 100644 --- a/README.md +++ b/README.md @@ -190,7 +190,7 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel - SER (Semantic entity recognition)
- +
diff --git a/README_ch.md b/README_ch.md index f7338c072..8e1cfeacd 100755 --- a/README_ch.md +++ b/README_ch.md @@ -222,7 +222,7 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力
- +
- RE(关系提取) diff --git a/ppocr/utils/visual.py b/ppocr/utils/visual.py index 5bd805ea6..b6de44659 100644 --- a/ppocr/utils/visual.py +++ b/ppocr/utils/visual.py @@ -51,20 +51,22 @@ def draw_ser_results(image, bbox = trans_poly_to_bbox(ocr_info["points"]) draw_box_txt(bbox, text, draw, font, font_size, color) - img_new = Image.blend(image, img_new, 0.5) + img_new = Image.blend(image, img_new, 0.7) return np.array(img_new) def draw_box_txt(bbox, text, draw, font, font_size, color): + # draw ocr results outline bbox = ((bbox[0], bbox[1]), (bbox[2], bbox[3])) draw.rectangle(bbox, fill=color) # draw ocr results - start_y = max(0, bbox[0][1] - font_size) tw = font.getsize(text)[0] + th = font.getsize(text)[1] + start_y = max(0, bbox[0][1] - th) draw.rectangle( - [(bbox[0][0] + 1, start_y), (bbox[0][0] + tw + 1, start_y + font_size)], + [(bbox[0][0] + 1, start_y), (bbox[0][0] + tw + 1, start_y + th)], fill=(0, 0, 255)) draw.text((bbox[0][0] + 1, start_y), text, fill=(255, 255, 255), font=font) diff --git a/ppstructure/README.md b/ppstructure/README.md index 9d503ca8e..e44ba5886 100644 --- a/ppstructure/README.md +++ b/ppstructure/README.md @@ -62,7 +62,7 @@ The following figure shows the effect of layout recovery based on the results of Different colored boxes in the figure represent different categories.
- +
diff --git a/ppstructure/README_ch.md b/ppstructure/README_ch.md index 050740b3b..53c251d15 100644 --- a/ppstructure/README_ch.md +++ b/ppstructure/README_ch.md @@ -78,7 +78,7 @@ PP-StructureV2支持各个模块独立使用或灵活搭配,如,可以单独
- +
From fd8039f9275e6123617df41b6d4c6505b1d304c3 Mon Sep 17 00:00:00 2001 From: WenmuZhou <572459439@qq.com> Date: Mon, 24 Oct 2022 07:53:05 +0000 Subject: [PATCH 13/19] update pdb doc --- applications/PCB字符识别/PCB字符识别.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/applications/PCB字符识别/PCB字符识别.md b/applications/PCB字符识别/PCB字符识别.md index d4596335a..c695e8297 100644 --- a/applications/PCB字符识别/PCB字符识别.md +++ b/applications/PCB字符识别/PCB字符识别.md @@ -266,8 +266,8 @@ python3 tools/eval.py \ | 序号 | 方案 | hmean | 效果提升 | 实验分析 | | -------- | -------- | -------- | -------- | -------- | | 1 | PP-OCRv3英文超轻量检测预训练模型 | 64.64% | - | 提供的预训练模型具有泛化能力 | -| 2 | PP-OCRv3英文超轻量检测预训练模型 + 验证集padding | 72.13% |+7.50% | padding可以提升尺寸较小图片的检测效果| -| 3 | PP-OCRv3英文超轻量检测预训练模型 + fine-tune | 100.00% | +27.90% | fine-tune会提升垂类场景效果 | +| 2 | PP-OCRv3英文超轻量检测预训练模型 + 验证集padding | 72.13% |+7.49% | padding可以提升尺寸较小图片的检测效果| +| 3 | PP-OCRv3英文超轻量检测预训练模型 + fine-tune | 100.00% | +27.87% | fine-tune会提升垂类场景效果 | ``` @@ -420,9 +420,9 @@ python3 tools/eval.py \ | 序号 | 方案 | acc | 效果提升 | 实验分析 | | -------- | -------- | -------- | -------- | -------- | | 1 | PP-OCRv3中英文超轻量识别预训练模型直接评估 | 46.67% | - | 提供的预训练模型具有泛化能力 | -| 2 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune | 42.02% |-4.60% | 在数据量不足的情况,反而比预训练模型效果低(也可以通过调整超参数再试试)| -| 3 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 公开通用识别数据集 | 77.00% | +30.00% | 在数据量不足的情况下,可以考虑补充公开数据训练 | -| 4 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 增加PCB图像数量 | 99.99% | +23.00% | 如果能获取更多数据量的情况,可以通过增加数据量提升效果 | +| 2 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune | 42.02% |-4.65% | 在数据量不足的情况,反而比预训练模型效果低(也可以通过调整超参数再试试)| +| 3 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 公开通用识别数据集 | 77.00% | +30.33% | 在数据量不足的情况下,可以考虑补充公开数据训练 | +| 4 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 增加PCB图像数量 | 99.99% | +22.99% | 如果能获取更多数据量的情况,可以通过增加数据量提升效果 | ``` 注:上述实验结果均是在1500张图片(1200张训练集,300张测试集)、2W张图片、添加公开通用识别数据集上训练、评估的得到,AIstudio只提供了100张数据,所以指标有所差异属于正常,只要策略有效、规律相同即可。 @@ -614,17 +614,17 @@ python3 tools/end2end/eval_end2end.py ./save_gt_label/ ./save_PPOCRV2_infer/ | 序号 | 方案 | hmean | 效果提升 | 实验分析 | | ---- | -------------------------------------------------------- | ------ | -------- | ------------------------------------- | | 1 | PP-OCRv3英文超轻量检测预训练模型直接评估 | 64.64% | - | 提供的预训练模型具有泛化能力 | -| 2 | PP-OCRv3英文超轻量检测预训练模型 + 验证集padding直接评估 | 72.13% | +7.50% | padding可以提升尺寸较小图片的检测效果 | -| 3 | PP-OCRv3英文超轻量检测预训练模型 + fine-tune | 100.00% | +27.90% | fine-tune会提升垂类场景效果 | +| 2 | PP-OCRv3英文超轻量检测预训练模型 + 验证集padding直接评估 | 72.13% | +7.49% | padding可以提升尺寸较小图片的检测效果 | +| 3 | PP-OCRv3英文超轻量检测预训练模型 + fine-tune | 100.00% | +27.87% | fine-tune会提升垂类场景效果 | * 识别 | 序号 | 方案 | acc | 效果提升 | 实验分析 | | ---- | ------------------------------------------------------------ | ------ | -------- | ------------------------------------------------------------ | | 1 | PP-OCRv3中英文超轻量识别预训练模型直接评估 | 46.67% | - | 提供的预训练模型具有泛化能力 | -| 2 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune | 42.02% | -4.60% | 在数据量不足的情况,反而比预训练模型效果低(也可以通过调整超参数再试试) | -| 3 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 公开通用识别数据集 | 77.00% | +30.00% | 在数据量不足的情况下,可以考虑补充公开数据训练 | -| 4 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 增加PCB图像数量 | 99.99% | +23.00% | 如果能获取更多数据量的情况,可以通过增加数据量提升效果 | +| 2 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune | 42.02% | -4.65% | 在数据量不足的情况,反而比预训练模型效果低(也可以通过调整超参数再试试) | +| 3 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 公开通用识别数据集 | 77.00% | +30.33% | 在数据量不足的情况下,可以考虑补充公开数据训练 | +| 4 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 增加PCB图像数量 | 99.99% | +22.99% | 如果能获取更多数据量的情况,可以通过增加数据量提升效果 | * 端到端 From cad701d4114514f2a20d3841dfa9008adc3d13a6 Mon Sep 17 00:00:00 2001 From: WenmuZhou <572459439@qq.com> Date: Mon, 24 Oct 2022 17:10:05 +0800 Subject: [PATCH 14/19] fix benckmark error when benckmark=false --- paddleocr.py | 2 +- ppstructure/pdf2word/pdf2word.py | 200 ++++++++++++++------------ ppstructure/recovery/requirements.txt | 3 +- ppstructure/table/predict_table.py | 8 +- 4 files changed, 120 insertions(+), 93 deletions(-) diff --git a/paddleocr.py b/paddleocr.py index 887feb96a..c3da5c1ba 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -47,7 +47,7 @@ __all__ = [ ] SUPPORT_DET_MODEL = ['DB'] -VERSION = '2.6.0.3' +VERSION = '2.6.1.0' SUPPORT_REC_MODEL = ['CRNN', 'SVTR_LCNet'] BASE_DIR = os.path.expanduser("~/.paddleocr/") diff --git a/ppstructure/pdf2word/pdf2word.py b/ppstructure/pdf2word/pdf2word.py index a287fb248..83a0c8892 100644 --- a/ppstructure/pdf2word/pdf2word.py +++ b/ppstructure/pdf2word/pdf2word.py @@ -1,9 +1,23 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import sys import tarfile import os import time import datetime -import functools +import functools import cv2 import platform import numpy as np @@ -20,7 +34,6 @@ root = os.path.abspath(os.path.join(file, '../../')) sys.path.append(file) sys.path.insert(0, root) - from ppstructure.predict_system import StructureSystem, save_structure_res from ppstructure.utility import parse_args, draw_structure_result from ppocr.utils.network import download_with_progressbar @@ -32,13 +45,17 @@ __VERSION__ = "0.2.2" URLs_EN = { # 下载超英文轻量级PP-OCRv3模型的检测模型并解压 - "en_PP-OCRv3_det_infer": "https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar", + "en_PP-OCRv3_det_infer": + "https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar", # 下载英文轻量级PP-OCRv3模型的识别模型并解压 - "en_PP-OCRv3_rec_infer": "https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar", + "en_PP-OCRv3_rec_infer": + "https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar", # 下载超轻量级英文表格英文模型并解压 - "en_ppstructure_mobile_v2.0_SLANet_infer": "https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar", + "en_ppstructure_mobile_v2.0_SLANet_infer": + "https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar", # 英文版面分析模型 - "picodet_lcnet_x1_0_fgd_layout_infer": "https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar", + "picodet_lcnet_x1_0_fgd_layout_infer": + "https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar", } DICT_EN = { "rec_char_dict_path": "en_dict.txt", @@ -47,21 +64,24 @@ DICT_EN = { URLs_CN = { # 下载超中文轻量级PP-OCRv3模型的检测模型并解压 - "cn_PP-OCRv3_det_infer": "https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar", + "cn_PP-OCRv3_det_infer": + "https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar", # 下载中文轻量级PP-OCRv3模型的识别模型并解压 - "cn_PP-OCRv3_rec_infer": "https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar", + "cn_PP-OCRv3_rec_infer": + "https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar", # 下载超轻量级英文表格英文模型并解压 - "cn_ppstructure_mobile_v2.0_SLANet_infer": "https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar", + "cn_ppstructure_mobile_v2.0_SLANet_infer": + "https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar", # 中文版面分析模型 - "picodet_lcnet_x1_0_fgd_layout_cdla_infer": "https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar", + "picodet_lcnet_x1_0_fgd_layout_cdla_infer": + "https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar", } DICT_CN = { - "rec_char_dict_path": "ppocr_keys_v1.txt", + "rec_char_dict_path": "ppocr_keys_v1.txt", "layout_dict_path": "layout_cdla_dict.txt", } - def QImageToCvMat(incomingImage) -> np.array: ''' Converts a QImage into an opencv MAT format @@ -98,7 +118,7 @@ def readImage(image_file) -> list: img = cv2.imread(image_file, cv2.IMREAD_COLOR) if img is not None: imgs = [img] - + return imgs @@ -106,7 +126,7 @@ class Worker(QThread): progressBarValue = Signal(int) progressBarRange = Signal(int) endsignal = Signal() - exceptedsignal = Signal(str) #发送一个异常信号 + exceptedsignal = Signal(str) #发送一个异常信号 loopFlag = True def __init__(self, predictors, save_pdf, vis_font_path, use_pdf2docx_api): @@ -120,7 +140,7 @@ class Worker(QThread): self.outputDir = None self.totalPageCnt = 0 self.pageCnt = 0 - self.setStackSize(1024*1024) + self.setStackSize(1024 * 1024) def setImagePath(self, imagePaths): self.imagePaths = imagePaths @@ -130,7 +150,7 @@ class Worker(QThread): def setOutputDir(self, outputDir): self.outputDir = outputDir - + def setPDFParser(self, enabled): self.use_pdf2docx_api = enabled @@ -167,10 +187,10 @@ class Worker(QThread): try: convert_info_docx(imgs, all_res, self.outputDir, img_name) except Exception as ex: - print("error in layout recovery image:{}, err msg: {}". - format(img_name, ex)) + print("error in layout recovery image:{}, err msg: {}".format( + img_name, ex)) print("Predict time : {:.3f}s".format(time_dict['all'])) - print('result save to {}'.format(self.outputDir)) + print('result save to {}'.format(self.outputDir)) def run(self): self.resetPageCnt() @@ -185,10 +205,11 @@ class Worker(QThread): and os.path.basename(image_file)[-3:] == 'pdf': self.totalPageCnt += 1 self.progressBarRange.emit(self.totalPageCnt) - print('===============using use_pdf2docx_api===============') + print( + '===============using use_pdf2docx_api===============') img_name = os.path.basename(image_file).split('.')[0] - docx_file = os.path.join( - self.outputDir, '{}.docx'.format(img_name)) + docx_file = os.path.join(self.outputDir, + '{}.docx'.format(img_name)) cv = Converter(image_file) cv.convert(docx_file) cv.close() @@ -201,13 +222,14 @@ class Worker(QThread): if len(imgs) == 0: continue img_name = os.path.basename(image_file).split('.')[0] - os.makedirs(os.path.join(self.outputDir, img_name), exist_ok=True) + os.makedirs( + os.path.join(self.outputDir, img_name), exist_ok=True) self.ppocrPrecitor(imgs, img_name) # file processed self.endsignal.emit() # self.exec() except Exception as e: - self.exceptedsignal.emit(str(e)) # 将异常发送给UI进程 + self.exceptedsignal.emit(str(e)) # 将异常发送给UI进程 class APP_Image2Doc(QWidget): @@ -222,8 +244,7 @@ class APP_Image2Doc(QWidget): self.screenShot = None self.save_pdf = False self.output_dir = None - self.vis_font_path = os.path.join(root, - "doc", "fonts", "simfang.ttf") + self.vis_font_path = os.path.join(root, "doc", "fonts", "simfang.ttf") self.use_pdf2docx_api = False # ProgressBar @@ -239,14 +260,16 @@ class APP_Image2Doc(QWidget): self.downloadModels(URLs_CN) # 初始化模型 - predictors = { + predictors = { 'EN': self.initPredictor('EN'), 'CN': self.initPredictor('CN'), } # 设置工作进程 - self._thread = Worker(predictors, self.save_pdf, self.vis_font_path, self.use_pdf2docx_api) - self._thread.progressBarValue.connect(self.handleProgressBarUpdateSingal) + self._thread = Worker(predictors, self.save_pdf, self.vis_font_path, + self.use_pdf2docx_api) + self._thread.progressBarValue.connect( + self.handleProgressBarUpdateSingal) self._thread.endsignal.connect(self.handleEndsignalSignal) # self._thread.finished.connect(QObject.deleteLater) self._thread.progressBarRange.connect(self.handleProgressBarRangeSingal) @@ -285,7 +308,7 @@ class APP_Image2Doc(QWidget): layout.addWidget(self.PDFParserButton, 0, 3, 1, 1) self.PDFParserButton.clicked.connect( functools.partial(self.handleStartSignal, 'CN', True)) - + self.showResultButton = QPushButton("显示结果") self.showResultButton.setIcon(QIcon(QPixmap("./icons/folder-open.png"))) layout.addWidget(self.showResultButton, 0, 4, 1, 1) @@ -294,8 +317,7 @@ class APP_Image2Doc(QWidget): # ProgressBar layout.addWidget(self.pb, 2, 0, 1, 5) # time estimate label - self.timeEstLabel = QLabel( - ("Time Left: --")) + self.timeEstLabel = QLabel(("Time Left: --")) layout.addWidget(self.timeEstLabel, 3, 0, 1, 5) self.setLayout(layout) @@ -303,11 +325,8 @@ class APP_Image2Doc(QWidget): def downloadModels(self, URLs): # using custom model tar_file_name_list = [ - 'inference.pdiparams', - 'inference.pdiparams.info', - 'inference.pdmodel', - 'model.pdiparams', - 'model.pdiparams.info', + 'inference.pdiparams', 'inference.pdiparams.info', + 'inference.pdmodel', 'model.pdiparams', 'model.pdiparams.info', 'model.pdmodel' ] model_path = os.path.join(root, 'inference') @@ -325,9 +344,10 @@ class APP_Image2Doc(QWidget): try: download_with_progressbar(url, tarpath) except Exception as e: - print("Error occurred when downloading file, error message:") + print( + "Error occurred when downloading file, error message:") print(e) - + # unzip model tar try: with tarfile.open(tarpath, 'r') as tarObj: @@ -341,13 +361,12 @@ class APP_Image2Doc(QWidget): if filename is None: continue file = tarObj.extractfile(member) - with open( - os.path.join(storage_dir, filename), - 'wb') as f: + with open(os.path.join(storage_dir, filename), + 'wb') as f: f.write(file.read()) except Exception as e: - print("Error occurred when unziping file, error message:") - print(e) + print("Error occurred when unziping file, error message:") + print(e) def initPredictor(self, lang='EN'): # init predictor args @@ -356,50 +375,53 @@ class APP_Image2Doc(QWidget): args.ocr = True args.recovery = True args.save_pdf = self.save_pdf - args.table_char_dict_path = os.path.join(root, - "ppocr", "utils", "dict", "table_structure_dict.txt") + args.table_char_dict_path = os.path.join(root, "ppocr", "utils", "dict", + "table_structure_dict.txt") if lang == 'EN': - args.det_model_dir = os.path.join(root, # 此处从这里找到模型存放位置 - "inference", "en_PP-OCRv3_det_infer") - args.rec_model_dir = os.path.join(root, - "inference", "en_PP-OCRv3_rec_infer") - args.table_model_dir = os.path.join(root, - "inference", "en_ppstructure_mobile_v2.0_SLANet_infer") - args.output = os.path.join(root, "output") # 结果保存路径 - args.layout_model_dir = os.path.join(root, - "inference", "picodet_lcnet_x1_0_fgd_layout_infer") + args.det_model_dir = os.path.join( + root, # 此处从这里找到模型存放位置 + "inference", + "en_PP-OCRv3_det_infer") + args.rec_model_dir = os.path.join(root, "inference", + "en_PP-OCRv3_rec_infer") + args.table_model_dir = os.path.join( + root, "inference", "en_ppstructure_mobile_v2.0_SLANet_infer") + args.output = os.path.join(root, "output") # 结果保存路径 + args.layout_model_dir = os.path.join( + root, "inference", "picodet_lcnet_x1_0_fgd_layout_infer") lang_dict = DICT_EN elif lang == 'CN': - args.det_model_dir = os.path.join(root, # 此处从这里找到模型存放位置 - "inference", "cn_PP-OCRv3_det_infer") - args.rec_model_dir = os.path.join(root, - "inference", "cn_PP-OCRv3_rec_infer") - args.table_model_dir = os.path.join(root, - "inference", "cn_ppstructure_mobile_v2.0_SLANet_infer") - args.output = os.path.join(root, "output") # 结果保存路径 - args.layout_model_dir = os.path.join(root, - "inference", "picodet_lcnet_x1_0_fgd_layout_cdla_infer") + args.det_model_dir = os.path.join( + root, # 此处从这里找到模型存放位置 + "inference", + "cn_PP-OCRv3_det_infer") + args.rec_model_dir = os.path.join(root, "inference", + "cn_PP-OCRv3_rec_infer") + args.table_model_dir = os.path.join( + root, "inference", "cn_ppstructure_mobile_v2.0_SLANet_infer") + args.output = os.path.join(root, "output") # 结果保存路径 + args.layout_model_dir = os.path.join( + root, "inference", "picodet_lcnet_x1_0_fgd_layout_cdla_infer") lang_dict = DICT_CN else: raise ValueError("Unsupported language") - args.rec_char_dict_path = os.path.join(root, - "ppocr", "utils", - lang_dict['rec_char_dict_path']) - args.layout_dict_path = os.path.join(root, - "ppocr", "utils", "dict", "layout_dict", - lang_dict['layout_dict_path']) + args.rec_char_dict_path = os.path.join(root, "ppocr", "utils", + lang_dict['rec_char_dict_path']) + args.layout_dict_path = os.path.join(root, "ppocr", "utils", "dict", + "layout_dict", + lang_dict['layout_dict_path']) # init predictor return StructureSystem(args) - + def handleOpenFileSignal(self): ''' 可以多选图像文件 ''' - selectedFiles = QFileDialog.getOpenFileNames(self, - "多文件选择", "/", "图片文件 (*.png *.jpeg *.jpg *.bmp *.pdf)")[0] + selectedFiles = QFileDialog.getOpenFileNames( + self, "多文件选择", "/", "图片文件 (*.png *.jpeg *.jpg *.bmp *.pdf)")[0] if len(selectedFiles) > 0: self.imagePaths = selectedFiles - self.screenShot = None # discard screenshot temp image + self.screenShot = None # discard screenshot temp image self.pb.setValue(0) # def screenShotSlot(self): @@ -415,18 +437,19 @@ class APP_Image2Doc(QWidget): # self.pb.setValue(0) def handleStartSignal(self, lang='EN', pdfParser=False): - if self.screenShot: # for screenShot - img_name = 'screenshot_' + time.strftime("%Y%m%d%H%M%S", time.localtime()) + if self.screenShot: # for screenShot + img_name = 'screenshot_' + time.strftime("%Y%m%d%H%M%S", + time.localtime()) image = QImageToCvMat(self.screenShot) self.predictAndSave(image, img_name, lang) # update Progress Bar self.pb.setValue(1) - QMessageBox.information(self, - u'Information', "文档提取完成") - elif len(self.imagePaths) > 0 : # for image file selection + QMessageBox.information(self, u'Information', "文档提取完成") + elif len(self.imagePaths) > 0: # for image file selection # Must set image path list and language before start self.output_dir = os.path.join( - os.path.dirname(self.imagePaths[0]), "output") # output_dir shold be same as imagepath + os.path.dirname(self.imagePaths[0]), + "output") # output_dir shold be same as imagepath self._thread.setOutputDir(self.output_dir) self._thread.setImagePath(self.imagePaths) self._thread.setLang(lang) @@ -438,12 +461,10 @@ class APP_Image2Doc(QWidget): self.PDFParserButton.setEnabled(False) # 启动工作进程 self._thread.start() - self.time_start = time.time() # log start time - QMessageBox.information(self, - u'Information', "开始转换") + self.time_start = time.time() # log start time + QMessageBox.information(self, u'Information', "开始转换") else: - QMessageBox.warning(self, - u'Information', "请选择要识别的文件或截图") + QMessageBox.warning(self, u'Information', "请选择要识别的文件或截图") def handleShowResultSignal(self): if self.output_dir is None: @@ -454,15 +475,16 @@ class APP_Image2Doc(QWidget): else: os.system('open ' + os.path.normpath(self.output_dir)) else: - QMessageBox.information(self, - u'Information', "输出文件不存在") + QMessageBox.information(self, u'Information', "输出文件不存在") def handleProgressBarUpdateSingal(self, i): self.pb.setValue(i) # calculate time left of recognition lenbar = self.pb.maximum() - avg_time = (time.time() - self.time_start) / i # Use average time to prevent time fluctuations - time_left = str(datetime.timedelta(seconds=avg_time * (lenbar - i))).split(".")[0] # Remove microseconds + avg_time = (time.time() - self.time_start + ) / i # Use average time to prevent time fluctuations + time_left = str(datetime.timedelta(seconds=avg_time * ( + lenbar - i))).split(".")[0] # Remove microseconds self.timeEstLabel.setText(f"Time Left: {time_left}") # show time left def handleProgressBarRangeSingal(self, max): diff --git a/ppstructure/recovery/requirements.txt b/ppstructure/recovery/requirements.txt index 4e4239a14..ec08f9d0a 100644 --- a/ppstructure/recovery/requirements.txt +++ b/ppstructure/recovery/requirements.txt @@ -2,4 +2,5 @@ python-docx PyMuPDF==1.19.0 beautifulsoup4 fonttools>=4.24.0 -fire>=0.3.0 \ No newline at end of file +fire>=0.3.0 +pdf2docx \ No newline at end of file diff --git a/ppstructure/table/predict_table.py b/ppstructure/table/predict_table.py index 8f9c71749..354baf6dd 100644 --- a/ppstructure/table/predict_table.py +++ b/ppstructure/table/predict_table.py @@ -60,12 +60,16 @@ class TableSystem(object): self.args = args if not args.show_log: logger.setLevel(logging.INFO) - args.benchmark = False + benchmark_tmp = False + if args.benchmark: + benchmark_tmp = args.benchmark + args.benchmark = False self.text_detector = predict_det.TextDetector(copy.deepcopy( args)) if text_detector is None else text_detector self.text_recognizer = predict_rec.TextRecognizer(copy.deepcopy( args)) if text_recognizer is None else text_recognizer - args.benchmark = True + if benchmark_tmp: + args.benchmark = True self.table_structurer = predict_strture.TableStructurer(args) if args.table_algorithm in ['TableMaster']: self.match = TableMasterMatcher() From 2325055268e7d85834a6843ea7256159af104827 Mon Sep 17 00:00:00 2001 From: WenmuZhou <572459439@qq.com> Date: Mon, 24 Oct 2022 09:43:16 +0000 Subject: [PATCH 15/19] update doc --- ppstructure/docs/models_list_en.md | 4 ++-- ppstructure/table/README.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ppstructure/docs/models_list_en.md b/ppstructure/docs/models_list_en.md index 5908f45e8..889ad0970 100644 --- a/ppstructure/docs/models_list_en.md +++ b/ppstructure/docs/models_list_en.md @@ -14,8 +14,8 @@ | --- |---------------------------------------------------------------------------------------------------------------------------------------------------------| --- | --- | --- | | picodet_lcnet_x1_0_fgd_layout | The layout analysis English model trained on the PubLayNet dataset based on PicoDet LCNet_x1_0 and FGD . the model can recognition 5 types of areas such as **Text, Title, Table, Picture and List** | 9.7M | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout.pdparams) | [PubLayNet dict](../../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt) | | ppyolov2_r50vd_dcn_365e_publaynet | The layout analysis English model trained on the PubLayNet dataset based on PP-YOLOv2 | 221.0M | [inference_moel](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [trained model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) | same as above | -| picodet_lcnet_x1_0_fgd_layout_cdla | The layout analysis Chinese model trained on the CDLA dataset, the model can recognition 10 types of areas such as **Table、Figure、Figure caption、Table、Table caption、Header、Footer、Reference、Equation** | 9.70M | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla.pdparams) | [CDLA dict](../../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt) | -| picodet_lcnet_x1_0_fgd_layout_table | The layout analysis model trained on the table dataset, the model can detect tables in Chinese and English documents | 9.70M | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table.pdparams) | [Table dict](../../ppocr/utils/dict/layout_dict/layout_table_dict.txt) | +| picodet_lcnet_x1_0_fgd_layout_cdla | The layout analysis Chinese model trained on the CDLA dataset, the model can recognition 10 types of areas such as **Table、Figure、Figure caption、Table、Table caption、Header、Footer、Reference、Equation** | 9.7M | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla.pdparams) | [CDLA dict](../../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt) | +| picodet_lcnet_x1_0_fgd_layout_table | The layout analysis model trained on the table dataset, the model can detect tables in Chinese and English documents | 9.7M | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table.pdparams) | [Table dict](../../ppocr/utils/dict/layout_dict/layout_table_dict.txt) | | ppyolov2_r50vd_dcn_365e_tableBank_word | The layout analysis model trained on the TableBank Word dataset based on PP-YOLOv2, the model can detect tables in English documents | 221.0M | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | same as above | | ppyolov2_r50vd_dcn_365e_tableBank_latex | The layout analysis model trained on the TableBank Latex dataset based on PP-YOLOv2, the model can detect tables in English documents | 221.0M | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | same as above | diff --git a/ppstructure/table/README.md b/ppstructure/table/README.md index bacd9ff5d..17f048879 100644 --- a/ppstructure/table/README.md +++ b/ppstructure/table/README.md @@ -32,7 +32,7 @@ We evaluated the algorithm on the PubTabNet[1] eval dataset, and the |Method|Acc|[TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src)|Speed| | --- | --- | --- | ---| -| EDD[2] |x| 88.30 |x| +| EDD[2] |x| 88.30% |x| | TableRec-RARE(ours) | 71.73%| 93.88% |779ms| | SLANet(ours) | 76.31%| 95.89%|766ms| From 6120094ac28efb0742dcdd4e6128573c984d095a Mon Sep 17 00:00:00 2001 From: WenmuZhou <572459439@qq.com> Date: Tue, 25 Oct 2022 05:08:58 +0000 Subject: [PATCH 16/19] add Telescope to support alg --- tools/program.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/program.py b/tools/program.py index a0594e950..afb8a4725 100755 --- a/tools/program.py +++ b/tools/program.py @@ -642,7 +642,8 @@ def preprocess(is_train=False): 'CLS', 'PGNet', 'Distillation', 'NRTR', 'TableAttn', 'SAR', 'PSE', 'SEED', 'SDMGR', 'LayoutXLM', 'LayoutLM', 'LayoutLMv2', 'PREN', 'FCE', 'SVTR', 'ViTSTR', 'ABINet', 'DB++', 'TableMaster', 'SPIN', 'VisionLAN', - 'Gestalt', 'SLANet', 'RobustScanner', 'CT', 'RFL', 'DRRG', 'CAN' + 'Gestalt', 'SLANet', 'RobustScanner', 'CT', 'RFL', 'DRRG', 'CAN', + 'Telescope' ] if use_xpu: From d1f57063fb419bc75d0889d20010786c8c9e7083 Mon Sep 17 00:00:00 2001 From: WenmuZhou <572459439@qq.com> Date: Tue, 25 Oct 2022 06:30:25 +0000 Subject: [PATCH 17/19] fix telescope tipc error --- test_tipc/configs/sr_telescope/sr_telescope.yml | 2 +- test_tipc/configs/sr_telescope/train_infer_python.txt | 10 +++++----- test_tipc/prepare.sh | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/test_tipc/configs/sr_telescope/sr_telescope.yml b/test_tipc/configs/sr_telescope/sr_telescope.yml index d3c10448e..c78a42d0e 100644 --- a/test_tipc/configs/sr_telescope/sr_telescope.yml +++ b/test_tipc/configs/sr_telescope/sr_telescope.yml @@ -51,7 +51,7 @@ Metric: Train: dataset: name: LMDBDataSetSR - data_dir: ./train_data/TextZoom/train + data_dir: ./train_data/TextZoom/test transforms: - SRResize: imgH: 32 diff --git a/test_tipc/configs/sr_telescope/train_infer_python.txt b/test_tipc/configs/sr_telescope/train_infer_python.txt index 4dcfa29ee..7235f07e8 100644 --- a/test_tipc/configs/sr_telescope/train_infer_python.txt +++ b/test_tipc/configs/sr_telescope/train_infer_python.txt @@ -4,12 +4,12 @@ python:python3.7 gpu_list:0|0,1 Global.use_gpu:True|True Global.auto_cast:null -Global.epoch_num:lite_train_lite_infer=2|whole_train_whole_infer=300 +Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=300 Global.save_model_dir:./output/ Train.loader.batch_size_per_card:lite_train_lite_infer=16|whole_train_whole_infer=16 Global.pretrained_model:null train_model_name:latest -train_infer_img_dir:./inference/sr_inference +train_infer_img_dir:./inference/rec_inference null:null ## trainer:norm_train @@ -21,7 +21,7 @@ null:null null:null ## ===========================eval_params=========================== -eval:tools/eval.py -c test_tipc/configs/sr_telescope/sr_telescope.yml -o +eval:null null:null ## ===========================infer_params=========================== @@ -44,8 +44,8 @@ inference:tools/infer/predict_sr.py --sr_image_shape="1,32,128" --rec_algorithm= --rec_batch_num:1 --use_tensorrt:False --precision:fp32 ---rec_model_dir: ---image_dir:./inference/sr_inference +--sr_model_dir: +--image_dir:./inference/rec_inference --save_log_path:./test/output/ --benchmark:True null:null diff --git a/test_tipc/prepare.sh b/test_tipc/prepare.sh index 452ec31c4..02ee8a24d 100644 --- a/test_tipc/prepare.sh +++ b/test_tipc/prepare.sh @@ -150,6 +150,7 @@ if [ ${MODE} = "lite_train_lite_infer" ];then # pretrain lite train data wget -nc -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams --no-check-certificate wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_db_v2.0_train.tar --no-check-certificate + cd ./pretrain_models/ && tar xf det_mv3_db_v2.0_train.tar && cd ../ if [[ ${model_name} =~ "ch_PP-OCRv2_det" ]];then wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar --no-check-certificate cd ./pretrain_models/ && tar xf ch_PP-OCRv2_det_distill_train.tar && cd ../ @@ -179,7 +180,6 @@ if [ ${MODE} = "lite_train_lite_infer" ];then wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/ppstructure/models/tablemaster/table_structure_tablemaster_train.tar --no-check-certificate cd ./pretrain_models/ && tar xf table_structure_tablemaster_train.tar && cd ../ fi - cd ./pretrain_models/ && tar xf det_mv3_db_v2.0_train.tar && cd ../ rm -rf ./train_data/icdar2015 rm -rf ./train_data/ic15_data rm -rf ./train_data/pubtabnet From 6f24523036832a716dbefe30edae7c35e474be9b Mon Sep 17 00:00:00 2001 From: WenmuZhou <572459439@qq.com> Date: Tue, 25 Oct 2022 07:01:33 +0000 Subject: [PATCH 18/19] update can docs --- doc/doc_ch/algorithm_rec_can.md | 16 +++++++--------- doc/doc_en/algorithm_rec_can_en.md | 8 ++++---- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/doc/doc_ch/algorithm_rec_can.md b/doc/doc_ch/algorithm_rec_can.md index e4f4ba6f3..745a9e8d4 100644 --- a/doc/doc_ch/algorithm_rec_can.md +++ b/doc/doc_ch/algorithm_rec_can.md @@ -57,24 +57,22 @@ python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs **注意:** - 我们提供的数据集,即[`CROHME数据集`](https://paddleocr.bj.bcebos.com/dataset/CROHME.tar)将手写公式存储为黑底白字的格式,若您自行准备的数据集与之相反,即以白底黑字模式存储,请在训练时做出如下修改 ``` -python3 tools/train.py -c configs/rec/rec_d28_can.yml --o Train.dataset.transforms.GrayImageChannelFormat.inverse=False +python3 tools/train.py -c configs/rec/rec_d28_can.yml -o Train.dataset.transforms.GrayImageChannelFormat.inverse=False ``` - 默认每训练1个epoch(1105次iteration)进行1次评估,若您更改训练的batch_size,或更换数据集,请在训练时作出如下修改 ``` -python3 tools/train.py -c configs/rec/rec_d28_can.yml --o Global.eval_batch_step=[0, {length_of_dataset//batch_size}] +python3 tools/train.py -c configs/rec/rec_d28_can.yml -o Global.eval_batch_step=[0, {length_of_dataset//batch_size}] ``` # ### 3.2 评估 -可下载已训练完成的[模型文件](https://paddleocr.bj.bcebos.com/contribution/can_train.tar),使用如下命令进行评估: +可下载已训练完成的[模型文件](https://paddleocr.bj.bcebos.com/contribution/rec_d28_can_train.tar),使用如下命令进行评估: ```shell # 注意将pretrained_model的路径设置为本地路径。若使用自行训练保存的模型,请注意修改路径和文件名为{path/to/weights}/{model_name}。 -python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_d28_can.yml -o Global.pretrained_model=./rec_d28_can_train/CAN +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_d28_can.yml -o Global.pretrained_model=./rec_d28_can_train/best_accuracy.pdparams ``` @@ -83,7 +81,7 @@ python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec 使用如下命令进行单张图片预测: ```shell # 注意将pretrained_model的路径设置为本地路径。 -python3 tools/infer_rec.py -c configs/rec/rec_d28_can.yml -o Architecture.Head.attdecoder.is_train=False Global.infer_img='./doc/datasets/crohme_demo/hme_00.jpg' Global.pretrained_model=./rec_d28_can_train/CAN +python3 tools/infer_rec.py -c configs/rec/rec_d28_can.yml -o Architecture.Head.attdecoder.is_train=False Global.infer_img='./doc/datasets/crohme_demo/hme_00.jpg' Global.pretrained_model=./rec_d28_can_train/best_accuracy.pdparams # 预测文件夹下所有图像时,可修改infer_img为文件夹,如 Global.infer_img='./doc/datasets/crohme_demo/'。 ``` @@ -94,11 +92,11 @@ python3 tools/infer_rec.py -c configs/rec/rec_d28_can.yml -o Architecture.Head.a ### 4.1 Python推理 -首先将训练得到best模型,转换成inference model。这里以训练完成的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/contribution/can_train.tar) ),可以使用如下命令进行转换: +首先将训练得到best模型,转换成inference model。这里以训练完成的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/contribution/rec_d28_can_train.tar) ),可以使用如下命令进行转换: ```shell # 注意将pretrained_model的路径设置为本地路径。 -python3 tools/export_model.py -c configs/rec/rec_d28_can.yml -o Global.pretrained_model=./rec_d28_can_train/CAN Global.save_inference_dir=./inference/rec_d28_can/ Architecture.Head.attdecoder.is_train=False +python3 tools/export_model.py -c configs/rec/rec_d28_can.yml -o Global.pretrained_model=./rec_d28_can_train/best_accuracy.pdparams Global.save_inference_dir=./inference/rec_d28_can/ Architecture.Head.attdecoder.is_train=False # 目前的静态图模型默认的输出长度最大为36,如果您需要预测更长的序列,请在导出模型时指定其输出序列为合适的值,例如 Architecture.Head.max_text_length=72 ``` diff --git a/doc/doc_en/algorithm_rec_can_en.md b/doc/doc_en/algorithm_rec_can_en.md index e65bb2aa8..cc1da7733 100644 --- a/doc/doc_en/algorithm_rec_can_en.md +++ b/doc/doc_en/algorithm_rec_can_en.md @@ -53,14 +53,14 @@ Evaluation: ``` # GPU evaluation -python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_d28_can.yml -o Global.pretrained_model=./rec_d28_can_train/CAN +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_d28_can.yml -o Global.pretrained_model=./rec_d28_can_train/best_accuracy.pdparams ``` Prediction: ``` # The configuration file used for prediction must match the training -python3 tools/infer_rec.py -c configs/rec/rec_d28_can.yml -o Architecture.Head.attdecoder.is_train=False Global.infer_img='./doc/crohme_demo/hme_00.jpg' Global.pretrained_model=./rec_d28_can_train/CAN +python3 tools/infer_rec.py -c configs/rec/rec_d28_can.yml -o Architecture.Head.attdecoder.is_train=False Global.infer_img='./doc/crohme_demo/hme_00.jpg' Global.pretrained_model=./rec_d28_can_train/best_accuracy.pdparams ``` @@ -71,7 +71,7 @@ python3 tools/infer_rec.py -c configs/rec/rec_d28_can.yml -o Architecture.Head.a First, the model saved during the CAN handwritten mathematical expression recognition training process is converted into an inference model. you can use the following command to convert: ``` -python3 tools/export_model.py -c configs/rec/rec_d28_can.yml -o Global.save_inference_dir=./inference/rec_d28_can/ Architecture.Head.attdecoder.is_train=False +python3 tools/export_model.py -c configs/rec/rec_d28_can.yml -o Global.pretrained_model=./rec_d28_can_train/best_accuracy.pdparams Global.save_inference_dir=./inference/rec_d28_can/ Architecture.Head.attdecoder.is_train=False # The default output max length of the model is 36. If you need to predict a longer sequence, please specify its output sequence as an appropriate value when exporting the model, as: Architecture.Head.max_ text_ length=72 ``` @@ -79,7 +79,7 @@ python3 tools/export_model.py -c configs/rec/rec_d28_can.yml -o Global.save_infe For CAN handwritten mathematical expression recognition model inference, the following commands can be executed: ``` -python3 tools/infer/predict_rec.py --image_dir="./doc/crohme_demo/hme_00.jpg" --rec_algorithm="CAN" --rec_batch_num=1 --rec_model_dir="./inference/rec_d28_can/" --rec_char_dict_path="./ppocr/utils/dict/latex_symbol_dict.txt" +python3 tools/infer/predict_rec.py --image_dir="./doc/datasets/crohme_demo/hme_00.jpg" --rec_algorithm="CAN" --rec_batch_num=1 --rec_model_dir="./inference/rec_d28_can/" --rec_char_dict_path="./ppocr/utils/dict/latex_symbol_dict.txt" # If you need to predict on a picture with black characters on a white background, please set: -- rec_ image_ inverse=False ``` From 2c1747b6385b9fefa15f805635601d640844ef9b Mon Sep 17 00:00:00 2001 From: WenmuZhou <572459439@qq.com> Date: Tue, 25 Oct 2022 08:38:42 +0000 Subject: [PATCH 19/19] fix cls diff with python --- deploy/cpp_infer/src/ocr_cls.cpp | 5 +++++ deploy/cpp_infer/src/preprocess_op.cpp | 4 ---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/deploy/cpp_infer/src/ocr_cls.cpp b/deploy/cpp_infer/src/ocr_cls.cpp index abcfed125..13a03d6ad 100644 --- a/deploy/cpp_infer/src/ocr_cls.cpp +++ b/deploy/cpp_infer/src/ocr_cls.cpp @@ -45,6 +45,11 @@ void Classifier::Run(std::vector img_list, this->normalize_op_.Run(&resize_img, this->mean_, this->scale_, this->is_scale_); + if (resize_img.cols < cls_image_shape[2]) { + cv::copyMakeBorder(resize_img, resize_img, 0, 0, 0, + cls_image_shape[2] - resize_img.cols, + cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0)); + } norm_img_batch.push_back(resize_img); } std::vector input(batch_num * cls_image_shape[0] * diff --git a/deploy/cpp_infer/src/preprocess_op.cpp b/deploy/cpp_infer/src/preprocess_op.cpp index 19cd6c3f7..b0261a9ed 100644 --- a/deploy/cpp_infer/src/preprocess_op.cpp +++ b/deploy/cpp_infer/src/preprocess_op.cpp @@ -132,10 +132,6 @@ void ClsResizeImg::Run(const cv::Mat &img, cv::Mat &resize_img, cv::resize(img, resize_img, cv::Size(resize_w, imgH), 0.f, 0.f, cv::INTER_LINEAR); - if (resize_w < imgW) { - cv::copyMakeBorder(resize_img, resize_img, 0, 0, 0, imgW - resize_w, - cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0)); - } } void TableResizeImg::Run(const cv::Mat &img, cv::Mat &resize_img,