From abe7550c8a49abb47dec128a433bdb0324f627cd Mon Sep 17 00:00:00 2001 From: Leif <4603009@qq.com> Date: Wed, 31 Aug 2022 21:20:33 +0800 Subject: [PATCH 01/11] update readme update readme --- PPOCRLabel/README_ch.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PPOCRLabel/README_ch.md b/PPOCRLabel/README_ch.md index 3ea684a3f..767102fb1 100644 --- a/PPOCRLabel/README_ch.md +++ b/PPOCRLabel/README_ch.md @@ -2,7 +2,7 @@ # PPOCRLabelv2 -PPOCRLabel是一款适用于OCR领域的半自动化图形标注工具,内置PP-OCR模型对数据自动标注和重新识别。使用Python3和PyQT5编写,支持矩形框标注和四点标注模式,导出格式可直接用于PaddleOCR检测和识别模型的训练。 +PPOCRLabel是一款适用于OCR领域的半自动化图形标注工具,内置PP-OCR模型对数据自动标注和重新识别。使用Python3和PyQT5编写,支持矩形框标注、表格标注、不规则文本标注、关键信息标注模式,导出格式可直接用于PaddleOCR检测和识别模型的训练。 | 常规标注 | 表格标注 | | :-------------------------------------------------: | :--------------------------------------------: | From 42395baffc7b168fbb00ac67b7ad9dc7db353061 Mon Sep 17 00:00:00 2001 From: whjdark <44253501+whjdark@users.noreply.github.com> Date: Wed, 28 Sep 2022 20:30:43 +0800 Subject: [PATCH 02/11] canvas.paintEvent() bug fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit https://github.com/PaddlePaddle/PaddleOCR/issues/7681 https://github.com/PaddlePaddle/PaddleOCR/issues/7702 self.prevPoint存储使用浮点型,p.drawLine只接受整型,触发上述异常。 --- PPOCRLabel/libs/canvas.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PPOCRLabel/libs/canvas.py b/PPOCRLabel/libs/canvas.py index 81f379951..44d899cbc 100644 --- a/PPOCRLabel/libs/canvas.py +++ b/PPOCRLabel/libs/canvas.py @@ -611,8 +611,8 @@ class Canvas(QWidget): if self.drawing() and not self.prevPoint.isNull() and not self.outOfPixmap(self.prevPoint): p.setPen(QColor(0, 0, 0)) - p.drawLine(self.prevPoint.x(), 0, self.prevPoint.x(), self.pixmap.height()) - p.drawLine(0, self.prevPoint.y(), self.pixmap.width(), self.prevPoint.y()) + p.drawLine(int(self.prevPoint.x()), 0, int(self.prevPoint.x()), self.pixmap.height()) + p.drawLine(0, int(self.prevPoint.y()), self.pixmap.width(), int(self.prevPoint.y())) self.setAutoFillBackground(True) if self.verified: @@ -909,4 +909,4 @@ class Canvas(QWidget): def updateShapeIndex(self): for i in range(len(self.shapes)): self.shapes[i].idx = i - self.update() \ No newline at end of file + self.update() From 2ff9055c1a9a9d499bd1d60f7f69338a540e6344 Mon Sep 17 00:00:00 2001 From: whjdark <44253501+whjdark@users.noreply.github.com> Date: Thu, 29 Sep 2022 16:09:43 +0800 Subject: [PATCH 03/11] =?UTF-8?q?[bug=20fix]=20KIE=E6=A8=A1=E5=BC=8F?= =?UTF-8?q?=E4=B8=8B=E5=85=B3=E9=94=AE=E5=AD=97=E6=97=A0=E6=B3=95=E4=BF=9D?= =?UTF-8?q?=E5=AD=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit https://github.com/PaddlePaddle/PaddleOCR/issues/7753 autoSaveFunc(self)修改后没有对shape进行保存 --- PPOCRLabel/PPOCRLabel.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/PPOCRLabel/PPOCRLabel.py b/PPOCRLabel/PPOCRLabel.py index 390c2b159..153837f8f 100644 --- a/PPOCRLabel/PPOCRLabel.py +++ b/PPOCRLabel/PPOCRLabel.py @@ -2715,6 +2715,10 @@ class MainWindow(QMainWindow): self._update_shape_color(shape) self.keyDialog.addLabelHistory(key_text) + + # save changed shape + self.singleLabel(shape) + self.setDirty() def undoShapeEdit(self): self.canvas.restoreShape() From 968c142e2aef92865553dd116a49db88d8507fdc Mon Sep 17 00:00:00 2001 From: Evezerest <50011306+Evezerest@users.noreply.github.com> Date: Thu, 29 Sep 2022 16:45:22 +0800 Subject: [PATCH 04/11] Update PPOCRLabel.py --- PPOCRLabel/PPOCRLabel.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/PPOCRLabel/PPOCRLabel.py b/PPOCRLabel/PPOCRLabel.py index 153837f8f..0a3ae1cb3 100644 --- a/PPOCRLabel/PPOCRLabel.py +++ b/PPOCRLabel/PPOCRLabel.py @@ -2716,9 +2716,8 @@ class MainWindow(QMainWindow): self._update_shape_color(shape) self.keyDialog.addLabelHistory(key_text) - # save changed shape - self.singleLabel(shape) - self.setDirty() + # save changed shape + self.setDirty() def undoShapeEdit(self): self.canvas.restoreShape() From 050e2a6879a28640e6915ed7d85ba7cb5cfb58b2 Mon Sep 17 00:00:00 2001 From: Leif <4603009@qq.com> Date: Thu, 29 Sep 2022 16:58:26 +0800 Subject: [PATCH 05/11] Update menu Update menu --- PPOCRLabel/resources/strings/strings-en.properties | 2 +- PPOCRLabel/resources/strings/strings-zh-CN.properties | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PPOCRLabel/resources/strings/strings-en.properties b/PPOCRLabel/resources/strings/strings-en.properties index 1b628016c..3dfc34e00 100644 --- a/PPOCRLabel/resources/strings/strings-en.properties +++ b/PPOCRLabel/resources/strings/strings-en.properties @@ -113,4 +113,4 @@ keyDialogTip=Enter object label keyChange=Change Box Key TableRecognition=Table Recognition cellreRecognition=Cell Re-Recognition -exportJSON=Export Excel Label(PubTabNet) +exportJSON=Export Table Label diff --git a/PPOCRLabel/resources/strings/strings-zh-CN.properties b/PPOCRLabel/resources/strings/strings-zh-CN.properties index 0758729a8..00dfd31da 100644 --- a/PPOCRLabel/resources/strings/strings-zh-CN.properties +++ b/PPOCRLabel/resources/strings/strings-zh-CN.properties @@ -113,4 +113,4 @@ keyDialogTip=请输入类型名称 keyChange=更改Box关键字类别 TableRecognition=表格识别 cellreRecognition=单元格重识别 -exportJSON=导出表格JSON标注 \ No newline at end of file +exportJSON=导出表格标注 \ No newline at end of file From 3e8c78b8c1210cd07f0ea1ab83dc521937814fd4 Mon Sep 17 00:00:00 2001 From: WenmuZhou <572459439@qq.com> Date: Fri, 30 Sep 2022 07:01:43 +0000 Subject: [PATCH 06/11] fix tablerec-rare train error --- ppocr/modeling/heads/table_att_head.py | 35 +++++++++++--------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/ppocr/modeling/heads/table_att_head.py b/ppocr/modeling/heads/table_att_head.py index 50910c5b7..e3fc8436e 100644 --- a/ppocr/modeling/heads/table_att_head.py +++ b/ppocr/modeling/heads/table_att_head.py @@ -82,7 +82,8 @@ class TableAttentionHead(nn.Layer): batch_size = fea.shape[0] hidden = paddle.zeros((batch_size, self.hidden_size)) - output_hiddens = paddle.zeros((batch_size, self.max_text_length + 1, self.hidden_size)) + output_hiddens = paddle.zeros( + (batch_size, self.max_text_length + 1, self.hidden_size)) if self.training and targets is not None: structure = targets[0] for i in range(self.max_text_length + 1): @@ -91,19 +92,13 @@ class TableAttentionHead(nn.Layer): (outputs, hidden), alpha = self.structure_attention_cell( hidden, fea, elem_onehots) output_hiddens[:, i, :] = outputs - # output_hiddens.append(paddle.unsqueeze(outputs, axis=1)) - output = paddle.concat(output_hiddens, axis=1) - structure_probs = self.structure_generator(output) - if self.loc_type == 1: - loc_preds = self.loc_generator(output) - loc_preds = F.sigmoid(loc_preds) - else: - loc_fea = fea.transpose([0, 2, 1]) - loc_fea = self.loc_fea_trans(loc_fea) - loc_fea = loc_fea.transpose([0, 2, 1]) - loc_concat = paddle.concat([output, loc_fea], axis=2) - loc_preds = self.loc_generator(loc_concat) - loc_preds = F.sigmoid(loc_preds) + structure_probs = self.structure_generator(output_hiddens) + loc_fea = fea.transpose([0, 2, 1]) + loc_fea = self.loc_fea_trans(loc_fea) + loc_fea = loc_fea.transpose([0, 2, 1]) + loc_concat = paddle.concat([output_hiddens, loc_fea], axis=2) + loc_preds = self.loc_generator(loc_concat) + loc_preds = F.sigmoid(loc_preds) else: temp_elem = paddle.zeros(shape=[batch_size], dtype="int32") structure_probs = None @@ -118,17 +113,15 @@ class TableAttentionHead(nn.Layer): (outputs, hidden), alpha = self.structure_attention_cell( hidden, fea, elem_onehots) output_hiddens[:, i, :] = outputs - # output_hiddens.append(paddle.unsqueeze(outputs, axis=1)) structure_probs_step = self.structure_generator(outputs) temp_elem = structure_probs_step.argmax(axis=1, dtype="int32") - output = output_hiddens - structure_probs = self.structure_generator(output) + structure_probs = self.structure_generator(output_hiddens) structure_probs = F.softmax(structure_probs) loc_fea = fea.transpose([0, 2, 1]) loc_fea = self.loc_fea_trans(loc_fea) loc_fea = loc_fea.transpose([0, 2, 1]) - loc_concat = paddle.concat([output, loc_fea], axis=2) + loc_concat = paddle.concat([output_hiddens, loc_fea], axis=2) loc_preds = self.loc_generator(loc_concat) loc_preds = F.sigmoid(loc_preds) return {'structure_probs': structure_probs, 'loc_preds': loc_preds} @@ -203,8 +196,10 @@ class SLAHead(nn.Layer): fea = fea.transpose([0, 2, 1]) # (NTC)(batch, width, channels) hidden = paddle.zeros((batch_size, self.hidden_size)) - structure_preds = paddle.zeros((batch_size, self.max_text_length + 1, self.num_embeddings)) - loc_preds = paddle.zeros((batch_size, self.max_text_length + 1, self.loc_reg_num)) + structure_preds = paddle.zeros( + (batch_size, self.max_text_length + 1, self.num_embeddings)) + loc_preds = paddle.zeros( + (batch_size, self.max_text_length + 1, self.loc_reg_num)) structure_preds.stop_gradient = True loc_preds.stop_gradient = True if self.training and targets is not None: From 57cb2183f9427968577e59fe7515337b9855ff60 Mon Sep 17 00:00:00 2001 From: andyjpaddle <jiangkaitao@baidu.com> Date: Sat, 8 Oct 2022 03:21:52 +0000 Subject: [PATCH 07/11] add doc for infer pdf --- doc/doc_ch/inference_ppocr.md | 5 ++++- doc/doc_en/inference_ppocr_en.md | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/doc_ch/inference_ppocr.md b/doc/doc_ch/inference_ppocr.md index 514f90539..6723fcc12 100644 --- a/doc/doc_ch/inference_ppocr.md +++ b/doc/doc_ch/inference_ppocr.md @@ -144,7 +144,7 @@ Predicts of ./doc/imgs_words/ch/word_4.jpg:['0', 0.9999982] **注意** `PP-OCRv3`的识别模型使用的输入shape为`3,48,320`, 如果使用其他识别模型,则需根据模型设置参数`--rec_image_shape`。此外,`PP-OCRv3`的识别模型默认使用的`rec_algorithm`为`SVTR_LCNet`,注意和原始`SVTR`的区别。 -以超轻量中文OCR模型推理为例,在执行预测时,需要通过参数`image_dir`指定单张图像或者图像集合的路径、参数`det_model_dir`,`cls_model_dir`和`rec_model_dir`分别指定检测,方向分类和识别的inference模型路径。参数`use_angle_cls`用于控制是否启用方向分类模型。`use_mp`表示是否使用多进程。`total_process_num`表示在使用多进程时的进程数。可视化识别结果默认保存到 ./inference_results 文件夹里面。 +以超轻量中文OCR模型推理为例,在执行预测时,需要通过参数`image_dir`指定单张图像或者图像集合的路径,也支持PDF文件、参数`det_model_dir`,`cls_model_dir`和`rec_model_dir`分别指定检测,方向分类和识别的inference模型路径。参数`use_angle_cls`用于控制是否启用方向分类模型。`use_mp`表示是否使用多进程。`total_process_num`表示在使用多进程时的进程数。可视化识别结果默认保存到 ./inference_results 文件夹里面。 ```shell # 使用方向分类器 @@ -153,8 +153,11 @@ python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --de python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false # 使用多进程 python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false --use_mp=True --total_process_num=6 +# 使用PDF文件,可以通过使用`page_num`参数来控制推理前几页,默认为0,表示推理所有页 +python3 tools/infer/predict_system.py --image_dir="./xxx.pdf" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=true --page_num=2 ``` + 执行命令后,识别结果图像如下:  diff --git a/doc/doc_en/inference_ppocr_en.md b/doc/doc_en/inference_ppocr_en.md index 4c9db51e1..5af3ce0ce 100755 --- a/doc/doc_en/inference_ppocr_en.md +++ b/doc/doc_en/inference_ppocr_en.md @@ -149,11 +149,12 @@ When performing prediction, you need to specify the path of a single image or a ```shell # use direction classifier python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=true - # not use use direction classifier python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false # use multi-process python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false --use_mp=True --total_process_num=6 +# use PDF files, you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages +python3 tools/infer/predict_system.py --image_dir="./xxx.pdf" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=true --page_num=2 ``` From 59812f6b154a8815064c69bc627ade76dd3c8e3c Mon Sep 17 00:00:00 2001 From: andyjpaddle <jiangkaitao@baidu.com> Date: Sat, 8 Oct 2022 03:24:05 +0000 Subject: [PATCH 08/11] add doc for infer pdf --- doc/doc_en/inference_ppocr_en.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/doc_en/inference_ppocr_en.md b/doc/doc_en/inference_ppocr_en.md index 5af3ce0ce..4c3576983 100755 --- a/doc/doc_en/inference_ppocr_en.md +++ b/doc/doc_en/inference_ppocr_en.md @@ -144,7 +144,7 @@ After executing the command, the prediction results (classification angle and sc **Note**: The input shape used by the recognition model of `PP-OCRv3` is `3, 48, 320`. If you use other recognition models, you need to set the parameter `--rec_image_shape` according to the model. In addition, the `rec_algorithm` used by the recognition model of `PP-OCRv3` is `SVTR_LCNet` by default. Note the difference from the original `SVTR`. -When performing prediction, you need to specify the path of a single image or a folder of images through the parameter `image_dir`, the parameter `det_model_dir` specifies the path to detect the inference model, the parameter `cls_model_dir` specifies the path to angle classification inference model and the parameter `rec_model_dir` specifies the path to identify the inference model. The parameter `use_angle_cls` is used to control whether to enable the angle classification model. The parameter `use_mp` specifies whether to use multi-process to infer `total_process_num` specifies process number when using multi-process. The parameter . The visualized recognition results are saved to the `./inference_results` folder by default. +When performing prediction, you need to specify the path of a single image or a folder of images through the parameter `image_dir`, pdf file is also supported, the parameter `det_model_dir` specifies the path to detect the inference model, the parameter `cls_model_dir` specifies the path to angle classification inference model and the parameter `rec_model_dir` specifies the path to identify the inference model. The parameter `use_angle_cls` is used to control whether to enable the angle classification model. The parameter `use_mp` specifies whether to use multi-process to infer `total_process_num` specifies process number when using multi-process. The parameter . The visualized recognition results are saved to the `./inference_results` folder by default. ```shell # use direction classifier From 2eabedf0f8dc47077c3686175e2e7e4732654b20 Mon Sep 17 00:00:00 2001 From: littletomatodonkey <dazhiningsibuqu@163.com> Date: Sat, 8 Oct 2022 12:10:27 +0800 Subject: [PATCH 09/11] add slanet dist training (#7825) * add slanet dist training * fix * fix --- doc/doc_ch/distributed_training.md | 30 ++++++++++++++++++++------- doc/doc_en/distributed_training_en.md | 28 ++++++++++++++++++------- 2 files changed, 42 insertions(+), 16 deletions(-) diff --git a/doc/doc_ch/distributed_training.md b/doc/doc_ch/distributed_training.md index 6afa4a5b9..dbbc4dc8b 100644 --- a/doc/doc_ch/distributed_training.md +++ b/doc/doc_ch/distributed_training.md @@ -41,16 +41,30 @@ python3 -m paddle.distributed.launch \ ## 性能效果测试 -* 在2机8卡P40的机器上,基于26W公开识别数据集(LSVT, RCTW, MTWI)上进行训练,最终耗时如下。 +* 在2机8卡P40的机器上进行模型训练,不同模型的精度、训练耗时、多机加速比情况如下所示。 -| 模型 | 配置 | 精度 | 单机8卡耗时 | 2机8卡耗时 | 加速比 | -|------|-----|--------|--------|--------|-----| -| CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 67.0% | 2.50d | 1.67d | **1.5** | +| 模型 | 配置 | 数据集 | 单机8卡耗时/精度 | 2机8卡耗时/精度 | 加速比 | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 26W中文数据集 | 2.50d/66.7% | 1.67d/67.0% | **1.5** | -* 在4机8卡V100的机器上,基于全量数据训练,最终耗时如下 +* 在3机8卡V100的机器上进行模型训练,不同模型的精度、训练耗时、多机加速比情况如下所示。 + +| 模型 | 配置 | 数据集 | 单机8卡耗时/精度 | 3机8卡耗时/精度 | 加速比 | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| SLANet | [SLANet.yml](../../configs/table/SLANet.yml) | PubTabNet | 49.8h/76.2% | 19.75h/74.77% | **2.52** | -| 模型 | 配置 | 精度 | 单机8卡耗时 | 4机8卡耗时 | 加速比 | -|------|-----|--------|--------|--------|-----| -| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | 74.0% | 10d | 2.84d | **3.5** | + > 注意:这里3机8卡训练时,单卡batch size相比于单机8卡不变,学习率乘以2 (默认乘以3的话,精度仅有73.42%) + + +* 在4机8卡V100的机器上进行模型训练,不同模型的精度、训练耗时、多机加速比情况如下所示。 + + +| 模型 | 配置 | 数据集 | 单机8卡耗时/精度 | 4机8卡耗时/精度 | 加速比 | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | PP-OCRv3_rec data | 10d/- | 2.84d/74.0% | **3.5** | + + +* **注意** + * 在训练的GPU卡数过多时,精度会稍微有所损失(1%左右),此时可以尝试通过添加warmup或者适当增加迭代轮数来弥补精度损失。 diff --git a/doc/doc_en/distributed_training_en.md b/doc/doc_en/distributed_training_en.md index 5a219ed2b..a9db354ad 100644 --- a/doc/doc_en/distributed_training_en.md +++ b/doc/doc_en/distributed_training_en.md @@ -40,17 +40,29 @@ python3 -m paddle.distributed.launch \ ## Performance comparison -* On two 8-card P40 graphics cards, the final time consumption and speedup ratio for public recognition dataset (LSVT, RCTW, MTWI) containing 260k images are as follows. +* We conducted model training on 2x8 P40 GPUs. Accuracy, training time, and multi machine acceleration ratio of different models are shown below. + +| Model | Configuration | Configuration | 8 GPU training time / Accuracy | 3x8 GPU training time / Accuracy | Acceleration ratio | -| Model | Config file | Recognition acc | single 8-card training time | two 8-card training time | Speedup ratio | -|------|-----|--------|--------|--------|-----| -| CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 67.0% | 2.50d | 1.67d | **1.5** | +| Model | Configuration | Configuration | 8 GPU training time / Accuracy | 3x8 GPU training time / Accuracy | Acceleration ratio | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 260k Chinese dataset | 2.50d/66.7% | 1.67d/67.0% | **1.5** | -* On four 8-card V100 graphics cards, the final time consumption and speedup ratio for full data are as follows. +* We conducted model training on 3x8 V100 GPUs. Accuracy, training time, and multi machine acceleration ratio of different models are shown below. + +| Model | Configuration | Configuration | 8 GPU training time / Accuracy | 3x8 GPU training time / Accuracy | Acceleration ratio | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| SLANet | [SLANet.yml](../../configs/table/SLANet.yml) | PubTabNet | 49.8h/76.2% | 19.75h/74.77% | **2.52** | -| Model | Config file | Recognition acc | single 8-card training time | four 8-card training time | Speedup ratio | -|------|-----|--------|--------|--------|-----| -| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | 74.0% | 10d | 2.84d | **3.5** | + > Note: when training with 3x8 GPUs, the single card batch size is unchanged compared with the 1x8 GPUs' training process, and the learning rate is multiplied by 2 (if it is multiplied by 3 by default, the accuracy is only 73.42%). + + +* We conducted model training on 4x8 V100 GPUs. Accuracy, training time, and multi machine acceleration ratio of different models are shown below. + + +| Model | Configuration | Configuration | 8 GPU training time / Accuracy | 4x8 GPU training time / Accuracy | Acceleration ratio | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | PP-OCRv3_rec data | 10d/- | 2.84d/74.0% | **3.5** | From a5df6c34e09f3b8a190e9b2fcde7ae7b55b48ead Mon Sep 17 00:00:00 2001 From: andyjpaddle <jiangkaitao@baidu.com> Date: Sat, 8 Oct 2022 08:28:30 +0000 Subject: [PATCH 10/11] update doc for whl --- doc/doc_ch/quickstart.md | 57 +++++++++++++++- doc/doc_ch/whl.md | 126 ++++++++++++++++++++++++++++-------- doc/doc_en/quickstart_en.md | 57 +++++++++++++++- doc/doc_en/whl_en.md | 121 +++++++++++++++++++++++++++------- paddleocr.py | 68 ++++++++++++------- 5 files changed, 353 insertions(+), 76 deletions(-) diff --git a/doc/doc_ch/quickstart.md b/doc/doc_ch/quickstart.md index e425cdd8a..cac7664c2 100644 --- a/doc/doc_ch/quickstart.md +++ b/doc/doc_ch/quickstart.md @@ -75,6 +75,11 @@ cd /path/to/ppocr_img ...... ``` + 此外,paddleocr也支持输入pdf文件,并且可以通过指定参数`page_num`来控制推理前面几页,默认为0,表示推理所有页。 + ```bash + paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 + ``` + - 单独使用检测:设置`--rec`为`false` ```bash @@ -165,12 +170,14 @@ from paddleocr import PaddleOCR, draw_ocr ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory img_path = './imgs/11.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # 显示结果 from PIL import Image - +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -196,6 +203,50 @@ im_show.save('result.jpg') <a name="3"></a> +如果输入是PDF文件,那么可以参考下面代码进行可视化 + +```python +from paddleocr import PaddleOCR, draw_ocr + +# Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换 +# 例如`ch`, `en`, `fr`, `german`, `korean`, `japan` +ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=2) # need to run only once to download and load model into memory +img_path = './xxx.pdf' +result = ocr.ocr(img_path, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# 显示结果 +import fitz +from PIL import Image +import cv2 +import numpy as np +imgs = [] +with fitz.open(img_path) as pdf: + for pg in range(0, pdf.pageCount): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.getPixmap(matrix=mat, alpha=False) + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False) + + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) +for idx in range(len(result)): + res = result[idx] + image = imgs[idx] + boxes = [line[0] for line in res] + txts = [line[1][0] for line in res] + scores = [line[1][1] for line in res] + im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf') + im_show = Image.fromarray(im_show) + im_show.save('result_page_{}.jpg'.format(idx)) +``` + ## 3. 小结 通过本节内容,相信您已经熟练掌握PaddleOCR whl包的使用方法并获得了初步效果。 diff --git a/doc/doc_ch/whl.md b/doc/doc_ch/whl.md index 315329464..83f062801 100644 --- a/doc/doc_ch/whl.md +++ b/doc/doc_ch/whl.md @@ -33,12 +33,14 @@ from paddleocr import PaddleOCR, draw_ocr ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs/11.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # 显示结果 from PIL import Image - +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -71,12 +73,14 @@ from paddleocr import PaddleOCR, draw_ocr ocr = PaddleOCR() # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs/11.jpg' result = ocr.ocr(img_path, cls=False) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # 显示结果 from PIL import Image - +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -109,8 +113,10 @@ from paddleocr import PaddleOCR ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' result = ocr.ocr(img_path, det=False, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) ``` 结果是一个list,每个item只包含识别结果和识别置信度 @@ -127,12 +133,14 @@ from paddleocr import PaddleOCR, draw_ocr ocr = PaddleOCR() # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs/11.jpg' result = ocr.ocr(img_path, rec=False) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # 显示结果 from PIL import Image - +result = result[0] image = Image.open(img_path).convert('RGB') im_show = draw_ocr(image, result, txts=None, scores=None, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf') im_show = Image.fromarray(im_show) @@ -163,8 +171,10 @@ from paddleocr import PaddleOCR ocr = PaddleOCR() # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' result = ocr.ocr(img_path, det=False) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) ``` 结果是一个list,每个item只包含识别结果和识别置信度 @@ -181,8 +191,10 @@ from paddleocr import PaddleOCR ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' result = ocr.ocr(img_path, det=False, rec=False, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) ``` 结果是一个list,每个item只包含分类结果和分类置信度 @@ -212,6 +224,11 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true ...... ``` +此外,paddleocr也支持输入pdf文件,并且可以通过指定参数`page_num`来控制推理前面几页,默认为0,表示推理所有页。 +```bash +paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 +``` + * 检测+识别 ```bash @@ -290,12 +307,14 @@ ocr = PaddleOCR(det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_m use_angle_cls=True) img_path = 'PaddleOCR/doc/imgs/11.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # 显示结果 from PIL import Image - +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -325,12 +344,14 @@ from paddleocr import PaddleOCR, draw_ocr, download_with_progressbar ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory img_path = 'http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # 显示结果 from PIL import Image - +result = result[0] download_with_progressbar(img_path, 'tmp.jpg') image = Image.open('tmp.jpg').convert('RGB') boxes = [line[0] for line in result] @@ -362,12 +383,14 @@ img_path = 'PaddleOCR/doc/imgs/11.jpg' img = cv2.imread(img_path) # img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY), 如果你自己训练的模型支持灰度图,可以将这句话的注释取消 result = ocr.ocr(img, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # 显示结果 from PIL import Image - +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -376,14 +399,65 @@ im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc im_show = Image.fromarray(im_show) im_show.save('result.jpg') ``` +## 5 PDF文件作为输入 +- 命令行模式 -## 5 参数说明 +可以通过指定参数`page_num`来控制推理前面几页,默认为0,表示推理所有页。 +```bash +paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 +``` +- 代码使用 + +```python +from paddleocr import PaddleOCR, draw_ocr + +# Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换 +# 例如`ch`, `en`, `fr`, `german`, `korean`, `japan` +ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=2) # need to run only once to download and load model into memory +img_path = './xxx.pdf' +result = ocr.ocr(img_path, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# 显示结果 +import fitz +from PIL import Image +import cv2 +import numpy as np +imgs = [] +with fitz.open(img_path) as pdf: + for pg in range(0, pdf.pageCount): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.getPixmap(matrix=mat, alpha=False) + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False) + + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) +for idx in range(len(result)): + res = result[idx] + image = imgs[idx] + boxes = [line[0] for line in res] + txts = [line[1][0] for line in res] + scores = [line[1][1] for line in res] + im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf') + im_show = Image.fromarray(im_show) + im_show.save('result_page_{}.jpg'.format(idx)) +``` + +## 6 参数说明 | 字段 | 说明 | 默认值 | |-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| | use_gpu | 是否使用GPU | TRUE | | gpu_mem | 初始化占用的GPU内存大小 | 8000M | -| image_dir | 通过命令行调用时执行预测的图片或文件夹路径 | | +| image_dir | 通过命令行调用时执行预测的图片或文件夹路径 | +| page_num | 当输入类型为pdf文件时有效,指定预测前面page_num页,默认预测所有页 | 0 | | det_algorithm | 使用的检测算法类型 | DB | | det_model_dir | 检测模型所在文件夹。传参方式有两种,1. None: 自动下载内置模型到 `~/.paddleocr/det`;2.自己转换好的inference模型路径,模型路径下必须包含model和params文件 | None | | det_max_side_len | 检测算法前向时图片长边的最大尺寸,当长边超出这个值时会将长边resize到这个大小,短边等比例缩放 | 960 | diff --git a/doc/doc_en/quickstart_en.md b/doc/doc_en/quickstart_en.md index 9e1de839f..ea38845f5 100644 --- a/doc/doc_en/quickstart_en.md +++ b/doc/doc_en/quickstart_en.md @@ -86,6 +86,12 @@ If you do not use the provided test image, you can replace the following `--imag ...... ``` + pdf file is also supported, you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages + + ```bash + paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 + ``` + * Only detection: set `--rec` to `false` ```bash @@ -176,12 +182,15 @@ from paddleocr import PaddleOCR,draw_ocr ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory img_path = './imgs_en/img_12.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # draw result from PIL import Image +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -206,6 +215,50 @@ Visualization of results <img src="../imgs_results/whl/12_det_rec.jpg" width="800"> </div> +If the input is a PDF file, you can refer to the following code for visualization + +```python +from paddleocr import PaddleOCR, draw_ocr + +# Paddleocr supports Chinese, English, French, German, Korean and Japanese. +# You can set the parameter `lang` as `ch`, `en`, `fr`, `german`, `korean`, `japan` +# to switch the language model in order. +ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=2) # need to run only once to download and load model into memory +img_path = './xxx.pdf' +result = ocr.ocr(img_path, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# draw result +import fitz +from PIL import Image +import cv2 +import numpy as np +imgs = [] +with fitz.open(img_path) as pdf: + for pg in range(0, pdf.pageCount): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.getPixmap(matrix=mat, alpha=False) + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False) + + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) +for idx in range(len(result)): + res = result[idx] + image = imgs[idx] + boxes = [line[0] for line in res] + txts = [line[1][0] for line in res] + scores = [line[1][1] for line in res] + im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf') + im_show = Image.fromarray(im_show) + im_show.save('result_page_{}.jpg'.format(idx)) +``` <a name="3"></a> diff --git a/doc/doc_en/whl_en.md b/doc/doc_en/whl_en.md index da2dff67c..77e80faa6 100644 --- a/doc/doc_en/whl_en.md +++ b/doc/doc_en/whl_en.md @@ -25,12 +25,14 @@ from paddleocr import PaddleOCR,draw_ocr ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) - +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # draw result from PIL import Image +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -60,11 +62,14 @@ from paddleocr import PaddleOCR,draw_ocr ocr = PaddleOCR(lang='en') # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' result = ocr.ocr(img_path, cls=False) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # draw result from PIL import Image +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -94,8 +99,10 @@ from paddleocr import PaddleOCR ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to load model into memory img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png' result = ocr.ocr(img_path, det=False, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) ``` Output will be a list, each item contains recognition text and confidence @@ -109,12 +116,14 @@ from paddleocr import PaddleOCR,draw_ocr ocr = PaddleOCR() # need to run only once to download and load model into memory img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' result = ocr.ocr(img_path,rec=False) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # draw result from PIL import Image - +result = result[0] image = Image.open(img_path).convert('RGB') im_show = draw_ocr(image, result, txts=None, scores=None, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf') im_show = Image.fromarray(im_show) @@ -141,8 +150,10 @@ from paddleocr import PaddleOCR ocr = PaddleOCR(lang='en') # need to run only once to load model into memory img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png' result = ocr.ocr(img_path, det=False, cls=False) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) ``` Output will be a list, each item contains recognition text and confidence @@ -156,8 +167,10 @@ from paddleocr import PaddleOCR ocr = PaddleOCR(use_angle_cls=True) # need to run only once to load model into memory img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png' result = ocr.ocr(img_path, det=False, rec=False, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) ``` Output will be a list, each item contains classification result and confidence @@ -185,6 +198,11 @@ Output will be a list, each item contains bounding box, text and recognition con ...... ``` +pdf file is also supported, you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages +```bash +paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 +``` + * detection and recognition ```bash paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --lang en @@ -253,11 +271,14 @@ from paddleocr import PaddleOCR,draw_ocr ocr = PaddleOCR(det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_model_dir}', rec_char_dict_path='{your_rec_char_dict_path}', cls_model_dir='{your_cls_model_dir}', use_angle_cls=True) img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # draw result from PIL import Image +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -283,11 +304,14 @@ from paddleocr import PaddleOCR, draw_ocr ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory img_path = 'http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg' result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # show result from PIL import Image +result = result[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] @@ -312,12 +336,14 @@ img_path = 'PaddleOCR/doc/imgs/11.jpg' img = cv2.imread(img_path) # img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY), If your own training model supports grayscale images, you can uncomment this line result = ocr.ocr(img_path, cls=True) -for line in result: - print(line) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) # show result from PIL import Image - +result = result[0] download_with_progressbar(img_path, 'tmp.jpg') image = Image.open('tmp.jpg').convert('RGB') boxes = [line[0] for line in result] @@ -327,15 +353,66 @@ im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc im_show = Image.fromarray(im_show) im_show.save('result.jpg') ``` +## 5 PDF file +- Use by command line +you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages +```bash +paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 +``` +- Use by code -## 5 Parameter Description +```python +from paddleocr import PaddleOCR, draw_ocr + +# Paddleocr supports Chinese, English, French, German, Korean and Japanese. +# You can set the parameter `lang` as `ch`, `en`, `fr`, `german`, `korean`, `japan` +# to switch the language model in order. +ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=2) # need to run only once to download and load model into memory +img_path = './xxx.pdf' +result = ocr.ocr(img_path, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# draw result +import fitz +from PIL import Image +import cv2 +import numpy as np +imgs = [] +with fitz.open(img_path) as pdf: + for pg in range(0, pdf.pageCount): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.getPixmap(matrix=mat, alpha=False) + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False) + + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) +for idx in range(len(result)): + res = result[idx] + image = imgs[idx] + boxes = [line[0] for line in res] + txts = [line[1][0] for line in res] + scores = [line[1][1] for line in res] + im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf') + im_show = Image.fromarray(im_show) + im_show.save('result_page_{}.jpg'.format(idx)) +``` + +## 6 Parameter Description | Parameter | Description | Default value | |-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| | use_gpu | use GPU or not | TRUE | | gpu_mem | GPU memory size used for initialization | 8000M | | image_dir | The images path or folder path for predicting when used by the command line | | +| page_num | Valid when the input type is pdf file, specify to predict the previous page_num pages, all pages are predicted by default | 0 | | det_algorithm | Type of detection algorithm selected | DB | | det_model_dir | the text detection inference model folder. There are two ways to transfer parameters, 1. None: Automatically download the built-in model to `~/.paddleocr/det`; 2. The path of the inference model converted by yourself, the model and params files must be included in the model path | None | | det_max_side_len | The maximum size of the long side of the image. When the long side exceeds this value, the long side will be resized to this size, and the short side will be scaled proportionally | 960 | diff --git a/paddleocr.py b/paddleocr.py index fa732fc11..de22397b0 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -47,7 +47,7 @@ __all__ = [ ] SUPPORT_DET_MODEL = ['DB'] -VERSION = '2.6.0.1' +VERSION = '2.6.0.2' SUPPORT_REC_MODEL = ['CRNN', 'SVTR_LCNet'] BASE_DIR = os.path.expanduser("~/.paddleocr/") @@ -428,8 +428,8 @@ def check_img(img): download_with_progressbar(img, 'tmp.jpg') img = 'tmp.jpg' image_file = img - img, flag, _ = check_and_read(image_file) - if not flag: + img, flag_gif, flag_pdf = check_and_read(image_file) + if not flag_gif and not flag_pdf: with open(image_file, 'rb') as f: img = img_decode(f.read()) if img is None: @@ -499,6 +499,7 @@ class PaddleOCR(predict_system.TextSystem): logger.debug(params) # init det_model and rec_model super().__init__(params) + self.page_num = params.page_num def ocr(self, img, det=True, rec=True, cls=True): """ @@ -519,24 +520,43 @@ class PaddleOCR(predict_system.TextSystem): ) img = check_img(img) - - if det and rec: - dt_boxes, rec_res, _ = self.__call__(img, cls) - return [[box.tolist(), res] for box, res in zip(dt_boxes, rec_res)] - elif det and not rec: - dt_boxes, elapse = self.text_detector(img) - if dt_boxes is None: - return None - return [box.tolist() for box in dt_boxes] + # for infer pdf file + if isinstance(img, list): + if self.page_num > len(img) or self.page_num == 0: + self.page_num = len(img) + imgs = img[:self.page_num] else: - if not isinstance(img, list): - img = [img] - if self.use_angle_cls and cls: - img, cls_res, elapse = self.text_classifier(img) - if not rec: - return cls_res - rec_res, elapse = self.text_recognizer(img) - return rec_res + imgs = [img] + if det and rec: + ocr_res = [] + for idx, img in enumerate(imgs): + dt_boxes, rec_res, _ = self.__call__(img, cls) + tmp_res = [[box.tolist(), res] + for box, res in zip(dt_boxes, rec_res)] + ocr_res.append(tmp_res) + return ocr_res + elif det and not rec: + ocr_res = [] + for idx, img in enumerate(imgs): + dt_boxes, elapse = self.text_detector(img) + tmp_res = [box.tolist() for box in dt_boxes] + ocr_res.append(tmp_res) + return ocr_res + else: + ocr_res = [] + cls_res = [] + for idx, img in enumerate(imgs): + if not isinstance(img, list): + img = [img] + if self.use_angle_cls and cls: + img, cls_res_tmp, elapse = self.text_classifier(img) + if not rec: + cls_res.append(cls_res_tmp) + rec_res, elapse = self.text_recognizer(img) + ocr_res.append(rec_res) + if not rec: + return cls_res + return ocr_res class PPStructure(StructureSystem): @@ -632,8 +652,10 @@ def main(): rec=args.rec, cls=args.use_angle_cls) if result is not None: - for line in result: - logger.info(line) + for idx in range(len(result)): + res = result[idx] + for line in res: + logger.info(line) elif args.type == 'structure': img, flag_gif, flag_pdf = check_and_read(img_path) if not flag_gif and not flag_pdf: @@ -681,7 +703,7 @@ def main(): "error in layout recovery image:{}, err msg: {}".format( img_name, ex)) continue - + for item in all_res: item.pop('img') item.pop('res') From eeef62b3c6fef19aeaad3306ace1dbc4a7848084 Mon Sep 17 00:00:00 2001 From: littletomatodonkey <dazhiningsibuqu@163.com> Date: Sat, 8 Oct 2022 16:37:12 +0800 Subject: [PATCH 11/11] fix PREN export and infer (#7833) --- .../backbones/rec_efficientb3_pren.py | 285 +++++++++++------- ppocr/postprocess/rec_postprocess.py | 3 +- tools/export_model.py | 2 +- tools/infer/predict_rec.py | 4 +- 4 files changed, 174 insertions(+), 120 deletions(-) diff --git a/ppocr/modeling/backbones/rec_efficientb3_pren.py b/ppocr/modeling/backbones/rec_efficientb3_pren.py index 57eef1788..701e436c1 100644 --- a/ppocr/modeling/backbones/rec_efficientb3_pren.py +++ b/ppocr/modeling/backbones/rec_efficientb3_pren.py @@ -21,124 +21,165 @@ from __future__ import division from __future__ import print_function import math -from collections import namedtuple +import re +import collections import paddle import paddle.nn as nn import paddle.nn.functional as F __all__ = ['EfficientNetb3'] +GlobalParams = collections.namedtuple('GlobalParams', [ + 'batch_norm_momentum', 'batch_norm_epsilon', 'dropout_rate', 'num_classes', + 'width_coefficient', 'depth_coefficient', 'depth_divisor', 'min_depth', + 'drop_connect_rate', 'image_size' +]) -class EffB3Params: +BlockArgs = collections.namedtuple('BlockArgs', [ + 'kernel_size', 'num_repeat', 'input_filters', 'output_filters', + 'expand_ratio', 'id_skip', 'stride', 'se_ratio' +]) + + +class BlockDecoder: @staticmethod - def get_global_params(): - """ - The fllowing are efficientnetb3's arch superparams, but to fit for scene - text recognition task, the resolution(image_size) here is changed - from 300 to 64. - """ - GlobalParams = namedtuple('GlobalParams', [ - 'drop_connect_rate', 'width_coefficient', 'depth_coefficient', - 'depth_divisor', 'image_size' - ]) - global_params = GlobalParams( - drop_connect_rate=0.3, - width_coefficient=1.2, - depth_coefficient=1.4, - depth_divisor=8, - image_size=64) - return global_params + def _decode_block_string(block_string): + assert isinstance(block_string, str) + + ops = block_string.split('_') + options = {} + for op in ops: + splits = re.split(r'(\d.*)', op) + if len(splits) >= 2: + key, value = splits[:2] + options[key] = value + + assert (('s' in options and len(options['s']) == 1) or + (len(options['s']) == 2 and options['s'][0] == options['s'][1])) + + return BlockArgs( + kernel_size=int(options['k']), + num_repeat=int(options['r']), + input_filters=int(options['i']), + output_filters=int(options['o']), + expand_ratio=int(options['e']), + id_skip=('noskip' not in block_string), + se_ratio=float(options['se']) if 'se' in options else None, + stride=[int(options['s'][0])]) @staticmethod - def get_block_params(): - BlockParams = namedtuple('BlockParams', [ - 'kernel_size', 'num_repeat', 'input_filters', 'output_filters', - 'expand_ratio', 'id_skip', 'se_ratio', 'stride' - ]) - block_params = [ - BlockParams(3, 1, 32, 16, 1, True, 0.25, 1), - BlockParams(3, 2, 16, 24, 6, True, 0.25, 2), - BlockParams(5, 2, 24, 40, 6, True, 0.25, 2), - BlockParams(3, 3, 40, 80, 6, True, 0.25, 2), - BlockParams(5, 3, 80, 112, 6, True, 0.25, 1), - BlockParams(5, 4, 112, 192, 6, True, 0.25, 2), - BlockParams(3, 1, 192, 320, 6, True, 0.25, 1) - ] - return block_params + def decode(string_list): + assert isinstance(string_list, list) + blocks_args = [] + for block_string in string_list: + blocks_args.append(BlockDecoder._decode_block_string(block_string)) + return blocks_args + + +def efficientnet(width_coefficient=None, + depth_coefficient=None, + dropout_rate=0.2, + drop_connect_rate=0.2, + image_size=None, + num_classes=1000): + blocks_args = [ + 'r1_k3_s11_e1_i32_o16_se0.25', + 'r2_k3_s22_e6_i16_o24_se0.25', + 'r2_k5_s22_e6_i24_o40_se0.25', + 'r3_k3_s22_e6_i40_o80_se0.25', + 'r3_k5_s11_e6_i80_o112_se0.25', + 'r4_k5_s22_e6_i112_o192_se0.25', + 'r1_k3_s11_e6_i192_o320_se0.25', + ] + blocks_args = BlockDecoder.decode(blocks_args) + + global_params = GlobalParams( + batch_norm_momentum=0.99, + batch_norm_epsilon=1e-3, + dropout_rate=dropout_rate, + drop_connect_rate=drop_connect_rate, + num_classes=num_classes, + width_coefficient=width_coefficient, + depth_coefficient=depth_coefficient, + depth_divisor=8, + min_depth=None, + image_size=image_size, ) + return blocks_args, global_params class EffUtils: @staticmethod def round_filters(filters, global_params): - """Calculate and round number of filters based on depth multiplier.""" + """ Calculate and round number of filters based on depth multiplier. """ multiplier = global_params.width_coefficient if not multiplier: return filters divisor = global_params.depth_divisor + min_depth = global_params.min_depth filters *= multiplier - new_filters = int(filters + divisor / 2) // divisor * divisor + min_depth = min_depth or divisor + new_filters = max(min_depth, + int(filters + divisor / 2) // divisor * divisor) if new_filters < 0.9 * filters: new_filters += divisor return int(new_filters) @staticmethod def round_repeats(repeats, global_params): - """Round number of filters based on depth multiplier.""" + """ Round number of filters based on depth multiplier. """ multiplier = global_params.depth_coefficient if not multiplier: return repeats return int(math.ceil(multiplier * repeats)) -class ConvBlock(nn.Layer): - def __init__(self, block_params): - super(ConvBlock, self).__init__() - self.block_args = block_params - self.has_se = (self.block_args.se_ratio is not None) and \ - (0 < self.block_args.se_ratio <= 1) - self.id_skip = block_params.id_skip +class MbConvBlock(nn.Layer): + def __init__(self, block_args): + super(MbConvBlock, self).__init__() + self._block_args = block_args + self.has_se = (self._block_args.se_ratio is not None) and \ + (0 < self._block_args.se_ratio <= 1) + self.id_skip = block_args.id_skip # expansion phase - self.input_filters = self.block_args.input_filters - output_filters = \ - self.block_args.input_filters * self.block_args.expand_ratio - if self.block_args.expand_ratio != 1: - self.expand_conv = nn.Conv2D( - self.input_filters, output_filters, 1, bias_attr=False) - self.bn0 = nn.BatchNorm(output_filters) + self.inp = self._block_args.input_filters + oup = self._block_args.input_filters * self._block_args.expand_ratio + if self._block_args.expand_ratio != 1: + self._expand_conv = nn.Conv2D(self.inp, oup, 1, bias_attr=False) + self._bn0 = nn.BatchNorm(oup) # depthwise conv phase - k = self.block_args.kernel_size - s = self.block_args.stride - self.depthwise_conv = nn.Conv2D( - output_filters, - output_filters, - groups=output_filters, + k = self._block_args.kernel_size + s = self._block_args.stride + if isinstance(s, list): + s = s[0] + self._depthwise_conv = nn.Conv2D( + oup, + oup, + groups=oup, kernel_size=k, stride=s, padding='same', bias_attr=False) - self.bn1 = nn.BatchNorm(output_filters) + self._bn1 = nn.BatchNorm(oup) # squeeze and excitation layer, if desired if self.has_se: num_squeezed_channels = max(1, - int(self.block_args.input_filters * - self.block_args.se_ratio)) - self.se_reduce = nn.Conv2D(output_filters, num_squeezed_channels, 1) - self.se_expand = nn.Conv2D(num_squeezed_channels, output_filters, 1) + int(self._block_args.input_filters * + self._block_args.se_ratio)) + self._se_reduce = nn.Conv2D(oup, num_squeezed_channels, 1) + self._se_expand = nn.Conv2D(num_squeezed_channels, oup, 1) - # output phase - self.final_oup = self.block_args.output_filters - self.project_conv = nn.Conv2D( - output_filters, self.final_oup, 1, bias_attr=False) - self.bn2 = nn.BatchNorm(self.final_oup) - self.swish = nn.Swish() + # output phase and some util class + self.final_oup = self._block_args.output_filters + self._project_conv = nn.Conv2D(oup, self.final_oup, 1, bias_attr=False) + self._bn2 = nn.BatchNorm(self.final_oup) + self._swish = nn.Swish() - def drop_connect(self, inputs, p, training): + def _drop_connect(self, inputs, p, training): if not training: return inputs - batch_size = inputs.shape[0] keep_prob = 1 - p random_tensor = keep_prob @@ -151,22 +192,23 @@ class ConvBlock(nn.Layer): def forward(self, inputs, drop_connect_rate=None): # expansion and depthwise conv x = inputs - if self.block_args.expand_ratio != 1: - x = self.swish(self.bn0(self.expand_conv(inputs))) - x = self.swish(self.bn1(self.depthwise_conv(x))) + if self._block_args.expand_ratio != 1: + x = self._swish(self._bn0(self._expand_conv(inputs))) + x = self._swish(self._bn1(self._depthwise_conv(x))) # squeeze and excitation if self.has_se: x_squeezed = F.adaptive_avg_pool2d(x, 1) - x_squeezed = self.se_expand(self.swish(self.se_reduce(x_squeezed))) + x_squeezed = self._se_expand( + self._swish(self._se_reduce(x_squeezed))) x = F.sigmoid(x_squeezed) * x - x = self.bn2(self.project_conv(x)) + x = self._bn2(self._project_conv(x)) # skip conntection and drop connect - if self.id_skip and self.block_args.stride == 1 and \ - self.input_filters == self.final_oup: + if self.id_skip and self._block_args.stride == 1 and \ + self.inp == self.final_oup: if drop_connect_rate: - x = self.drop_connect( + x = self._drop_connect( x, p=drop_connect_rate, training=self.training) x = x + inputs return x @@ -175,54 +217,63 @@ class ConvBlock(nn.Layer): class EfficientNetb3_PREN(nn.Layer): def __init__(self, in_channels): super(EfficientNetb3_PREN, self).__init__() - self.blocks_params = EffB3Params.get_block_params() - self.global_params = EffB3Params.get_global_params() + """ + the fllowing are efficientnetb3's superparams, + they means efficientnetb3 network's width, depth, resolution and + dropout respectively, to fit for text recognition task, the resolution + here is changed from 300 to 64. + """ + w, d, s, p = 1.2, 1.4, 64, 0.3 + self._blocks_args, self._global_params = efficientnet( + width_coefficient=w, + depth_coefficient=d, + dropout_rate=p, + image_size=s) self.out_channels = [] # stem - stem_channels = EffUtils.round_filters(32, self.global_params) - self.conv_stem = nn.Conv2D( - in_channels, stem_channels, 3, 2, padding='same', bias_attr=False) - self.bn0 = nn.BatchNorm(stem_channels) + out_channels = EffUtils.round_filters(32, self._global_params) + self._conv_stem = nn.Conv2D( + in_channels, out_channels, 3, 2, padding='same', bias_attr=False) + self._bn0 = nn.BatchNorm(out_channels) - self.blocks = [] + # build blocks + self._blocks = [] # to extract three feature maps for fpn based on efficientnetb3 backbone - self.concerned_block_idxes = [7, 17, 25] - concerned_idx = 0 - for i, block_params in enumerate(self.blocks_params): - block_params = block_params._replace( - input_filters=EffUtils.round_filters(block_params.input_filters, - self.global_params), - output_filters=EffUtils.round_filters( - block_params.output_filters, self.global_params), - num_repeat=EffUtils.round_repeats(block_params.num_repeat, - self.global_params)) - self.blocks.append( - self.add_sublayer("{}-0".format(i), ConvBlock(block_params))) - concerned_idx += 1 - if concerned_idx in self.concerned_block_idxes: - self.out_channels.append(block_params.output_filters) - if block_params.num_repeat > 1: - block_params = block_params._replace( - input_filters=block_params.output_filters, stride=1) - for j in range(block_params.num_repeat - 1): - self.blocks.append( - self.add_sublayer('{}-{}'.format(i, j + 1), - ConvBlock(block_params))) - concerned_idx += 1 - if concerned_idx in self.concerned_block_idxes: - self.out_channels.append(block_params.output_filters) + self._concerned_block_idxes = [7, 17, 25] + _concerned_idx = 0 + for i, block_args in enumerate(self._blocks_args): + block_args = block_args._replace( + input_filters=EffUtils.round_filters(block_args.input_filters, + self._global_params), + output_filters=EffUtils.round_filters(block_args.output_filters, + self._global_params), + num_repeat=EffUtils.round_repeats(block_args.num_repeat, + self._global_params)) + self._blocks.append( + self.add_sublayer(f"{i}-0", MbConvBlock(block_args))) + _concerned_idx += 1 + if _concerned_idx in self._concerned_block_idxes: + self.out_channels.append(block_args.output_filters) + if block_args.num_repeat > 1: + block_args = block_args._replace( + input_filters=block_args.output_filters, stride=1) + for j in range(block_args.num_repeat - 1): + self._blocks.append( + self.add_sublayer(f'{i}-{j+1}', MbConvBlock(block_args))) + _concerned_idx += 1 + if _concerned_idx in self._concerned_block_idxes: + self.out_channels.append(block_args.output_filters) - self.swish = nn.Swish() + self._swish = nn.Swish() def forward(self, inputs): outs = [] - - x = self.swish(self.bn0(self.conv_stem(inputs))) - for idx, block in enumerate(self.blocks): - drop_connect_rate = self.global_params.drop_connect_rate + x = self._swish(self._bn0(self._conv_stem(inputs))) + for idx, block in enumerate(self._blocks): + drop_connect_rate = self._global_params.drop_connect_rate if drop_connect_rate: - drop_connect_rate *= float(idx) / len(self.blocks) + drop_connect_rate *= float(idx) / len(self._blocks) x = block(x, drop_connect_rate=drop_connect_rate) - if idx in self.concerned_block_idxes: + if idx in self._concerned_block_idxes: outs.append(x) return outs diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index 749060a05..e8ab4b9f3 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -562,7 +562,8 @@ class PRENLabelDecode(BaseRecLabelDecode): return result_list def __call__(self, preds, label=None, *args, **kwargs): - preds = preds.numpy() + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() preds_idx = preds.argmax(axis=2) preds_prob = preds.max(axis=2) text = self.decode(preds_idx, preds_prob) diff --git a/tools/export_model.py b/tools/export_model.py index 8610df83e..0797bd726 100755 --- a/tools/export_model.py +++ b/tools/export_model.py @@ -77,7 +77,7 @@ def export_single_model(model, elif arch_config["algorithm"] == "PREN": other_shape = [ paddle.static.InputSpec( - shape=[None, 3, 64, 512], dtype="float32"), + shape=[None, 3, 64, 256], dtype="float32"), ] model = to_static(model, input_spec=other_shape) elif arch_config["model_type"] == "sr": diff --git a/tools/infer/predict_rec.py b/tools/infer/predict_rec.py index 176e2c68e..32f664362 100755 --- a/tools/infer/predict_rec.py +++ b/tools/infer/predict_rec.py @@ -100,6 +100,8 @@ class TextRecognizer(object): "use_space_char": args.use_space_char, "rm_symbol": True } + elif self.rec_algorithm == "PREN": + postprocess_params = {'name': 'PRENLabelDecode'} self.postprocess_op = build_post_process(postprocess_params) self.predictor, self.input_tensor, self.output_tensors, self.config = \ utility.create_predictor(args, 'rec', logger) @@ -384,7 +386,7 @@ class TextRecognizer(object): self.rec_image_shape) norm_img = norm_img[np.newaxis, :] norm_img_batch.append(norm_img) - elif self.rec_algorithm == "VisionLAN": + elif self.rec_algorithm in ["VisionLAN", "PREN"]: norm_img = self.resize_norm_img_vl(img_list[indices[ino]], self.rec_image_shape) norm_img = norm_img[np.newaxis, :]