Merge branch 'dygraph' of https://github.com/PaddlePaddle/PaddleOCR into rfl_branch

pull/7741/head
zhiminzhang0830 2022-10-09 10:07:46 +08:00
commit c25eec882a
19 changed files with 600 additions and 241 deletions

View File

@ -2715,6 +2715,9 @@ class MainWindow(QMainWindow):
self._update_shape_color(shape) self._update_shape_color(shape)
self.keyDialog.addLabelHistory(key_text) self.keyDialog.addLabelHistory(key_text)
# save changed shape
self.setDirty()
def undoShapeEdit(self): def undoShapeEdit(self):
self.canvas.restoreShape() self.canvas.restoreShape()

View File

@ -2,7 +2,7 @@
# PPOCRLabelv2 # PPOCRLabelv2
PPOCRLabel是一款适用于OCR领域的半自动化图形标注工具内置PP-OCR模型对数据自动标注和重新识别。使用Python3和PyQT5编写支持矩形框标注和四点标注模式导出格式可直接用于PaddleOCR检测和识别模型的训练。 PPOCRLabel是一款适用于OCR领域的半自动化图形标注工具内置PP-OCR模型对数据自动标注和重新识别。使用Python3和PyQT5编写支持矩形框标注、表格标注、不规则文本标注、关键信息标注模式导出格式可直接用于PaddleOCR检测和识别模型的训练。
| 常规标注 | 表格标注 | | 常规标注 | 表格标注 |
| :-------------------------------------------------: | :--------------------------------------------: | | :-------------------------------------------------: | :--------------------------------------------: |

View File

@ -611,8 +611,8 @@ class Canvas(QWidget):
if self.drawing() and not self.prevPoint.isNull() and not self.outOfPixmap(self.prevPoint): if self.drawing() and not self.prevPoint.isNull() and not self.outOfPixmap(self.prevPoint):
p.setPen(QColor(0, 0, 0)) p.setPen(QColor(0, 0, 0))
p.drawLine(self.prevPoint.x(), 0, self.prevPoint.x(), self.pixmap.height()) p.drawLine(int(self.prevPoint.x()), 0, int(self.prevPoint.x()), self.pixmap.height())
p.drawLine(0, self.prevPoint.y(), self.pixmap.width(), self.prevPoint.y()) p.drawLine(0, int(self.prevPoint.y()), self.pixmap.width(), int(self.prevPoint.y()))
self.setAutoFillBackground(True) self.setAutoFillBackground(True)
if self.verified: if self.verified:
@ -909,4 +909,4 @@ class Canvas(QWidget):
def updateShapeIndex(self): def updateShapeIndex(self):
for i in range(len(self.shapes)): for i in range(len(self.shapes)):
self.shapes[i].idx = i self.shapes[i].idx = i
self.update() self.update()

View File

@ -113,4 +113,4 @@ keyDialogTip=Enter object label
keyChange=Change Box Key keyChange=Change Box Key
TableRecognition=Table Recognition TableRecognition=Table Recognition
cellreRecognition=Cell Re-Recognition cellreRecognition=Cell Re-Recognition
exportJSON=Export Excel Label(PubTabNet) exportJSON=Export Table Label

View File

@ -113,4 +113,4 @@ keyDialogTip=请输入类型名称
keyChange=更改Box关键字类别 keyChange=更改Box关键字类别
TableRecognition=表格识别 TableRecognition=表格识别
cellreRecognition=单元格重识别 cellreRecognition=单元格重识别
exportJSON=导出表格JSON标注 exportJSON=导出表格标注

View File

@ -41,16 +41,30 @@ python3 -m paddle.distributed.launch \
## 性能效果测试 ## 性能效果测试
* 在2机8卡P40的机器上基于26W公开识别数据集(LSVT, RCTW, MTWI)上进行训练,最终耗时如下 * 在2机8卡P40的机器上进行模型训练,不同模型的精度、训练耗时、多机加速比情况如下所示
| 模型 | 配置 | 精度 | 单机8卡耗时 | 2机8卡耗时 | 加速比 | | 模型 | 配置 | 数据集 | 单机8卡耗时/精度 | 2机8卡耗时/精度 | 加速比 |
|------|-----|--------|--------|--------|-----| |:------:|:-----:|:--------:|:--------:|:--------:|:-----:|
| CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 67.0% | 2.50d | 1.67d | **1.5** | | CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 26W中文数据集 | 2.50d/66.7% | 1.67d/67.0% | **1.5** |
* 在4机8卡V100的机器上基于全量数据训练最终耗时如下 * 在3机8卡V100的机器上进行模型训练不同模型的精度、训练耗时、多机加速比情况如下所示。
| 模型 | 配置 | 数据集 | 单机8卡耗时/精度 | 3机8卡耗时/精度 | 加速比 |
|:------:|:-----:|:--------:|:--------:|:--------:|:-----:|
| SLANet | [SLANet.yml](../../configs/table/SLANet.yml) | PubTabNet | 49.8h/76.2% | 19.75h/74.77% | **2.52** |
| 模型 | 配置 | 精度 | 单机8卡耗时 | 4机8卡耗时 | 加速比 | > 注意这里3机8卡训练时单卡batch size相比于单机8卡不变学习率乘以2 (默认乘以3的话精度仅有73.42%)
|------|-----|--------|--------|--------|-----|
| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | 74.0% | 10d | 2.84d | **3.5** |
* 在4机8卡V100的机器上进行模型训练不同模型的精度、训练耗时、多机加速比情况如下所示。
| 模型 | 配置 | 数据集 | 单机8卡耗时/精度 | 4机8卡耗时/精度 | 加速比 |
|:------:|:-----:|:--------:|:--------:|:--------:|:-----:|
| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | PP-OCRv3_rec data | 10d/- | 2.84d/74.0% | **3.5** |
* **注意**
* 在训练的GPU卡数过多时精度会稍微有所损失1%左右此时可以尝试通过添加warmup或者适当增加迭代轮数来弥补精度损失。

View File

@ -144,7 +144,7 @@ Predicts of ./doc/imgs_words/ch/word_4.jpg:['0', 0.9999982]
**注意** `PP-OCRv3`的识别模型使用的输入shape为`3,48,320`, 如果使用其他识别模型,则需根据模型设置参数`--rec_image_shape`。此外,`PP-OCRv3`的识别模型默认使用的`rec_algorithm`为`SVTR_LCNet`,注意和原始`SVTR`的区别。 **注意** `PP-OCRv3`的识别模型使用的输入shape为`3,48,320`, 如果使用其他识别模型,则需根据模型设置参数`--rec_image_shape`。此外,`PP-OCRv3`的识别模型默认使用的`rec_algorithm`为`SVTR_LCNet`,注意和原始`SVTR`的区别。
以超轻量中文OCR模型推理为例在执行预测时需要通过参数`image_dir`指定单张图像或者图像集合的路径、参数`det_model_dir`,`cls_model_dir`和`rec_model_dir`分别指定检测方向分类和识别的inference模型路径。参数`use_angle_cls`用于控制是否启用方向分类模型。`use_mp`表示是否使用多进程。`total_process_num`表示在使用多进程时的进程数。可视化识别结果默认保存到 ./inference_results 文件夹里面。 以超轻量中文OCR模型推理为例在执行预测时需要通过参数`image_dir`指定单张图像或者图像集合的路径也支持PDF文件、参数`det_model_dir`,`cls_model_dir`和`rec_model_dir`分别指定检测方向分类和识别的inference模型路径。参数`use_angle_cls`用于控制是否启用方向分类模型。`use_mp`表示是否使用多进程。`total_process_num`表示在使用多进程时的进程数。可视化识别结果默认保存到 ./inference_results 文件夹里面。
```shell ```shell
# 使用方向分类器 # 使用方向分类器
@ -153,8 +153,11 @@ python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --de
python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false
# 使用多进程 # 使用多进程
python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false --use_mp=True --total_process_num=6 python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false --use_mp=True --total_process_num=6
# 使用PDF文件,可以通过使用`page_num`参数来控制推理前几页默认为0表示推理所有页
python3 tools/infer/predict_system.py --image_dir="./xxx.pdf" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=true --page_num=2
``` ```
执行命令后,识别结果图像如下: 执行命令后,识别结果图像如下:
![](../imgs_results/system_res_00018069_v3.jpg) ![](../imgs_results/system_res_00018069_v3.jpg)

View File

@ -75,6 +75,11 @@ cd /path/to/ppocr_img
...... ......
``` ```
此外paddleocr也支持输入pdf文件并且可以通过指定参数`page_num`来控制推理前面几页默认为0表示推理所有页。
```bash
paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2
```
- 单独使用检测:设置`--rec`为`false` - 单独使用检测:设置`--rec`为`false`
```bash ```bash
@ -165,12 +170,14 @@ from paddleocr import PaddleOCR, draw_ocr
ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
img_path = './imgs/11.jpg' img_path = './imgs/11.jpg'
result = ocr.ocr(img_path, cls=True) result = ocr.ocr(img_path, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# 显示结果 # 显示结果
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
@ -196,6 +203,50 @@ im_show.save('result.jpg')
<a name="3"></a> <a name="3"></a>
如果输入是PDF文件那么可以参考下面代码进行可视化
```python
from paddleocr import PaddleOCR, draw_ocr
# Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换
# 例如`ch`, `en`, `fr`, `german`, `korean`, `japan`
ocr = PaddleOCR(use_angle_cls=True, lang="ch" page_num=2) # need to run only once to download and load model into memory
img_path = './xxx.pdf'
result = ocr.ocr(img_path, cls=True)
for idx in range(len(result)):
res = result[idx]
for line in res:
print(line)
# 显示结果
import fitz
from PIL import Image
import cv2
import numpy as np
imgs = []
with fitz.open(img_path) as pdf:
for pg in range(0, pdf.pageCount):
page = pdf[pg]
mat = fitz.Matrix(2, 2)
pm = page.getPixmap(matrix=mat, alpha=False)
# if width or height > 2000 pixels, don't enlarge the image
if pm.width > 2000 or pm.height > 2000:
pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
imgs.append(img)
for idx in range(len(result)):
res = result[idx]
image = imgs[idx]
boxes = [line[0] for line in res]
txts = [line[1][0] for line in res]
scores = [line[1][1] for line in res]
im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf')
im_show = Image.fromarray(im_show)
im_show.save('result_page_{}.jpg'.format(idx))
```
## 3. 小结 ## 3. 小结
通过本节内容相信您已经熟练掌握PaddleOCR whl包的使用方法并获得了初步效果。 通过本节内容相信您已经熟练掌握PaddleOCR whl包的使用方法并获得了初步效果。

View File

@ -33,12 +33,14 @@ from paddleocr import PaddleOCR, draw_ocr
ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs/11.jpg' img_path = 'PaddleOCR/doc/imgs/11.jpg'
result = ocr.ocr(img_path, cls=True) result = ocr.ocr(img_path, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# 显示结果 # 显示结果
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
@ -71,12 +73,14 @@ from paddleocr import PaddleOCR, draw_ocr
ocr = PaddleOCR() # need to run only once to download and load model into memory ocr = PaddleOCR() # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs/11.jpg' img_path = 'PaddleOCR/doc/imgs/11.jpg'
result = ocr.ocr(img_path, cls=False) result = ocr.ocr(img_path, cls=False)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# 显示结果 # 显示结果
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
@ -109,8 +113,10 @@ from paddleocr import PaddleOCR
ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg'
result = ocr.ocr(img_path, det=False, cls=True) result = ocr.ocr(img_path, det=False, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
``` ```
结果是一个list每个item只包含识别结果和识别置信度 结果是一个list每个item只包含识别结果和识别置信度
@ -127,12 +133,14 @@ from paddleocr import PaddleOCR, draw_ocr
ocr = PaddleOCR() # need to run only once to download and load model into memory ocr = PaddleOCR() # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs/11.jpg' img_path = 'PaddleOCR/doc/imgs/11.jpg'
result = ocr.ocr(img_path, rec=False) result = ocr.ocr(img_path, rec=False)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# 显示结果 # 显示结果
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
im_show = draw_ocr(image, result, txts=None, scores=None, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf') im_show = draw_ocr(image, result, txts=None, scores=None, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf')
im_show = Image.fromarray(im_show) im_show = Image.fromarray(im_show)
@ -163,8 +171,10 @@ from paddleocr import PaddleOCR
ocr = PaddleOCR() # need to run only once to download and load model into memory ocr = PaddleOCR() # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg'
result = ocr.ocr(img_path, det=False) result = ocr.ocr(img_path, det=False)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
``` ```
结果是一个list每个item只包含识别结果和识别置信度 结果是一个list每个item只包含识别结果和识别置信度
@ -181,8 +191,10 @@ from paddleocr import PaddleOCR
ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg'
result = ocr.ocr(img_path, det=False, rec=False, cls=True) result = ocr.ocr(img_path, det=False, rec=False, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
``` ```
结果是一个list每个item只包含分类结果和分类置信度 结果是一个list每个item只包含分类结果和分类置信度
@ -212,6 +224,11 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true
...... ......
``` ```
此外paddleocr也支持输入pdf文件并且可以通过指定参数`page_num`来控制推理前面几页默认为0表示推理所有页。
```bash
paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2
```
* 检测+识别 * 检测+识别
```bash ```bash
@ -290,12 +307,14 @@ ocr = PaddleOCR(det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_m
use_angle_cls=True) use_angle_cls=True)
img_path = 'PaddleOCR/doc/imgs/11.jpg' img_path = 'PaddleOCR/doc/imgs/11.jpg'
result = ocr.ocr(img_path, cls=True) result = ocr.ocr(img_path, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# 显示结果 # 显示结果
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
@ -325,12 +344,14 @@ from paddleocr import PaddleOCR, draw_ocr, download_with_progressbar
ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
img_path = 'http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg' img_path = 'http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg'
result = ocr.ocr(img_path, cls=True) result = ocr.ocr(img_path, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# 显示结果 # 显示结果
from PIL import Image from PIL import Image
result = result[0]
download_with_progressbar(img_path, 'tmp.jpg') download_with_progressbar(img_path, 'tmp.jpg')
image = Image.open('tmp.jpg').convert('RGB') image = Image.open('tmp.jpg').convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
@ -362,12 +383,14 @@ img_path = 'PaddleOCR/doc/imgs/11.jpg'
img = cv2.imread(img_path) img = cv2.imread(img_path)
# img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY), 如果你自己训练的模型支持灰度图,可以将这句话的注释取消 # img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY), 如果你自己训练的模型支持灰度图,可以将这句话的注释取消
result = ocr.ocr(img, cls=True) result = ocr.ocr(img, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# 显示结果 # 显示结果
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
@ -376,14 +399,65 @@ im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc
im_show = Image.fromarray(im_show) im_show = Image.fromarray(im_show)
im_show.save('result.jpg') im_show.save('result.jpg')
``` ```
## 5 PDF文件作为输入
- 命令行模式
## 5 参数说明 可以通过指定参数`page_num`来控制推理前面几页默认为0表示推理所有页。
```bash
paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2
```
- 代码使用
```python
from paddleocr import PaddleOCR, draw_ocr
# Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换
# 例如`ch`, `en`, `fr`, `german`, `korean`, `japan`
ocr = PaddleOCR(use_angle_cls=True, lang="ch" page_num=2) # need to run only once to download and load model into memory
img_path = './xxx.pdf'
result = ocr.ocr(img_path, cls=True)
for idx in range(len(result)):
res = result[idx]
for line in res:
print(line)
# 显示结果
import fitz
from PIL import Image
import cv2
import numpy as np
imgs = []
with fitz.open(img_path) as pdf:
for pg in range(0, pdf.pageCount):
page = pdf[pg]
mat = fitz.Matrix(2, 2)
pm = page.getPixmap(matrix=mat, alpha=False)
# if width or height > 2000 pixels, don't enlarge the image
if pm.width > 2000 or pm.height > 2000:
pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
imgs.append(img)
for idx in range(len(result)):
res = result[idx]
image = imgs[idx]
boxes = [line[0] for line in res]
txts = [line[1][0] for line in res]
scores = [line[1][1] for line in res]
im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf')
im_show = Image.fromarray(im_show)
im_show.save('result_page_{}.jpg'.format(idx))
```
## 6 参数说明
| 字段 | 说明 | 默认值 | | 字段 | 说明 | 默认值 |
|-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| |-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------|
| use_gpu | 是否使用GPU | TRUE | | use_gpu | 是否使用GPU | TRUE |
| gpu_mem | 初始化占用的GPU内存大小 | 8000M | | gpu_mem | 初始化占用的GPU内存大小 | 8000M |
| image_dir | 通过命令行调用时执行预测的图片或文件夹路径 | | | image_dir | 通过命令行调用时执行预测的图片或文件夹路径 |
| page_num | 当输入类型为pdf文件时有效指定预测前面page_num页默认预测所有页 | 0 |
| det_algorithm | 使用的检测算法类型 | DB | | det_algorithm | 使用的检测算法类型 | DB |
| det_model_dir | 检测模型所在文件夹。传参方式有两种1. None: 自动下载内置模型到 `~/.paddleocr/det`2.自己转换好的inference模型路径模型路径下必须包含model和params文件 | None | | det_model_dir | 检测模型所在文件夹。传参方式有两种1. None: 自动下载内置模型到 `~/.paddleocr/det`2.自己转换好的inference模型路径模型路径下必须包含model和params文件 | None |
| det_max_side_len | 检测算法前向时图片长边的最大尺寸当长边超出这个值时会将长边resize到这个大小短边等比例缩放 | 960 | | det_max_side_len | 检测算法前向时图片长边的最大尺寸当长边超出这个值时会将长边resize到这个大小短边等比例缩放 | 960 |

View File

@ -40,17 +40,29 @@ python3 -m paddle.distributed.launch \
## Performance comparison ## Performance comparison
* On two 8-card P40 graphics cards, the final time consumption and speedup ratio for public recognition dataset (LSVT, RCTW, MTWI) containing 260k images are as follows. * We conducted model training on 2x8 P40 GPUs. Accuracy, training time, and multi machine acceleration ratio of different models are shown below.
| Model | Configuration | Configuration | 8 GPU training time / Accuracy | 3x8 GPU training time / Accuracy | Acceleration ratio |
| Model | Config file | Recognition acc | single 8-card training time | two 8-card training time | Speedup ratio | | Model | Configuration | Configuration | 8 GPU training time / Accuracy | 3x8 GPU training time / Accuracy | Acceleration ratio |
|------|-----|--------|--------|--------|-----| |:------:|:-----:|:--------:|:--------:|:--------:|:-----:|
| CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 67.0% | 2.50d | 1.67d | **1.5** | | CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 260k Chinese dataset | 2.50d/66.7% | 1.67d/67.0% | **1.5** |
* On four 8-card V100 graphics cards, the final time consumption and speedup ratio for full data are as follows. * We conducted model training on 3x8 V100 GPUs. Accuracy, training time, and multi machine acceleration ratio of different models are shown below.
| Model | Configuration | Configuration | 8 GPU training time / Accuracy | 3x8 GPU training time / Accuracy | Acceleration ratio |
|:------:|:-----:|:--------:|:--------:|:--------:|:-----:|
| SLANet | [SLANet.yml](../../configs/table/SLANet.yml) | PubTabNet | 49.8h/76.2% | 19.75h/74.77% | **2.52** |
| Model | Config file | Recognition acc | single 8-card training time | four 8-card training time | Speedup ratio | > Note: when training with 3x8 GPUs, the single card batch size is unchanged compared with the 1x8 GPUs' training process, and the learning rate is multiplied by 2 (if it is multiplied by 3 by default, the accuracy is only 73.42%).
|------|-----|--------|--------|--------|-----|
| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | 74.0% | 10d | 2.84d | **3.5** |
* We conducted model training on 4x8 V100 GPUs. Accuracy, training time, and multi machine acceleration ratio of different models are shown below.
| Model | Configuration | Configuration | 8 GPU training time / Accuracy | 4x8 GPU training time / Accuracy | Acceleration ratio |
|:------:|:-----:|:--------:|:--------:|:--------:|:-----:|
| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | PP-OCRv3_rec data | 10d/- | 2.84d/74.0% | **3.5** |

View File

@ -144,16 +144,17 @@ After executing the command, the prediction results (classification angle and sc
**Note**: The input shape used by the recognition model of `PP-OCRv3` is `3, 48, 320`. If you use other recognition models, you need to set the parameter `--rec_image_shape` according to the model. In addition, the `rec_algorithm` used by the recognition model of `PP-OCRv3` is `SVTR_LCNet` by default. Note the difference from the original `SVTR`. **Note**: The input shape used by the recognition model of `PP-OCRv3` is `3, 48, 320`. If you use other recognition models, you need to set the parameter `--rec_image_shape` according to the model. In addition, the `rec_algorithm` used by the recognition model of `PP-OCRv3` is `SVTR_LCNet` by default. Note the difference from the original `SVTR`.
When performing prediction, you need to specify the path of a single image or a folder of images through the parameter `image_dir`, the parameter `det_model_dir` specifies the path to detect the inference model, the parameter `cls_model_dir` specifies the path to angle classification inference model and the parameter `rec_model_dir` specifies the path to identify the inference model. The parameter `use_angle_cls` is used to control whether to enable the angle classification model. The parameter `use_mp` specifies whether to use multi-process to infer `total_process_num` specifies process number when using multi-process. The parameter . The visualized recognition results are saved to the `./inference_results` folder by default. When performing prediction, you need to specify the path of a single image or a folder of images through the parameter `image_dir`, pdf file is also supported, the parameter `det_model_dir` specifies the path to detect the inference model, the parameter `cls_model_dir` specifies the path to angle classification inference model and the parameter `rec_model_dir` specifies the path to identify the inference model. The parameter `use_angle_cls` is used to control whether to enable the angle classification model. The parameter `use_mp` specifies whether to use multi-process to infer `total_process_num` specifies process number when using multi-process. The parameter . The visualized recognition results are saved to the `./inference_results` folder by default.
```shell ```shell
# use direction classifier # use direction classifier
python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=true python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=true
# not use use direction classifier # not use use direction classifier
python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false
# use multi-process # use multi-process
python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false --use_mp=True --total_process_num=6 python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false --use_mp=True --total_process_num=6
# use PDF files, you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages
python3 tools/infer/predict_system.py --image_dir="./xxx.pdf" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=true --page_num=2
``` ```

View File

@ -86,6 +86,12 @@ If you do not use the provided test image, you can replace the following `--imag
...... ......
``` ```
pdf file is also supported, you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages
```bash
paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2
```
* Only detection: set `--rec` to `false` * Only detection: set `--rec` to `false`
```bash ```bash
@ -176,12 +182,15 @@ from paddleocr import PaddleOCR,draw_ocr
ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory
img_path = './imgs_en/img_12.jpg' img_path = './imgs_en/img_12.jpg'
result = ocr.ocr(img_path, cls=True) result = ocr.ocr(img_path, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# draw result # draw result
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
@ -206,6 +215,50 @@ Visualization of results
<img src="../imgs_results/whl/12_det_rec.jpg" width="800"> <img src="../imgs_results/whl/12_det_rec.jpg" width="800">
</div> </div>
If the input is a PDF file, you can refer to the following code for visualization
```python
from paddleocr import PaddleOCR, draw_ocr
# Paddleocr supports Chinese, English, French, German, Korean and Japanese.
# You can set the parameter `lang` as `ch`, `en`, `fr`, `german`, `korean`, `japan`
# to switch the language model in order.
ocr = PaddleOCR(use_angle_cls=True, lang="ch" page_num=2) # need to run only once to download and load model into memory
img_path = './xxx.pdf'
result = ocr.ocr(img_path, cls=True)
for idx in range(len(result)):
res = result[idx]
for line in res:
print(line)
# draw result
import fitz
from PIL import Image
import cv2
import numpy as np
imgs = []
with fitz.open(img_path) as pdf:
for pg in range(0, pdf.pageCount):
page = pdf[pg]
mat = fitz.Matrix(2, 2)
pm = page.getPixmap(matrix=mat, alpha=False)
# if width or height > 2000 pixels, don't enlarge the image
if pm.width > 2000 or pm.height > 2000:
pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
imgs.append(img)
for idx in range(len(result)):
res = result[idx]
image = imgs[idx]
boxes = [line[0] for line in res]
txts = [line[1][0] for line in res]
scores = [line[1][1] for line in res]
im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf')
im_show = Image.fromarray(im_show)
im_show.save('result_page_{}.jpg'.format(idx))
```
<a name="3"></a> <a name="3"></a>

View File

@ -25,12 +25,14 @@ from paddleocr import PaddleOCR,draw_ocr
ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg'
result = ocr.ocr(img_path, cls=True) result = ocr.ocr(img_path, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# draw result # draw result
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
@ -60,11 +62,14 @@ from paddleocr import PaddleOCR,draw_ocr
ocr = PaddleOCR(lang='en') # need to run only once to download and load model into memory ocr = PaddleOCR(lang='en') # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg'
result = ocr.ocr(img_path, cls=False) result = ocr.ocr(img_path, cls=False)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# draw result # draw result
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
@ -94,8 +99,10 @@ from paddleocr import PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to load model into memory ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to load model into memory
img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png' img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png'
result = ocr.ocr(img_path, det=False, cls=True) result = ocr.ocr(img_path, det=False, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
``` ```
Output will be a list, each item contains recognition text and confidence Output will be a list, each item contains recognition text and confidence
@ -109,12 +116,14 @@ from paddleocr import PaddleOCR,draw_ocr
ocr = PaddleOCR() # need to run only once to download and load model into memory ocr = PaddleOCR() # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg'
result = ocr.ocr(img_path,rec=False) result = ocr.ocr(img_path,rec=False)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# draw result # draw result
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
im_show = draw_ocr(image, result, txts=None, scores=None, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf') im_show = draw_ocr(image, result, txts=None, scores=None, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf')
im_show = Image.fromarray(im_show) im_show = Image.fromarray(im_show)
@ -141,8 +150,10 @@ from paddleocr import PaddleOCR
ocr = PaddleOCR(lang='en') # need to run only once to load model into memory ocr = PaddleOCR(lang='en') # need to run only once to load model into memory
img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png' img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png'
result = ocr.ocr(img_path, det=False, cls=False) result = ocr.ocr(img_path, det=False, cls=False)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
``` ```
Output will be a list, each item contains recognition text and confidence Output will be a list, each item contains recognition text and confidence
@ -156,8 +167,10 @@ from paddleocr import PaddleOCR
ocr = PaddleOCR(use_angle_cls=True) # need to run only once to load model into memory ocr = PaddleOCR(use_angle_cls=True) # need to run only once to load model into memory
img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png' img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png'
result = ocr.ocr(img_path, det=False, rec=False, cls=True) result = ocr.ocr(img_path, det=False, rec=False, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
``` ```
Output will be a list, each item contains classification result and confidence Output will be a list, each item contains classification result and confidence
@ -185,6 +198,11 @@ Output will be a list, each item contains bounding box, text and recognition con
...... ......
``` ```
pdf file is also supported, you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages
```bash
paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2
```
* detection and recognition * detection and recognition
```bash ```bash
paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --lang en paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --lang en
@ -253,11 +271,14 @@ from paddleocr import PaddleOCR,draw_ocr
ocr = PaddleOCR(det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_model_dir}', rec_char_dict_path='{your_rec_char_dict_path}', cls_model_dir='{your_cls_model_dir}', use_angle_cls=True) ocr = PaddleOCR(det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_model_dir}', rec_char_dict_path='{your_rec_char_dict_path}', cls_model_dir='{your_cls_model_dir}', use_angle_cls=True)
img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg'
result = ocr.ocr(img_path, cls=True) result = ocr.ocr(img_path, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# draw result # draw result
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
@ -283,11 +304,14 @@ from paddleocr import PaddleOCR, draw_ocr
ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
img_path = 'http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg' img_path = 'http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg'
result = ocr.ocr(img_path, cls=True) result = ocr.ocr(img_path, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# show result # show result
from PIL import Image from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
@ -312,12 +336,14 @@ img_path = 'PaddleOCR/doc/imgs/11.jpg'
img = cv2.imread(img_path) img = cv2.imread(img_path)
# img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY), If your own training model supports grayscale images, you can uncomment this line # img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY), If your own training model supports grayscale images, you can uncomment this line
result = ocr.ocr(img_path, cls=True) result = ocr.ocr(img_path, cls=True)
for line in result: for idx in range(len(result)):
print(line) res = result[idx]
for line in res:
print(line)
# show result # show result
from PIL import Image from PIL import Image
result = result[0]
download_with_progressbar(img_path, 'tmp.jpg') download_with_progressbar(img_path, 'tmp.jpg')
image = Image.open('tmp.jpg').convert('RGB') image = Image.open('tmp.jpg').convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
@ -327,15 +353,66 @@ im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc
im_show = Image.fromarray(im_show) im_show = Image.fromarray(im_show)
im_show.save('result.jpg') im_show.save('result.jpg')
``` ```
## 5 PDF file
- Use by command line
you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages
```bash
paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2
```
- Use by code
## 5 Parameter Description ```python
from paddleocr import PaddleOCR, draw_ocr
# Paddleocr supports Chinese, English, French, German, Korean and Japanese.
# You can set the parameter `lang` as `ch`, `en`, `fr`, `german`, `korean`, `japan`
# to switch the language model in order.
ocr = PaddleOCR(use_angle_cls=True, lang="ch" page_num=2) # need to run only once to download and load model into memory
img_path = './xxx.pdf'
result = ocr.ocr(img_path, cls=True)
for idx in range(len(result)):
res = result[idx]
for line in res:
print(line)
# draw result
import fitz
from PIL import Image
import cv2
import numpy as np
imgs = []
with fitz.open(img_path) as pdf:
for pg in range(0, pdf.pageCount):
page = pdf[pg]
mat = fitz.Matrix(2, 2)
pm = page.getPixmap(matrix=mat, alpha=False)
# if width or height > 2000 pixels, don't enlarge the image
if pm.width > 2000 or pm.height > 2000:
pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
imgs.append(img)
for idx in range(len(result)):
res = result[idx]
image = imgs[idx]
boxes = [line[0] for line in res]
txts = [line[1][0] for line in res]
scores = [line[1][1] for line in res]
im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf')
im_show = Image.fromarray(im_show)
im_show.save('result_page_{}.jpg'.format(idx))
```
## 6 Parameter Description
| Parameter | Description | Default value | | Parameter | Description | Default value |
|-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| |-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------|
| use_gpu | use GPU or not | TRUE | | use_gpu | use GPU or not | TRUE |
| gpu_mem | GPU memory size used for initialization | 8000M | | gpu_mem | GPU memory size used for initialization | 8000M |
| image_dir | The images path or folder path for predicting when used by the command line | | | image_dir | The images path or folder path for predicting when used by the command line | |
| page_num | Valid when the input type is pdf file, specify to predict the previous page_num pages, all pages are predicted by default | 0 |
| det_algorithm | Type of detection algorithm selected | DB | | det_algorithm | Type of detection algorithm selected | DB |
| det_model_dir | the text detection inference model folder. There are two ways to transfer parameters, 1. None: Automatically download the built-in model to `~/.paddleocr/det`; 2. The path of the inference model converted by yourself, the model and params files must be included in the model path | None | | det_model_dir | the text detection inference model folder. There are two ways to transfer parameters, 1. None: Automatically download the built-in model to `~/.paddleocr/det`; 2. The path of the inference model converted by yourself, the model and params files must be included in the model path | None |
| det_max_side_len | The maximum size of the long side of the image. When the long side exceeds this value, the long side will be resized to this size, and the short side will be scaled proportionally | 960 | | det_max_side_len | The maximum size of the long side of the image. When the long side exceeds this value, the long side will be resized to this size, and the short side will be scaled proportionally | 960 |

View File

@ -47,7 +47,7 @@ __all__ = [
] ]
SUPPORT_DET_MODEL = ['DB'] SUPPORT_DET_MODEL = ['DB']
VERSION = '2.6.0.1' VERSION = '2.6.0.2'
SUPPORT_REC_MODEL = ['CRNN', 'SVTR_LCNet'] SUPPORT_REC_MODEL = ['CRNN', 'SVTR_LCNet']
BASE_DIR = os.path.expanduser("~/.paddleocr/") BASE_DIR = os.path.expanduser("~/.paddleocr/")
@ -428,8 +428,8 @@ def check_img(img):
download_with_progressbar(img, 'tmp.jpg') download_with_progressbar(img, 'tmp.jpg')
img = 'tmp.jpg' img = 'tmp.jpg'
image_file = img image_file = img
img, flag, _ = check_and_read(image_file) img, flag_gif, flag_pdf = check_and_read(image_file)
if not flag: if not flag_gif and not flag_pdf:
with open(image_file, 'rb') as f: with open(image_file, 'rb') as f:
img = img_decode(f.read()) img = img_decode(f.read())
if img is None: if img is None:
@ -500,6 +500,7 @@ class PaddleOCR(predict_system.TextSystem):
logger.debug(params) logger.debug(params)
# init det_model and rec_model # init det_model and rec_model
super().__init__(params) super().__init__(params)
self.page_num = params.page_num
def ocr(self, img, det=True, rec=True, cls=True): def ocr(self, img, det=True, rec=True, cls=True):
""" """
@ -520,24 +521,43 @@ class PaddleOCR(predict_system.TextSystem):
) )
img = check_img(img) img = check_img(img)
# for infer pdf file
if det and rec: if isinstance(img, list):
dt_boxes, rec_res, _ = self.__call__(img, cls) if self.page_num > len(img) or self.page_num == 0:
return [[box.tolist(), res] for box, res in zip(dt_boxes, rec_res)] self.page_num = len(img)
elif det and not rec: imgs = img[:self.page_num]
dt_boxes, elapse = self.text_detector(img)
if dt_boxes is None:
return None
return [box.tolist() for box in dt_boxes]
else: else:
if not isinstance(img, list): imgs = [img]
img = [img] if det and rec:
if self.use_angle_cls and cls: ocr_res = []
img, cls_res, elapse = self.text_classifier(img) for idx, img in enumerate(imgs):
if not rec: dt_boxes, rec_res, _ = self.__call__(img, cls)
return cls_res tmp_res = [[box.tolist(), res]
rec_res, elapse = self.text_recognizer(img) for box, res in zip(dt_boxes, rec_res)]
return rec_res ocr_res.append(tmp_res)
return ocr_res
elif det and not rec:
ocr_res = []
for idx, img in enumerate(imgs):
dt_boxes, elapse = self.text_detector(img)
tmp_res = [box.tolist() for box in dt_boxes]
ocr_res.append(tmp_res)
return ocr_res
else:
ocr_res = []
cls_res = []
for idx, img in enumerate(imgs):
if not isinstance(img, list):
img = [img]
if self.use_angle_cls and cls:
img, cls_res_tmp, elapse = self.text_classifier(img)
if not rec:
cls_res.append(cls_res_tmp)
rec_res, elapse = self.text_recognizer(img)
ocr_res.append(rec_res)
if not rec:
return cls_res
return ocr_res
class PPStructure(StructureSystem): class PPStructure(StructureSystem):
@ -633,8 +653,10 @@ def main():
rec=args.rec, rec=args.rec,
cls=args.use_angle_cls) cls=args.use_angle_cls)
if result is not None: if result is not None:
for line in result: for idx in range(len(result)):
logger.info(line) res = result[idx]
for line in res:
logger.info(line)
elif args.type == 'structure': elif args.type == 'structure':
img, flag_gif, flag_pdf = check_and_read(img_path) img, flag_gif, flag_pdf = check_and_read(img_path)
if not flag_gif and not flag_pdf: if not flag_gif and not flag_pdf:
@ -682,7 +704,7 @@ def main():
"error in layout recovery image:{}, err msg: {}".format( "error in layout recovery image:{}, err msg: {}".format(
img_name, ex)) img_name, ex))
continue continue
for item in all_res: for item in all_res:
item.pop('img') item.pop('img')
item.pop('res') item.pop('res')

View File

@ -21,124 +21,165 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import math import math
from collections import namedtuple import re
import collections
import paddle import paddle
import paddle.nn as nn import paddle.nn as nn
import paddle.nn.functional as F import paddle.nn.functional as F
__all__ = ['EfficientNetb3'] __all__ = ['EfficientNetb3']
GlobalParams = collections.namedtuple('GlobalParams', [
'batch_norm_momentum', 'batch_norm_epsilon', 'dropout_rate', 'num_classes',
'width_coefficient', 'depth_coefficient', 'depth_divisor', 'min_depth',
'drop_connect_rate', 'image_size'
])
class EffB3Params: BlockArgs = collections.namedtuple('BlockArgs', [
'kernel_size', 'num_repeat', 'input_filters', 'output_filters',
'expand_ratio', 'id_skip', 'stride', 'se_ratio'
])
class BlockDecoder:
@staticmethod @staticmethod
def get_global_params(): def _decode_block_string(block_string):
""" assert isinstance(block_string, str)
The fllowing are efficientnetb3's arch superparams, but to fit for scene
text recognition task, the resolution(image_size) here is changed ops = block_string.split('_')
from 300 to 64. options = {}
""" for op in ops:
GlobalParams = namedtuple('GlobalParams', [ splits = re.split(r'(\d.*)', op)
'drop_connect_rate', 'width_coefficient', 'depth_coefficient', if len(splits) >= 2:
'depth_divisor', 'image_size' key, value = splits[:2]
]) options[key] = value
global_params = GlobalParams(
drop_connect_rate=0.3, assert (('s' in options and len(options['s']) == 1) or
width_coefficient=1.2, (len(options['s']) == 2 and options['s'][0] == options['s'][1]))
depth_coefficient=1.4,
depth_divisor=8, return BlockArgs(
image_size=64) kernel_size=int(options['k']),
return global_params num_repeat=int(options['r']),
input_filters=int(options['i']),
output_filters=int(options['o']),
expand_ratio=int(options['e']),
id_skip=('noskip' not in block_string),
se_ratio=float(options['se']) if 'se' in options else None,
stride=[int(options['s'][0])])
@staticmethod @staticmethod
def get_block_params(): def decode(string_list):
BlockParams = namedtuple('BlockParams', [ assert isinstance(string_list, list)
'kernel_size', 'num_repeat', 'input_filters', 'output_filters', blocks_args = []
'expand_ratio', 'id_skip', 'se_ratio', 'stride' for block_string in string_list:
]) blocks_args.append(BlockDecoder._decode_block_string(block_string))
block_params = [ return blocks_args
BlockParams(3, 1, 32, 16, 1, True, 0.25, 1),
BlockParams(3, 2, 16, 24, 6, True, 0.25, 2),
BlockParams(5, 2, 24, 40, 6, True, 0.25, 2), def efficientnet(width_coefficient=None,
BlockParams(3, 3, 40, 80, 6, True, 0.25, 2), depth_coefficient=None,
BlockParams(5, 3, 80, 112, 6, True, 0.25, 1), dropout_rate=0.2,
BlockParams(5, 4, 112, 192, 6, True, 0.25, 2), drop_connect_rate=0.2,
BlockParams(3, 1, 192, 320, 6, True, 0.25, 1) image_size=None,
] num_classes=1000):
return block_params blocks_args = [
'r1_k3_s11_e1_i32_o16_se0.25',
'r2_k3_s22_e6_i16_o24_se0.25',
'r2_k5_s22_e6_i24_o40_se0.25',
'r3_k3_s22_e6_i40_o80_se0.25',
'r3_k5_s11_e6_i80_o112_se0.25',
'r4_k5_s22_e6_i112_o192_se0.25',
'r1_k3_s11_e6_i192_o320_se0.25',
]
blocks_args = BlockDecoder.decode(blocks_args)
global_params = GlobalParams(
batch_norm_momentum=0.99,
batch_norm_epsilon=1e-3,
dropout_rate=dropout_rate,
drop_connect_rate=drop_connect_rate,
num_classes=num_classes,
width_coefficient=width_coefficient,
depth_coefficient=depth_coefficient,
depth_divisor=8,
min_depth=None,
image_size=image_size, )
return blocks_args, global_params
class EffUtils: class EffUtils:
@staticmethod @staticmethod
def round_filters(filters, global_params): def round_filters(filters, global_params):
"""Calculate and round number of filters based on depth multiplier.""" """ Calculate and round number of filters based on depth multiplier. """
multiplier = global_params.width_coefficient multiplier = global_params.width_coefficient
if not multiplier: if not multiplier:
return filters return filters
divisor = global_params.depth_divisor divisor = global_params.depth_divisor
min_depth = global_params.min_depth
filters *= multiplier filters *= multiplier
new_filters = int(filters + divisor / 2) // divisor * divisor min_depth = min_depth or divisor
new_filters = max(min_depth,
int(filters + divisor / 2) // divisor * divisor)
if new_filters < 0.9 * filters: if new_filters < 0.9 * filters:
new_filters += divisor new_filters += divisor
return int(new_filters) return int(new_filters)
@staticmethod @staticmethod
def round_repeats(repeats, global_params): def round_repeats(repeats, global_params):
"""Round number of filters based on depth multiplier.""" """ Round number of filters based on depth multiplier. """
multiplier = global_params.depth_coefficient multiplier = global_params.depth_coefficient
if not multiplier: if not multiplier:
return repeats return repeats
return int(math.ceil(multiplier * repeats)) return int(math.ceil(multiplier * repeats))
class ConvBlock(nn.Layer): class MbConvBlock(nn.Layer):
def __init__(self, block_params): def __init__(self, block_args):
super(ConvBlock, self).__init__() super(MbConvBlock, self).__init__()
self.block_args = block_params self._block_args = block_args
self.has_se = (self.block_args.se_ratio is not None) and \ self.has_se = (self._block_args.se_ratio is not None) and \
(0 < self.block_args.se_ratio <= 1) (0 < self._block_args.se_ratio <= 1)
self.id_skip = block_params.id_skip self.id_skip = block_args.id_skip
# expansion phase # expansion phase
self.input_filters = self.block_args.input_filters self.inp = self._block_args.input_filters
output_filters = \ oup = self._block_args.input_filters * self._block_args.expand_ratio
self.block_args.input_filters * self.block_args.expand_ratio if self._block_args.expand_ratio != 1:
if self.block_args.expand_ratio != 1: self._expand_conv = nn.Conv2D(self.inp, oup, 1, bias_attr=False)
self.expand_conv = nn.Conv2D( self._bn0 = nn.BatchNorm(oup)
self.input_filters, output_filters, 1, bias_attr=False)
self.bn0 = nn.BatchNorm(output_filters)
# depthwise conv phase # depthwise conv phase
k = self.block_args.kernel_size k = self._block_args.kernel_size
s = self.block_args.stride s = self._block_args.stride
self.depthwise_conv = nn.Conv2D( if isinstance(s, list):
output_filters, s = s[0]
output_filters, self._depthwise_conv = nn.Conv2D(
groups=output_filters, oup,
oup,
groups=oup,
kernel_size=k, kernel_size=k,
stride=s, stride=s,
padding='same', padding='same',
bias_attr=False) bias_attr=False)
self.bn1 = nn.BatchNorm(output_filters) self._bn1 = nn.BatchNorm(oup)
# squeeze and excitation layer, if desired # squeeze and excitation layer, if desired
if self.has_se: if self.has_se:
num_squeezed_channels = max(1, num_squeezed_channels = max(1,
int(self.block_args.input_filters * int(self._block_args.input_filters *
self.block_args.se_ratio)) self._block_args.se_ratio))
self.se_reduce = nn.Conv2D(output_filters, num_squeezed_channels, 1) self._se_reduce = nn.Conv2D(oup, num_squeezed_channels, 1)
self.se_expand = nn.Conv2D(num_squeezed_channels, output_filters, 1) self._se_expand = nn.Conv2D(num_squeezed_channels, oup, 1)
# output phase # output phase and some util class
self.final_oup = self.block_args.output_filters self.final_oup = self._block_args.output_filters
self.project_conv = nn.Conv2D( self._project_conv = nn.Conv2D(oup, self.final_oup, 1, bias_attr=False)
output_filters, self.final_oup, 1, bias_attr=False) self._bn2 = nn.BatchNorm(self.final_oup)
self.bn2 = nn.BatchNorm(self.final_oup) self._swish = nn.Swish()
self.swish = nn.Swish()
def drop_connect(self, inputs, p, training): def _drop_connect(self, inputs, p, training):
if not training: if not training:
return inputs return inputs
batch_size = inputs.shape[0] batch_size = inputs.shape[0]
keep_prob = 1 - p keep_prob = 1 - p
random_tensor = keep_prob random_tensor = keep_prob
@ -151,22 +192,23 @@ class ConvBlock(nn.Layer):
def forward(self, inputs, drop_connect_rate=None): def forward(self, inputs, drop_connect_rate=None):
# expansion and depthwise conv # expansion and depthwise conv
x = inputs x = inputs
if self.block_args.expand_ratio != 1: if self._block_args.expand_ratio != 1:
x = self.swish(self.bn0(self.expand_conv(inputs))) x = self._swish(self._bn0(self._expand_conv(inputs)))
x = self.swish(self.bn1(self.depthwise_conv(x))) x = self._swish(self._bn1(self._depthwise_conv(x)))
# squeeze and excitation # squeeze and excitation
if self.has_se: if self.has_se:
x_squeezed = F.adaptive_avg_pool2d(x, 1) x_squeezed = F.adaptive_avg_pool2d(x, 1)
x_squeezed = self.se_expand(self.swish(self.se_reduce(x_squeezed))) x_squeezed = self._se_expand(
self._swish(self._se_reduce(x_squeezed)))
x = F.sigmoid(x_squeezed) * x x = F.sigmoid(x_squeezed) * x
x = self.bn2(self.project_conv(x)) x = self._bn2(self._project_conv(x))
# skip conntection and drop connect # skip conntection and drop connect
if self.id_skip and self.block_args.stride == 1 and \ if self.id_skip and self._block_args.stride == 1 and \
self.input_filters == self.final_oup: self.inp == self.final_oup:
if drop_connect_rate: if drop_connect_rate:
x = self.drop_connect( x = self._drop_connect(
x, p=drop_connect_rate, training=self.training) x, p=drop_connect_rate, training=self.training)
x = x + inputs x = x + inputs
return x return x
@ -175,54 +217,63 @@ class ConvBlock(nn.Layer):
class EfficientNetb3_PREN(nn.Layer): class EfficientNetb3_PREN(nn.Layer):
def __init__(self, in_channels): def __init__(self, in_channels):
super(EfficientNetb3_PREN, self).__init__() super(EfficientNetb3_PREN, self).__init__()
self.blocks_params = EffB3Params.get_block_params() """
self.global_params = EffB3Params.get_global_params() the fllowing are efficientnetb3's superparams,
they means efficientnetb3 network's width, depth, resolution and
dropout respectively, to fit for text recognition task, the resolution
here is changed from 300 to 64.
"""
w, d, s, p = 1.2, 1.4, 64, 0.3
self._blocks_args, self._global_params = efficientnet(
width_coefficient=w,
depth_coefficient=d,
dropout_rate=p,
image_size=s)
self.out_channels = [] self.out_channels = []
# stem # stem
stem_channels = EffUtils.round_filters(32, self.global_params) out_channels = EffUtils.round_filters(32, self._global_params)
self.conv_stem = nn.Conv2D( self._conv_stem = nn.Conv2D(
in_channels, stem_channels, 3, 2, padding='same', bias_attr=False) in_channels, out_channels, 3, 2, padding='same', bias_attr=False)
self.bn0 = nn.BatchNorm(stem_channels) self._bn0 = nn.BatchNorm(out_channels)
self.blocks = [] # build blocks
self._blocks = []
# to extract three feature maps for fpn based on efficientnetb3 backbone # to extract three feature maps for fpn based on efficientnetb3 backbone
self.concerned_block_idxes = [7, 17, 25] self._concerned_block_idxes = [7, 17, 25]
concerned_idx = 0 _concerned_idx = 0
for i, block_params in enumerate(self.blocks_params): for i, block_args in enumerate(self._blocks_args):
block_params = block_params._replace( block_args = block_args._replace(
input_filters=EffUtils.round_filters(block_params.input_filters, input_filters=EffUtils.round_filters(block_args.input_filters,
self.global_params), self._global_params),
output_filters=EffUtils.round_filters( output_filters=EffUtils.round_filters(block_args.output_filters,
block_params.output_filters, self.global_params), self._global_params),
num_repeat=EffUtils.round_repeats(block_params.num_repeat, num_repeat=EffUtils.round_repeats(block_args.num_repeat,
self.global_params)) self._global_params))
self.blocks.append( self._blocks.append(
self.add_sublayer("{}-0".format(i), ConvBlock(block_params))) self.add_sublayer(f"{i}-0", MbConvBlock(block_args)))
concerned_idx += 1 _concerned_idx += 1
if concerned_idx in self.concerned_block_idxes: if _concerned_idx in self._concerned_block_idxes:
self.out_channels.append(block_params.output_filters) self.out_channels.append(block_args.output_filters)
if block_params.num_repeat > 1: if block_args.num_repeat > 1:
block_params = block_params._replace( block_args = block_args._replace(
input_filters=block_params.output_filters, stride=1) input_filters=block_args.output_filters, stride=1)
for j in range(block_params.num_repeat - 1): for j in range(block_args.num_repeat - 1):
self.blocks.append( self._blocks.append(
self.add_sublayer('{}-{}'.format(i, j + 1), self.add_sublayer(f'{i}-{j+1}', MbConvBlock(block_args)))
ConvBlock(block_params))) _concerned_idx += 1
concerned_idx += 1 if _concerned_idx in self._concerned_block_idxes:
if concerned_idx in self.concerned_block_idxes: self.out_channels.append(block_args.output_filters)
self.out_channels.append(block_params.output_filters)
self.swish = nn.Swish() self._swish = nn.Swish()
def forward(self, inputs): def forward(self, inputs):
outs = [] outs = []
x = self._swish(self._bn0(self._conv_stem(inputs)))
x = self.swish(self.bn0(self.conv_stem(inputs))) for idx, block in enumerate(self._blocks):
for idx, block in enumerate(self.blocks): drop_connect_rate = self._global_params.drop_connect_rate
drop_connect_rate = self.global_params.drop_connect_rate
if drop_connect_rate: if drop_connect_rate:
drop_connect_rate *= float(idx) / len(self.blocks) drop_connect_rate *= float(idx) / len(self._blocks)
x = block(x, drop_connect_rate=drop_connect_rate) x = block(x, drop_connect_rate=drop_connect_rate)
if idx in self.concerned_block_idxes: if idx in self._concerned_block_idxes:
outs.append(x) outs.append(x)
return outs return outs

View File

@ -82,7 +82,8 @@ class TableAttentionHead(nn.Layer):
batch_size = fea.shape[0] batch_size = fea.shape[0]
hidden = paddle.zeros((batch_size, self.hidden_size)) hidden = paddle.zeros((batch_size, self.hidden_size))
output_hiddens = paddle.zeros((batch_size, self.max_text_length + 1, self.hidden_size)) output_hiddens = paddle.zeros(
(batch_size, self.max_text_length + 1, self.hidden_size))
if self.training and targets is not None: if self.training and targets is not None:
structure = targets[0] structure = targets[0]
for i in range(self.max_text_length + 1): for i in range(self.max_text_length + 1):
@ -91,19 +92,13 @@ class TableAttentionHead(nn.Layer):
(outputs, hidden), alpha = self.structure_attention_cell( (outputs, hidden), alpha = self.structure_attention_cell(
hidden, fea, elem_onehots) hidden, fea, elem_onehots)
output_hiddens[:, i, :] = outputs output_hiddens[:, i, :] = outputs
# output_hiddens.append(paddle.unsqueeze(outputs, axis=1)) structure_probs = self.structure_generator(output_hiddens)
output = paddle.concat(output_hiddens, axis=1) loc_fea = fea.transpose([0, 2, 1])
structure_probs = self.structure_generator(output) loc_fea = self.loc_fea_trans(loc_fea)
if self.loc_type == 1: loc_fea = loc_fea.transpose([0, 2, 1])
loc_preds = self.loc_generator(output) loc_concat = paddle.concat([output_hiddens, loc_fea], axis=2)
loc_preds = F.sigmoid(loc_preds) loc_preds = self.loc_generator(loc_concat)
else: loc_preds = F.sigmoid(loc_preds)
loc_fea = fea.transpose([0, 2, 1])
loc_fea = self.loc_fea_trans(loc_fea)
loc_fea = loc_fea.transpose([0, 2, 1])
loc_concat = paddle.concat([output, loc_fea], axis=2)
loc_preds = self.loc_generator(loc_concat)
loc_preds = F.sigmoid(loc_preds)
else: else:
temp_elem = paddle.zeros(shape=[batch_size], dtype="int32") temp_elem = paddle.zeros(shape=[batch_size], dtype="int32")
structure_probs = None structure_probs = None
@ -118,17 +113,15 @@ class TableAttentionHead(nn.Layer):
(outputs, hidden), alpha = self.structure_attention_cell( (outputs, hidden), alpha = self.structure_attention_cell(
hidden, fea, elem_onehots) hidden, fea, elem_onehots)
output_hiddens[:, i, :] = outputs output_hiddens[:, i, :] = outputs
# output_hiddens.append(paddle.unsqueeze(outputs, axis=1))
structure_probs_step = self.structure_generator(outputs) structure_probs_step = self.structure_generator(outputs)
temp_elem = structure_probs_step.argmax(axis=1, dtype="int32") temp_elem = structure_probs_step.argmax(axis=1, dtype="int32")
output = output_hiddens structure_probs = self.structure_generator(output_hiddens)
structure_probs = self.structure_generator(output)
structure_probs = F.softmax(structure_probs) structure_probs = F.softmax(structure_probs)
loc_fea = fea.transpose([0, 2, 1]) loc_fea = fea.transpose([0, 2, 1])
loc_fea = self.loc_fea_trans(loc_fea) loc_fea = self.loc_fea_trans(loc_fea)
loc_fea = loc_fea.transpose([0, 2, 1]) loc_fea = loc_fea.transpose([0, 2, 1])
loc_concat = paddle.concat([output, loc_fea], axis=2) loc_concat = paddle.concat([output_hiddens, loc_fea], axis=2)
loc_preds = self.loc_generator(loc_concat) loc_preds = self.loc_generator(loc_concat)
loc_preds = F.sigmoid(loc_preds) loc_preds = F.sigmoid(loc_preds)
return {'structure_probs': structure_probs, 'loc_preds': loc_preds} return {'structure_probs': structure_probs, 'loc_preds': loc_preds}
@ -203,8 +196,10 @@ class SLAHead(nn.Layer):
fea = fea.transpose([0, 2, 1]) # (NTC)(batch, width, channels) fea = fea.transpose([0, 2, 1]) # (NTC)(batch, width, channels)
hidden = paddle.zeros((batch_size, self.hidden_size)) hidden = paddle.zeros((batch_size, self.hidden_size))
structure_preds = paddle.zeros((batch_size, self.max_text_length + 1, self.num_embeddings)) structure_preds = paddle.zeros(
loc_preds = paddle.zeros((batch_size, self.max_text_length + 1, self.loc_reg_num)) (batch_size, self.max_text_length + 1, self.num_embeddings))
loc_preds = paddle.zeros(
(batch_size, self.max_text_length + 1, self.loc_reg_num))
structure_preds.stop_gradient = True structure_preds.stop_gradient = True
loc_preds.stop_gradient = True loc_preds.stop_gradient = True
if self.training and targets is not None: if self.training and targets is not None:

View File

@ -650,7 +650,8 @@ class PRENLabelDecode(BaseRecLabelDecode):
return result_list return result_list
def __call__(self, preds, label=None, *args, **kwargs): def __call__(self, preds, label=None, *args, **kwargs):
preds = preds.numpy() if isinstance(preds, paddle.Tensor):
preds = preds.numpy()
preds_idx = preds.argmax(axis=2) preds_idx = preds.argmax(axis=2)
preds_prob = preds.max(axis=2) preds_prob = preds.max(axis=2)
text = self.decode(preds_idx, preds_prob) text = self.decode(preds_idx, preds_prob)

View File

@ -77,7 +77,7 @@ def export_single_model(model,
elif arch_config["algorithm"] == "PREN": elif arch_config["algorithm"] == "PREN":
other_shape = [ other_shape = [
paddle.static.InputSpec( paddle.static.InputSpec(
shape=[None, 3, 64, 512], dtype="float32"), shape=[None, 3, 64, 256], dtype="float32"),
] ]
model = to_static(model, input_spec=other_shape) model = to_static(model, input_spec=other_shape)
elif arch_config["model_type"] == "sr": elif arch_config["model_type"] == "sr":

View File

@ -106,6 +106,8 @@ class TextRecognizer(object):
"character_dict_path": None, "character_dict_path": None,
"use_space_char": args.use_space_char "use_space_char": args.use_space_char
} }
elif self.rec_algorithm == "PREN":
postprocess_params = {'name': 'PRENLabelDecode'}
self.postprocess_op = build_post_process(postprocess_params) self.postprocess_op = build_post_process(postprocess_params)
self.predictor, self.input_tensor, self.output_tensors, self.config = \ self.predictor, self.input_tensor, self.output_tensors, self.config = \
utility.create_predictor(args, 'rec', logger) utility.create_predictor(args, 'rec', logger)
@ -400,7 +402,7 @@ class TextRecognizer(object):
self.rec_image_shape) self.rec_image_shape)
norm_img = norm_img[np.newaxis, :] norm_img = norm_img[np.newaxis, :]
norm_img_batch.append(norm_img) norm_img_batch.append(norm_img)
elif self.rec_algorithm == "VisionLAN": elif self.rec_algorithm in ["VisionLAN", "PREN"]:
norm_img = self.resize_norm_img_vl(img_list[indices[ino]], norm_img = self.resize_norm_img_vl(img_list[indices[ino]],
self.rec_image_shape) self.rec_image_shape)
norm_img = norm_img[np.newaxis, :] norm_img = norm_img[np.newaxis, :]