pdf to markdown document (#13942)
parent
fa3f7dbc72
commit
8728b47046
|
@ -85,6 +85,20 @@ Recovery by using OCR:
|
||||||
paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true --lang='en'
|
paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true --lang='en'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### 2.1.7 layout recovery(PDF to Markdown)
|
||||||
|
|
||||||
|
Do not use LaTeXCOR model for formula recognition:
|
||||||
|
|
||||||
|
```bash linenums="1"
|
||||||
|
paddleocr --image_dir=ppstructure/docs/recovery/UnrealText.pdf --type=structure --recovery=true --recovery_to_markdown=true --lang='en'
|
||||||
|
```
|
||||||
|
|
||||||
|
Use LaTeXCOR model for formula recognition, where Chinese layout model must be used:
|
||||||
|
|
||||||
|
```bash linenums="1"
|
||||||
|
paddleocr --image_dir=ppstructure/docs/recovery/UnrealText.pdf --type=structure --recovery=true --formula=true --recovery_to_markdown=true --lang='ch'
|
||||||
|
```
|
||||||
|
|
||||||
### 2.2 Use by python script
|
### 2.2 Use by python script
|
||||||
|
|
||||||
#### 2.2.1 image orientation + layout analysis + table recognition
|
#### 2.2.1 image orientation + layout analysis + table recognition
|
||||||
|
@ -271,6 +285,35 @@ res = sorted_layout_boxes(result, w)
|
||||||
convert_info_docx(img, res, save_folder, os.path.basename(img_path).split('.')[0])
|
convert_info_docx(img, res, save_folder, os.path.basename(img_path).split('.')[0])
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### 2.2.7 layout recovery(PDF to Markdown)
|
||||||
|
|
||||||
|
```python linenums="1"
|
||||||
|
import os
|
||||||
|
import cv2
|
||||||
|
from paddleocr import PPStructure,save_structure_res
|
||||||
|
from paddleocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes
|
||||||
|
from paddleocr.ppstructure.recovery.recovery_to_markdown import convert_info_markdown
|
||||||
|
|
||||||
|
# Chinese image
|
||||||
|
table_engine = PPStructure(recovery=True)
|
||||||
|
# English image
|
||||||
|
# table_engine = PPStructure(recovery=True, lang='en')
|
||||||
|
|
||||||
|
save_folder = './output'
|
||||||
|
img_path = 'ppstructure/docs/table/1.png'
|
||||||
|
img = cv2.imread(img_path)
|
||||||
|
result = table_engine(img)
|
||||||
|
save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0])
|
||||||
|
|
||||||
|
for line in result:
|
||||||
|
line.pop('img')
|
||||||
|
print(line)
|
||||||
|
|
||||||
|
h, w, _ = img.shape
|
||||||
|
res = sorted_layout_boxes(result, w)
|
||||||
|
convert_info_markdown(res, save_folder, os.path.basename(img_path).split('.')[0])
|
||||||
|
```
|
||||||
|
|
||||||
### 2.3 Result description
|
### 2.3 Result description
|
||||||
|
|
||||||
The return of PP-Structure is a list of dicts, the example is as follows:
|
The return of PP-Structure is a list of dicts, the example is as follows:
|
||||||
|
@ -311,28 +354,32 @@ Please refer to: [Key Information Extraction](../ppocr/model_train/kie.en.md) .
|
||||||
|
|
||||||
### 2.4 Parameter Description
|
### 2.4 Parameter Description
|
||||||
|
|
||||||
| field | description | default |
|
| field | description | default |
|
||||||
|---|---|---|
|
|-------------------------|----------------------------------------------------------------------------------------------------------------------------|---|
|
||||||
| output | result save path | ./output/table |
|
| output | result save path | ./output/table |
|
||||||
| table_max_len | long side of the image resize in table structure model | 488 |
|
| table_max_len | long side of the image resize in table structure model | 488 |
|
||||||
| table_model_dir | Table structure model inference model path| None |
|
| table_model_dir | Table structure model inference model path | None |
|
||||||
| table_char_dict_path | The dictionary path of table structure model | ../ppocr/utils/dict/table_structure_dict.txt |
|
| table_char_dict_path | The dictionary path of table structure model | ../ppocr/utils/dict/table_structure_dict.txt |
|
||||||
| merge_no_span_structure | In the table recognition model, whether to merge '\<td>' and '\</td>' | False |
|
| merge_no_span_structure | In the table recognition model, whether to merge '\<td>' and '\</td>' | False |
|
||||||
| layout_model_dir | Layout analysis model inference model path| None |
|
| formula_model_dir | Formula recognition model inference model path | None |
|
||||||
| layout_dict_path | The dictionary path of layout analysis model| ../ppocr/utils/dict/layout_publaynet_dict.txt |
|
| formula_char_dict_path | The dictionary path of formula recognition model | ../ppocr/utils/dict/latex_ocr_tokenizer.json |
|
||||||
| layout_score_threshold | The box threshold path of layout analysis model| 0.5|
|
| layout_model_dir | Layout analysis model inference model path | None |
|
||||||
| layout_nms_threshold | The nms threshold path of layout analysis model| 0.5|
|
| layout_dict_path | The dictionary path of layout analysis model | ../ppocr/utils/dict/layout_publaynet_dict.txt |
|
||||||
| kie_algorithm | kie model algorithm| LayoutXLM|
|
| layout_score_threshold | The box threshold path of layout analysis model | 0.5|
|
||||||
| ser_model_dir | Ser model inference model path| None|
|
| layout_nms_threshold | The nms threshold path of layout analysis model | 0.5|
|
||||||
| ser_dict_path | The dictionary path of Ser model| ../train_data/XFUND/class_list_xfun.txt|
|
| kie_algorithm | kie model algorithm | LayoutXLM|
|
||||||
| mode | structure or kie | structure |
|
| ser_model_dir | Ser model inference model path | None|
|
||||||
| image_orientation | Whether to perform image orientation classification in forward | False |
|
| ser_dict_path | The dictionary path of Ser model | ../train_data/XFUND/class_list_xfun.txt|
|
||||||
| layout | Whether to perform layout analysis in forward | True |
|
| mode | structure or kie | structure |
|
||||||
| table | Whether to perform table recognition in forward | True |
|
| image_orientation | Whether to perform image orientation classification in forward | False |
|
||||||
| ocr | Whether to perform ocr for non-table areas in layout analysis. When layout is False, it will be automatically set to False| True |
|
| layout | Whether to perform layout analysis in forward | True |
|
||||||
| recovery | Whether to perform layout recovery in forward| False |
|
| table | Whether to perform table recognition in forward | True |
|
||||||
| save_pdf | Whether to convert docx to pdf when recovery| False |
|
| formula | Whether to perform formula recognition in forward | False |
|
||||||
| structure_version | Structure version, optional PP-structure and PP-structurev2 | PP-structure |
|
| ocr | Whether to perform ocr for non-table areas in layout analysis. When layout is False, it will be automatically set to False | True |
|
||||||
|
| recovery | Whether to perform layout recovery in forward | False |
|
||||||
|
| recovery_to_markdown | Whether to convert the layout recovery results into a markdown file | False |
|
||||||
|
| save_pdf | Whether to convert docx to pdf when recovery | False |
|
||||||
|
| structure_version | Structure version, optional PP-structure and PP-structurev2 | PP-structure |
|
||||||
|
|
||||||
Most of the parameters are consistent with the PaddleOCR whl package, see [whl package documentation](../ppocr/blog/whl.en.md)
|
Most of the parameters are consistent with the PaddleOCR whl package, see [whl package documentation](../ppocr/blog/whl.en.md)
|
||||||
|
|
||||||
|
|
|
@ -103,6 +103,20 @@ paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=t
|
||||||
paddleocr --image_dir=ppstructure/docs/recovery/UnrealText.pdf --type=structure --recovery=true --lang='en'
|
paddleocr --image_dir=ppstructure/docs/recovery/UnrealText.pdf --type=structure --recovery=true --lang='en'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### 2.1.7 版面恢复+转换为markdown文件
|
||||||
|
|
||||||
|
不使用LaTeXOCR模型进行公式识别:
|
||||||
|
|
||||||
|
```bash linenums="1"
|
||||||
|
paddleocr --image_dir=ppstructure/docs/recovery/UnrealText.pdf --type=structure --recovery=true --recovery_to_markdown=true --lang='en'
|
||||||
|
```
|
||||||
|
|
||||||
|
使用LaTeXOCR模型进行公式识别,其中必须使用中文layout模型:
|
||||||
|
|
||||||
|
```bash linenums="1"
|
||||||
|
paddleocr --image_dir=ppstructure/docs/recovery/UnrealText.pdf --type=structure --recovery=true --formula=true --recovery_to_markdown=true --lang='ch'
|
||||||
|
```
|
||||||
|
|
||||||
### 2.2 Python脚本使用
|
### 2.2 Python脚本使用
|
||||||
|
|
||||||
#### 2.2.1 图像方向分类+版面分析+表格识别
|
#### 2.2.1 图像方向分类+版面分析+表格识别
|
||||||
|
@ -289,6 +303,35 @@ res = sorted_layout_boxes(result, w)
|
||||||
convert_info_docx(img, res, save_folder, os.path.basename(img_path).split('.')[0])
|
convert_info_docx(img, res, save_folder, os.path.basename(img_path).split('.')[0])
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### 2.2.7 版面恢复+转换为markdown文件
|
||||||
|
|
||||||
|
```python linenums="1"
|
||||||
|
import os
|
||||||
|
import cv2
|
||||||
|
from paddleocr import PPStructure,save_structure_res
|
||||||
|
from paddleocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes
|
||||||
|
from paddleocr.ppstructure.recovery.recovery_to_markdown import convert_info_markdown
|
||||||
|
|
||||||
|
# 中文测试图
|
||||||
|
table_engine = PPStructure(recovery=True)
|
||||||
|
# 英文测试图
|
||||||
|
# table_engine = PPStructure(recovery=True, lang='en')
|
||||||
|
|
||||||
|
save_folder = './output'
|
||||||
|
img_path = 'ppstructure/docs/table/1.png'
|
||||||
|
img = cv2.imread(img_path)
|
||||||
|
result = table_engine(img)
|
||||||
|
save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0])
|
||||||
|
|
||||||
|
for line in result:
|
||||||
|
line.pop('img')
|
||||||
|
print(line)
|
||||||
|
|
||||||
|
h, w, _ = img.shape
|
||||||
|
res = sorted_layout_boxes(result, w)
|
||||||
|
convert_info_markdown(res, save_folder, os.path.basename(img_path).split('.')[0])
|
||||||
|
```
|
||||||
|
|
||||||
### 2.3 返回结果说明
|
### 2.3 返回结果说明
|
||||||
|
|
||||||
PP-Structure的返回结果为一个dict组成的list,示例如下:
|
PP-Structure的返回结果为一个dict组成的list,示例如下:
|
||||||
|
@ -329,28 +372,32 @@ dict 里各个字段说明如下:
|
||||||
|
|
||||||
### 2.4 参数说明
|
### 2.4 参数说明
|
||||||
|
|
||||||
| 字段 | 说明 | 默认值 |
|
| 字段 | 说明 | 默认值 |
|
||||||
| ----- | ---- | ------ |
|
|-------------------------|-------------------------------------------------| ------ |
|
||||||
| output | 结果保存地址 | ./output/table |
|
| output | 结果保存地址 | ./output/table |
|
||||||
| table_max_len | 表格结构模型预测时,图像的长边resize尺度 | 488 |
|
| table_max_len | 表格结构模型预测时,图像的长边resize尺度 | 488 |
|
||||||
| table_model_dir | 表格结构模型 inference 模型地址 | None |
|
| table_model_dir | 表格结构模型 inference 模型地址 | None |
|
||||||
| table_char_dict_path | 表格结构模型所用字典地址 | ../ppocr/utils/dict/table_structure_dict.txt |
|
| table_char_dict_path | 表格结构模型所用字典地址 | ../ppocr/utils/dict/table_structure_dict.txt |
|
||||||
| merge_no_span_structure | 表格识别模型中,是否对'\<td>'和'\</td>' 进行合并 | False |
|
| merge_no_span_structure | 表格识别模型中,是否对'\<td>'和'\</td>' 进行合并 | False |
|
||||||
| layout_model_dir | 版面分析模型 inference 模型地址 | None |
|
| formula_model_dir | 公式识别模型 inference 模型地址 | None |
|
||||||
| layout_dict_path | 版面分析模型字典 | ../ppocr/utils/dict/layout_publaynet_dict.txt |
|
| formula_char_dict_path | 公式识别模型所用字典地址 | ../ppocr/utils/dict/latex_ocr_tokenizer.json |
|
||||||
| layout_score_threshold | 版面分析模型检测框阈值 | 0.5 |
|
| layout_model_dir | 版面分析模型 inference 模型地址 | None |
|
||||||
| layout_nms_threshold | 版面分析模型nms阈值 | 0.5 |
|
| layout_dict_path | 版面分析模型字典 | ../ppocr/utils/dict/layout_publaynet_dict.txt |
|
||||||
| kie_algorithm | kie模型算法 | LayoutXLM |
|
| layout_score_threshold | 版面分析模型检测框阈值 | 0.5 |
|
||||||
| ser_model_dir | ser模型 inference 模型地址 | None |
|
| layout_nms_threshold | 版面分析模型nms阈值 | 0.5 |
|
||||||
| ser_dict_path | ser模型字典 | ../train_data/XFUND/class_list_xfun.txt |
|
| kie_algorithm | kie模型算法 | LayoutXLM |
|
||||||
| mode | structure or kie | structure |
|
| ser_model_dir | ser模型 inference 模型地址 | None |
|
||||||
| image_orientation | 前向中是否执行图像方向分类 | False |
|
| ser_dict_path | ser模型字典 | ../train_data/XFUND/class_list_xfun.txt |
|
||||||
| layout | 前向中是否执行版面分析 | True |
|
| mode | structure or kie | structure |
|
||||||
| table | 前向中是否执行表格识别 | True |
|
| image_orientation | 前向中是否执行图像方向分类 | False |
|
||||||
|
| layout | 前向中是否执行版面分析 | True |
|
||||||
|
| table | 前向中是否执行表格识别 | True |
|
||||||
|
| formula | 前向中是否执行公式识别 | False |
|
||||||
| ocr | 对于版面分析中的非表格区域,是否执行ocr。当layout为False时会被自动设置为False | True |
|
| ocr | 对于版面分析中的非表格区域,是否执行ocr。当layout为False时会被自动设置为False | True |
|
||||||
| recovery | 前向中是否执行版面恢复 | False |
|
| recovery | 前向中是否执行版面恢复 | False |
|
||||||
| save_pdf | 版面恢复导出docx文件的同时,是否导出pdf文件 | False |
|
| recovery_to_markdown | 是否将版面恢复结果转换为markdown文件 | False |
|
||||||
| structure_version | 模型版本,可选 PP-structure和PP-structurev2 | PP-structure |
|
| save_pdf | 版面恢复导出docx文件的同时,是否导出pdf文件 | False |
|
||||||
|
| structure_version | 模型版本,可选 PP-structure和PP-structurev2 | PP-structure |
|
||||||
|
|
||||||
大部分参数和PaddleOCR whl包保持一致,见 [whl包文档](../ppocr/blog/whl.md)
|
大部分参数和PaddleOCR whl包保持一致,见 [whl包文档](../ppocr/blog/whl.md)
|
||||||
|
|
||||||
|
|
28
paddleocr.py
28
paddleocr.py
|
@ -66,6 +66,7 @@ from tools.infer.utility import draw_ocr, str2bool, check_gpu
|
||||||
from ppstructure.utility import init_args, draw_structure_result
|
from ppstructure.utility import init_args, draw_structure_result
|
||||||
from ppstructure.predict_system import StructureSystem, save_structure_res, to_excel
|
from ppstructure.predict_system import StructureSystem, save_structure_res, to_excel
|
||||||
from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx
|
from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx
|
||||||
|
from ppstructure.recovery.recovery_to_markdown import convert_info_markdown
|
||||||
|
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
|
|
||||||
|
@ -79,6 +80,7 @@ __all__ = [
|
||||||
"to_excel",
|
"to_excel",
|
||||||
"sorted_layout_boxes",
|
"sorted_layout_boxes",
|
||||||
"convert_info_docx",
|
"convert_info_docx",
|
||||||
|
"convert_info_markdown",
|
||||||
]
|
]
|
||||||
|
|
||||||
SUPPORT_DET_MODEL = ["DB"]
|
SUPPORT_DET_MODEL = ["DB"]
|
||||||
|
@ -356,6 +358,16 @@ MODEL_URLS = {
|
||||||
"dict_path": "ppocr/utils/dict/layout_dict/layout_cdla_dict.txt",
|
"dict_path": "ppocr/utils/dict/layout_dict/layout_cdla_dict.txt",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
"formula": {
|
||||||
|
"en": {
|
||||||
|
"url": "https://paddleocr.bj.bcebos.com/contribution/rec_latex_ocr_infer.tar",
|
||||||
|
"dict_path": "ppocr/utils/dict/latex_ocr_tokenizer.json",
|
||||||
|
},
|
||||||
|
"ch": {
|
||||||
|
"url": "https://paddleocr.bj.bcebos.com/contribution/rec_latex_ocr_infer.tar",
|
||||||
|
"dict_path": "ppocr/utils/dict/latex_ocr_tokenizer.json",
|
||||||
|
},
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -396,6 +408,7 @@ def parse_args(mMain=True):
|
||||||
"rec_char_dict_path",
|
"rec_char_dict_path",
|
||||||
"table_char_dict_path",
|
"table_char_dict_path",
|
||||||
"layout_dict_path",
|
"layout_dict_path",
|
||||||
|
"formula_char_dict_path",
|
||||||
]:
|
]:
|
||||||
action.default = None
|
action.default = None
|
||||||
if mMain:
|
if mMain:
|
||||||
|
@ -845,12 +858,21 @@ class PPStructure(StructureSystem):
|
||||||
os.path.join(BASE_DIR, "whl", "layout"),
|
os.path.join(BASE_DIR, "whl", "layout"),
|
||||||
layout_model_config["url"],
|
layout_model_config["url"],
|
||||||
)
|
)
|
||||||
|
formula_model_config = get_model_config(
|
||||||
|
"STRUCTURE", params.structure_version, "formula", lang
|
||||||
|
)
|
||||||
|
params.formula_model_dir, formula_url = confirm_model_dir_url(
|
||||||
|
params.formula_model_dir,
|
||||||
|
os.path.join(BASE_DIR, "whl", "formula"),
|
||||||
|
formula_model_config["url"],
|
||||||
|
)
|
||||||
# download model
|
# download model
|
||||||
if not params.use_onnx:
|
if not params.use_onnx:
|
||||||
maybe_download(params.det_model_dir, det_url)
|
maybe_download(params.det_model_dir, det_url)
|
||||||
maybe_download(params.rec_model_dir, rec_url)
|
maybe_download(params.rec_model_dir, rec_url)
|
||||||
maybe_download(params.table_model_dir, table_url)
|
maybe_download(params.table_model_dir, table_url)
|
||||||
maybe_download(params.layout_model_dir, layout_url)
|
maybe_download(params.layout_model_dir, layout_url)
|
||||||
|
maybe_download(params.formula_model_dir, formula_url)
|
||||||
|
|
||||||
if params.rec_char_dict_path is None:
|
if params.rec_char_dict_path is None:
|
||||||
params.rec_char_dict_path = str(
|
params.rec_char_dict_path = str(
|
||||||
|
@ -864,6 +886,10 @@ class PPStructure(StructureSystem):
|
||||||
params.layout_dict_path = str(
|
params.layout_dict_path = str(
|
||||||
Path(__file__).parent / layout_model_config["dict_path"]
|
Path(__file__).parent / layout_model_config["dict_path"]
|
||||||
)
|
)
|
||||||
|
if params.formula_char_dict_path is None:
|
||||||
|
params.formula_char_dict_path = str(
|
||||||
|
Path(__file__).parent / formula_model_config["dict_path"]
|
||||||
|
)
|
||||||
logger.debug(params)
|
logger.debug(params)
|
||||||
super().__init__(params)
|
super().__init__(params)
|
||||||
|
|
||||||
|
@ -1005,6 +1031,8 @@ def main():
|
||||||
if args.recovery and all_res != []:
|
if args.recovery and all_res != []:
|
||||||
try:
|
try:
|
||||||
convert_info_docx(img, all_res, args.output, img_name)
|
convert_info_docx(img, all_res, args.output, img_name)
|
||||||
|
if args.recovery_to_markdown:
|
||||||
|
convert_info_markdown(all_res, args.output, img_name)
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
logger.error(
|
logger.error(
|
||||||
"error in layout recovery image:{}, err msg: {}".format(
|
"error in layout recovery image:{}, err msg: {}".format(
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
- [2.1.4 表格识别](#214-表格识别)
|
- [2.1.4 表格识别](#214-表格识别)
|
||||||
- [2.1.5 关键信息抽取](#215-关键信息抽取)
|
- [2.1.5 关键信息抽取](#215-关键信息抽取)
|
||||||
- [2.1.6 版面恢复](#216-版面恢复)
|
- [2.1.6 版面恢复](#216-版面恢复)
|
||||||
|
- [2.1.7 版面恢复+转换为markdown文件](#217-版面恢复转换为markdown文件)
|
||||||
- [2.2 Python脚本使用](#22-Python脚本使用)
|
- [2.2 Python脚本使用](#22-Python脚本使用)
|
||||||
- [2.2.1 图像方向分类+版面分析+表格识别](#221-图像方向分类版面分析表格识别)
|
- [2.2.1 图像方向分类+版面分析+表格识别](#221-图像方向分类版面分析表格识别)
|
||||||
- [2.2.2 版面分析+表格识别](#222-版面分析表格识别)
|
- [2.2.2 版面分析+表格识别](#222-版面分析表格识别)
|
||||||
|
@ -16,6 +17,7 @@
|
||||||
- [2.2.4 表格识别](#224-表格识别)
|
- [2.2.4 表格识别](#224-表格识别)
|
||||||
- [2.2.5 关键信息抽取](#225-关键信息抽取)
|
- [2.2.5 关键信息抽取](#225-关键信息抽取)
|
||||||
- [2.2.6 版面恢复](#226-版面恢复)
|
- [2.2.6 版面恢复](#226-版面恢复)
|
||||||
|
- [2.2.7 版面恢复+转换为markdown文件](#227-版面恢复转换为markdown文件)
|
||||||
- [2.3 返回结果说明](#23-返回结果说明)
|
- [2.3 返回结果说明](#23-返回结果说明)
|
||||||
- [2.3.1 版面分析+表格识别](#231-版面分析表格识别)
|
- [2.3.1 版面分析+表格识别](#231-版面分析表格识别)
|
||||||
- [2.3.2 关键信息抽取](#232-关键信息抽取)
|
- [2.3.2 关键信息抽取](#232-关键信息抽取)
|
||||||
|
@ -126,6 +128,22 @@ paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=t
|
||||||
paddleocr --image_dir=ppstructure/docs/recovery/UnrealText.pdf --type=structure --recovery=true --lang='en'
|
paddleocr --image_dir=ppstructure/docs/recovery/UnrealText.pdf --type=structure --recovery=true --lang='en'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
<a name="217"></a>
|
||||||
|
|
||||||
|
#### 2.1.7 版面恢复+转换为markdown文件
|
||||||
|
|
||||||
|
不使用LaTeXOCR模型进行公式识别:
|
||||||
|
|
||||||
|
```bash linenums="1"
|
||||||
|
paddleocr --image_dir=ppstructure/docs/recovery/UnrealText.pdf --type=structure --recovery=true --recovery_to_markdown=true --lang='en'
|
||||||
|
```
|
||||||
|
|
||||||
|
使用LaTeXOCR模型进行公式识别,其中必须使用中文layout模型:
|
||||||
|
|
||||||
|
```bash linenums="1"
|
||||||
|
paddleocr --image_dir=ppstructure/docs/recovery/UnrealText.pdf --type=structure --recovery=true --formula=true --recovery_to_markdown=true --lang='ch'
|
||||||
|
```
|
||||||
|
|
||||||
<a name="22"></a>
|
<a name="22"></a>
|
||||||
|
|
||||||
### 2.2 Python脚本使用
|
### 2.2 Python脚本使用
|
||||||
|
@ -322,6 +340,37 @@ res = sorted_layout_boxes(result, w)
|
||||||
convert_info_docx(img, res, save_folder, os.path.basename(img_path).split('.')[0])
|
convert_info_docx(img, res, save_folder, os.path.basename(img_path).split('.')[0])
|
||||||
```
|
```
|
||||||
|
|
||||||
|
<a name="227"></a>
|
||||||
|
|
||||||
|
#### 2.2.7 版面恢复+转换为markdown文件
|
||||||
|
|
||||||
|
```python linenums="1"
|
||||||
|
import os
|
||||||
|
import cv2
|
||||||
|
from paddleocr import PPStructure,save_structure_res
|
||||||
|
from paddleocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes
|
||||||
|
from paddleocr.ppstructure.recovery.recovery_to_markdown import convert_info_markdown
|
||||||
|
|
||||||
|
# 中文测试图
|
||||||
|
table_engine = PPStructure(recovery=True)
|
||||||
|
# 英文测试图
|
||||||
|
# table_engine = PPStructure(recovery=True, lang='en')
|
||||||
|
|
||||||
|
save_folder = './output'
|
||||||
|
img_path = 'ppstructure/docs/table/1.png'
|
||||||
|
img = cv2.imread(img_path)
|
||||||
|
result = table_engine(img)
|
||||||
|
save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0])
|
||||||
|
|
||||||
|
for line in result:
|
||||||
|
line.pop('img')
|
||||||
|
print(line)
|
||||||
|
|
||||||
|
h, w, _ = img.shape
|
||||||
|
res = sorted_layout_boxes(result, w)
|
||||||
|
convert_info_markdown(res, save_folder, os.path.basename(img_path).split('.')[0])
|
||||||
|
```
|
||||||
|
|
||||||
<a name="23"></a>
|
<a name="23"></a>
|
||||||
### 2.3 返回结果说明
|
### 2.3 返回结果说明
|
||||||
PP-Structure的返回结果为一个dict组成的list,示例如下:
|
PP-Structure的返回结果为一个dict组成的list,示例如下:
|
||||||
|
@ -363,28 +412,32 @@ dict 里各个字段说明如下:
|
||||||
<a name="24"></a>
|
<a name="24"></a>
|
||||||
### 2.4 参数说明
|
### 2.4 参数说明
|
||||||
|
|
||||||
| 字段 | 说明 | 默认值 |
|
| 字段 | 说明 | 默认值 |
|
||||||
|---|---|---|
|
|---|-----------------------------------------------|---|
|
||||||
| output | 结果保存地址 | ./output/table |
|
| output | 结果保存地址 | ./output/table |
|
||||||
| table_max_len | 表格结构模型预测时,图像的长边resize尺度 | 488 |
|
| table_max_len | 表格结构模型预测时,图像的长边resize尺度 | 488 |
|
||||||
| table_model_dir | 表格结构模型 inference 模型地址| None |
|
| table_model_dir | 表格结构模型 inference 模型地址 | None |
|
||||||
| table_char_dict_path | 表格结构模型所用字典地址 | ../ppocr/utils/dict/table_structure_dict.txt |
|
| table_char_dict_path | 表格结构模型所用字典地址 | ../ppocr/utils/dict/table_structure_dict.txt |
|
||||||
| merge_no_span_structure | 表格识别模型中,是否对'\<td>'和'\</td>' 进行合并 | False |
|
| merge_no_span_structure | 表格识别模型中,是否对'\<td>'和'\</td>' 进行合并 | False |
|
||||||
| layout_model_dir | 版面分析模型 inference 模型地址 | None |
|
| formula_model_dir | 公式识别模型 inference 模型地址 | None |
|
||||||
| layout_dict_path | 版面分析模型字典| ../ppocr/utils/dict/layout_publaynet_dict.txt |
|
| formula_char_dict_path | 公式识别模型所用字典地址 | ../ppocr/utils/dict/latex_ocr_tokenizer.json |
|
||||||
| layout_score_threshold | 版面分析模型检测框阈值| 0.5|
|
| layout_model_dir | 版面分析模型 inference 模型地址 | None |
|
||||||
| layout_nms_threshold | 版面分析模型nms阈值| 0.5|
|
| layout_dict_path | 版面分析模型字典 | ../ppocr/utils/dict/layout_publaynet_dict.txt |
|
||||||
| kie_algorithm | kie模型算法| LayoutXLM|
|
| layout_score_threshold | 版面分析模型检测框阈值 | 0.5|
|
||||||
| ser_model_dir | ser模型 inference 模型地址| None|
|
| layout_nms_threshold | 版面分析模型nms阈值 | 0.5|
|
||||||
| ser_dict_path | ser模型字典| ../train_data/XFUND/class_list_xfun.txt|
|
| kie_algorithm | kie模型算法 | LayoutXLM|
|
||||||
| mode | structure or kie | structure |
|
| ser_model_dir | ser模型 inference 模型地址 | None|
|
||||||
| image_orientation | 前向中是否执行图像方向分类 | False |
|
| ser_dict_path | ser模型字典 | ../train_data/XFUND/class_list_xfun.txt|
|
||||||
| layout | 前向中是否执行版面分析 | True |
|
| mode | structure or kie | structure |
|
||||||
| table | 前向中是否执行表格识别 | True |
|
| image_orientation | 前向中是否执行图像方向分类 | False |
|
||||||
| ocr | 对于版面分析中的非表格区域,是否执行ocr。当layout为False时会被自动设置为False| True |
|
| layout | 前向中是否执行版面分析 | True |
|
||||||
| recovery | 前向中是否执行版面恢复| False |
|
| table | 前向中是否执行表格识别 | True |
|
||||||
| save_pdf | 版面恢复导出docx文件的同时,是否导出pdf文件 | False |
|
| formula | 前向中是否执行公式识别 | False |
|
||||||
| structure_version | 模型版本,可选 PP-structure和PP-structurev2 | PP-structure |
|
| ocr | 对于版面分析中的非表格区域,是否执行ocr。当layout为False时会被自动设置为False | True |
|
||||||
|
| recovery | 前向中是否执行版面恢复 | False |
|
||||||
|
| recovery_to_markdown | 是否将版面恢复结果转换为markdown文件 | False |
|
||||||
|
| save_pdf | 版面恢复导出docx文件的同时,是否导出pdf文件 | False |
|
||||||
|
| structure_version | 模型版本,可选 PP-structure和PP-structurev2 | PP-structure |
|
||||||
|
|
||||||
大部分参数和PaddleOCR whl包保持一致,见 [whl包文档](../../doc/doc_ch/whl.md)
|
大部分参数和PaddleOCR whl包保持一致,见 [whl包文档](../../doc/doc_ch/whl.md)
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
- [2.1.4 table recognition](#214-table-recognition)
|
- [2.1.4 table recognition](#214-table-recognition)
|
||||||
- [2.1.5 Key Information Extraction](#215-Key-Information-Extraction)
|
- [2.1.5 Key Information Extraction](#215-Key-Information-Extraction)
|
||||||
- [2.1.6 layout recovery](#216-layout-recovery)
|
- [2.1.6 layout recovery](#216-layout-recovery)
|
||||||
|
- [2.1.7 layout recovery(PDF to Markdown)](#217-layout-recoverypdf-to-markdown)
|
||||||
- [2.2 Use by python script](#22-use-by-python-script)
|
- [2.2 Use by python script](#22-use-by-python-script)
|
||||||
- [2.2.1 image orientation + layout analysis + table recognition](#221-image-orientation--layout-analysis--table-recognition)
|
- [2.2.1 image orientation + layout analysis + table recognition](#221-image-orientation--layout-analysis--table-recognition)
|
||||||
- [2.2.2 layout analysis + table recognition](#222-layout-analysis--table-recognition)
|
- [2.2.2 layout analysis + table recognition](#222-layout-analysis--table-recognition)
|
||||||
|
@ -16,6 +17,7 @@
|
||||||
- [2.2.4 table recognition](#224-table-recognition)
|
- [2.2.4 table recognition](#224-table-recognition)
|
||||||
- [2.2.5 Key Information Extraction](#225-Key-Information-Extraction)
|
- [2.2.5 Key Information Extraction](#225-Key-Information-Extraction)
|
||||||
- [2.2.6 layout recovery](#226-layout-recovery)
|
- [2.2.6 layout recovery](#226-layout-recovery)
|
||||||
|
- [2.2.7 layout recovery(PDF to Markdown)](#227-layout-recoverypdf-to-markdown)
|
||||||
- [2.3 Result description](#23-result-description)
|
- [2.3 Result description](#23-result-description)
|
||||||
- [2.3.1 layout analysis + table recognition](#231-layout-analysis--table-recognition)
|
- [2.3.1 layout analysis + table recognition](#231-layout-analysis--table-recognition)
|
||||||
- [2.3.2 Key Information Extraction](#232-Key-Information-Extraction)
|
- [2.3.2 Key Information Extraction](#232-Key-Information-Extraction)
|
||||||
|
@ -110,6 +112,21 @@ Recovery by using OCR:
|
||||||
paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true --lang='en'
|
paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true --lang='en'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
<a name="217"></a>
|
||||||
|
#### 2.1.7 layout recovery(PDF to Markdown)
|
||||||
|
|
||||||
|
Do not use LaTeXCOR model for formula recognition:
|
||||||
|
|
||||||
|
```bash linenums="1"
|
||||||
|
paddleocr --image_dir=ppstructure/docs/recovery/UnrealText.pdf --type=structure --recovery=true --recovery_to_markdown=true --lang='en'
|
||||||
|
```
|
||||||
|
|
||||||
|
Use LaTeXCOR model for formula recognition, where Chinese layout model must be used:
|
||||||
|
|
||||||
|
```bash linenums="1"
|
||||||
|
paddleocr --image_dir=ppstructure/docs/recovery/UnrealText.pdf --type=structure --recovery=true --formula=true --recovery_to_markdown=true --lang='ch'
|
||||||
|
```
|
||||||
|
|
||||||
<a name="22"></a>
|
<a name="22"></a>
|
||||||
### 2.2 Use by python script
|
### 2.2 Use by python script
|
||||||
|
|
||||||
|
@ -303,6 +320,36 @@ res = sorted_layout_boxes(result, w)
|
||||||
convert_info_docx(img, res, save_folder, os.path.basename(img_path).split('.')[0])
|
convert_info_docx(img, res, save_folder, os.path.basename(img_path).split('.')[0])
|
||||||
```
|
```
|
||||||
|
|
||||||
|
<a name="227"></a>
|
||||||
|
#### 2.2.7 layout recovery(PDF to Markdown)
|
||||||
|
|
||||||
|
```python linenums="1"
|
||||||
|
import os
|
||||||
|
import cv2
|
||||||
|
from paddleocr import PPStructure,save_structure_res
|
||||||
|
from paddleocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes
|
||||||
|
from paddleocr.ppstructure.recovery.recovery_to_markdown import convert_info_markdown
|
||||||
|
|
||||||
|
# Chinese image
|
||||||
|
table_engine = PPStructure(recovery=True)
|
||||||
|
# English image
|
||||||
|
# table_engine = PPStructure(recovery=True, lang='en')
|
||||||
|
|
||||||
|
save_folder = './output'
|
||||||
|
img_path = 'ppstructure/docs/table/1.png'
|
||||||
|
img = cv2.imread(img_path)
|
||||||
|
result = table_engine(img)
|
||||||
|
save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0])
|
||||||
|
|
||||||
|
for line in result:
|
||||||
|
line.pop('img')
|
||||||
|
print(line)
|
||||||
|
|
||||||
|
h, w, _ = img.shape
|
||||||
|
res = sorted_layout_boxes(result, w)
|
||||||
|
convert_info_markdown(res, save_folder, os.path.basename(img_path).split('.')[0])
|
||||||
|
```
|
||||||
|
|
||||||
<a name="23"></a>
|
<a name="23"></a>
|
||||||
### 2.3 Result description
|
### 2.3 Result description
|
||||||
|
|
||||||
|
@ -351,9 +398,11 @@ Please refer to: [Key Information Extraction](../kie/README.md) .
|
||||||
| table_model_dir | Table structure model inference model path| None |
|
| table_model_dir | Table structure model inference model path| None |
|
||||||
| table_char_dict_path | The dictionary path of table structure model | ../ppocr/utils/dict/table_structure_dict.txt |
|
| table_char_dict_path | The dictionary path of table structure model | ../ppocr/utils/dict/table_structure_dict.txt |
|
||||||
| merge_no_span_structure | In the table recognition model, whether to merge '\<td>' and '\</td>' | False |
|
| merge_no_span_structure | In the table recognition model, whether to merge '\<td>' and '\</td>' | False |
|
||||||
|
| formula_model_dir | Formula recognition model inference model path | None |
|
||||||
|
| formula_char_dict_path | The dictionary path of formula recognition model | ../ppocr/utils/dict/latex_ocr_tokenizer.json |
|
||||||
| layout_model_dir | Layout analysis model inference model path| None |
|
| layout_model_dir | Layout analysis model inference model path| None |
|
||||||
| layout_dict_path | The dictionary path of layout analysis model| ../ppocr/utils/dict/layout_publaynet_dict.txt |
|
| layout_dict_path | The dictionary path of layout analysis model| ../ppocr/utils/dict/layout_publaynet_dict.txt |
|
||||||
| layout_score_threshold | The box threshold path of layout analysis model| 0.5|
|
| layout_score_threshold | The box threshold path of layout analysis model| 0.5|
|
||||||
| layout_nms_threshold | The nms threshold path of layout analysis model| 0.5|
|
| layout_nms_threshold | The nms threshold path of layout analysis model| 0.5|
|
||||||
| kie_algorithm | kie model algorithm| LayoutXLM|
|
| kie_algorithm | kie model algorithm| LayoutXLM|
|
||||||
| ser_model_dir | Ser model inference model path| None|
|
| ser_model_dir | Ser model inference model path| None|
|
||||||
|
@ -362,8 +411,10 @@ Please refer to: [Key Information Extraction](../kie/README.md) .
|
||||||
| image_orientation | Whether to perform image orientation classification in forward | False |
|
| image_orientation | Whether to perform image orientation classification in forward | False |
|
||||||
| layout | Whether to perform layout analysis in forward | True |
|
| layout | Whether to perform layout analysis in forward | True |
|
||||||
| table | Whether to perform table recognition in forward | True |
|
| table | Whether to perform table recognition in forward | True |
|
||||||
|
| formula | Whether to perform formula recognition in forward | False |
|
||||||
| ocr | Whether to perform ocr for non-table areas in layout analysis. When layout is False, it will be automatically set to False| True |
|
| ocr | Whether to perform ocr for non-table areas in layout analysis. When layout is False, it will be automatically set to False| True |
|
||||||
| recovery | Whether to perform layout recovery in forward| False |
|
| recovery | Whether to perform layout recovery in forward| False |
|
||||||
|
| recovery_to_markdown | Whether to convert the layout recovery results into a markdown file | False |
|
||||||
| save_pdf | Whether to convert docx to pdf when recovery| False |
|
| save_pdf | Whether to convert docx to pdf when recovery| False |
|
||||||
| structure_version | Structure version, optional PP-structure and PP-structurev2 | PP-structure |
|
| structure_version | Structure version, optional PP-structure and PP-structurev2 | PP-structure |
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue