2020-08-22 19:42:14 +08:00
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
2022-03-23 22:14:33 +08:00
import importlib
2020-08-22 19:42:14 +08:00
__dir__ = os . path . dirname ( __file__ )
2021-11-10 20:20:45 +08:00
import paddle
2024-04-26 16:54:49 +08:00
from paddle . utils import try_import
2021-11-10 20:20:45 +08:00
2024-04-21 21:46:20 +08:00
sys . path . append ( os . path . join ( __dir__ , " " ) )
2020-08-22 19:42:14 +08:00
import cv2
2021-06-10 14:24:59 +08:00
import logging
2020-08-22 19:42:14 +08:00
import numpy as np
from pathlib import Path
2022-12-07 09:51:54 +08:00
import base64
from io import BytesIO
2024-06-05 09:55:50 +08:00
import pprint
2022-12-07 09:51:54 +08:00
from PIL import Image
2023-12-13 10:16:39 +08:00
from tools . infer import predict_system
2020-08-22 19:42:14 +08:00
2023-08-06 11:17:13 +08:00
2023-07-18 11:36:49 +08:00
def _import_file ( module_name , file_path , make_importable = False ) :
spec = importlib . util . spec_from_file_location ( module_name , file_path )
module = importlib . util . module_from_spec ( spec )
spec . loader . exec_module ( module )
if make_importable :
sys . modules [ module_name ] = module
return module
2023-08-06 11:17:13 +08:00
tools = _import_file (
2024-04-21 21:46:20 +08:00
" tools " , os . path . join ( __dir__ , " tools/__init__.py " ) , make_importable = True
)
ppocr = importlib . import_module ( " ppocr " , " paddleocr " )
ppstructure = importlib . import_module ( " ppstructure " , " paddleocr " )
2023-08-06 11:17:13 +08:00
from ppocr . utils . logging import get_logger
2023-10-18 17:37:23 +08:00
logger = get_logger ( )
2024-04-21 21:46:20 +08:00
from ppocr . utils . utility import (
check_and_read ,
get_image_file_list ,
alpha_to_color ,
binarize_img ,
)
from ppocr . utils . network import (
maybe_download ,
download_with_progressbar ,
is_link ,
confirm_model_dir_url ,
)
2021-11-10 20:20:45 +08:00
from tools . infer . utility import draw_ocr , str2bool , check_gpu
2021-08-02 15:28:07 +08:00
from ppstructure . utility import init_args , draw_structure_result
2022-08-25 16:32:44 +08:00
from ppstructure . predict_system import StructureSystem , save_structure_res , to_excel
2020-08-22 19:42:14 +08:00
2023-08-06 11:17:13 +08:00
logger = get_logger ( )
2021-09-06 18:33:21 +08:00
__all__ = [
2024-04-21 21:46:20 +08:00
" PaddleOCR " ,
" PPStructure " ,
" draw_ocr " ,
" draw_structure_result " ,
" save_structure_res " ,
" download_with_progressbar " ,
" to_excel " ,
2021-09-06 18:33:21 +08:00
]
2024-04-21 21:46:20 +08:00
SUPPORT_DET_MODEL = [ " DB " ]
SUPPORT_REC_MODEL = [ " CRNN " , " SVTR_LCNet " ]
2021-09-06 18:33:21 +08:00
BASE_DIR = os . path . expanduser ( " ~/.paddleocr/ " )
2024-04-21 21:46:20 +08:00
DEFAULT_OCR_MODEL_VERSION = " PP-OCRv4 "
SUPPORT_OCR_MODEL_VERSION = [ " PP-OCR " , " PP-OCRv2 " , " PP-OCRv3 " , " PP-OCRv4 " ]
DEFAULT_STRUCTURE_MODEL_VERSION = " PP-StructureV2 "
SUPPORT_STRUCTURE_MODEL_VERSION = [ " PP-Structure " , " PP-StructureV2 " ]
2021-09-06 18:33:21 +08:00
MODEL_URLS = {
2024-04-21 21:46:20 +08:00
" OCR " : {
" PP-OCRv4 " : {
" det " : {
" ch " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_infer.tar " ,
} ,
" en " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar " ,
} ,
" ml " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_infer.tar "
} ,
2023-08-06 11:17:13 +08:00
} ,
2024-04-21 21:46:20 +08:00
" rec " : {
" ch " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/ppocr_keys_v1.txt " ,
} ,
" en " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/en_dict.txt " ,
} ,
" korean " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv4/multilingual/korean_PP-OCRv4_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/korean_dict.txt " ,
} ,
" japan " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv4/multilingual/japan_PP-OCRv4_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/japan_dict.txt " ,
} ,
" chinese_cht " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/chinese_cht_PP-OCRv3_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/chinese_cht_dict.txt " ,
} ,
" ta " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv4/multilingual/ta_PP-OCRv4_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/ta_dict.txt " ,
} ,
" te " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv4/multilingual/te_PP-OCRv4_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/te_dict.txt " ,
} ,
" ka " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv4/multilingual/ka_PP-OCRv4_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/ka_dict.txt " ,
} ,
" latin " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/latin_dict.txt " ,
} ,
" arabic " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv4/multilingual/arabic_PP-OCRv4_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/arabic_dict.txt " ,
} ,
" cyrillic " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_PP-OCRv3_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/cyrillic_dict.txt " ,
} ,
" devanagari " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv4/multilingual/devanagari_PP-OCRv4_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/devanagari_dict.txt " ,
2023-08-06 11:17:13 +08:00
} ,
} ,
2024-04-21 21:46:20 +08:00
" cls " : {
" ch " : {
" url " : " https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar " ,
2023-08-06 11:17:13 +08:00
}
} ,
} ,
2024-04-21 21:46:20 +08:00
" PP-OCRv3 " : {
" det " : {
" ch " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar " ,
} ,
" en " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar " ,
} ,
" ml " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_infer.tar "
} ,
2022-04-29 11:38:59 +08:00
} ,
2024-04-21 21:46:20 +08:00
" rec " : {
" ch " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/ppocr_keys_v1.txt " ,
} ,
" en " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/en_dict.txt " ,
} ,
" korean " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/korean_PP-OCRv3_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/korean_dict.txt " ,
} ,
" japan " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/japan_PP-OCRv3_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/japan_dict.txt " ,
} ,
" chinese_cht " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/chinese_cht_PP-OCRv3_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/chinese_cht_dict.txt " ,
} ,
" ta " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/ta_dict.txt " ,
} ,
" te " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/te_PP-OCRv3_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/te_dict.txt " ,
} ,
" ka " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/ka_dict.txt " ,
} ,
" latin " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/latin_dict.txt " ,
} ,
" arabic " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/arabic_PP-OCRv3_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/arabic_dict.txt " ,
} ,
" cyrillic " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_PP-OCRv3_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/cyrillic_dict.txt " ,
} ,
" devanagari " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/devanagari_PP-OCRv3_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/devanagari_dict.txt " ,
2022-05-07 11:52:06 +08:00
} ,
2022-04-29 11:38:59 +08:00
} ,
2024-04-21 21:46:20 +08:00
" cls " : {
" ch " : {
" url " : " https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar " ,
2022-04-29 11:38:59 +08:00
}
} ,
} ,
2024-04-21 21:46:20 +08:00
" PP-OCRv2 " : {
" det " : {
" ch " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar " ,
2021-11-10 20:20:45 +08:00
} ,
2021-09-06 18:33:21 +08:00
} ,
2024-04-21 21:46:20 +08:00
" rec " : {
" ch " : {
" url " : " https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/ppocr_keys_v1.txt " ,
2021-11-10 20:20:45 +08:00
}
2022-05-30 17:25:21 +08:00
} ,
2024-04-21 21:46:20 +08:00
" cls " : {
" ch " : {
" url " : " https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar " ,
2022-05-30 17:25:21 +08:00
}
} ,
2020-12-07 19:10:19 +08:00
} ,
2024-04-21 21:46:20 +08:00
" PP-OCR " : {
" det " : {
" ch " : {
" url " : " https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar " ,
} ,
" en " : {
" url " : " https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_ppocr_mobile_v2.0_det_infer.tar " ,
} ,
" structure " : {
" url " : " https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar "
} ,
2021-09-06 18:33:21 +08:00
} ,
2024-04-21 21:46:20 +08:00
" rec " : {
" ch " : {
" url " : " https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/ppocr_keys_v1.txt " ,
} ,
" en " : {
" url " : " https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/en_dict.txt " ,
} ,
" french " : {
" url " : " https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/french_mobile_v2.0_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/french_dict.txt " ,
} ,
" german " : {
" url " : " https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/german_mobile_v2.0_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/german_dict.txt " ,
} ,
" korean " : {
" url " : " https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/korean_mobile_v2.0_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/korean_dict.txt " ,
} ,
" japan " : {
" url " : " https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/japan_mobile_v2.0_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/japan_dict.txt " ,
} ,
" chinese_cht " : {
" url " : " https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/chinese_cht_mobile_v2.0_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/chinese_cht_dict.txt " ,
} ,
" ta " : {
" url " : " https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ta_mobile_v2.0_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/ta_dict.txt " ,
} ,
" te " : {
" url " : " https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/te_mobile_v2.0_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/te_dict.txt " ,
} ,
" ka " : {
" url " : " https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/ka_mobile_v2.0_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/ka_dict.txt " ,
} ,
" latin " : {
" url " : " https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/latin_ppocr_mobile_v2.0_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/latin_dict.txt " ,
} ,
" arabic " : {
" url " : " https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/arabic_ppocr_mobile_v2.0_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/arabic_dict.txt " ,
} ,
" cyrillic " : {
" url " : " https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/cyrillic_ppocr_mobile_v2.0_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/cyrillic_dict.txt " ,
} ,
" devanagari " : {
" url " : " https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/devanagari_ppocr_mobile_v2.0_rec_infer.tar " ,
" dict_path " : " ./ppocr/utils/dict/devanagari_dict.txt " ,
} ,
" structure " : {
" url " : " https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar " ,
" dict_path " : " ppocr/utils/dict/table_dict.txt " ,
} ,
2021-09-06 18:33:21 +08:00
} ,
2024-04-21 21:46:20 +08:00
" cls " : {
" ch " : {
" url " : " https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar " ,
2021-11-10 20:20:45 +08:00
}
2021-09-06 18:33:21 +08:00
} ,
2024-04-21 21:46:20 +08:00
} ,
2021-11-10 20:20:45 +08:00
} ,
2024-04-21 21:46:20 +08:00
" STRUCTURE " : {
" PP-Structure " : {
" table " : {
" en " : {
" url " : " https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar " ,
" dict_path " : " ppocr/utils/dict/table_structure_dict.txt " ,
2021-11-10 20:20:45 +08:00
}
2021-09-06 18:33:21 +08:00
}
2022-08-10 22:51:57 +08:00
} ,
2024-04-21 21:46:20 +08:00
" PP-StructureV2 " : {
" table " : {
" en " : {
" url " : " https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar " ,
" dict_path " : " ppocr/utils/dict/table_structure_dict.txt " ,
} ,
" ch " : {
" url " : " https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar " ,
" dict_path " : " ppocr/utils/dict/table_structure_dict_ch.txt " ,
} ,
2022-08-10 22:51:57 +08:00
} ,
2024-04-21 21:46:20 +08:00
" layout " : {
" en " : {
" url " : " https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar " ,
" dict_path " : " ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt " ,
} ,
" ch " : {
" url " : " https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar " ,
" dict_path " : " ppocr/utils/dict/layout_dict/layout_cdla_dict.txt " ,
} ,
} ,
} ,
} ,
2020-08-22 19:42:14 +08:00
}
2021-06-02 20:10:59 +08:00
def parse_args ( mMain = True ) :
2020-08-22 19:42:14 +08:00
import argparse
2024-04-21 21:46:20 +08:00
2021-06-02 20:10:59 +08:00
parser = init_args ( )
parser . add_help = mMain
2024-04-21 21:46:20 +08:00
parser . add_argument ( " --lang " , type = str , default = " ch " )
2021-06-02 20:10:59 +08:00
parser . add_argument ( " --det " , type = str2bool , default = True )
parser . add_argument ( " --rec " , type = str2bool , default = True )
2024-04-21 21:46:20 +08:00
parser . add_argument ( " --type " , type = str , default = " ocr " )
2023-08-21 10:25:47 +08:00
parser . add_argument ( " --savefile " , type = str2bool , default = False )
2021-11-10 20:20:45 +08:00
parser . add_argument (
" --ocr_version " ,
type = str ,
2022-01-10 13:10:48 +08:00
choices = SUPPORT_OCR_MODEL_VERSION ,
2024-04-21 21:46:20 +08:00
default = " PP-OCRv4 " ,
help = " OCR Model version, the current model support list is as follows: "
" 1. PP-OCRv4/v3 Support Chinese and English detection and recognition model, and direction classifier model "
" 2. PP-OCRv2 Support Chinese detection and recognition model. "
" 3. PP-OCR support Chinese detection, recognition and direction classifier and multilingual recognition model. " ,
2021-11-10 20:20:45 +08:00
)
parser . add_argument (
" --structure_version " ,
type = str ,
2022-01-10 13:10:48 +08:00
choices = SUPPORT_STRUCTURE_MODEL_VERSION ,
2024-04-21 21:46:20 +08:00
default = " PP-StructureV2 " ,
help = " Model version, the current model support list is as follows: "
" 1. PP-Structure Support en table structure model. "
" 2. PP-StructureV2 Support ch and en table structure model. " ,
)
2021-06-02 20:10:59 +08:00
for action in parser . _actions :
2022-08-10 22:51:57 +08:00
if action . dest in [
2024-04-21 21:46:20 +08:00
" rec_char_dict_path " ,
" table_char_dict_path " ,
" layout_dict_path " ,
2022-08-10 22:51:57 +08:00
] :
2021-06-02 20:10:59 +08:00
action . default = None
2020-12-07 19:10:19 +08:00
if mMain :
2021-06-02 20:10:59 +08:00
return parser . parse_args ( )
2020-12-07 19:10:19 +08:00
else :
2021-05-26 17:34:47 +08:00
inference_args_dict = { }
2021-06-02 20:10:59 +08:00
for action in parser . _actions :
inference_args_dict [ action . dest ] = action . default
2021-05-26 17:34:47 +08:00
return argparse . Namespace ( * * inference_args_dict )
2020-08-22 19:42:14 +08:00
2021-08-02 17:04:53 +08:00
def parse_lang ( lang ) :
latin_lang = [
2024-04-21 21:46:20 +08:00
" af " ,
" az " ,
" bs " ,
" cs " ,
" cy " ,
" da " ,
" de " ,
" es " ,
" et " ,
" fr " ,
" ga " ,
" hr " ,
" hu " ,
" id " ,
" is " ,
" it " ,
" ku " ,
" la " ,
" lt " ,
" lv " ,
" mi " ,
" ms " ,
" mt " ,
" nl " ,
" no " ,
" oc " ,
" pi " ,
" pl " ,
" pt " ,
" ro " ,
" rs_latin " ,
" sk " ,
" sl " ,
" sq " ,
" sv " ,
" sw " ,
" tl " ,
" tr " ,
" uz " ,
" vi " ,
" french " ,
" german " ,
2021-08-02 17:04:53 +08:00
]
2024-04-21 21:46:20 +08:00
arabic_lang = [ " ar " , " fa " , " ug " , " ur " ]
2021-08-02 17:04:53 +08:00
cyrillic_lang = [
2024-04-21 21:46:20 +08:00
" ru " ,
" rs_cyrillic " ,
" be " ,
" bg " ,
" uk " ,
" mn " ,
" abq " ,
" ady " ,
" kbd " ,
" ava " ,
" dar " ,
" inh " ,
" che " ,
" lbe " ,
" lez " ,
" tab " ,
2021-08-02 17:04:53 +08:00
]
devanagari_lang = [
2024-04-21 21:46:20 +08:00
" hi " ,
" mr " ,
" ne " ,
" bh " ,
" mai " ,
" ang " ,
" bho " ,
" mah " ,
" sck " ,
" new " ,
" gom " ,
" sa " ,
" bgc " ,
2021-08-02 17:04:53 +08:00
]
if lang in latin_lang :
lang = " latin "
elif lang in arabic_lang :
lang = " arabic "
elif lang in cyrillic_lang :
lang = " cyrillic "
elif lang in devanagari_lang :
lang = " devanagari "
2024-04-21 21:46:20 +08:00
assert (
lang in MODEL_URLS [ " OCR " ] [ DEFAULT_OCR_MODEL_VERSION ] [ " rec " ]
) , " param lang must in {} , but got {} " . format (
MODEL_URLS [ " OCR " ] [ DEFAULT_OCR_MODEL_VERSION ] [ " rec " ] . keys ( ) , lang
)
2021-08-02 17:04:53 +08:00
if lang == " ch " :
det_lang = " ch "
2024-04-21 21:46:20 +08:00
elif lang == " structure " :
det_lang = " structure "
2022-05-07 14:05:47 +08:00
elif lang in [ " en " , " latin " ] :
2021-08-02 17:04:53 +08:00
det_lang = " en "
2022-05-07 14:05:47 +08:00
else :
det_lang = " ml "
2021-08-02 17:04:53 +08:00
return lang , det_lang
2021-11-10 20:20:45 +08:00
def get_model_config ( type , version , model_type , lang ) :
2024-04-21 21:46:20 +08:00
if type == " OCR " :
2021-11-10 20:20:45 +08:00
DEFAULT_MODEL_VERSION = DEFAULT_OCR_MODEL_VERSION
2024-04-21 21:46:20 +08:00
elif type == " STRUCTURE " :
2021-11-10 20:20:45 +08:00
DEFAULT_MODEL_VERSION = DEFAULT_STRUCTURE_MODEL_VERSION
else :
raise NotImplementedError
2022-01-10 11:33:21 +08:00
2021-11-10 20:20:45 +08:00
model_urls = MODEL_URLS [ type ]
if version not in model_urls :
2021-09-06 18:33:21 +08:00
version = DEFAULT_MODEL_VERSION
2021-11-10 20:20:45 +08:00
if model_type not in model_urls [ version ] :
if model_type in model_urls [ DEFAULT_MODEL_VERSION ] :
2021-09-06 18:33:21 +08:00
version = DEFAULT_MODEL_VERSION
else :
2024-04-21 21:46:20 +08:00
logger . error (
" {} models is not support, we only support {} " . format (
model_type , model_urls [ DEFAULT_MODEL_VERSION ] . keys ( )
)
)
2021-09-06 18:33:21 +08:00
sys . exit ( - 1 )
2022-01-10 13:11:18 +08:00
2021-11-10 20:20:45 +08:00
if lang not in model_urls [ version ] [ model_type ] :
if lang in model_urls [ DEFAULT_MODEL_VERSION ] [ model_type ] :
2021-09-06 18:33:21 +08:00
version = DEFAULT_MODEL_VERSION
else :
logger . error (
2024-04-21 21:46:20 +08:00
" lang {} is not support, we only support {} for {} models " . format (
lang ,
model_urls [ DEFAULT_MODEL_VERSION ] [ model_type ] . keys ( ) ,
model_type ,
)
)
2021-09-06 18:33:21 +08:00
sys . exit ( - 1 )
2021-11-10 20:20:45 +08:00
return model_urls [ version ] [ model_type ] [ lang ]
2021-09-06 18:33:21 +08:00
2022-08-31 19:59:29 +08:00
def img_decode ( content : bytes ) :
np_arr = np . frombuffer ( content , dtype = np . uint8 )
2023-09-07 16:36:47 +08:00
return cv2 . imdecode ( np_arr , cv2 . IMREAD_UNCHANGED )
2022-08-31 19:59:29 +08:00
2023-09-21 14:51:32 +08:00
def check_img ( img , alpha_color = ( 255 , 255 , 255 ) ) :
"""
Check the image data . If it is another type of image file , try to decode it into a numpy array .
The inference network requires three - channel images , So the following channel conversions are done
single channel image : Gray to RGB R ← Y , G ← Y , B ← Y
four channel image : alpha_to_color
args :
img : image data
file format : jpg , png and other image formats that opencv can decode , as well as gif and pdf formats
storage type : binary image , net image file , local image file
alpha_color : Background color in images in RGBA format
2024-04-25 16:52:09 +08:00
return : numpy . array ( h , w , 3 ) or list ( p , h , w , 3 ) ( p : page of pdf ) , boolean , boolean
2023-09-21 14:51:32 +08:00
"""
2024-04-25 16:52:09 +08:00
flag_gif , flag_pdf = False , False
2022-08-31 19:59:29 +08:00
if isinstance ( img , bytes ) :
img = img_decode ( img )
if isinstance ( img , str ) :
# download net image
if is_link ( img ) :
2024-04-21 21:46:20 +08:00
download_with_progressbar ( img , " tmp.jpg " )
img = " tmp.jpg "
2022-08-31 19:59:29 +08:00
image_file = img
2022-10-08 16:28:30 +08:00
img , flag_gif , flag_pdf = check_and_read ( image_file )
if not flag_gif and not flag_pdf :
2024-04-21 21:46:20 +08:00
with open ( image_file , " rb " ) as f :
2022-12-07 09:51:54 +08:00
img_str = f . read ( )
img = img_decode ( img_str )
if img is None :
try :
buf = BytesIO ( )
image = BytesIO ( img_str )
im = Image . open ( image )
2024-04-21 21:46:20 +08:00
rgb = im . convert ( " RGB " )
rgb . save ( buf , " jpeg " )
2022-12-07 09:51:54 +08:00
buf . seek ( 0 )
image_bytes = buf . read ( )
2024-04-21 21:46:20 +08:00
data_base64 = str ( base64 . b64encode ( image_bytes ) , encoding = " utf-8 " )
2022-12-07 09:51:54 +08:00
image_decode = base64 . b64decode ( data_base64 )
img_array = np . frombuffer ( image_decode , np . uint8 )
img = cv2 . imdecode ( img_array , cv2 . IMREAD_COLOR )
except :
logger . error ( " error in loading image: {} " . format ( image_file ) )
2024-04-25 16:52:09 +08:00
return None , flag_gif , flag_pdf
2022-08-31 19:59:29 +08:00
if img is None :
logger . error ( " error in loading image: {} " . format ( image_file ) )
2024-04-25 16:52:09 +08:00
return None , flag_gif , flag_pdf
2023-09-21 14:51:32 +08:00
# single channel image array.shape:h,w
2022-08-31 19:59:29 +08:00
if isinstance ( img , np . ndarray ) and len ( img . shape ) == 2 :
img = cv2 . cvtColor ( img , cv2 . COLOR_GRAY2BGR )
2023-09-21 14:51:32 +08:00
# four channel image array.shape:h,w,c
if isinstance ( img , np . ndarray ) and len ( img . shape ) == 3 and img . shape [ 2 ] == 4 :
img = alpha_to_color ( img , alpha_color )
2024-04-25 16:52:09 +08:00
return img , flag_gif , flag_pdf
2022-08-31 19:59:29 +08:00
2020-08-22 19:42:14 +08:00
class PaddleOCR ( predict_system . TextSystem ) :
2020-08-24 11:30:56 +08:00
def __init__ ( self , * * kwargs ) :
2020-08-22 19:42:14 +08:00
"""
paddleocr package
args :
* * kwargs : other params show in paddleocr - - help
"""
2021-06-10 14:24:59 +08:00
params = parse_args ( mMain = False )
params . __dict__ . update ( * * kwargs )
2024-04-21 21:46:20 +08:00
assert (
params . ocr_version in SUPPORT_OCR_MODEL_VERSION
) , " ocr_version must in {} , but get {} " . format (
SUPPORT_OCR_MODEL_VERSION , params . ocr_version
)
2021-11-10 20:20:45 +08:00
params . use_gpu = check_gpu ( params . use_gpu )
2021-06-10 17:12:14 +08:00
if not params . show_log :
logger . setLevel ( logging . INFO )
2021-06-10 14:24:59 +08:00
self . use_angle_cls = params . use_angle_cls
2021-08-02 17:04:53 +08:00
lang , det_lang = parse_lang ( params . lang )
2020-08-22 19:42:14 +08:00
2020-08-24 11:30:56 +08:00
# init model dir
2024-04-21 21:46:20 +08:00
det_model_config = get_model_config ( " OCR " , params . ocr_version , " det " , det_lang )
2021-09-06 18:33:21 +08:00
params . det_model_dir , det_url = confirm_model_dir_url (
params . det_model_dir ,
2024-04-21 21:46:20 +08:00
os . path . join ( BASE_DIR , " whl " , " det " , det_lang ) ,
det_model_config [ " url " ] ,
)
rec_model_config = get_model_config ( " OCR " , params . ocr_version , " rec " , lang )
2021-09-06 18:33:21 +08:00
params . rec_model_dir , rec_url = confirm_model_dir_url (
params . rec_model_dir ,
2024-04-21 21:46:20 +08:00
os . path . join ( BASE_DIR , " whl " , " rec " , lang ) ,
rec_model_config [ " url " ] ,
)
cls_model_config = get_model_config ( " OCR " , params . ocr_version , " cls " , " ch " )
2021-09-06 18:33:21 +08:00
params . cls_model_dir , cls_url = confirm_model_dir_url (
params . cls_model_dir ,
2024-04-21 21:46:20 +08:00
os . path . join ( BASE_DIR , " whl " , " cls " ) ,
cls_model_config [ " url " ] ,
)
if params . ocr_version in [ " PP-OCRv3 " , " PP-OCRv4 " ] :
2022-05-07 17:06:13 +08:00
params . rec_image_shape = " 3, 48, 320 "
else :
params . rec_image_shape = " 3, 32, 320 "
2022-09-29 11:19:44 +08:00
# download model if using paddle infer
if not params . use_onnx :
maybe_download ( params . det_model_dir , det_url )
maybe_download ( params . rec_model_dir , rec_url )
maybe_download ( params . cls_model_dir , cls_url )
2020-08-22 19:42:14 +08:00
2021-06-10 14:24:59 +08:00
if params . det_algorithm not in SUPPORT_DET_MODEL :
2024-04-21 21:46:20 +08:00
logger . error ( " det_algorithm must in {} " . format ( SUPPORT_DET_MODEL ) )
2020-08-22 19:42:14 +08:00
sys . exit ( 0 )
2021-06-10 14:24:59 +08:00
if params . rec_algorithm not in SUPPORT_REC_MODEL :
2024-04-21 21:46:20 +08:00
logger . error ( " rec_algorithm must in {} " . format ( SUPPORT_REC_MODEL ) )
2020-08-22 19:42:14 +08:00
sys . exit ( 0 )
2021-08-02 17:04:53 +08:00
if params . rec_char_dict_path is None :
2021-09-06 18:33:21 +08:00
params . rec_char_dict_path = str (
2024-04-21 21:46:20 +08:00
Path ( __file__ ) . parent / rec_model_config [ " dict_path " ]
)
2020-08-22 19:42:14 +08:00
2022-03-17 22:20:41 +08:00
logger . debug ( params )
2020-08-22 19:42:14 +08:00
# init det_model and rec_model
2021-06-10 14:24:59 +08:00
super ( ) . __init__ ( params )
2022-10-08 16:28:30 +08:00
self . page_num = params . page_num
2020-08-22 19:42:14 +08:00
2024-04-21 21:46:20 +08:00
def ocr (
self ,
img ,
det = True ,
rec = True ,
cls = True ,
bin = False ,
inv = False ,
alpha_color = ( 255 , 255 , 255 ) ,
2024-05-24 16:16:37 +08:00
slice = { } ,
2024-04-21 21:46:20 +08:00
) :
2020-08-22 19:42:14 +08:00
"""
2023-08-21 16:33:03 +08:00
OCR with PaddleOCR
2024-04-21 21:46:20 +08:00
2023-08-21 10:25:47 +08:00
args :
2023-08-21 16:33:03 +08:00
img : img for OCR , support ndarray , img_path and list or ndarray
det : use text detection or not . If False , only rec will be exec . Default is True
rec : use text recognition or not . If False , only det will be exec . Default is True
cls : use angle classifier or not . Default is True . If True , the text with rotation of 180 degrees can be recognized . If no text is rotated by 180 degrees , use cls = False to get better performance . Text with rotation of 90 or 270 degrees can be recognized even if cls = False .
bin : binarize image to black and white . Default is False .
inv : invert image colors . Default is False .
alpha_color : set RGB color Tuple for transparent parts replacement . Default is pure white .
2024-05-24 16:16:37 +08:00
slice : use sliding window inference for large images , det and rec must be True . Requires int values for slice [ " horizontal_stride " ] , slice [ " vertical_stride " ] , slice [ " merge_x_thres " ] , slice [ " merge_y_thres] (See doc/doc_en/slice_en.md). Default is {} .
2020-08-22 19:42:14 +08:00
"""
2022-08-31 19:59:29 +08:00
assert isinstance ( img , ( np . ndarray , list , str , bytes ) )
2020-12-07 19:10:19 +08:00
if isinstance ( img , list ) and det == True :
2024-04-21 21:46:20 +08:00
logger . error ( " When input a list of images, det must be false " )
2020-12-07 19:10:19 +08:00
exit ( 0 )
2021-05-26 17:34:47 +08:00
if cls == True and self . use_angle_cls == False :
2021-02-24 15:38:54 +08:00
logger . warning (
2024-04-21 21:46:20 +08:00
" Since the angle classifier is not initialized, it will not be used during the forward process "
2021-02-24 15:38:54 +08:00
)
2020-12-07 19:10:19 +08:00
2024-04-25 16:52:09 +08:00
img , flag_gif , flag_pdf = check_img ( img , alpha_color )
2022-10-08 16:28:30 +08:00
# for infer pdf file
2024-04-25 16:52:09 +08:00
if isinstance ( img , list ) and flag_pdf :
2022-10-08 16:28:30 +08:00
if self . page_num > len ( img ) or self . page_num == 0 :
2023-10-18 17:37:23 +08:00
imgs = img
else :
2024-04-21 21:46:20 +08:00
imgs = img [ : self . page_num ]
2022-10-08 16:28:30 +08:00
else :
imgs = [ img ]
2023-08-21 16:33:03 +08:00
def preprocess_image ( _image ) :
2023-10-18 17:37:23 +08:00
_image = alpha_to_color ( _image , alpha_color )
2023-08-21 16:33:03 +08:00
if inv :
_image = cv2 . bitwise_not ( _image )
if bin :
_image = binarize_img ( _image )
return _image
2020-08-22 19:42:14 +08:00
if det and rec :
2022-10-08 16:28:30 +08:00
ocr_res = [ ]
for idx , img in enumerate ( imgs ) :
2023-08-21 16:33:03 +08:00
img = preprocess_image ( img )
2024-05-24 16:16:37 +08:00
dt_boxes , rec_res , _ = self . __call__ ( img , cls , slice )
2023-08-21 16:33:03 +08:00
if not dt_boxes and not rec_res :
ocr_res . append ( None )
continue
2024-04-21 21:46:20 +08:00
tmp_res = [ [ box . tolist ( ) , res ] for box , res in zip ( dt_boxes , rec_res ) ]
2022-10-08 16:28:30 +08:00
ocr_res . append ( tmp_res )
return ocr_res
2020-08-22 19:42:14 +08:00
elif det and not rec :
2022-10-08 16:28:30 +08:00
ocr_res = [ ]
for idx , img in enumerate ( imgs ) :
2023-08-21 16:33:03 +08:00
img = preprocess_image ( img )
2022-10-08 16:28:30 +08:00
dt_boxes , elapse = self . text_detector ( img )
2023-11-22 20:10:32 +08:00
if dt_boxes . size == 0 :
2023-08-21 16:33:03 +08:00
ocr_res . append ( None )
continue
2022-10-08 16:28:30 +08:00
tmp_res = [ box . tolist ( ) for box in dt_boxes ]
ocr_res . append ( tmp_res )
return ocr_res
2020-08-22 19:42:14 +08:00
else :
2022-10-08 16:28:30 +08:00
ocr_res = [ ]
cls_res = [ ]
for idx , img in enumerate ( imgs ) :
if not isinstance ( img , list ) :
2023-08-21 16:33:03 +08:00
img = preprocess_image ( img )
2022-10-08 16:28:30 +08:00
img = [ img ]
if self . use_angle_cls and cls :
img , cls_res_tmp , elapse = self . text_classifier ( img )
if not rec :
cls_res . append ( cls_res_tmp )
rec_res , elapse = self . text_recognizer ( img )
ocr_res . append ( rec_res )
if not rec :
return cls_res
return ocr_res
2020-08-24 11:30:56 +08:00
2022-03-30 17:24:24 +08:00
class PPStructure ( StructureSystem ) :
2021-08-02 15:28:07 +08:00
def __init__ ( self , * * kwargs ) :
params = parse_args ( mMain = False )
params . __dict__ . update ( * * kwargs )
2024-04-21 21:46:20 +08:00
assert (
params . structure_version in SUPPORT_STRUCTURE_MODEL_VERSION
) , " structure_version must in {} , but get {} " . format (
SUPPORT_STRUCTURE_MODEL_VERSION , params . structure_version
)
2021-11-10 20:20:45 +08:00
params . use_gpu = check_gpu ( params . use_gpu )
2024-04-21 21:46:20 +08:00
params . mode = " structure "
2021-11-10 20:20:45 +08:00
2021-08-02 15:28:07 +08:00
if not params . show_log :
logger . setLevel ( logging . INFO )
2021-08-02 17:04:53 +08:00
lang , det_lang = parse_lang ( params . lang )
2024-04-21 21:46:20 +08:00
if lang == " ch " :
table_lang = " ch "
2022-08-16 18:55:24 +08:00
else :
2024-04-21 21:46:20 +08:00
table_lang = " en "
if params . structure_version == " PP-Structure " :
2022-08-16 18:55:24 +08:00
params . merge_no_span_structure = False
2021-08-02 17:04:53 +08:00
2021-08-02 15:28:07 +08:00
# init model dir
2024-04-21 21:46:20 +08:00
det_model_config = get_model_config ( " OCR " , params . ocr_version , " det " , det_lang )
2021-09-06 18:33:21 +08:00
params . det_model_dir , det_url = confirm_model_dir_url (
params . det_model_dir ,
2024-04-21 21:46:20 +08:00
os . path . join ( BASE_DIR , " whl " , " det " , det_lang ) ,
det_model_config [ " url " ] ,
)
rec_model_config = get_model_config ( " OCR " , params . ocr_version , " rec " , lang )
2021-09-06 18:33:21 +08:00
params . rec_model_dir , rec_url = confirm_model_dir_url (
params . rec_model_dir ,
2024-04-21 21:46:20 +08:00
os . path . join ( BASE_DIR , " whl " , " rec " , lang ) ,
rec_model_config [ " url " ] ,
)
2021-11-10 20:20:45 +08:00
table_model_config = get_model_config (
2024-04-21 21:46:20 +08:00
" STRUCTURE " , params . structure_version , " table " , table_lang
)
2021-09-06 18:33:21 +08:00
params . table_model_dir , table_url = confirm_model_dir_url (
params . table_model_dir ,
2024-04-21 21:46:20 +08:00
os . path . join ( BASE_DIR , " whl " , " table " ) ,
table_model_config [ " url " ] ,
)
2022-08-10 22:51:57 +08:00
layout_model_config = get_model_config (
2024-04-21 21:46:20 +08:00
" STRUCTURE " , params . structure_version , " layout " , lang
)
2022-08-10 22:51:57 +08:00
params . layout_model_dir , layout_url = confirm_model_dir_url (
params . layout_model_dir ,
2024-04-21 21:46:20 +08:00
os . path . join ( BASE_DIR , " whl " , " layout " ) ,
layout_model_config [ " url " ] ,
)
2021-08-02 15:28:07 +08:00
# download model
2024-05-09 09:35:44 +08:00
if not params . use_onnx :
maybe_download ( params . det_model_dir , det_url )
maybe_download ( params . rec_model_dir , rec_url )
maybe_download ( params . table_model_dir , table_url )
maybe_download ( params . layout_model_dir , layout_url )
2021-08-02 15:28:07 +08:00
if params . rec_char_dict_path is None :
2021-09-06 18:33:21 +08:00
params . rec_char_dict_path = str (
2024-04-21 21:46:20 +08:00
Path ( __file__ ) . parent / rec_model_config [ " dict_path " ]
)
2021-08-02 15:28:07 +08:00
if params . table_char_dict_path is None :
2021-09-06 18:33:21 +08:00
params . table_char_dict_path = str (
2024-04-21 21:46:20 +08:00
Path ( __file__ ) . parent / table_model_config [ " dict_path " ]
)
2022-08-10 22:51:57 +08:00
if params . layout_dict_path is None :
params . layout_dict_path = str (
2024-04-21 21:46:20 +08:00
Path ( __file__ ) . parent / layout_model_config [ " dict_path " ]
)
2022-03-17 22:20:41 +08:00
logger . debug ( params )
2021-08-02 15:28:07 +08:00
super ( ) . __init__ ( params )
2024-04-21 21:46:20 +08:00
def __call__ (
self ,
img ,
return_ocr_result_in_table = False ,
img_idx = 0 ,
alpha_color = ( 255 , 255 , 255 ) ,
) :
2024-04-25 16:52:09 +08:00
img , flag_gif , flag_pdf = check_img ( img , alpha_color )
if isinstance ( img , list ) and flag_pdf :
res_list = [ ]
for index , pdf_img in enumerate ( img ) :
logger . info ( " processing {} / {} page: " . format ( index + 1 , len ( img ) ) )
res , _ = super ( ) . __call__ (
pdf_img , return_ocr_result_in_table , img_idx = index
)
res_list . append ( res )
return res_list
2024-04-21 21:46:20 +08:00
res , _ = super ( ) . __call__ ( img , return_ocr_result_in_table , img_idx = img_idx )
2021-08-02 15:28:07 +08:00
return res
2020-08-24 11:30:56 +08:00
def main ( ) :
2020-12-07 19:10:19 +08:00
# for cmd
2021-06-02 20:10:59 +08:00
args = parse_args ( mMain = True )
2020-12-07 19:10:19 +08:00
image_dir = args . image_dir
2021-06-10 14:47:23 +08:00
if is_link ( image_dir ) :
2024-04-21 21:46:20 +08:00
download_with_progressbar ( image_dir , " tmp.jpg " )
image_file_list = [ " tmp.jpg " ]
2020-12-07 19:10:19 +08:00
else :
image_file_list = get_image_file_list ( args . image_dir )
2020-08-24 11:30:56 +08:00
if len ( image_file_list ) == 0 :
2024-04-21 21:46:20 +08:00
logger . error ( " no images find in {} " . format ( args . image_dir ) )
2020-08-24 11:30:56 +08:00
return
2024-04-21 21:46:20 +08:00
if args . type == " ocr " :
2021-08-02 15:28:07 +08:00
engine = PaddleOCR ( * * ( args . __dict__ ) )
2024-04-21 21:46:20 +08:00
elif args . type == " structure " :
2021-08-02 15:28:07 +08:00
engine = PPStructure ( * * ( args . __dict__ ) )
else :
raise NotImplementedError
2020-12-07 19:10:19 +08:00
2020-08-24 11:30:56 +08:00
for img_path in image_file_list :
2024-04-21 21:46:20 +08:00
img_name = os . path . basename ( img_path ) . split ( " . " ) [ 0 ]
logger . info ( " {} {} {} " . format ( " * " * 10 , img_path , " * " * 10 ) )
if args . type == " ocr " :
2023-08-21 16:33:03 +08:00
result = engine . ocr (
img_path ,
det = args . det ,
rec = args . rec ,
cls = args . use_angle_cls ,
bin = args . binarize ,
inv = args . invert ,
2024-04-21 21:46:20 +08:00
alpha_color = args . alphacolor ,
2023-08-21 16:33:03 +08:00
)
2021-08-02 15:28:07 +08:00
if result is not None :
2023-08-21 10:25:47 +08:00
lines = [ ]
2024-06-05 09:55:50 +08:00
for res in result :
2022-10-08 16:28:30 +08:00
for line in res :
logger . info ( line )
2024-06-05 09:55:50 +08:00
lines . append ( pprint . pformat ( line ) + " \n " )
2023-08-21 10:25:47 +08:00
if args . savefile :
if os . path . exists ( args . output ) is False :
os . mkdir ( args . output )
2024-04-21 21:46:20 +08:00
outfile = args . output + " / " + img_name + " .txt "
with open ( outfile , " w " , encoding = " utf-8 " ) as f :
2023-08-21 10:25:47 +08:00
f . writelines ( lines )
2024-04-21 21:46:20 +08:00
elif args . type == " structure " :
2022-08-23 23:28:49 +08:00
img , flag_gif , flag_pdf = check_and_read ( img_path )
if not flag_gif and not flag_pdf :
img = cv2 . imread ( img_path )
2021-08-02 17:22:31 +08:00
2022-10-12 21:28:48 +08:00
if args . recovery and args . use_pdf2docx_api and flag_pdf :
2024-04-26 16:54:49 +08:00
try_import ( " pdf2docx " )
2022-10-12 21:28:48 +08:00
from pdf2docx . converter import Converter
2024-04-21 21:46:20 +08:00
docx_file = os . path . join ( args . output , " {} .docx " . format ( img_name ) )
2022-10-12 21:28:48 +08:00
cv = Converter ( img_path )
cv . convert ( docx_file )
cv . close ( )
2024-04-21 21:46:20 +08:00
logger . info ( " docx save to {} " . format ( docx_file ) )
2022-10-12 21:28:48 +08:00
continue
2022-08-23 23:28:49 +08:00
if not flag_pdf :
if img is None :
2022-08-31 19:59:29 +08:00
logger . error ( " error in loading image: {} " . format ( img_path ) )
2022-08-23 23:28:49 +08:00
continue
img_paths = [ [ img_path , img ] ]
else :
img_paths = [ ]
for index , pdf_img in enumerate ( img ) :
2024-04-21 21:46:20 +08:00
os . makedirs ( os . path . join ( args . output , img_name ) , exist_ok = True )
2022-08-24 09:04:59 +08:00
pdf_img_path = os . path . join (
2024-04-21 21:46:20 +08:00
args . output , img_name , img_name + " _ " + str ( index ) + " .jpg "
)
2022-08-23 23:28:49 +08:00
cv2 . imwrite ( pdf_img_path , pdf_img )
img_paths . append ( [ pdf_img_path , pdf_img ] )
all_res = [ ]
for index , ( new_img_path , img ) in enumerate ( img_paths ) :
2024-04-21 21:46:20 +08:00
logger . info ( " processing {} / {} page: " . format ( index + 1 , len ( img_paths ) ) )
2023-08-09 19:15:49 +08:00
result = engine ( img , img_idx = index )
2022-08-23 23:28:49 +08:00
save_structure_res ( result , args . output , img_name , index )
if args . recovery and result != [ ] :
from copy import deepcopy
from ppstructure . recovery . recovery_to_doc import sorted_layout_boxes
2024-04-21 21:46:20 +08:00
2022-08-23 16:11:18 +08:00
h , w , _ = img . shape
2022-08-23 23:28:49 +08:00
result_cp = deepcopy ( result )
result_sorted = sorted_layout_boxes ( result_cp , w )
all_res + = result_sorted
if args . recovery and all_res != [ ] :
try :
from ppstructure . recovery . recovery_to_doc import convert_info_docx
2024-04-21 21:46:20 +08:00
2022-10-11 16:42:47 +08:00
convert_info_docx ( img , all_res , args . output , img_name )
2022-08-23 16:11:18 +08:00
except Exception as ex :
logger . error (
" error in layout recovery image: {} , err msg: {} " . format (
2024-04-21 21:46:20 +08:00
img_name , ex
)
)
2022-08-23 16:11:18 +08:00
continue
2022-10-08 16:28:30 +08:00
2022-08-24 10:07:30 +08:00
for item in all_res :
2024-04-21 21:46:20 +08:00
item . pop ( " img " )
item . pop ( " res " )
2022-08-24 10:07:30 +08:00
logger . info ( item )
2024-04-21 21:46:20 +08:00
logger . info ( " result save to {} " . format ( args . output ) )