mirror of
https://github.com/PaddlePaddle/PaddleOCR.git
synced 2025-06-03 21:53:39 +08:00
pdf2word v0.2.2
pdf2word v0.2.2
This commit is contained in:
parent
3907c72a08
commit
0f70eaf285
@ -7,8 +7,11 @@ import functools
|
|||||||
import cv2
|
import cv2
|
||||||
import platform
|
import platform
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import fitz
|
||||||
|
from PIL import Image
|
||||||
|
from pdf2docx.converter import Converter
|
||||||
from qtpy.QtWidgets import QApplication, QWidget, QPushButton, QProgressBar, \
|
from qtpy.QtWidgets import QApplication, QWidget, QPushButton, QProgressBar, \
|
||||||
QGridLayout, QMessageBox, QLabel, QFileDialog
|
QGridLayout, QMessageBox, QLabel, QFileDialog, QCheckBox
|
||||||
from qtpy.QtCore import Signal, QThread, QObject
|
from qtpy.QtCore import Signal, QThread, QObject
|
||||||
from qtpy.QtGui import QImage, QPixmap, QIcon
|
from qtpy.QtGui import QImage, QPixmap, QIcon
|
||||||
|
|
||||||
@ -17,6 +20,7 @@ root = os.path.abspath(os.path.join(file, '../../'))
|
|||||||
sys.path.append(file)
|
sys.path.append(file)
|
||||||
sys.path.insert(0, root)
|
sys.path.insert(0, root)
|
||||||
|
|
||||||
|
|
||||||
from ppstructure.predict_system import StructureSystem, save_structure_res
|
from ppstructure.predict_system import StructureSystem, save_structure_res
|
||||||
from ppstructure.utility import parse_args, draw_structure_result
|
from ppstructure.utility import parse_args, draw_structure_result
|
||||||
from ppocr.utils.network import download_with_progressbar
|
from ppocr.utils.network import download_with_progressbar
|
||||||
@ -24,7 +28,7 @@ from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_in
|
|||||||
# from ScreenShotWidget import ScreenShotWidget
|
# from ScreenShotWidget import ScreenShotWidget
|
||||||
|
|
||||||
__APPNAME__ = "pdf2word"
|
__APPNAME__ = "pdf2word"
|
||||||
__VERSION__ = "0.1.1"
|
__VERSION__ = "0.2.2"
|
||||||
|
|
||||||
URLs_EN = {
|
URLs_EN = {
|
||||||
# 下载超英文轻量级PP-OCRv3模型的检测模型并解压
|
# 下载超英文轻量级PP-OCRv3模型的检测模型并解压
|
||||||
@ -75,9 +79,7 @@ def QImageToCvMat(incomingImage) -> np.array:
|
|||||||
|
|
||||||
|
|
||||||
def readImage(image_file) -> list:
|
def readImage(image_file) -> list:
|
||||||
if os.path.basename(image_file)[-3:] in ['pdf']:
|
if os.path.basename(image_file)[-3:] == 'pdf':
|
||||||
import fitz
|
|
||||||
from PIL import Image
|
|
||||||
imgs = []
|
imgs = []
|
||||||
with fitz.open(image_file) as pdf:
|
with fitz.open(image_file) as pdf:
|
||||||
for pg in range(0, pdf.pageCount):
|
for pg in range(0, pdf.pageCount):
|
||||||
@ -102,17 +104,22 @@ def readImage(image_file) -> list:
|
|||||||
|
|
||||||
class Worker(QThread):
|
class Worker(QThread):
|
||||||
progressBarValue = Signal(int)
|
progressBarValue = Signal(int)
|
||||||
|
progressBarRange = Signal(int)
|
||||||
endsignal = Signal()
|
endsignal = Signal()
|
||||||
|
exceptedsignal = Signal(str) #发送一个异常信号
|
||||||
loopFlag = True
|
loopFlag = True
|
||||||
|
|
||||||
def __init__(self, predictors, save_pdf, vis_font_path):
|
def __init__(self, predictors, save_pdf, vis_font_path, use_pdf2docx_api):
|
||||||
super(Worker, self).__init__()
|
super(Worker, self).__init__()
|
||||||
self.predictors = predictors
|
self.predictors = predictors
|
||||||
self.save_pdf = save_pdf
|
self.save_pdf = save_pdf
|
||||||
self.vis_font_path = vis_font_path
|
self.vis_font_path = vis_font_path
|
||||||
self.lang = 'EN'
|
self.lang = 'EN'
|
||||||
self.imagePaths = []
|
self.imagePaths = []
|
||||||
|
self.use_pdf2docx_api = use_pdf2docx_api
|
||||||
self.outputDir = None
|
self.outputDir = None
|
||||||
|
self.totalPageCnt = 0
|
||||||
|
self.pageCnt = 0
|
||||||
self.setStackSize(1024*1024)
|
self.setStackSize(1024*1024)
|
||||||
|
|
||||||
def setImagePath(self, imagePaths):
|
def setImagePath(self, imagePaths):
|
||||||
@ -124,60 +131,90 @@ class Worker(QThread):
|
|||||||
def setOutputDir(self, outputDir):
|
def setOutputDir(self, outputDir):
|
||||||
self.outputDir = outputDir
|
self.outputDir = outputDir
|
||||||
|
|
||||||
def predictAndSave(self, imgs, img_name):
|
def setPDFParser(self, enabled):
|
||||||
|
self.use_pdf2docx_api = enabled
|
||||||
|
|
||||||
|
def resetPageCnt(self):
|
||||||
|
self.pageCnt = 0
|
||||||
|
|
||||||
|
def resetTotalPageCnt(self):
|
||||||
|
self.totalPageCnt = 0
|
||||||
|
|
||||||
|
def ppocrPrecitor(self, imgs, img_name):
|
||||||
all_res = []
|
all_res = []
|
||||||
|
# update progress bar ranges
|
||||||
|
self.totalPageCnt += len(imgs)
|
||||||
|
self.progressBarRange.emit(self.totalPageCnt)
|
||||||
|
# processing pages
|
||||||
for index, img in enumerate(imgs):
|
for index, img in enumerate(imgs):
|
||||||
res, time_dict = self.predictors[self.lang](img)
|
res, time_dict = self.predictors[self.lang](img)
|
||||||
|
|
||||||
# save output
|
# save output
|
||||||
save_structure_res(res, self.outputDir, img_name)
|
save_structure_res(res, self.outputDir, img_name)
|
||||||
draw_img = draw_structure_result(img, res, self.vis_font_path)
|
# draw_img = draw_structure_result(img, res, self.vis_font_path)
|
||||||
img_save_path = os.path.join(self.outputDir, img_name, 'show_{}.jpg'.format(index))
|
# img_save_path = os.path.join(self.outputDir, img_name, 'show_{}.jpg'.format(index))
|
||||||
if res != []:
|
# if res != []:
|
||||||
cv2.imwrite(img_save_path, draw_img)
|
# cv2.imwrite(img_save_path, draw_img)
|
||||||
|
|
||||||
# recovery
|
# recovery
|
||||||
h, w, _ = img.shape
|
h, w, _ = img.shape
|
||||||
res = sorted_layout_boxes(res, w)
|
res = sorted_layout_boxes(res, w)
|
||||||
all_res += res
|
all_res += res
|
||||||
|
self.pageCnt += 1
|
||||||
|
self.progressBarValue.emit(self.pageCnt)
|
||||||
|
|
||||||
|
if all_res != []:
|
||||||
try:
|
try:
|
||||||
convert_info_docx(img, all_res, self.outputDir, img_name, self.save_pdf)
|
convert_info_docx(imgs, all_res, self.outputDir, img_name)
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
print(self,
|
print("error in layout recovery image:{}, err msg: {}".
|
||||||
"error in layout recovery image:{}, err msg: {}".format(
|
format(img_name, ex))
|
||||||
img_name, ex))
|
print("Predict time : {:.3f}s".format(time_dict['all']))
|
||||||
|
|
||||||
print('result save to {}'.format(self.outputDir))
|
print('result save to {}'.format(self.outputDir))
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
|
self.resetPageCnt()
|
||||||
|
self.resetTotalPageCnt()
|
||||||
try:
|
try:
|
||||||
findex = 0
|
|
||||||
os.makedirs(self.outputDir, exist_ok=True)
|
os.makedirs(self.outputDir, exist_ok=True)
|
||||||
for i, image_file in enumerate(self.imagePaths):
|
for i, image_file in enumerate(self.imagePaths):
|
||||||
if self.loopFlag == True:
|
if not self.loopFlag:
|
||||||
|
break
|
||||||
|
# using use_pdf2docx_api for PDF parsing
|
||||||
|
if self.use_pdf2docx_api \
|
||||||
|
and os.path.basename(image_file)[-3:] == 'pdf':
|
||||||
|
self.totalPageCnt += 1
|
||||||
|
self.progressBarRange.emit(self.totalPageCnt)
|
||||||
|
print('===============using use_pdf2docx_api===============')
|
||||||
|
img_name = os.path.basename(image_file).split('.')[0]
|
||||||
|
docx_file = os.path.join(
|
||||||
|
self.outputDir, '{}.docx'.format(img_name))
|
||||||
|
cv = Converter(image_file)
|
||||||
|
cv.convert(docx_file)
|
||||||
|
cv.close()
|
||||||
|
print('docx save to {}'.format(docx_file))
|
||||||
|
self.pageCnt += 1
|
||||||
|
self.progressBarValue.emit(self.pageCnt)
|
||||||
|
else:
|
||||||
|
# using PPOCR for PDF/Image parsing
|
||||||
imgs = readImage(image_file)
|
imgs = readImage(image_file)
|
||||||
if len(imgs) == 0:
|
if len(imgs) == 0:
|
||||||
continue
|
continue
|
||||||
img_name = os.path.basename(image_file).split('.')[0]
|
img_name = os.path.basename(image_file).split('.')[0]
|
||||||
os.makedirs(os.path.join(self.outputDir, img_name), exist_ok=True)
|
os.makedirs(os.path.join(self.outputDir, img_name), exist_ok=True)
|
||||||
self.predictAndSave(imgs, img_name)
|
self.ppocrPrecitor(imgs, img_name)
|
||||||
findex += 1
|
# file processed
|
||||||
self.progressBarValue.emit(findex)
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
self.endsignal.emit()
|
self.endsignal.emit()
|
||||||
self.exec()
|
# self.exec()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
self.exceptedsignal.emit(str(e)) # 将异常发送给UI进程
|
||||||
raise
|
|
||||||
|
|
||||||
|
|
||||||
class APP_Image2Doc(QWidget):
|
class APP_Image2Doc(QWidget):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.setFixedHeight(90)
|
self.setFixedHeight(100)
|
||||||
self.setFixedWidth(400)
|
self.setFixedWidth(420)
|
||||||
|
|
||||||
# settings
|
# settings
|
||||||
self.imagePaths = []
|
self.imagePaths = []
|
||||||
@ -187,6 +224,7 @@ class APP_Image2Doc(QWidget):
|
|||||||
self.output_dir = None
|
self.output_dir = None
|
||||||
self.vis_font_path = os.path.join(root,
|
self.vis_font_path = os.path.join(root,
|
||||||
"doc", "fonts", "simfang.ttf")
|
"doc", "fonts", "simfang.ttf")
|
||||||
|
self.use_pdf2docx_api = False
|
||||||
|
|
||||||
# ProgressBar
|
# ProgressBar
|
||||||
self.pb = QProgressBar()
|
self.pb = QProgressBar()
|
||||||
@ -207,10 +245,12 @@ class APP_Image2Doc(QWidget):
|
|||||||
}
|
}
|
||||||
|
|
||||||
# 设置工作进程
|
# 设置工作进程
|
||||||
self._thread = Worker(predictors, self.save_pdf, self.vis_font_path)
|
self._thread = Worker(predictors, self.save_pdf, self.vis_font_path, self.use_pdf2docx_api)
|
||||||
self._thread.progressBarValue.connect(self.handleProgressBarSingal)
|
self._thread.progressBarValue.connect(self.handleProgressBarUpdateSingal)
|
||||||
self._thread.endsignal.connect(self.handleEndsignalSignal)
|
self._thread.endsignal.connect(self.handleEndsignalSignal)
|
||||||
self._thread.finished.connect(QObject.deleteLater)
|
# self._thread.finished.connect(QObject.deleteLater)
|
||||||
|
self._thread.progressBarRange.connect(self.handleProgressBarRangeSingal)
|
||||||
|
self._thread.exceptedsignal.connect(self.handleThreadException)
|
||||||
self.time_start = 0 # save start time
|
self.time_start = 0 # save start time
|
||||||
|
|
||||||
def setupUi(self):
|
def setupUi(self):
|
||||||
@ -233,25 +273,30 @@ class APP_Image2Doc(QWidget):
|
|||||||
self.startCNButton.setIcon(QIcon(QPixmap("./icons/chinese.png")))
|
self.startCNButton.setIcon(QIcon(QPixmap("./icons/chinese.png")))
|
||||||
layout.addWidget(self.startCNButton, 0, 1, 1, 1)
|
layout.addWidget(self.startCNButton, 0, 1, 1, 1)
|
||||||
self.startCNButton.clicked.connect(
|
self.startCNButton.clicked.connect(
|
||||||
functools.partial(self.handleStartSignal, 'CN'))
|
functools.partial(self.handleStartSignal, 'CN', False))
|
||||||
|
|
||||||
self.startENButton = QPushButton("英文转换")
|
self.startENButton = QPushButton("英文转换")
|
||||||
self.startENButton.setIcon(QIcon(QPixmap("./icons/english.png")))
|
self.startENButton.setIcon(QIcon(QPixmap("./icons/english.png")))
|
||||||
layout.addWidget(self.startENButton, 0, 2, 1, 1)
|
layout.addWidget(self.startENButton, 0, 2, 1, 1)
|
||||||
self.startENButton.clicked.connect(
|
self.startENButton.clicked.connect(
|
||||||
functools.partial(self.handleStartSignal, 'EN'))
|
functools.partial(self.handleStartSignal, 'EN', False))
|
||||||
|
|
||||||
|
self.PDFParserButton = QPushButton('PDF解析', self)
|
||||||
|
layout.addWidget(self.PDFParserButton, 0, 3, 1, 1)
|
||||||
|
self.PDFParserButton.clicked.connect(
|
||||||
|
functools.partial(self.handleStartSignal, 'CN', True))
|
||||||
|
|
||||||
self.showResultButton = QPushButton("显示结果")
|
self.showResultButton = QPushButton("显示结果")
|
||||||
self.showResultButton.setIcon(QIcon(QPixmap("./icons/folder-open.png")))
|
self.showResultButton.setIcon(QIcon(QPixmap("./icons/folder-open.png")))
|
||||||
layout.addWidget(self.showResultButton, 0, 3, 1, 1)
|
layout.addWidget(self.showResultButton, 0, 4, 1, 1)
|
||||||
self.showResultButton.clicked.connect(self.handleShowResultSignal)
|
self.showResultButton.clicked.connect(self.handleShowResultSignal)
|
||||||
|
|
||||||
# ProgressBar
|
# ProgressBar
|
||||||
layout.addWidget(self.pb, 2, 0, 1, 4)
|
layout.addWidget(self.pb, 2, 0, 1, 5)
|
||||||
# time estimate label
|
# time estimate label
|
||||||
self.timeEstLabel = QLabel(
|
self.timeEstLabel = QLabel(
|
||||||
("Time Left: --"))
|
("Time Left: --"))
|
||||||
layout.addWidget(self.timeEstLabel, 3, 0, 1, 4)
|
layout.addWidget(self.timeEstLabel, 3, 0, 1, 5)
|
||||||
|
|
||||||
self.setLayout(layout)
|
self.setLayout(layout)
|
||||||
|
|
||||||
@ -355,7 +400,6 @@ class APP_Image2Doc(QWidget):
|
|||||||
if len(selectedFiles) > 0:
|
if len(selectedFiles) > 0:
|
||||||
self.imagePaths = selectedFiles
|
self.imagePaths = selectedFiles
|
||||||
self.screenShot = None # discard screenshot temp image
|
self.screenShot = None # discard screenshot temp image
|
||||||
self.pb.setRange(0, len(self.imagePaths))
|
|
||||||
self.pb.setValue(0)
|
self.pb.setValue(0)
|
||||||
|
|
||||||
# def screenShotSlot(self):
|
# def screenShotSlot(self):
|
||||||
@ -370,7 +414,7 @@ class APP_Image2Doc(QWidget):
|
|||||||
# self.pb.setRange(0, 1)
|
# self.pb.setRange(0, 1)
|
||||||
# self.pb.setValue(0)
|
# self.pb.setValue(0)
|
||||||
|
|
||||||
def handleStartSignal(self, lang):
|
def handleStartSignal(self, lang='EN', pdfParser=False):
|
||||||
if self.screenShot: # for screenShot
|
if self.screenShot: # for screenShot
|
||||||
img_name = 'screenshot_' + time.strftime("%Y%m%d%H%M%S", time.localtime())
|
img_name = 'screenshot_' + time.strftime("%Y%m%d%H%M%S", time.localtime())
|
||||||
image = QImageToCvMat(self.screenShot)
|
image = QImageToCvMat(self.screenShot)
|
||||||
@ -386,10 +430,12 @@ class APP_Image2Doc(QWidget):
|
|||||||
self._thread.setOutputDir(self.output_dir)
|
self._thread.setOutputDir(self.output_dir)
|
||||||
self._thread.setImagePath(self.imagePaths)
|
self._thread.setImagePath(self.imagePaths)
|
||||||
self._thread.setLang(lang)
|
self._thread.setLang(lang)
|
||||||
|
self._thread.setPDFParser(pdfParser)
|
||||||
# disenble buttons
|
# disenble buttons
|
||||||
self.openFileButton.setEnabled(False)
|
self.openFileButton.setEnabled(False)
|
||||||
self.startCNButton.setEnabled(False)
|
self.startCNButton.setEnabled(False)
|
||||||
self.startENButton.setEnabled(False)
|
self.startENButton.setEnabled(False)
|
||||||
|
self.PDFParserButton.setEnabled(False)
|
||||||
# 启动工作进程
|
# 启动工作进程
|
||||||
self._thread.start()
|
self._thread.start()
|
||||||
self.time_start = time.time() # log start time
|
self.time_start = time.time() # log start time
|
||||||
@ -411,7 +457,7 @@ class APP_Image2Doc(QWidget):
|
|||||||
QMessageBox.information(self,
|
QMessageBox.information(self,
|
||||||
u'Information', "输出文件不存在")
|
u'Information', "输出文件不存在")
|
||||||
|
|
||||||
def handleProgressBarSingal(self, i):
|
def handleProgressBarUpdateSingal(self, i):
|
||||||
self.pb.setValue(i)
|
self.pb.setValue(i)
|
||||||
# calculate time left of recognition
|
# calculate time left of recognition
|
||||||
lenbar = self.pb.maximum()
|
lenbar = self.pb.maximum()
|
||||||
@ -419,13 +465,24 @@ class APP_Image2Doc(QWidget):
|
|||||||
time_left = str(datetime.timedelta(seconds=avg_time * (lenbar - i))).split(".")[0] # Remove microseconds
|
time_left = str(datetime.timedelta(seconds=avg_time * (lenbar - i))).split(".")[0] # Remove microseconds
|
||||||
self.timeEstLabel.setText(f"Time Left: {time_left}") # show time left
|
self.timeEstLabel.setText(f"Time Left: {time_left}") # show time left
|
||||||
|
|
||||||
|
def handleProgressBarRangeSingal(self, max):
|
||||||
|
self.pb.setRange(0, max)
|
||||||
|
|
||||||
def handleEndsignalSignal(self):
|
def handleEndsignalSignal(self):
|
||||||
# enble buttons
|
# enble buttons
|
||||||
self.openFileButton.setEnabled(True)
|
self.openFileButton.setEnabled(True)
|
||||||
self.startCNButton.setEnabled(True)
|
self.startCNButton.setEnabled(True)
|
||||||
self.startENButton.setEnabled(True)
|
self.startENButton.setEnabled(True)
|
||||||
|
self.PDFParserButton.setEnabled(True)
|
||||||
QMessageBox.information(self, u'Information', "转换结束")
|
QMessageBox.information(self, u'Information', "转换结束")
|
||||||
|
|
||||||
|
def handleCBChangeSignal(self):
|
||||||
|
self._thread.setPDFParser(self.checkBox.isChecked())
|
||||||
|
|
||||||
|
def handleThreadException(self, message):
|
||||||
|
self._thread.quit()
|
||||||
|
QMessageBox.information(self, message)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
app = QApplication(sys.argv)
|
app = QApplication(sys.argv)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user