mirror of
https://github.com/PaddlePaddle/PaddleOCR.git
synced 2025-06-03 21:53:39 +08:00
【OCR Issue No.9】移除明确不适合放在ppocr依赖中的依赖项 (#11946)
* modify requestions * Update requirements.txt * Update requirements.txt * try import pdfconvert * try import lxml * try import lxml * try import premailer * try import openpyxl * Apply suggestions from code review
This commit is contained in:
parent
b32677cd3b
commit
b5eedf727e
@ -19,6 +19,7 @@ import importlib
|
||||
__dir__ = os.path.dirname(__file__)
|
||||
|
||||
import paddle
|
||||
from paddle.utils import try_import
|
||||
|
||||
sys.path.append(os.path.join(__dir__, ""))
|
||||
|
||||
@ -910,6 +911,7 @@ def main():
|
||||
img = cv2.imread(img_path)
|
||||
|
||||
if args.recovery and args.use_pdf2docx_api and flag_pdf:
|
||||
try_import("pdf2docx")
|
||||
from pdf2docx.converter import Converter
|
||||
|
||||
docx_file = os.path.join(args.output, "{}.docx".format(img_name))
|
||||
|
@ -25,7 +25,6 @@ from paddle.utils import try_import
|
||||
|
||||
fitz = try_import("fitz")
|
||||
from PIL import Image
|
||||
from pdf2docx.converter import Converter
|
||||
from qtpy.QtWidgets import (
|
||||
QApplication,
|
||||
QWidget,
|
||||
@ -209,6 +208,9 @@ class Worker(QThread):
|
||||
break
|
||||
# using use_pdf2docx_api for PDF parsing
|
||||
if self.use_pdf2docx_api and os.path.basename(image_file)[-3:] == "pdf":
|
||||
try_import("pdf2docx")
|
||||
from pdf2docx.converter import Converter
|
||||
|
||||
self.totalPageCnt += 1
|
||||
self.progressBarRange.emit(self.totalPageCnt)
|
||||
print("===============using use_pdf2docx_api===============")
|
||||
|
@ -28,6 +28,7 @@ import time
|
||||
import logging
|
||||
from copy import deepcopy
|
||||
|
||||
from paddle.utils import try_import
|
||||
from ppocr.utils.utility import get_image_file_list, check_and_read
|
||||
from ppocr.utils.logging import get_logger
|
||||
from ppocr.utils.visual import draw_ser_results, draw_re_results
|
||||
@ -300,6 +301,7 @@ def main(args):
|
||||
img_name = os.path.basename(image_file).split(".")[0]
|
||||
|
||||
if args.recovery and args.use_pdf2docx_api and flag_pdf:
|
||||
try_import("pdf2docx")
|
||||
from pdf2docx.converter import Converter
|
||||
|
||||
os.makedirs(args.output, exist_ok=True)
|
||||
|
@ -2,4 +2,3 @@ python-docx
|
||||
beautifulsoup4
|
||||
fonttools>=4.24.0
|
||||
fire>=0.3.0
|
||||
pdf2docx
|
@ -12,10 +12,10 @@
|
||||
from rapidfuzz.distance import Levenshtein
|
||||
from apted import APTED, Config
|
||||
from apted.helpers import Tree
|
||||
from lxml import etree, html
|
||||
from collections import deque
|
||||
from .parallel import parallel_process
|
||||
from tqdm import tqdm
|
||||
from paddle.utils import try_import
|
||||
|
||||
|
||||
class TableTree(Tree):
|
||||
@ -161,6 +161,9 @@ class TEDS(object):
|
||||
"""Computes TEDS score between the prediction and the ground truth of a
|
||||
given sample
|
||||
"""
|
||||
try_import("lxml")
|
||||
from lxml import etree, html
|
||||
|
||||
if (not pred) or (not true):
|
||||
return 0.0
|
||||
parser = html.HTMLParser(remove_comments=True, encoding="utf-8")
|
||||
|
@ -1,19 +1,26 @@
|
||||
# This is where we handle translating css styles into openpyxl styles
|
||||
# and cascading those from parent to child in the dom.
|
||||
|
||||
from openpyxl.cell import cell
|
||||
from openpyxl.styles import (
|
||||
Font,
|
||||
Alignment,
|
||||
PatternFill,
|
||||
NamedStyle,
|
||||
Border,
|
||||
Side,
|
||||
Color,
|
||||
)
|
||||
from openpyxl.styles.fills import FILL_SOLID
|
||||
from openpyxl.styles.numbers import FORMAT_CURRENCY_USD_SIMPLE, FORMAT_PERCENTAGE
|
||||
from openpyxl.styles.colors import BLACK
|
||||
try:
|
||||
from openpyxl.cell import cell
|
||||
from openpyxl.styles import (
|
||||
Font,
|
||||
Alignment,
|
||||
PatternFill,
|
||||
NamedStyle,
|
||||
Border,
|
||||
Side,
|
||||
Color,
|
||||
)
|
||||
from openpyxl.styles.fills import FILL_SOLID
|
||||
from openpyxl.styles.numbers import FORMAT_CURRENCY_USD_SIMPLE, FORMAT_PERCENTAGE
|
||||
from openpyxl.styles.colors import BLACK
|
||||
except:
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"Can not import openpyxl, some functions in the ppstructure may not work. Please manually install openpyxl before using ppstructure."
|
||||
)
|
||||
|
||||
FORMAT_DATE_MMDDYYYY = "mm/dd/yyyy"
|
||||
|
||||
|
@ -1,11 +1,9 @@
|
||||
# Do imports like python3 so our package works for 2 and 3
|
||||
from __future__ import absolute_import
|
||||
|
||||
from lxml import html
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.utils import get_column_letter
|
||||
from premailer import Premailer
|
||||
|
||||
from tablepyxl.style import Table
|
||||
from paddle.utils import try_import
|
||||
|
||||
|
||||
def string_to_int(s):
|
||||
@ -15,6 +13,9 @@ def string_to_int(s):
|
||||
|
||||
|
||||
def get_Tables(doc):
|
||||
try_import("lxml")
|
||||
from lxml import etree, html
|
||||
|
||||
tree = html.fromstring(doc)
|
||||
comments = tree.xpath("//comment()")
|
||||
for comment in comments:
|
||||
@ -27,7 +28,9 @@ def write_rows(worksheet, elem, row, column=1):
|
||||
Writes every tr child element of elem to a row in the worksheet
|
||||
returns the next row after all rows are written
|
||||
"""
|
||||
try_import("openpyxl")
|
||||
from openpyxl.cell.cell import MergedCell
|
||||
from openpyxl.utils import get_column_letter
|
||||
|
||||
initial_column = column
|
||||
for table_row in elem.rows:
|
||||
@ -87,6 +90,11 @@ def document_to_workbook(doc, wb=None, base_url=None):
|
||||
every table in the document.
|
||||
The workbook is returned
|
||||
"""
|
||||
try_import("premailer")
|
||||
try_import("openpyxl")
|
||||
from premailer import Premailer
|
||||
from openpyxl import Workbook
|
||||
|
||||
if not wb:
|
||||
wb = Workbook()
|
||||
wb.remove(wb.active)
|
||||
|
@ -9,9 +9,5 @@ rapidfuzz
|
||||
opencv-python<=4.6.0.66
|
||||
opencv-contrib-python<=4.6.0.66
|
||||
cython
|
||||
lxml
|
||||
premailer
|
||||
openpyxl
|
||||
attrdict
|
||||
Pillow>=10.0.0
|
||||
pyyaml
|
||||
pyyaml
|
||||
|
Loading…
x
Reference in New Issue
Block a user