【OCR Issue No.9】移除明确不适合放在ppocr依赖中的依赖项 (#11946)

* modify requestions * Update requirements.txt * Update requirements.txt * try import pdfconvert * try import lxml * try import lxml * try import premailer * try import openpyxl * Apply suggestions from code review
2025-06-03 21:53:39 +08:00 · 2024-04-26 16:54:49 +08:00 · 2024-04-26 16:54:49 +08:00 · b5eedf727e
commit b5eedf727e
parent b32677cd3b
8 changed files with 44 additions and 25 deletions
--- a/paddleocr.py
+++ b/paddleocr.py
@ -19,6 +19,7 @@ import importlib
 __dir__ = os.path.dirname(__file__)

 import paddle
+from paddle.utils import try_import

 sys.path.append(os.path.join(__dir__, ""))

@ -910,6 +911,7 @@ def main():
                img = cv2.imread(img_path)

            if args.recovery and args.use_pdf2docx_api and flag_pdf:
+                try_import("pdf2docx")
                from pdf2docx.converter import Converter

                docx_file = os.path.join(args.output, "{}.docx".format(img_name))
--- a/ppstructure/pdf2word/pdf2word.py
+++ b/ppstructure/pdf2word/pdf2word.py
@ -25,7 +25,6 @@ from paddle.utils import try_import

 fitz = try_import("fitz")
 from PIL import Image
-from pdf2docx.converter import Converter
 from qtpy.QtWidgets import (
    QApplication,
    QWidget,
@ -209,6 +208,9 @@ class Worker(QThread):
                    break
                # using use_pdf2docx_api for PDF parsing
                if self.use_pdf2docx_api and os.path.basename(image_file)[-3:] == "pdf":
+                    try_import("pdf2docx")
+                    from pdf2docx.converter import Converter
+
                    self.totalPageCnt += 1
                    self.progressBarRange.emit(self.totalPageCnt)
                    print("===============using use_pdf2docx_api===============")
--- a/ppstructure/predict_system.py
+++ b/ppstructure/predict_system.py
@ -28,6 +28,7 @@ import time
 import logging
 from copy import deepcopy

+from paddle.utils import try_import
 from ppocr.utils.utility import get_image_file_list, check_and_read
 from ppocr.utils.logging import get_logger
 from ppocr.utils.visual import draw_ser_results, draw_re_results
@ -300,6 +301,7 @@ def main(args):
        img_name = os.path.basename(image_file).split(".")[0]

        if args.recovery and args.use_pdf2docx_api and flag_pdf:
+            try_import("pdf2docx")
            from pdf2docx.converter import Converter

            os.makedirs(args.output, exist_ok=True)
--- a/ppstructure/recovery/requirements.txt
+++ b/ppstructure/recovery/requirements.txt
@ -2,4 +2,3 @@ python-docx
 beautifulsoup4
 fonttools>=4.24.0
 fire>=0.3.0
-pdf2docx
--- a/ppstructure/table/table_metric/table_metric.py
+++ b/ppstructure/table/table_metric/table_metric.py
@ -12,10 +12,10 @@
 from rapidfuzz.distance import Levenshtein
 from apted import APTED, Config
 from apted.helpers import Tree
-from lxml import etree, html
 from collections import deque
 from .parallel import parallel_process
 from tqdm import tqdm
+from paddle.utils import try_import


 class TableTree(Tree):
@ -161,6 +161,9 @@ class TEDS(object):
        """Computes TEDS score between the prediction and the ground truth of a
        given sample
        """
+        try_import("lxml")
+        from lxml import etree, html
+
        if (not pred) or (not true):
            return 0.0
        parser = html.HTMLParser(remove_comments=True, encoding="utf-8")
--- a/ppstructure/table/tablepyxl/style.py
+++ b/ppstructure/table/tablepyxl/style.py
@ -1,19 +1,26 @@
 # This is where we handle translating css styles into openpyxl styles
 # and cascading those from parent to child in the dom.

-from openpyxl.cell import cell
-from openpyxl.styles import (
-    Font,
-    Alignment,
-    PatternFill,
-    NamedStyle,
-    Border,
-    Side,
-    Color,
-)
-from openpyxl.styles.fills import FILL_SOLID
-from openpyxl.styles.numbers import FORMAT_CURRENCY_USD_SIMPLE, FORMAT_PERCENTAGE
-from openpyxl.styles.colors import BLACK
+try:
+    from openpyxl.cell import cell
+    from openpyxl.styles import (
+        Font,
+        Alignment,
+        PatternFill,
+        NamedStyle,
+        Border,
+        Side,
+        Color,
+    )
+    from openpyxl.styles.fills import FILL_SOLID
+    from openpyxl.styles.numbers import FORMAT_CURRENCY_USD_SIMPLE, FORMAT_PERCENTAGE
+    from openpyxl.styles.colors import BLACK
+except:
+    import warnings
+
+    warnings.warn(
+        "Can not import openpyxl, some functions in the ppstructure may not work. Please manually install openpyxl before using ppstructure."
+    )

 FORMAT_DATE_MMDDYYYY = "mm/dd/yyyy"

--- a/ppstructure/table/tablepyxl/tablepyxl.py
+++ b/ppstructure/table/tablepyxl/tablepyxl.py
@ -1,11 +1,9 @@
 # Do imports like python3 so our package works for 2 and 3
 from __future__ import absolute_import

-from lxml import html
-from openpyxl import Workbook
-from openpyxl.utils import get_column_letter
-from premailer import Premailer
+
 from tablepyxl.style import Table
+from paddle.utils import try_import


 def string_to_int(s):
@ -15,6 +13,9 @@ def string_to_int(s):


 def get_Tables(doc):
+    try_import("lxml")
+    from lxml import etree, html
+
    tree = html.fromstring(doc)
    comments = tree.xpath("//comment()")
    for comment in comments:
@ -27,7 +28,9 @@ def write_rows(worksheet, elem, row, column=1):
    Writes every tr child element of elem to a row in the worksheet
    returns the next row after all rows are written
    """
+    try_import("openpyxl")
    from openpyxl.cell.cell import MergedCell
+    from openpyxl.utils import get_column_letter

    initial_column = column
    for table_row in elem.rows:
@ -87,6 +90,11 @@ def document_to_workbook(doc, wb=None, base_url=None):
    every table in the document.
    The workbook is returned
    """
+    try_import("premailer")
+    try_import("openpyxl")
+    from premailer import Premailer
+    from openpyxl import Workbook
+
    if not wb:
        wb = Workbook()
        wb.remove(wb.active)
--- a/requirements.txt
+++ b/requirements.txt
@ -9,9 +9,5 @@ rapidfuzz
 opencv-python<=4.6.0.66
 opencv-contrib-python<=4.6.0.66
 cython
-lxml
-premailer
-openpyxl
-attrdict
 Pillow>=10.0.0
-pyyaml
+pyyaml