Merge branch 'feature/img' into feature/0611

周伟奇
Showing 3 changed files with 23 additions and 3 deletions
src/apps/doc/management/commands/ocr_process.py
src/celery_compare/tasks.py
src/common/tools/pdf_to_img.py
--- a/src/apps/doc/management/commands/ocr_process.py
View file @906f258
+++ b/src/apps/doc/management/commands/ocr_process.py
View file @906f258
@@ -585,7 +585,7 @@ class Command(BaseCommand, LoggerMixin):
                    img_save_path = os.path.join(doc_data_path, 'img')
                    pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))

-                    pdf_handler = PDFHandler(pdf_path, img_save_path)
+                    pdf_handler = PDFHandler(pdf_path, img_save_path, doc.document_name)
                    max_count_obj = Configs.objects.filter(id=2).first()
                    try:
                        max_img_count = int(max_count_obj.value)
--- a/src/celery_compare/tasks.py
View file @906f258
+++ b/src/celery_compare/tasks.py
View file @906f258
@@ -27,7 +27,7 @@ from apps.doc.named_enum import RequestTeam, RequestTrigger, ProcessName
 from common.tools.comparison import cp

 compare_log = logging.getLogger('compare')
-log_base = '[CA Compare]'
+log_base = '[Compare]'


 def name_check(ocr_res_dict, second_ocr_field, second_compare_list, second_id_num, name):
--- a/src/common/tools/pdf_to_img.py
View file @906f258
+++ b/src/common/tools/pdf_to_img.py
View file @906f258
 import os
+import shutil
 import fitz
 from PIL import Image
 from io import BytesIO
@@ -22,12 +23,25 @@ WH_COUPLE_5 = (100, 200)

 class PDFHandler:

-    def __init__(self, path, img_dir_path):
+    def __init__(self, path, img_dir_path, document_name=None):
        self.path = path
        self.img_dir_path = img_dir_path
        self.img_path_list = []
        self.img_count = 0
        self.xref_set = set()
+        self.img_suffixs = {'.jpeg', '.jpg', '.png', '.webp', '.bmp'}
+        self.suffix = self.get_suffix(document_name)
+
+    def get_suffix(self, file_name):
+        if file_name is None:
+            return None
+        try:
+            _, src_suffix = os.path.splitext(file_name)
+            lower_suffix = src_suffix.lower()
+            if lower_suffix in self.img_suffixs:
+                return lower_suffix
+        except Exception as e:
+            return

    def get_img_save_path(self, pno, img_index=0, ext='png'):
        return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext))
@@ -197,6 +211,12 @@ class PDFHandler:
        self.img_path_list = []
        self.xref_set = set()
        os.makedirs(self.img_dir_path, exist_ok=True)
+
+        if self.suffix in self.img_suffixs:
+            img_save_path = self.get_img_save_path(0, ext=self.suffix[1:])
+            shutil.copy(self.path, img_save_path)
+            self.img_path_list.append(img_save_path)
+        else:
            with fitz.Document(self.path) as pdf:
                if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
                    self.img_count = pdf.pageCount