Merge branch 'feature/0611' of gitlab.situdata.com:zhouweiqi/bmw-ocr into feature/0611

周伟奇
Showing 3 changed files with 48 additions and 28 deletions
src/apps/doc/management/commands/ocr_process.py
src/celery_compare/tasks.py
src/common/tools/pdf_to_img.py
--- a/src/apps/doc/management/commands/ocr_process.py
View file @c1f24ad
+++ b/src/apps/doc/management/commands/ocr_process.py
View file @c1f24ad
@@ -585,7 +585,7 @@ class Command(BaseCommand, LoggerMixin):
                    img_save_path = os.path.join(doc_data_path, 'img')
                    pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))

-                    pdf_handler = PDFHandler(pdf_path, img_save_path)
+                    pdf_handler = PDFHandler(pdf_path, img_save_path, doc.document_name)
                    max_count_obj = Configs.objects.filter(id=2).first()
                    try:
                        max_img_count = int(max_count_obj.value)
--- a/src/celery_compare/tasks.py
View file @c1f24ad
+++ b/src/celery_compare/tasks.py
View file @c1f24ad
@@ -27,7 +27,7 @@ from apps.doc.named_enum import RequestTeam, RequestTrigger, ProcessName
 from common.tools.comparison import cp

 compare_log = logging.getLogger('compare')
-log_base = '[CA Compare]'
+log_base = '[Compare]'


 def name_check(ocr_res_dict, second_ocr_field, second_compare_list, second_id_num, name):
--- a/src/common/tools/pdf_to_img.py
View file @c1f24ad
+++ b/src/common/tools/pdf_to_img.py
View file @c1f24ad
 import os
+import shutil
 import fitz
 from PIL import Image
 from io import BytesIO
@@ -22,12 +23,25 @@ WH_COUPLE_5 = (100, 200)

 class PDFHandler:

-    def __init__(self, path, img_dir_path):
+    def __init__(self, path, img_dir_path, document_name=None):
        self.path = path
        self.img_dir_path = img_dir_path
        self.img_path_list = []
        self.img_count = 0
        self.xref_set = set()
+        self.img_suffixs = {'.jpeg', '.jpg', '.png', '.webp', '.bmp'}
+        self.suffix = self.get_suffix(document_name)
+
+    def get_suffix(self, file_name):
+        if file_name is None:
+            return None
+        try:
+            _, src_suffix = os.path.splitext(file_name)
+            lower_suffix = src_suffix.lower()
+            if lower_suffix in self.img_suffixs:
+                return lower_suffix
+        except Exception as e:
+            return

    def get_img_save_path(self, pno, img_index=0, ext='png'):
        return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext))
@@ -197,33 +211,39 @@ class PDFHandler:
        self.img_path_list = []
        self.xref_set = set()
        os.makedirs(self.img_dir_path, exist_ok=True)
-        with fitz.Document(self.path) as pdf:
-            if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
-                self.img_count = pdf.pageCount
-                return
-            for pno in range(pdf.pageCount):
-                il = pdf.getPageImageList(pno)  # 获取页面图片对象
-                # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
-
-                # 1.页面图片对象数目为0时，保存整个页面为png图片
-                if len(il) == 0:
-                    page = pdf.loadPage(pno)
-                    self.page_to_png(page)
-                # 2.页面图片对象数目为1时：
-                # 小图(如电子账单的盖章)：保存整个页面为png图片
-                # 大图：提取图片对象
-                elif len(il) == 1:
-                    xref, smask, width, height, _, colorspace, _, _, _ = il[0]
-                    # 小图
-                    if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]:
+
+        if self.suffix in self.img_suffixs:
+            img_save_path = self.get_img_save_path(0, ext=self.suffix[1:])
+            shutil.copy(self.path, img_save_path)
+            self.img_path_list.append(img_save_path)
+        else:
+            with fitz.Document(self.path) as pdf:
+                if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
+                    self.img_count = pdf.pageCount
+                    return
+                for pno in range(pdf.pageCount):
+                    il = pdf.getPageImageList(pno)  # 获取页面图片对象
+                    # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
+
+                    # 1.页面图片对象数目为0时，保存整个页面为png图片
+                    if len(il) == 0:
                        page = pdf.loadPage(pno)
                        self.page_to_png(page)
-                    # 大图
-                    elif xref not in self.xref_set:
-                        self.extract_single_image(pdf, xref, smask, colorspace, pno)
-                # 3.页面图片对象数目大于1时，特殊处理
-                else:
-                    self.merge_il(pdf, pno, il)
+                    # 2.页面图片对象数目为1时：
+                    # 小图(如电子账单的盖章)：保存整个页面为png图片
+                    # 大图：提取图片对象
+                    elif len(il) == 1:
+                        xref, smask, width, height, _, colorspace, _, _, _ = il[0]
+                        # 小图
+                        if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]:
+                            page = pdf.loadPage(pno)
+                            self.page_to_png(page)
+                        # 大图
+                        elif xref not in self.xref_set:
+                            self.extract_single_image(pdf, xref, smask, colorspace, pno)
+                    # 3.页面图片对象数目大于1时，特殊处理
+                    else:
+                        self.merge_il(pdf, pno, il)
        self.img_count = len(self.img_path_list)

    def extract_page_image(self):