Merge branch 'feature/uat-tmp' of gitlab.situdata.com:zhouweiqi/bmw-ocr into feature/uat-tmp

冯轩
Showing 6 changed files with 35 additions and 13 deletions
.gitignore
src/apps/doc/consts.py
src/apps/doc/management/commands/ocr_process.py
src/common/fsm_econtract/fsm_contract_ocr.py
src/common/fsm_econtract/tools.py
src/common/tools/pdf_to_img.py
--- a/.gitignore
View file @ba0dc00
+++ b/.gitignore
View file @ba0dc00
@@ -31,4 +31,5 @@ conf/*
 data/*

 test*
-flow_test.py
\ No newline at end of file
+flow_test.py
+pdf_test.py
\ No newline at end of file
--- a/src/apps/doc/consts.py
View file @ba0dc00
+++ b/src/apps/doc/consts.py
View file @ba0dc00
@@ -2434,14 +2434,14 @@ ECONTRACT_KEYWORDS_MAP = {

 FSM_ECONTRACT_KEYWORDS_MAP = {
    AFC_PREFIX: [
-        ('延长保修条款与条件', FSM_CONTRACT_WEP_CLASSIFY),
+        ('延长保修服务合约', FSM_CONTRACT_WEP_CLASSIFY),
        ('长悦保养套餐服务合约', FSM_CONTRACT_MSI_CLASSIFY),
        ('汽车销售合同补充合同', FSM_CONTRACT_SC2_CLASSIFY),
        ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY),
    ],
    HIL_PREFIX: [
-        ('延长保修条款与条件', FSM_CONTRACT_WEP_CLASSIFY),
-        ('长悦保养套餐服务合同', FSM_CONTRACT_MSI_CLASSIFY),
+        ('延长保修服务合约', FSM_CONTRACT_WEP_CLASSIFY),
+        ('长悦保养套餐服务合约', FSM_CONTRACT_MSI_CLASSIFY),
        ('汽车销售合同补充合同', FSM_CONTRACT_SC2_CLASSIFY),
        ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY),
    ]
--- a/src/apps/doc/management/commands/ocr_process.py
View file @ba0dc00
+++ b/src/apps/doc/management/commands/ocr_process.py
View file @ba0dc00
@@ -1339,8 +1339,8 @@ class Command(BaseCommand, LoggerMixin):
                                pdf_handler.extract_image(max_img_count)
                                end_time = time.time()
                                speed_time = int(end_time - start_time)
-                                self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format(
-                                    self.log_base, task_str, times, speed_time))
+                                self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'.format(
+                                    self.log_base, task_str, times, speed_time, pdf_handler.is_new_modify))
                            except Exception as e:
                                self.online_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] '
                                                      '[error={3}]'.format(self.log_base, task_str, times,
--- a/src/common/fsm_econtract/fsm_contract_ocr.py
View file @ba0dc00
+++ b/src/common/fsm_econtract/fsm_contract_ocr.py
View file @ba0dc00
@@ -6,7 +6,7 @@ retriever_list = [Retriever(WEP_FIELD), Retriever(MSI_FIELD), Retriever(SC_FIELD

 def predict(pdf_info, file_type=0):
    retriever =  retriever_list[file_type]
-    pdf_text_list, pdf_img_list = pdf_info_rebuild(pdf_info) 
+    pdf_text_list, pdf_img_list = pdf_info_rebuild(pdf_info, file_type=file_type) 
    return retriever.get_target_fields(pdf_text_list, pdf_img_list)


--- a/src/common/fsm_econtract/tools.py
View file @ba0dc00
+++ b/src/common/fsm_econtract/tools.py
View file @ba0dc00
-def pdf_info_rebuild(pdf_info, fix_bbox=True):
+def pdf_info_rebuild(pdf_info, fix_bbox=True, file_type=0):
    pdf_text_info = dict()
    pdf_img_info = dict()
    for pno_str, page_info in pdf_info.items():
@@ -11,7 +11,8 @@ def pdf_info_rebuild(pdf_info, fix_bbox=True):
                    for span in line['spans']:
                        bbox, text = span['bbox'], span['text'].strip()
                        if len(text) != 0 and text not in text_set:
-                            text_set.add(text)
+                            if file_type != 3:  # 汽车销售合同补充协议，相同的总价会被过滤，所以取消
+                                text_set.add(text)
                            # bbox的高，不准
                            if fix_bbox and bbox[-1] - bbox[1] < span['size']:
                                bbox[-1] = bbox[-1] + span['size']
--- a/src/common/tools/pdf_to_img.py
View file @ba0dc00
+++ b/src/common/tools/pdf_to_img.py
View file @ba0dc00
@@ -12,8 +12,10 @@ from unicodedata import normalize
 # 页面保存为png图片参数
 ZOOM_X_1 = ZOOM_Y_1 = 1.0
 ZOOM_X_2 = ZOOM_Y_2 = 2.0
+ZOOM_X_3 = ZOOM_Y_3 = 3.0
 trans_1 = fitz.Matrix(ZOOM_X_1, ZOOM_X_1).preRotate(0)  # zoom factor 1 in each dimension
 trans_2 = fitz.Matrix(ZOOM_X_2, ZOOM_X_2).preRotate(0)  # zoom factor 2 in each dimension
+trans_3 = fitz.Matrix(ZOOM_X_3, ZOOM_X_3).preRotate(0)  # zoom factor 3 in each dimension

 # 特殊filter处理
 ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'}
@@ -25,6 +27,12 @@ WH_COUPLE_3 = (100, 100)
 WH_COUPLE_4 = (100, 300)
 WH_COUPLE_5 = (100, 200)

+# 碎图宽度阈值
+TINY_IMG_MAX_WIDTH = 1400
+
+# 大图宽高阈值
+WH_COUPLE_6 = (1800, 1400)
+WH_COUPLE_7 = (2500, 3000)

 class PDFBuild:

@@ -55,6 +63,7 @@ class PDFHandler:
        self.img_dir_path = img_dir_path
        self.img_path_list = []
        self.img_count = 0
+        self.is_new_modify = 0 # 用于记录受新改动影响的PDF
        self.xref_set = set()
        self.img_suffixs = {'.jpeg', '.jpg', '.png', '.webp', '.bmp'}
        self.suffix = self.get_suffix(document_name)
@@ -165,8 +174,10 @@ class PDFHandler:
        except Exception as e:
            pass

-    def page_to_png(self, page):
-        if page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500:
+    def page_to_png(self, page, is_big_img=False):
+        if is_big_img:
+            pm = page.getPixmap(matrix=trans_3, alpha=False)
+        elif page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500:
            pm = page.getPixmap(matrix=trans_1, alpha=False)
        else:
            pm = page.getPixmap(matrix=trans_2, alpha=False)
@@ -236,8 +247,8 @@ class PDFHandler:
        self.xref_set.add(xref)
        self.img_path_list.append(img_save_path)

-    @staticmethod
-    def split_il(il):
+    # @staticmethod
+    def split_il(self, il):
        broken_il = []
        start = 0
        length = len(il)
@@ -247,6 +258,10 @@ class PDFHandler:
            if il[i][-1] in ADOBE_FILTER_SET:
                page_to_png = True
                break
+            if il[i][2] >= TINY_IMG_MAX_WIDTH:
+                self.is_new_modify = 1
+                page_to_png = True
+                break
        else:
            for i in range(length):
                # 当图片对象够大时，不作碎图合并处理，而是单纯提取
@@ -446,6 +461,11 @@ class PDFHandler:
                            page = pdf.loadPage(pno)
                            self.page_to_png(page)
                        # 大图
+                        elif width >= WH_COUPLE_6[0] or height >= WH_COUPLE_6[1]:
+                            self.is_new_modify = 1
+                            is_big_img = (width < WH_COUPLE_7[0] and height < WH_COUPLE_7[1]) # 防止图片过大
+                            page = pdf.loadPage(pno)
+                            self.page_to_png(page, is_big_img=is_big_img) 
                        elif xref not in self.xref_set:
                            self.extract_single_image(pdf, xref, smask, colorspace, pno)
                    # 3.页面图片对象数目大于1时，特殊处理