fix bug

周伟奇
Showing 2 changed files with 78 additions and 48 deletions
src/apps/doc/management/commands/folder_wsc_process.py
src/common/tools/pdf_to_img.py
--- a/src/apps/doc/management/commands/folder_wsc_process.py
View file @92b21d6
+++ b/src/apps/doc/management/commands/folder_wsc_process.py
View file @92b21d6
@@ -19,16 +19,16 @@ from common.mixins import LoggerMixin
 from common.tools.pdf_to_img import PDFHandler
 from apps.doc import consts
 from apps.doc.exceptions import OCR1Exception, OCR4Exception
-from apps.doc.ocr.wb import BSWorkbook
+from apps.doc.ocr.wb import BSWorkbook, PatternFill


 class Finder:
    """Summary
-
+    
    Attributes:
        ocr_results (TYPE): Description
    """
-
+    
    def __init__(self, ocr_results=None):
        self.ocr_results = ocr_results

@@ -82,15 +82,15 @@ class Finder:
            for key in ocr_results[pno]:
                bbox, text = ocr_results[pno][key]
                ocr_texts += text
-            pattern = re.compile("[^\u4e00-\u9fa5]")  # 匹配不是中文的其他字符
+            pattern = re.compile("[^\u4e00-\u9fa5]")        # 匹配不是中文的其他字符
            ocr_texts = pattern.sub('', ocr_texts)

-            score = fuzz.ratio(page_template, ocr_texts) / 100.
+            score = fuzz.ratio(page_template, ocr_texts)/100.
            classes.append([pno, score])
        pred = sorted(classes, key=lambda x: x[1], reverse=True)[0]
        return pred

-    def get_top_key(self, ocr_results, key_string):  # 加入过滤词典
+    def get_top_key(self, ocr_results, key_string):          # 加入过滤词典
        """找到与 key_string 最匹配的字段的 key
        """
        if len(ocr_results) == 0:
@@ -111,7 +111,7 @@ class Finder:
                continue
            inter = Polygon(g).intersection(Polygon(p)).area
            union = g.area + p.area - inter
-            iou = inter / union
+            iou = inter/union
            iou_list.append([iou, key])
        if len(iou_list) == 0:
            return -1, -1
@@ -128,8 +128,8 @@ class Finder:
            bbox, text = ocr_results[key]
            # 定制化规则, 比如过滤一些词呀什么的
            # 该例中, 我们要去掉非中文字符
-            pattern = re.compile("[^\u4e00-\u9fa5]")  # 匹配不是中文的其他字符
-            text = pattern.sub('', text)
+            pattern = re.compile("[^\u4e00-\u9fa5]")        # 匹配不是中文的其他字符
+            text = pattern.sub('', text) 
            tmp_ocr_results[key] = [bbox, text]

        # 先根据 key_string 找到 key 的位置所在, 再判断该位置是否包含 value
@@ -141,8 +141,8 @@ class Finder:
            if len(words) == 0:
                # 将 bbox 右移一个单位
                x0, y0, x1, y1, x2, y2, x3, y3 = bbox
-                rw = abs(x0 - x1)
-                anchor = [x0 + rw, y0, x1 + rw, y1, x2 + rw, y2, x3 + rw, y3]
+                rw = abs(x0-x1)
+                anchor = [x0+rw, y0, x1+rw, y1, x2+rw, y2, x3+rw, y3]
                iou, key = self.get_top_iou(ocr_results, anchor)
                if ratio > 0.3:
                    bbox, text = ocr_results[key]
@@ -223,7 +223,7 @@ class Finder:
                bbox, text = self.ocr_results[pno][key]
                all_texts += text

-        searchObj = re.search(r'保证人\[(.*?)\]与甲方', all_texts)
+        searchObj = re.search( r'保证人\[(.*?)\]与甲方', all_texts)
        if searchObj:
            words = f'[{searchObj.group(1)}]'
            words = words.replace('【', '[').replace('】', ']').replace(',', '，').replace('(', '（').replace(')', '）')
@@ -256,7 +256,9 @@ class Finder:
        if score > 0.5:
            if len(self.ocr_results[pno]) > 0:
                # 根据关键词，找这一行字符
-                lines = self.get_line(self.ocr_results[pno], 'RMB')
+                lines = ''
+                for i in ['RMB', 'CNY']:
+                    lines += self.get_line(self.ocr_results[pno], i)
                # searchObj = re.search( r'RMB(.*?)in', lines)
                searchObj = re.search(r'[0-9,.]+', lines)
                if searchObj:
@@ -264,10 +266,10 @@ class Finder:
                    amount_eng = words

                lines = self.get_line(self.ocr_results[pno], '人民币')
-                searchObj = re.search(r'大写(.*?)综合', lines)
+                searchObj = re.search( r'大写(.*?)综合', lines)
                if searchObj:
                    words = searchObj.group(1)
-                    pattern = re.compile("[^\u4e00-\u9fa5]")  # 匹配不是中文的其他字符
+                    pattern = re.compile("[^\u4e00-\u9fa5]")        # 匹配不是中文的其他字符
                    words = pattern.sub('', words)
                    words = words.replace("仔", "仟").replace("任", "仟")
                    words = words.replace("值", "佰")
@@ -276,15 +278,15 @@ class Finder:
                    words = words.replace("政", "玖")
                    words = words.replace("垒", "叁")
                    amount_chn = words
-
+                
                lines = self.get_line(self.ocr_results[pno], 'ending')
                if len(lines) > 0:
                    start, end = lines.split('ending')
-                    searchStart = re.search(r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', start)
+                    searchStart = re.search( r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', start)
                    if searchStart:
                        words = searchStart.group()
                        term_start_eng = words
-                    searchEnd = re.search(r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', end)
+                    searchEnd = re.search( r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', end)
                    if searchEnd:
                        words = searchEnd.group()
                        term_end_eng = words
@@ -292,29 +294,29 @@ class Finder:
                lines = self.get_line(self.ocr_results[pno], '至')
                if len(lines) > 0:
                    start, end = lines.split('至')
-                    searchStart = re.search(r'[0-9]{4}-[0-9]+-[0-9]+', start)
+                    searchStart = re.search( r'[0-9]{4}-[0-9]+-[0-9]+', start)
                    if searchStart:
                        words = searchStart.group()
                        term_start_chn = words
-                    searchEnd = re.search(r'[0-9]{4}-[0-9]+-[0-9]+', end)
+                    searchEnd = re.search( r'[0-9]{4}-[0-9]+-[0-9]+', end)
                    if searchEnd:
                        words = searchEnd.group()
                        term_end_chn = words
-
+                
                lines = self.get_line(self.ocr_results[pno], 'above')
-                searchObj = re.search(r'aboveto([0-9]+)', lines.replace('O', '0'))
+                searchObj = re.search( r'aboveto([0-9]+)', lines.replace('O', '0').replace('too', 'to0'))
                if searchObj:
                    words = searchObj.group(1)
                    deposit_eng = f'{words}%'

                lines = self.get_line(self.ocr_results[pno], '授信额度的')
-                searchObj = re.search(r'授信额度的([0-9]+)', lines.replace('O', '0'))
+                searchObj = re.search( r'授信额度的([0-9]+)', lines.replace('O', '0'))
                if searchObj:
                    words = searchObj.group(1)
                    deposit_chn = f'{words}%'

        return amount_eng, amount_chn, term_start_eng, term_end_eng, \
-               term_start_chn, term_end_chn, deposit_eng, deposit_chn
+                    term_start_chn, term_end_chn, deposit_eng, deposit_chn

    def get_other_arrangements_and_conditions(self):
        """获取其它约定与条件文本段落
@@ -330,7 +332,7 @@ class Finder:
        searchObj = re.search(r'Conditions:(.*?)其他约定与条件', all_texts, re.I)
        if searchObj:
            words = searchObj.group(1)
-            pattern = re.compile("[\u4e00-\u9fa5]")  # 去除中文字符
+            pattern = re.compile("[\u4e00-\u9fa5]")        # 去除中文字符
            words = pattern.sub('', words)
            other_arrangements_and_conditions_eng = words

@@ -356,7 +358,7 @@ class Finder:
        self.init_result["保证人"] = guarantor

        amount_eng, amount_chn, term_start_eng, term_end_eng, \
-        term_start_chn, term_end_chn, deposit_eng, deposit_chn = self.get_info_in_page_39()
+            term_start_chn, term_end_chn, deposit_eng, deposit_chn = self.get_info_in_page_39()
        self.init_result["综合授信额度金额英文"] = amount_eng
        self.init_result["综合授信额度金额中文"] = amount_chn
        self.init_result["综合授信额度期限开始日期英文"] = term_start_eng
@@ -371,7 +373,6 @@ class Finder:
        self.init_result["其他约定与条件中文"] = words_chn
        return self.init_result

-
 class TIFFHandler:

    def __init__(self, path, img_save_path):
@@ -409,6 +410,7 @@ class Command(BaseCommand, LoggerMixin):
        self.input_dir = conf.WSC_DIR
        # ocr相关
        self.go_ocr_url = conf.WSC_GO_URL
+        self.amount_fill = PatternFill("solid", fgColor="00FFFF00")
        # 优雅退出信号：15
        signal.signal(signal.SIGTERM, self.signal_handler)

@@ -435,6 +437,19 @@ class Command(BaseCommand, LoggerMixin):
        excel_path = os.path.join(wb_output_dir, excel_name)
        return img_save_path, excel_path, pdf_save_path

+    @staticmethod
+    def get_mode_code(code_list):
+        result_dict = {}
+        for code in code_list:
+            if code in result_dict:
+                result_dict[code] += 1
+            else:
+                result_dict[code] = 1
+        if len(result_dict) == 1:
+            return None
+        else:
+            return sorted(result_dict.items(), key=lambda x:x[1], reverse=True)[0][0]
+
    def res_process(self, all_res, excel_path):
        try:
            self.finder.ocr_results = all_res
@@ -442,11 +457,26 @@ class Command(BaseCommand, LoggerMixin):

            wb = BSWorkbook(set(), set(), set(), set(), set())
            ws = wb.create_sheet(self.sheet_name)
+            row_idx = 1
+            code_idx = 1
+            mode_code = None
            for write_field, field_value in results.items():
+                row_idx += 1
                if isinstance(field_value, list):
+                    if write_field == '合同编号列表':
+                        code_idx = row_idx
+                        mode_code = self.get_mode_code(field_value)
                    ws.append((write_field, *field_value))
                else:
                    ws.append((write_field, field_value))
+
+            if isinstance(mode_code, str):
+                for cell in ws[code_idx]:
+                    if cell.value == '合同编号列表':
+                        continue
+                    if cell.value != mode_code:
+                        cell.fill = self.amount_fill
+
            wb.remove_base_sheet()
            wb.save(excel_path)
        except Exception as e:
--- a/src/common/tools/pdf_to_img.py
View file @92b21d6
+++ b/src/common/tools/pdf_to_img.py
View file @92b21d6
@@ -257,19 +257,19 @@ class PDFHandler:
            self.page_to_png(page)

    def check_ebank(self, pdf):
-        page_text_list = []
+        # page_text_list = []
        text_item_sum = 0
        for pno in range(pdf.pageCount):
            page = pdf.loadPage(pno)
-            if page.rotation is None:
-                rotation = 0
-            elif isinstance(page.rotation, int):
-                divisor, remainder = divmod(page.rotation, 90)
-                if remainder != 0:
-                    return
-                rotation = divmod(divisor, 4)[1]
-            else:
-                return
+            # if page.rotation is None:
+            #     rotation = 0
+            # elif isinstance(page.rotation, int):
+            #     divisor, remainder = divmod(page.rotation, 90)
+            #     if remainder != 0:
+            #         return
+            #     rotation = divmod(divisor, 4)[1]
+            # else:
+            #     return
            textpage = page.getTextPage()
            text = textpage.extractDICT()
            text_list = []
@@ -284,17 +284,17 @@ class PDFHandler:
            text_item_sum += len(text_list)
            if text_item_sum < (pno + 1) * 5:
                return
-            else:
-                page_text_list.append(
-                    {
-                        'width': text.get('width'),
-                        'height': text.get('height'),
-                        'rotation': rotation,
-                        'text': text_list
-                    }
-                )
+            # else:
+            #     page_text_list.append(
+            #         {
+            #             'width': text.get('width'),
+            #             'height': text.get('height'),
+            #             'rotation': rotation,
+            #             'text': text_list
+            #         }
+            #     )
        self.is_ebank = True
-        self.page_text_list = page_text_list
+        # self.page_text_list = page_text_list

    def extract_image(self, max_img_count=None):
        self.img_path_list = []
@@ -310,7 +310,7 @@ class PDFHandler:
                if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
                    self.img_count = pdf.pageCount
                    return
-                # self.check_ebank(pdf)
+                self.check_ebank(pdf)
                for pno in range(pdf.pageCount):
                    il = pdf.getPageImageList(pno)  # 获取页面图片对象
                    # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)