fix afc e-contract

周伟奇
Showing 2 changed files with 90 additions and 86 deletions
src/common/electronic_afc_contract/afc_contract_ocr.py
src/common/electronic_afc_contract/get_char.py
--- a/src/common/electronic_afc_contract/afc_contract_ocr.py
View file @87525e9
+++ b/src/common/electronic_afc_contract/afc_contract_ocr.py
View file @87525e9
@@ -9,8 +9,23 @@ from .get_char import Finder
 def predict(pdf_info):
+    ocr_results = {}
+    for pno in pdf_info:
+        ocr_results[pno] = {}
+        for key, block in enumerate(pdf_info[pno]['blocks']):
+            if block['type'] != 0:
+                continue
+            for line in block['lines']:
+                for span in line['spans']:
+                    bbox, text = span['bbox'], span['text']
+                    # print(text)
+                    xmin, ymin, xmax, ymax = bbox
+                    polygon = [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax]
+                    text = text.replace("：", ":").replace("　", "")
+                    ocr_results[pno][key] = [polygon, text]
    # 输入是整个 PDF 中的信息
-    f = Finder(pdf_info)
+    f = Finder(pdf_info, ocr_results=ocr_results)
    results = f.get_info()
    return results
--- a/src/common/electronic_afc_contract/get_char.py
View file @87525e9
+++ b/src/common/electronic_afc_contract/get_char.py
View file @87525e9
@@ -11,14 +11,13 @@ from fuzzywuzzy import fuzz
 class Finder:
+    def __init__(self, pdf_info, ocr_results):
-    def __init__(self, pdf_info):
        self.pdf_info = pdf_info
+        self.ocr_results = ocr_results
        self.is_asp = False
        self.item = {"words": None,
                     "position": None,
                    }
    def gen_init_result(self, is_asp):
        # 格式化算法输出
        self.init_result = {"page_1": {"合同编号": self.item,
@@ -109,8 +108,10 @@ class Finder:
                                                       "日期": self.item,
                                                        },
                                          }
+    def poly_to_rectangle(self, poly):
+        xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax = poly
+        bbox = [xmin, ymin, xmax, ymax]
+        return bbox
    def get_contract_no(self, page_num):
        """传入页码,查看该页码右上角的编号
@@ -121,47 +122,41 @@ class Finder:
            sting: 
        """
        contract_no = self.item.copy()
+        # contract_no['words'] = ''
+        # contract_no['position'] = [-1, -1, -1, -1]
        # 只看第一页
-        for block in self.pdf_info[page_num]['blocks']:
+        for key in self.ocr_results[page_num]:
-            if block['type'] != 0:
+            bbox, text = self.ocr_results[page_num][key]
-                continue
+            if '合同编号:' in text:
-            for line in block['lines']:
+                words = text.split(':')[-1]
-                for span in line['spans']:
+                location = self.poly_to_rectangle(bbox)
-                    bbox, text = span['bbox'], span['text']
+                contract_no['words'] = words
-                    if '合同编号：' in text:
+                contract_no['position'] = location
-                        words = text.split('：')[-1]
-                        contract_no['position'] = bbox
-                        contract_no['words'] = words
        return contract_no
    def get_vehicle_price(self, page_num='0'):
        vehicle_price = self.item.copy()
-        for block in self.pdf_info[page_num]['blocks']:
+        # vehicle_price['words'] = ''
-            if block['type'] != 0:
+        # vehicle_price['position'] = [-1, -1, -1, -1]
-                continue
+        for key in self.ocr_results[page_num]:
-            for line in block['lines']:
+            bbox, text = self.ocr_results[page_num][key]
-                for span in line['spans']:
+            if '所购车辆价格为人民币' in text:
-                    bbox, text = span['bbox'], span['text']
+                words = text.split('币')[-1]
-                    if '所购车辆价格为人民币' in text:
+                location = self.poly_to_rectangle(bbox)
-                        words = text.split('币')[-1]
+                vehicle_price['words'] = words
-                        vehicle_price['position'] = bbox
+                vehicle_price['position'] = location
-                        vehicle_price['words'] = words
        return vehicle_price
    def get_vin(self, page_num='0'):
        vin = self.item.copy()
-        for block in self.pdf_info[page_num]['blocks']:
+        # vin['words'] = ''
-            if block['type'] != 0:
+        # vin['position'] = [-1, -1, -1, -1]
-                continue
+        for key in self.ocr_results[page_num]:
-            for line in block['lines']:
+            bbox, text = self.ocr_results[page_num][key]
-                for span in line['spans']:
+            if '车架号:' in text:
-                    bbox, text = span['bbox'], span['text']
+                words = text.split(':')[-1]
-                    if '车架号：' in text:
+                location = self.poly_to_rectangle(bbox)
-                        words = text.split('：')[-1]
+                vin['words'] = words
-                        vin['position'] = bbox
+                vin['position'] = location
-                        vin['words'] = words
        return vin
    def get_loan_principal(self, page_num='0'):
        chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾',
                            '佰', '仟', '万', '亿', '元', '角', '分', '零', '整']
@@ -202,7 +197,6 @@ class Finder:
                            asp_2['position'] = bbox
                            asp_2['words'] = words
        return upper, lower, asp_1, asp_2
    def get_loan_term(self, page_num='0'):
        loan_term = self.item.copy()
        all_text = ''
@@ -226,10 +220,20 @@ class Finder:
                            loan_term['position'] = bbox
                            loan_term['words'] = words
        return loan_term
+    def mergelist(self, text_list):
+        pattern = re.compile("[^\u4e00-\u9fa5]")        # 匹配不是中文的其他字符
+        mergeindex = -1
+        for index, i in enumerate(text_list):
+            if '所购' in i and len(pattern.sub('', pattern.sub('', text_list[index+1]))) != 0:
+            # if '所购' in i and '.00' not in text_list[index+1]:
+                mergeindex = index
+        if mergeindex == -1:
+            return text_list
+        else:
+            new_text_list = text_list[:mergeindex] + [text_list[mergeindex] + text_list[mergeindex+1]] + text_list[mergeindex+2:]
+            return self.mergelist(new_text_list)   
    def get_asp_details(self, page_num):
        asp_details_table_term = self.item.copy()
        asp_details_table = []
        asp_details_text_list = []
        table = False
@@ -244,26 +248,22 @@ class Finder:
                    if '第二条' in text or '征信管理' in text:
                        table = False
                    if table == True:
+                        # print(text)
                        asp_details_text_list.append(text)
+        asp_details_text_list = self.mergelist(asp_details_text_list)
        for i in range((len(asp_details_text_list)+2)//3):
            line = []
            if i == 0:
                line = [asp_details_text_list[0]]
            else:
                for j in range(3):
                    line.append(asp_details_text_list[i*3-2+j])
            asp_details_table.append(line)
        if len(asp_details_table) > 0:
            asp_details_table_term['words'] = asp_details_table
        return asp_details_table_term
    def get_signature(self):
        signature = self.item.copy()
        for block in self.pdf_info['0']['blocks']:
            if block['type'] != 0:
                continue
@@ -275,7 +275,6 @@ class Finder:
                        signature['words'] = words
                        signature['position'] = bbox
        return signature
    def get_somebody(self, top, bottom):
        # 指定上下边界后,返回上下边界内的客户信息
        _name = self.item.copy()
@@ -300,6 +299,7 @@ class Finder:
                for span in line['spans']:
                    bbox, text = span['bbox'], span['text']
                    if y_top < bbox[3] < y_bottom:
+                        # print(top, bottom, text)
                        if '姓名/名称' in text:
                            words = text.split('：')[-1]
                            _name['position'] = bbox
@@ -309,7 +309,6 @@ class Finder:
                            _id['position'] = bbox
                            _id['words'] = words
        return _name, _id
    def get_seller(self):
        seller = self.item.copy()
        # 先找到 key
@@ -336,7 +335,6 @@ class Finder:
                            seller['position'] = bbox
                            seller['words'] = text
        return seller
    def get_payback_account(self):
        account = self.item.copy()
        account_name = self.item.copy()
@@ -389,7 +387,6 @@ class Finder:
                                account_bank['position'] = bbox
                                account_bank['words'] = words
        return account, account_name, account_bank
    def get_repayment_schedule(self):
        repayment_schedule = self.item.copy()
        # 只看第二页
@@ -408,23 +405,17 @@ class Finder:
                        table = False
                    if table == True:
                        repayment_schedule_text_list.append(text)
        for i in range(len(repayment_schedule_text_list)//5):
            line = []
            # 5表示5列的意思
            for j in range(5):
                line.append(repayment_schedule_text_list[i*5+j])
            if str(i+1) == line[1]:
                break
            repayment_schedule_table.append(line)
        if len(repayment_schedule_table) > 0:
            repayment_schedule['words'] = repayment_schedule_table
        return repayment_schedule
    def get_signature_role_1(self):
        signature_role_1 = self.init_item.copy()
        # 先定位签字区域
@@ -459,7 +450,6 @@ class Finder:
        signature_role_1['position'] = position
        signature_role_1['words'] = words
        return signature_role_1
    def get_signature_role_2(self):
        signature_role_2 = self.init_item.copy()
        # 先定位签字区域
@@ -494,7 +484,6 @@ class Finder:
        signature_role_2['position'] = position
        signature_role_2['words'] = words
        return signature_role_2
    def get_signature_role_3(self):
        signature_role_3 = self.init_item.copy()
        # 先定位签字区域
@@ -529,7 +518,6 @@ class Finder:
        signature_role_3['position'] = position
        signature_role_3['words'] = words
        return signature_role_3
    def get_signature_role_4(self):
        signature_role_4 = self.init_item.copy()
        # 先定位签字区域
@@ -564,7 +552,6 @@ class Finder:
        signature_role_4['position'] = position
        signature_role_4['words'] = words
        return signature_role_4
    def get_signature_role_5(self):
        signature_role_5 = self.init_item.copy()
        # 先定位签字区域
@@ -600,7 +587,6 @@ class Finder:
        signature_role_5['position'] = position
        signature_role_5['words'] = words
        return signature_role_5
    def get_last_page_signature(self, page_num, top, bottom):
        signature_name = self.item.copy()
        signature_date = self.item.copy()
@@ -616,6 +602,7 @@ class Finder:
                        anchor_top = bbox[1]
                    if bottom in text:
                        anchor_bottom = bbox[1]
+        # print(top, anchor_top, anchor_bottom)
        if anchor_top is not None and anchor_bottom is not None:
            for block in self.pdf_info[page_num]['blocks']:
                if block['type'] != 0:
@@ -629,9 +616,8 @@ class Finder:
                            signature_name['words'] = name
                            signature_name['position'] = bbox
                            signature_date['words'] = date
-                            signature_name['position'] = bbox
+                            signature_date['position'] = bbox
        return signature_name, signature_date
    def get_info(self):
        """
            block['type'] == 0 : 表示该元素为图片
@@ -639,21 +625,22 @@ class Finder:
        Returns:
            dict: Description
        """
        # 先判断是否为 ASP 产品
        # 只看第一页，判断是否有 '附加产品融资贷款本金总金额' 这一句话，若有则为 ASP 产品
        # print(self.pdf_info['0']['blocks'])
-        for block in self.pdf_info['0']['blocks']:
+        # for block in self.pdf_info['0']['blocks']:
-            if block['type'] != 0:
+        #     if block['type'] != 0:
-                continue
+        #         continue
-            for line in block['lines']:
+        #     for line in block['lines']:
-                for span in line['spans']:
+        #         for span in line['spans']:
-                    bbox, text = span['bbox'], span['text']
+        #             bbox, text = span['bbox'], span['text']
-                    if '附加产品融资贷款本金总金额' == text:
+        #             if '附加产品融资贷款本金总金额' == text:
-                        self.is_asp = True
+        #                 self.is_asp = True
+        for key in self.ocr_results['0']:
+            bbox, text = self.ocr_results['0'][key]
+            if '附加产品融资贷款本金总金额' in text:
+                self.is_asp = True
        self.gen_init_result(self.is_asp)
        # Page 1
        # 找合同编号
        contract_no = self.get_contract_no(page_num='0')
@@ -663,7 +650,7 @@ class Finder:
        self.init_result['page_1']['所购车辆价格'] = vehicle_price
        # 车架号
        vin = self.get_vin()
-        self.init_result['page_1']['车架号'] = vehicle_price
+        self.init_result['page_1']['车架号'] = vin
        # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目
        upper, lower, asp_1, asp_2 = self.get_loan_principal()
        self.init_result['page_1']['贷款本金金额']['大写'] = upper
@@ -685,11 +672,14 @@ class Finder:
        contract_no = self.get_contract_no(page_num='0')
        self.init_result['page_2']['合同编号'] = contract_no
        # 找借款人及抵押人(地址字段原本有空格)
-        borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人：', bottom='共同借款人及共同抵押人：')
+        borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人：', bottom='共同借款人：')
+        # 这是为了同时兼容 8.1 版本
+        if borrower_name['words'] == None:
+            borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人：', bottom='共同借款人及共同抵押人：')
        self.init_result['page_2']['借款人及抵押人']['name'] = borrower_name
        self.init_result['page_2']['借款人及抵押人']['id'] = borrower_id
        # 找共同借款人及共同抵押人
-        co_borrower_name, co_borrower_id = self.get_somebody(top='共同借款人及共同抵押人：', bottom='保证人1：')
+        co_borrower_name, co_borrower_id = self.get_somebody(top='共同借款人：', bottom='保证人1：')
        self.init_result['page_2']['共同借款人及共同抵押人']['name'] = co_borrower_name
        self.init_result['page_2']['共同借款人及共同抵押人']['id'] = co_borrower_id
        # 保证人1
@@ -755,11 +745,11 @@ class Finder:
            contract_no = self.get_contract_no(page_num='6')
            self.init_result['page_7']['合同编号'] = contract_no
            signature_name, signature_date = self.get_last_page_signature(page_num='6',
-                                                top='借款人(抵押人)', bottom='共同借款人(共同抵押人)')
+                                                top='合同编号', bottom='共同借款人')
            self.init_result['page_7']['主借人签字']['签字'] = signature_name
            self.init_result['page_7']['主借人签字']['日期'] = signature_date
            signature_name, signature_date = self.get_last_page_signature(page_num='6',
-                                                top='共同借款人(共同抵押人)', bottom='保证人1')
+                                                top='共同借款人', bottom='保证人1')
            self.init_result['page_7']['共借人签字']['签字'] = signature_name
            self.init_result['page_7']['共借人签字']['日期'] = signature_date
            signature_name, signature_date = self.get_last_page_signature(page_num='6',
@@ -771,7 +761,7 @@ class Finder:
            self.init_result['page_7']['保证人2签字']['签字'] = signature_name
            self.init_result['page_7']['保证人2签字']['日期'] = signature_date
            signature_name, signature_date = self.get_last_page_signature(page_num='6',
-                                                top='在本人面前亲笔签署本合同', bottom='(以下无正文)')
+                                                top='在本人面前亲笔签署本合同', bottom='以下无正文')
            self.init_result['page_7']['见证人签字']['签字'] = signature_name
            self.init_result['page_7']['见证人签字']['日期'] = signature_date
        else:
@@ -784,11 +774,11 @@ class Finder:
            contract_no = self.get_contract_no(page_num='7')
            self.init_result['page_8']['合同编号'] = contract_no
            signature_name, signature_date = self.get_last_page_signature(page_num='7',
-                                                top='借款人(抵押人)', bottom='共同借款人(共同抵押人)')
+                                                top='合同编号', bottom='共同借款人')
            self.init_result['page_8']['主借人签字']['签字'] = signature_name
            self.init_result['page_8']['主借人签字']['日期'] = signature_date
            signature_name, signature_date = self.get_last_page_signature(page_num='7',
-                                                top='共同借款人(共同抵押人)', bottom='保证人1')
+                                                top='共同借款人', bottom='保证人1')
            self.init_result['page_8']['共借人签字']['签字'] = signature_name
            self.init_result['page_8']['共借人签字']['日期'] = signature_date
            signature_name, signature_date = self.get_last_page_signature(page_num='7',
@@ -800,10 +790,9 @@ class Finder:
            self.init_result['page_8']['保证人2签字']['签字'] = signature_name
            self.init_result['page_8']['保证人2签字']['日期'] = signature_date
            signature_name, signature_date = self.get_last_page_signature(page_num='7',
-                                                top='在本人面前亲笔签署本合同', bottom='(以下无正文)')
+                                                top='在本人面前亲笔签署本合同', bottom='以下无正文')
            self.init_result['page_8']['见证人签字']['签字'] = signature_name
            self.init_result['page_8']['见证人签字']['日期'] = signature_date
        # 重新定制输出
        new_results = {"is_asp": self.is_asp,
                       "page_info": self.init_result