e-contract part 1
Showing
15 changed files
with
2228 additions
and
17 deletions
| ... | @@ -1773,3 +1773,21 @@ APPLICANT_TYPE_MAP = { | ... | @@ -1773,3 +1773,21 @@ APPLICANT_TYPE_MAP = { |
| 1773 | } | 1773 | } |
| 1774 | 1774 | ||
| 1775 | APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager'] | 1775 | APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager'] |
| 1776 | |||
| 1777 | FILE_NAME_PREFIX_MAP = { | ||
| 1778 | AFC_PREFIX: [ | ||
| 1779 | ((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'), | ||
| 1780 | ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), | ||
| 1781 | ], | ||
| 1782 | HIL_PREFIX: [ | ||
| 1783 | ((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'), | ||
| 1784 | ((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'), | ||
| 1785 | ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), | ||
| 1786 | ] | ||
| 1787 | } | ||
| 1788 | |||
| 1789 | HIL_CONTRACT_TYPE_MAP = { | ||
| 1790 | str(HIL_CONTRACT_1_CLASSIFY): 0, | ||
| 1791 | str(HIL_CONTRACT_2_CLASSIFY): 2, | ||
| 1792 | str(HIL_CONTRACT_3_CLASSIFY): 1, | ||
| 1793 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
| ... | @@ -18,6 +18,8 @@ from settings import conf | ... | @@ -18,6 +18,8 @@ from settings import conf |
| 18 | from common.mixins import LoggerMixin | 18 | from common.mixins import LoggerMixin |
| 19 | from common.tools.file_tools import write_zip_file | 19 | from common.tools.file_tools import write_zip_file |
| 20 | from common.tools.pdf_to_img import PDFHandler | 20 | from common.tools.pdf_to_img import PDFHandler |
| 21 | from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict | ||
| 22 | from common.electronic_hil_contract.hil_contract_ocr import predict as hil_predict | ||
| 21 | from apps.doc import consts | 23 | from apps.doc import consts |
| 22 | # from apps.doc.ocr.edms import EDMS, rh | 24 | # from apps.doc.ocr.edms import EDMS, rh |
| 23 | from apps.doc.ocr.ecm import ECM, rh | 25 | from apps.doc.ocr.ecm import ECM, rh |
| ... | @@ -47,6 +49,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -47,6 +49,7 @@ class Command(BaseCommand, LoggerMixin): |
| 47 | def __init__(self): | 49 | def __init__(self): |
| 48 | super().__init__() | 50 | super().__init__() |
| 49 | self.log_base = '[doc ocr process]' | 51 | self.log_base = '[doc ocr process]' |
| 52 | self.e_log_base = '[e-contract ocr process]' | ||
| 50 | # 处理文件开关 | 53 | # 处理文件开关 |
| 51 | self.switch = True | 54 | self.switch = True |
| 52 | # 睡眠时间 | 55 | # 睡眠时间 |
| ... | @@ -90,13 +93,20 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -90,13 +93,20 @@ class Command(BaseCommand, LoggerMixin): |
| 90 | task_str, is_priority = rh.dequeue() | 93 | task_str, is_priority = rh.dequeue() |
| 91 | if task_str is None: | 94 | if task_str is None: |
| 92 | self.online_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base)) | 95 | self.online_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base)) |
| 93 | return None, None, None | 96 | return None, None, None, None, None |
| 94 | 97 | ||
| 95 | self.online_log.info('{0} [get_doc_info] [task={1}] [is_priority={2}]'.format( | 98 | self.online_log.info('{0} [get_doc_info] [task={1}] [is_priority={2}]'.format( |
| 96 | self.log_base, task_str, is_priority)) | 99 | self.log_base, task_str, is_priority)) |
| 97 | try: | 100 | try: |
| 98 | # doc, business_type = self.get_doc_object(task_str) | 101 | # doc, business_type = self.get_doc_object(task_str) |
| 99 | business_type, doc_id_str = task_str.split(consts.SPLIT_STR) | 102 | info_tuple = task_str.split(consts.SPLIT_STR) |
| 103 | if len(info_tuple) == 2: | ||
| 104 | business_type, doc_id_str = info_tuple | ||
| 105 | classify_1_str = classify_2_str = '0' | ||
| 106 | rebuild_task_str = task_str | ||
| 107 | else: | ||
| 108 | business_type, doc_id_str, classify_1_str, classify_2_str = info_tuple | ||
| 109 | rebuild_task_str = '{0}{1}{2}'.format(business_type, consts.SPLIT_STR, doc_id_str) | ||
| 100 | doc_id = int(doc_id_str) | 110 | doc_id = int(doc_id_str) |
| 101 | doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc | 111 | doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc |
| 102 | doc = doc_class.objects.filter(id=doc_id).first() | 112 | doc = doc_class.objects.filter(id=doc_id).first() |
| ... | @@ -104,11 +114,11 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -104,11 +114,11 @@ class Command(BaseCommand, LoggerMixin): |
| 104 | if doc is None: | 114 | if doc is None: |
| 105 | self.online_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format( | 115 | self.online_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format( |
| 106 | self.log_base, task_str, is_priority)) | 116 | self.log_base, task_str, is_priority)) |
| 107 | return None, None, None | 117 | return None, None, None, None, None |
| 108 | elif doc.status != DocStatus.INIT.value: | 118 | elif doc.status != DocStatus.INIT.value: |
| 109 | self.online_log.warn('{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] ' | 119 | self.online_log.warn('{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] ' |
| 110 | '[doc_status={3}]'.format(self.log_base, task_str, is_priority, doc.status)) | 120 | '[doc_status={3}]'.format(self.log_base, task_str, is_priority, doc.status)) |
| 111 | return None, None, None | 121 | return None, None, None, None, None |
| 112 | doc.status = DocStatus.PROCESSING.value | 122 | doc.status = DocStatus.PROCESSING.value |
| 113 | doc.start_time = timezone.now() | 123 | doc.start_time = timezone.now() |
| 114 | doc.save() | 124 | doc.save() |
| ... | @@ -120,7 +130,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -120,7 +130,7 @@ class Command(BaseCommand, LoggerMixin): |
| 120 | else: | 130 | else: |
| 121 | self.online_log.info('{0} [get_doc_info] [db save end] [task_str={1}] [is_priority={2}]'.format( | 131 | self.online_log.info('{0} [get_doc_info] [db save end] [task_str={1}] [is_priority={2}]'.format( |
| 122 | self.log_base, task_str, is_priority)) | 132 | self.log_base, task_str, is_priority)) |
| 123 | return doc, business_type, task_str | 133 | return doc, business_type, rebuild_task_str, classify_1_str, classify_2_str |
| 124 | 134 | ||
| 125 | # def pdf_download(self, doc, pdf_path): | 135 | # def pdf_download(self, doc, pdf_path): |
| 126 | # if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): | 136 | # if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): |
| ... | @@ -915,11 +925,11 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -915,11 +925,11 @@ class Command(BaseCommand, LoggerMixin): |
| 915 | # summary['confidence'] = max(summary['confidence']) | 925 | # summary['confidence'] = max(summary['confidence']) |
| 916 | return merged_bs_summary | 926 | return merged_bs_summary |
| 917 | 927 | ||
| 918 | def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock, error_list): | 928 | def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock, error_list, res_dict, finish_queue): |
| 919 | while self.switch: | 929 | while self.switch: |
| 920 | try: | 930 | try: |
| 921 | # 1. 从队列获取文件信息 | 931 | # 1. 从队列获取文件信息 |
| 922 | doc, business_type, task_str = self.get_doc_info() | 932 | doc, business_type, task_str, classify_1_str, classify_2_str = self.get_doc_info() |
| 923 | # 队列为空时的处理 | 933 | # 队列为空时的处理 |
| 924 | if doc is None: | 934 | if doc is None: |
| 925 | time.sleep(self.sleep_time_doc_get) | 935 | time.sleep(self.sleep_time_doc_get) |
| ... | @@ -930,14 +940,16 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -930,14 +940,16 @@ class Command(BaseCommand, LoggerMixin): |
| 930 | error_list.append(1) | 940 | error_list.append(1) |
| 931 | return | 941 | return |
| 932 | else: | 942 | else: |
| 933 | try: | ||
| 934 | # 2. 从EDMS获取PDF文件 | ||
| 935 | doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id)) | 943 | doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id)) |
| 936 | os.makedirs(doc_data_path, exist_ok=True) | 944 | os.makedirs(doc_data_path, exist_ok=True) |
| 937 | img_save_path = os.path.join(doc_data_path, 'img') | 945 | img_save_path = os.path.join(doc_data_path, 'img') |
| 938 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) | 946 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) |
| 939 | 947 | ||
| 940 | pdf_handler = PDFHandler(pdf_path, img_save_path, doc.document_name) | 948 | pdf_handler = PDFHandler(pdf_path, img_save_path, doc.document_name) |
| 949 | |||
| 950 | if classify_1_str == '0' or classify_1_str == str(consts.HMH_CLASSIFY): | ||
| 951 | try: | ||
| 952 | # 2. 从EDMS获取PDF文件 | ||
| 941 | max_count_obj = Configs.objects.filter(id=2).first() | 953 | max_count_obj = Configs.objects.filter(id=2).first() |
| 942 | try: | 954 | try: |
| 943 | max_img_count = int(max_count_obj.value) | 955 | max_img_count = int(max_count_obj.value) |
| ... | @@ -1057,6 +1069,107 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1057,6 +1069,107 @@ class Command(BaseCommand, LoggerMixin): |
| 1057 | self.log_base, traceback.format_exc())) | 1069 | self.log_base, traceback.format_exc())) |
| 1058 | error_list.append(1) | 1070 | error_list.append(1) |
| 1059 | return | 1071 | return |
| 1072 | else: # e-contract | ||
| 1073 | try: | ||
| 1074 | # pdf下载 处理 图片存储 识别 | ||
| 1075 | for times in range(consts.RETRY_TIMES): | ||
| 1076 | try: | ||
| 1077 | self.edms.download(pdf_path, doc.metadata_version_id, doc.document_scheme, business_type) | ||
| 1078 | self.online_log.info('{0} [edms download success] [task={1}] [times={2}] ' | ||
| 1079 | '[pdf_path={3}]'.format(self.e_log_base, task_str, times, pdf_path)) | ||
| 1080 | |||
| 1081 | self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format( | ||
| 1082 | self.e_log_base, task_str, times)) | ||
| 1083 | pdf_handler.e_contract_process() | ||
| 1084 | self.online_log.info( | ||
| 1085 | '{0} [pdf to img end] [task={1}] [times={2}]'.format(self.e_log_base, task_str, times)) | ||
| 1086 | except Exception as e: | ||
| 1087 | self.online_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] ' | ||
| 1088 | '[error={3}]'.format(self.e_log_base, task_str, times, | ||
| 1089 | traceback.format_exc())) | ||
| 1090 | else: | ||
| 1091 | break | ||
| 1092 | else: | ||
| 1093 | raise Exception('download or pdf to img failed') | ||
| 1094 | |||
| 1095 | if classify_1_str == str(consts.CONTRACT_CLASSIFY): | ||
| 1096 | ocr_result = afc_predict(pdf_handler.pdf_info) | ||
| 1097 | page_res = {} | ||
| 1098 | for page_num, page_info in ocr_result.get('page_info', {}).items(): | ||
| 1099 | if isinstance(page_num, str) and page_num.startswith('page_'): | ||
| 1100 | page_res[page_num] = { | ||
| 1101 | 'classify': int(classify_1_str), | ||
| 1102 | 'page_num': page_num, | ||
| 1103 | 'page_info': page_info | ||
| 1104 | } | ||
| 1105 | |||
| 1106 | else: | ||
| 1107 | file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str) | ||
| 1108 | file_type_2 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_2_str) | ||
| 1109 | ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1) | ||
| 1110 | rebuild_res_1 = {} | ||
| 1111 | page_res = {} | ||
| 1112 | for field_name, field_info in ocr_result_1.items(): | ||
| 1113 | page_num = field_info.pop('page', 'page_1') | ||
| 1114 | rebuild_res_1.setdefault(page_num, dict())[field_name] = field_info | ||
| 1115 | for page_num, page_info in rebuild_res_1.items(): | ||
| 1116 | if isinstance(page_num, str) and page_num.startswith('page_'): | ||
| 1117 | page_res[page_num] = { | ||
| 1118 | 'classify': int(classify_1_str), | ||
| 1119 | 'page_num': page_num, | ||
| 1120 | 'page_info': page_info | ||
| 1121 | } | ||
| 1122 | if isinstance(file_type_2, int): | ||
| 1123 | rebuild_res_2 = {} | ||
| 1124 | ocr_result_2 = hil_predict(pdf_handler.pdf_info, file_type_2) | ||
| 1125 | for field_name, field_info in ocr_result_2.items(): | ||
| 1126 | page_num = field_info.pop('page', 'page_1') | ||
| 1127 | rebuild_res_2.setdefault(page_num, dict())[field_name] = field_info | ||
| 1128 | for page_num, page_info in ocr_result_2.items(): | ||
| 1129 | if isinstance(page_num, str) and page_num.startswith('page_'): | ||
| 1130 | page_res[page_num] = { | ||
| 1131 | 'classify': int(classify_2_str), | ||
| 1132 | 'page_num': page_num, | ||
| 1133 | 'page_info': page_info | ||
| 1134 | } | ||
| 1135 | |||
| 1136 | contract_res = {} | ||
| 1137 | for img_path_tmp, page_key in pdf_handler.img_path_pno_list: | ||
| 1138 | if page_key in page_res: | ||
| 1139 | img_contract_res = { | ||
| 1140 | 'code': 1, | ||
| 1141 | 'data': [ | ||
| 1142 | { | ||
| 1143 | 'classify': page_res[page_key].pop('classify', consts.OTHER_CLASSIFY), | ||
| 1144 | 'data': page_res[page_key] | ||
| 1145 | } | ||
| 1146 | ] | ||
| 1147 | } | ||
| 1148 | else: | ||
| 1149 | img_contract_res = { | ||
| 1150 | 'code': 1, | ||
| 1151 | 'data': [ | ||
| 1152 | { | ||
| 1153 | 'classify': int(classify_1_str), | ||
| 1154 | } | ||
| 1155 | ] | ||
| 1156 | } | ||
| 1157 | contract_res[img_path_tmp] = img_contract_res | ||
| 1158 | |||
| 1159 | with lock: | ||
| 1160 | res_dict[task_str] = contract_res | ||
| 1161 | finish_queue.put(task_str) | ||
| 1162 | except Exception as e: | ||
| 1163 | try: | ||
| 1164 | doc.status = DocStatus.PROCESS_FAILED.value | ||
| 1165 | doc.save() | ||
| 1166 | self.online_log.warn('{0} [process failed (e-contract)] [task={1}] ' | ||
| 1167 | '[error={2}]'.format(self.e_log_base, task_str, traceback.format_exc())) | ||
| 1168 | except Exception as e: | ||
| 1169 | self.online_log.error('{0} [process error (db save)] [error={1}]'.format( | ||
| 1170 | self.e_log_base, traceback.format_exc())) | ||
| 1171 | error_list.append(1) | ||
| 1172 | return | ||
| 1060 | 1173 | ||
| 1061 | def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url, error_list): | 1174 | def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url, error_list): |
| 1062 | while len(error_list) == 0 or not img_queue.empty(): | 1175 | while len(error_list) == 0 or not img_queue.empty(): |
| ... | @@ -1801,7 +1914,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1801,7 +1914,7 @@ class Command(BaseCommand, LoggerMixin): |
| 1801 | finish_queue = Queue() | 1914 | finish_queue = Queue() |
| 1802 | 1915 | ||
| 1803 | process_list = [] | 1916 | process_list = [] |
| 1804 | pdf_process = Process(target=self.pdf_2_img_2_queue, args=(img_queue, todo_count_dict, lock, error_list)) | 1917 | pdf_process = Process(target=self.pdf_2_img_2_queue, args=(img_queue, todo_count_dict, lock, error_list, res_dict, finish_queue)) |
| 1805 | process_list.append(pdf_process) | 1918 | process_list.append(pdf_process) |
| 1806 | 1919 | ||
| 1807 | for url in self.ocr_1_urls.values(): | 1920 | for url in self.ocr_1_urls.values(): | ... | ... |
| ... | @@ -789,3 +789,24 @@ class HILCACompareResultRecord(models.Model): | ... | @@ -789,3 +789,24 @@ class HILCACompareResultRecord(models.Model): |
| 789 | db_table = 'hil_ca_compare_result_record' | 789 | db_table = 'hil_ca_compare_result_record' |
| 790 | 790 | ||
| 791 | 791 | ||
| 792 | class HILContract(models.Model): | ||
| 793 | id = models.AutoField(primary_key=True, verbose_name="id") # 主键 | ||
| 794 | application_id = models.CharField(max_length=64, verbose_name="申请id") # 索引 | ||
| 795 | create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') | ||
| 796 | |||
| 797 | class Meta: | ||
| 798 | managed = False | ||
| 799 | db_table = 'hil_contract' | ||
| 800 | |||
| 801 | |||
| 802 | class AFCContract(models.Model): | ||
| 803 | id = models.AutoField(primary_key=True, verbose_name="id") # 主键 | ||
| 804 | application_id = models.CharField(max_length=64, verbose_name="申请id") # 索引 | ||
| 805 | create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') | ||
| 806 | |||
| 807 | class Meta: | ||
| 808 | managed = False | ||
| 809 | db_table = 'afc_contract' | ||
| 810 | situ_db_label = 'afc' | ||
| 811 | |||
| 812 | ... | ... |
| 1 | import os | ||
| 1 | import base64 | 2 | import base64 |
| 2 | import requests | 3 | import requests |
| 3 | from common.redis_cache import redis_handler as rh | 4 | from common.redis_cache import redis_handler as rh |
| ... | @@ -44,7 +45,6 @@ class ECM: | ... | @@ -44,7 +45,6 @@ class ECM: |
| 44 | "b_coborrower_id", "b_coborrower_name", "b_guarantor_id", "b_guarantor_name", | 45 | "b_coborrower_id", "b_coborrower_name", "b_guarantor_id", "b_guarantor_name", |
| 45 | "b_frontend_partner", "b_dealer_code", "b_dealer_name", "b_input_date", "b_comment", | 46 | "b_frontend_partner", "b_dealer_code", "b_dealer_name", "b_input_date", "b_comment", |
| 46 | "b_contract_no", "b_location"] | 47 | "b_contract_no", "b_location"] |
| 47 | self.contract_prefix = '电子' | ||
| 48 | 48 | ||
| 49 | def update_oauth_token(self): | 49 | def update_oauth_token(self): |
| 50 | response = requests.post(self.oauth_url, headers=self.oauth_headers, data=self.oauth_payload, verify=False) | 50 | response = requests.post(self.oauth_url, headers=self.oauth_headers, data=self.oauth_payload, verify=False) |
| ... | @@ -69,9 +69,9 @@ class ECM: | ... | @@ -69,9 +69,9 @@ class ECM: |
| 69 | def get_headers(self): | 69 | def get_headers(self): |
| 70 | return {'Authorization': '{0} {1}'.format(self.token_type, self.get_oauth_token())} | 70 | return {'Authorization': '{0} {1}'.format(self.token_type, self.get_oauth_token())} |
| 71 | 71 | ||
| 72 | def search(self, application_id, business_type): | 72 | def search(self, application_id, business_type, prefix): |
| 73 | sql = "select * from {0} where b_application_no='{1}' and object_name like '{2}%'".format( | 73 | sql = "select * from {0} where b_application_no='{1}' and object_name like '{2}%'".format( |
| 74 | self.settlement_type, application_id, self.contract_prefix) | 74 | self.settlement_type, application_id, prefix) |
| 75 | search_args = { | 75 | search_args = { |
| 76 | "userName": self.username, | 76 | "userName": self.username, |
| 77 | "password": self.pwd, | 77 | "password": self.pwd, |
| ... | @@ -96,7 +96,6 @@ class ECM: | ... | @@ -96,7 +96,6 @@ class ECM: |
| 96 | result.append((object_name, object_id)) | 96 | result.append((object_name, object_id)) |
| 97 | return result | 97 | return result |
| 98 | 98 | ||
| 99 | |||
| 100 | def download(self, save_path, object_id, document_scheme, business_type): | 99 | def download(self, save_path, object_id, document_scheme, business_type): |
| 101 | doc_type, _, _ = self.doc_type_map.get(document_scheme) | 100 | doc_type, _, _ = self.doc_type_map.get(document_scheme) |
| 102 | download_json = { | 101 | download_json = { | ... | ... |
| ... | @@ -36,12 +36,14 @@ from .models import ( | ... | @@ -36,12 +36,14 @@ from .models import ( |
| 36 | AFCSECompareResultRecord, | 36 | AFCSECompareResultRecord, |
| 37 | HILCACompareResultRecord, | 37 | HILCACompareResultRecord, |
| 38 | HILSECompareResultRecord, | 38 | HILSECompareResultRecord, |
| 39 | HILContract, | ||
| 40 | AFCContract, | ||
| 39 | ) | 41 | ) |
| 40 | from .named_enum import ErrorType | 42 | from .named_enum import ErrorType |
| 41 | from .mixins import DocHandler | 43 | from .mixins import DocHandler |
| 42 | from . import consts | 44 | from . import consts |
| 43 | from apps.account.authentication import OAuth2AuthenticationWithUser | 45 | from apps.account.authentication import OAuth2AuthenticationWithUser |
| 44 | from celery_compare.tasks import compare | 46 | from celery_compare.tasks import compare, forwarding_station |
| 45 | 47 | ||
| 46 | 48 | ||
| 47 | class CustomDate(fields.Date): | 49 | class CustomDate(fields.Date): |
| ... | @@ -1164,5 +1166,11 @@ class SEContractView(GenericView): | ... | @@ -1164,5 +1166,11 @@ class SEContractView(GenericView): |
| 1164 | # pos上传e-contract信息接口 SE | 1166 | # pos上传e-contract信息接口 SE |
| 1165 | @use_args(se_contract_args, location='data') | 1167 | @use_args(se_contract_args, location='data') |
| 1166 | def post(self, request, args): | 1168 | def post(self, request, args): |
| 1167 | self.running_log.info('e-contract in') | 1169 | contract_info = args.get('content', {}) |
| 1170 | application_id = contract_info.get('applicationId', '') | ||
| 1171 | entity = contract_info.get('applicationEntity', '') | ||
| 1172 | table_class = HILContract if entity == consts.HIL_PREFIX else AFCContract | ||
| 1173 | table_class.objects.create(application_id=application_id) | ||
| 1174 | forwarding_station.apply_async((application_id, entity), queue='queue_compare', countdown=conf.DELAY_SECONDS) | ||
| 1175 | self.running_log.info('[e-contract] [application_id={0}] [entity={1}]'.format(application_id, entity)) | ||
| 1168 | return response.ok() | 1176 | return response.ok() | ... | ... |
| ... | @@ -27,10 +27,13 @@ from apps.doc.models import ( | ... | @@ -27,10 +27,13 @@ from apps.doc.models import ( |
| 27 | AFCCACompareResult, | 27 | AFCCACompareResult, |
| 28 | HILSECompareResult, | 28 | HILSECompareResult, |
| 29 | HILCACompareResult, | 29 | HILCACompareResult, |
| 30 | AFCDoc, | ||
| 31 | HILDoc | ||
| 30 | ) | 32 | ) |
| 31 | from apps.doc import consts | 33 | from apps.doc import consts |
| 32 | from apps.doc.ocr.gcap import gcap | 34 | from apps.doc.ocr.gcap import gcap |
| 33 | from apps.doc.ocr.cms import cms | 35 | from apps.doc.ocr.cms import cms |
| 36 | from apps.doc.ocr.ecm import ECM, rh | ||
| 34 | from apps.doc.exceptions import GCAPException | 37 | from apps.doc.exceptions import GCAPException |
| 35 | from apps.doc.named_enum import RequestTeam, RequestTrigger, ProcessName, ErrorType | 38 | from apps.doc.named_enum import RequestTeam, RequestTrigger, ProcessName, ErrorType |
| 36 | from common.tools.comparison import cp | 39 | from common.tools.comparison import cp |
| ... | @@ -38,9 +41,11 @@ from common.tools.des import decode_des | ... | @@ -38,9 +41,11 @@ from common.tools.des import decode_des |
| 38 | 41 | ||
| 39 | compare_log = logging.getLogger('compare') | 42 | compare_log = logging.getLogger('compare') |
| 40 | log_base = '[Compare]' | 43 | log_base = '[Compare]' |
| 44 | e_log_base = '[e-contract]' | ||
| 41 | empty_str = '' | 45 | empty_str = '' |
| 42 | empty_error_type = 1000 | 46 | empty_error_type = 1000 |
| 43 | des_key = conf.CMS_DES_KEY | 47 | des_key = conf.CMS_DES_KEY |
| 48 | ecm = ECM() | ||
| 44 | 49 | ||
| 45 | 50 | ||
| 46 | def rotate_bound(image, angle): | 51 | def rotate_bound(image, angle): |
| ... | @@ -1867,4 +1872,32 @@ def compare(application_id, application_entity, uniq_seq, ocr_res_id, is_ca=True | ... | @@ -1867,4 +1872,32 @@ def compare(application_id, application_entity, uniq_seq, ocr_res_id, is_ca=True |
| 1867 | se_compare(application_id, application_entity, ocr_res_id, last_obj, ocr_res_dict, is_cms) | 1872 | se_compare(application_id, application_entity, ocr_res_id, last_obj, ocr_res_dict, is_cms) |
| 1868 | 1873 | ||
| 1869 | 1874 | ||
| 1870 | 1875 | @app.task | |
| 1876 | def forwarding_station(application_id, entity): | ||
| 1877 | compare_log.info('{0} [forward start] [application_id={1}] [entity={2}]'.format(e_log_base, application_id, entity)) | ||
| 1878 | doc_class = HILDoc if entity in consts.HIL_SET else AFCDoc | ||
| 1879 | entity_prefix = consts.HIL_PREFIX if entity in consts.HIL_SET else consts.AFC_PREFIX | ||
| 1880 | for (classify_1, classify_2), prefix in consts.FILE_NAME_PREFIX_MAP.get(entity): | ||
| 1881 | try: | ||
| 1882 | file_list = ecm.search(application_id, entity, prefix.format(application_id)) # TODO 获取最新文件 | ||
| 1883 | except Exception as e: | ||
| 1884 | compare_log.error('{0} [search failed] [application_id={1}] [entity={2}] [error={3}]'.format( | ||
| 1885 | e_log_base, application_id, entity, traceback.format_exc())) | ||
| 1886 | else: | ||
| 1887 | compare_log.info('{0} [search end] [application_id={1}] [entity={2}] [file_list={3}]'.format( | ||
| 1888 | e_log_base, application_id, entity, file_list)) | ||
| 1889 | for object_name, object_id in file_list: | ||
| 1890 | doc = doc_class.objects.create( | ||
| 1891 | metadata_version_id=object_id, | ||
| 1892 | application_id=application_id, | ||
| 1893 | document_name=object_name, | ||
| 1894 | document_scheme='SETTLEMENT', | ||
| 1895 | data_source='POS', | ||
| 1896 | upload_finish_time=datetime.now(), | ||
| 1897 | ) | ||
| 1898 | task = consts.SPLIT_STR.join([entity_prefix, str(doc.id), str(classify_1), str(classify_2)]) | ||
| 1899 | enqueue_res = rh.enqueue([task], False) | ||
| 1900 | compare_log.info('{0} [upload success] [res={1}] [application_id={2}] [entity={3}] [object_name={4}] ' | ||
| 1901 | '[object_id={5}] [doc_id={6}]'.format(e_log_base, enqueue_res, application_id, entity, | ||
| 1902 | object_name, object_id, doc.id)) | ||
| 1903 | compare_log.info('{0} [forward end] [application_id={1}] [entity={2}]'.format(e_log_base, application_id, entity)) | ... | ... |
| 1 | # -*- coding: utf-8 -*- | ||
| 2 | # @Author : lk | ||
| 3 | # @Email : 9428.al@gmail.com | ||
| 4 | # @Created Date : 2021-06-29 17:43:46 | ||
| 5 | # @Last Modified : 2021-09-07 14:11:25 | ||
| 6 | # @Description : | ||
| 7 | |||
| 8 | from .get_char import Finder | ||
| 9 | |||
| 10 | |||
| 11 | def predict(pdf_info): | ||
| 12 | # 输入是整个 PDF 中的信息 | ||
| 13 | f = Finder(pdf_info) | ||
| 14 | results = f.get_info() | ||
| 15 | |||
| 16 | return results | ||
| 17 | |||
| 18 |
| 1 | # -*- coding: utf-8 -*- | ||
| 2 | # @Author : lk | ||
| 3 | # @Email : 9428.al@gmail.com | ||
| 4 | # @Create Date : 2021-07-20 16:42:41 | ||
| 5 | # @Last Modified : 2021-09-07 19:52:39 | ||
| 6 | # @Description : | ||
| 7 | |||
| 8 | import re | ||
| 9 | import numpy as np | ||
| 10 | from fuzzywuzzy import fuzz | ||
| 11 | |||
| 12 | |||
| 13 | class Finder: | ||
| 14 | |||
| 15 | def __init__(self, pdf_info): | ||
| 16 | self.pdf_info = pdf_info | ||
| 17 | self.is_asp = False | ||
| 18 | self.item = {"words": None, | ||
| 19 | "position": None, | ||
| 20 | } | ||
| 21 | |||
| 22 | def gen_init_result(self, is_asp): | ||
| 23 | # 格式化算法输出 | ||
| 24 | self.init_result = {"page_1": {"合同编号": self.item, | ||
| 25 | "所购车辆价格": self.item, | ||
| 26 | "车架号": self.item, | ||
| 27 | "贷款本金金额": {"大写": self.item, | ||
| 28 | "小写": self.item, | ||
| 29 | "车辆贷款本金金额": self.item, | ||
| 30 | "附加产品融资贷款本金总金额": self.item, | ||
| 31 | }, | ||
| 32 | "贷款期限": self.item, | ||
| 33 | "附加产品融资贷款本金总金额明细": self.item, | ||
| 34 | "借款人签字及时间": self.item, | ||
| 35 | }, | ||
| 36 | "page_2": {"合同编号": self.item, | ||
| 37 | "借款人及抵押人": {"name": self.item, | ||
| 38 | "id": self.item, | ||
| 39 | }, | ||
| 40 | "共同借款人及共同抵押人": {"name": self.item, | ||
| 41 | "id": self.item, | ||
| 42 | }, | ||
| 43 | "保证人1": {"name": self.item, | ||
| 44 | "id": self.item, | ||
| 45 | }, | ||
| 46 | "保证人2": {"name": self.item, | ||
| 47 | "id": self.item, | ||
| 48 | }, | ||
| 49 | "所购车辆价格": self.item, | ||
| 50 | "车架号": self.item, | ||
| 51 | "经销商": self.item, | ||
| 52 | "贷款本金金额": {"大写": self.item, | ||
| 53 | "小写": self.item, | ||
| 54 | "车辆贷款本金金额": self.item, | ||
| 55 | "附加产品融资贷款本金总金额": self.item, | ||
| 56 | }, | ||
| 57 | "贷款期限": self.item, | ||
| 58 | "还款账户": {"账号": self.item, | ||
| 59 | "户名": self.item, | ||
| 60 | "开户行": self.item, | ||
| 61 | }, | ||
| 62 | }, | ||
| 63 | "page_3": {"合同编号": self.item, | ||
| 64 | "还款计划表": self.item, | ||
| 65 | }, | ||
| 66 | "page_4": {"合同编号": self.item, | ||
| 67 | "附加产品融资贷款本金总金额明细": self.item, | ||
| 68 | }, | ||
| 69 | "page_5": {"合同编号": self.item, | ||
| 70 | }, | ||
| 71 | "page_6": {"合同编号": self.item, | ||
| 72 | }, | ||
| 73 | } | ||
| 74 | if self.is_asp == False: | ||
| 75 | self.init_result["page_7"] = {"合同编号": self.item, | ||
| 76 | "主借人签字": {"签字": self.item, | ||
| 77 | "日期": self.item, | ||
| 78 | }, | ||
| 79 | "共借人签字": {"签字": self.item, | ||
| 80 | "日期": self.item, | ||
| 81 | }, | ||
| 82 | "保证人1签字": {"签字": self.item, | ||
| 83 | "日期": self.item, | ||
| 84 | }, | ||
| 85 | "保证人2签字": {"签字": self.item, | ||
| 86 | "日期": self.item, | ||
| 87 | }, | ||
| 88 | "见证人签字": {"签字": self.item, | ||
| 89 | "日期": self.item, | ||
| 90 | }, | ||
| 91 | } | ||
| 92 | else: | ||
| 93 | self.init_result["page_7"] = {"合同编号": self.item, | ||
| 94 | } | ||
| 95 | self.init_result["page_8"] = {"合同编号": self.item, | ||
| 96 | "主借人签字": {"签字": self.item, | ||
| 97 | "日期": self.item, | ||
| 98 | }, | ||
| 99 | "共借人签字": {"签字": self.item, | ||
| 100 | "日期": self.item, | ||
| 101 | }, | ||
| 102 | "保证人1签字": {"签字": self.item, | ||
| 103 | "日期": self.item, | ||
| 104 | }, | ||
| 105 | "保证人2签字": {"签字": self.item, | ||
| 106 | "日期": self.item, | ||
| 107 | }, | ||
| 108 | "见证人签字": {"签字": self.item, | ||
| 109 | "日期": self.item, | ||
| 110 | }, | ||
| 111 | } | ||
| 112 | |||
| 113 | |||
| 114 | def get_contract_no(self, page_num): | ||
| 115 | """传入页码,查看该页码右上角的编号 | ||
| 116 | |||
| 117 | Args: | ||
| 118 | page_num (string): | ||
| 119 | |||
| 120 | Returns: | ||
| 121 | sting: | ||
| 122 | """ | ||
| 123 | contract_no = self.item.copy() | ||
| 124 | # 只看第一页 | ||
| 125 | for block in self.pdf_info[page_num]['blocks']: | ||
| 126 | if block['type'] != 0: | ||
| 127 | continue | ||
| 128 | for line in block['lines']: | ||
| 129 | for span in line['spans']: | ||
| 130 | bbox, text = span['bbox'], span['text'] | ||
| 131 | if '合同编号:' in text: | ||
| 132 | words = text.split(':')[-1] | ||
| 133 | contract_no['position'] = bbox | ||
| 134 | contract_no['words'] = words | ||
| 135 | return contract_no | ||
| 136 | |||
| 137 | def get_vehicle_price(self, page_num='0'): | ||
| 138 | vehicle_price = self.item.copy() | ||
| 139 | for block in self.pdf_info[page_num]['blocks']: | ||
| 140 | if block['type'] != 0: | ||
| 141 | continue | ||
| 142 | for line in block['lines']: | ||
| 143 | for span in line['spans']: | ||
| 144 | bbox, text = span['bbox'], span['text'] | ||
| 145 | if '所购车辆价格为人民币' in text: | ||
| 146 | words = text.split('币')[-1] | ||
| 147 | vehicle_price['position'] = bbox | ||
| 148 | vehicle_price['words'] = words | ||
| 149 | return vehicle_price | ||
| 150 | |||
| 151 | def get_vin(self, page_num='0'): | ||
| 152 | vin = self.item.copy() | ||
| 153 | for block in self.pdf_info[page_num]['blocks']: | ||
| 154 | if block['type'] != 0: | ||
| 155 | continue | ||
| 156 | for line in block['lines']: | ||
| 157 | for span in line['spans']: | ||
| 158 | bbox, text = span['bbox'], span['text'] | ||
| 159 | if '车架号:' in text: | ||
| 160 | words = text.split(':')[-1] | ||
| 161 | vin['position'] = bbox | ||
| 162 | vin['words'] = words | ||
| 163 | return vin | ||
| 164 | |||
| 165 | def get_loan_principal(self, page_num='0'): | ||
| 166 | chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', | ||
| 167 | '佰', '仟', '万', '亿', '元', '角', '分', '零', '整'] | ||
| 168 | upper = self.item.copy() | ||
| 169 | lower = self.item.copy() | ||
| 170 | asp_1 = self.item.copy() | ||
| 171 | asp_2 = self.item.copy() | ||
| 172 | anchor_bbox = None | ||
| 173 | for block in self.pdf_info[page_num]['blocks']: | ||
| 174 | if block['type'] != 0: | ||
| 175 | continue | ||
| 176 | for line in block['lines']: | ||
| 177 | for span in line['spans']: | ||
| 178 | bbox, text = span['bbox'], span['text'] | ||
| 179 | if fuzz.ratio(''.join(chinese_keywords), text) > 15: | ||
| 180 | text = text.split(':')[-1].strip() | ||
| 181 | upper['position'] = bbox | ||
| 182 | upper['words'] = text | ||
| 183 | if '小写:¥' in text: | ||
| 184 | words = text.split('¥')[-1].strip() | ||
| 185 | lower['position'] = bbox | ||
| 186 | lower['words'] = words | ||
| 187 | if '附加产品融资贷款本金总金额' == text: | ||
| 188 | anchor_bbox = bbox | ||
| 189 | if anchor_bbox: | ||
| 190 | for block in self.pdf_info[page_num]['blocks']: | ||
| 191 | if block['type'] != 0: | ||
| 192 | continue | ||
| 193 | for line in block['lines']: | ||
| 194 | for span in line['spans']: | ||
| 195 | bbox, text = span['bbox'], span['text'] | ||
| 196 | if np.mean(bbox[1::2]) < np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text: | ||
| 197 | words = re.findall(r'人民币:小写:\[(.*)\]', text)[0] | ||
| 198 | asp_1['position'] = bbox | ||
| 199 | asp_1['words'] = words | ||
| 200 | if np.mean(bbox[1::2]) > np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text: | ||
| 201 | words = re.findall(r'人民币:小写:\[(.*)\]', text)[0] | ||
| 202 | asp_2['position'] = bbox | ||
| 203 | asp_2['words'] = words | ||
| 204 | return upper, lower, asp_1, asp_2 | ||
| 205 | |||
| 206 | def get_loan_term(self, page_num='0'): | ||
| 207 | loan_term = self.item.copy() | ||
| 208 | all_text = '' | ||
| 209 | for block in self.pdf_info[page_num]['blocks']: | ||
| 210 | if block['type'] != 0: | ||
| 211 | continue | ||
| 212 | for line in block['lines']: | ||
| 213 | for span in line['spans']: | ||
| 214 | bbox, text = span['bbox'], span['text'] | ||
| 215 | all_text += text | ||
| 216 | matchs = re.search(r'贷款期限(\d+)个月', all_text) | ||
| 217 | if matchs: | ||
| 218 | words = matchs.group(1) | ||
| 219 | for block in self.pdf_info[page_num]['blocks']: | ||
| 220 | if block['type'] != 0: | ||
| 221 | continue | ||
| 222 | for line in block['lines']: | ||
| 223 | for span in line['spans']: | ||
| 224 | bbox, text = span['bbox'], span['text'] | ||
| 225 | if f'{words}个月' in text: | ||
| 226 | loan_term['position'] = bbox | ||
| 227 | loan_term['words'] = words | ||
| 228 | return loan_term | ||
| 229 | |||
| 230 | def get_asp_details(self, page_num): | ||
| 231 | asp_details_table_term = self.item.copy() | ||
| 232 | |||
| 233 | asp_details_table = [] | ||
| 234 | asp_details_text_list = [] | ||
| 235 | table = False | ||
| 236 | for block in self.pdf_info[page_num]['blocks']: | ||
| 237 | if block['type'] != 0: | ||
| 238 | continue | ||
| 239 | for line in block['lines']: | ||
| 240 | for span in line['spans']: | ||
| 241 | bbox, text = span['bbox'], span['text'] | ||
| 242 | if '附加产品融资贷款本金总金额明细' == text: | ||
| 243 | table = True | ||
| 244 | if '第二条' in text or '征信管理' in text: | ||
| 245 | table = False | ||
| 246 | if table == True: | ||
| 247 | asp_details_text_list.append(text) | ||
| 248 | |||
| 249 | for i in range((len(asp_details_text_list)+2)//3): | ||
| 250 | |||
| 251 | line = [] | ||
| 252 | if i == 0: | ||
| 253 | line = [asp_details_text_list[0]] | ||
| 254 | else: | ||
| 255 | for j in range(3): | ||
| 256 | line.append(asp_details_text_list[i*3-2+j]) | ||
| 257 | |||
| 258 | asp_details_table.append(line) | ||
| 259 | |||
| 260 | if len(asp_details_table) > 0: | ||
| 261 | asp_details_table_term['words'] = asp_details_table | ||
| 262 | return asp_details_table_term | ||
| 263 | |||
| 264 | def get_signature(self): | ||
| 265 | signature = self.item.copy() | ||
| 266 | |||
| 267 | for block in self.pdf_info['0']['blocks']: | ||
| 268 | if block['type'] != 0: | ||
| 269 | continue | ||
| 270 | for line in block['lines']: | ||
| 271 | for span in line['spans']: | ||
| 272 | bbox, text = span['bbox'], span['text'] | ||
| 273 | if '签署日期' in text: | ||
| 274 | words = text | ||
| 275 | signature['words'] = words | ||
| 276 | signature['position'] = bbox | ||
| 277 | return signature | ||
| 278 | |||
| 279 | def get_somebody(self, top, bottom): | ||
| 280 | # 指定上下边界后,返回上下边界内的客户信息 | ||
| 281 | _name = self.item.copy() | ||
| 282 | _id = self.item.copy() | ||
| 283 | # 只看第一页,先划定上下边界 | ||
| 284 | y_top = 0 | ||
| 285 | y_bottom = 0 | ||
| 286 | for block in self.pdf_info['1']['blocks']: | ||
| 287 | if block['type'] != 0: | ||
| 288 | continue | ||
| 289 | for line in block['lines']: | ||
| 290 | for span in line['spans']: | ||
| 291 | bbox, text = span['bbox'], span['text'] | ||
| 292 | if top in text: | ||
| 293 | y_top = bbox[3] | ||
| 294 | if bottom in text: | ||
| 295 | y_bottom = bbox[3] | ||
| 296 | for block in self.pdf_info['1']['blocks']: | ||
| 297 | if block['type'] != 0: | ||
| 298 | continue | ||
| 299 | for line in block['lines']: | ||
| 300 | for span in line['spans']: | ||
| 301 | bbox, text = span['bbox'], span['text'] | ||
| 302 | if y_top < bbox[3] < y_bottom: | ||
| 303 | if '姓名/名称' in text: | ||
| 304 | words = text.split(':')[-1] | ||
| 305 | _name['position'] = bbox | ||
| 306 | _name['words'] = words | ||
| 307 | if '自然人身份证件号码/法人执照号码' in text: | ||
| 308 | words = text.split(':')[-1] | ||
| 309 | _id['position'] = bbox | ||
| 310 | _id['words'] = words | ||
| 311 | return _name, _id | ||
| 312 | |||
| 313 | def get_seller(self): | ||
| 314 | seller = self.item.copy() | ||
| 315 | # 先找到 key | ||
| 316 | anchor_bbox = None | ||
| 317 | for block in self.pdf_info['1']['blocks']: | ||
| 318 | if block['type'] != 0: | ||
| 319 | continue | ||
| 320 | for line in block['lines']: | ||
| 321 | for span in line['spans']: | ||
| 322 | bbox, text = span['bbox'], span['text'] | ||
| 323 | if '经销商' == text: | ||
| 324 | anchor_bbox = bbox | ||
| 325 | # 当找到了 key, 则根据 key 去匹配 value | ||
| 326 | if anchor_bbox: | ||
| 327 | half_width = self.pdf_info['1']['width'] * 0.5 | ||
| 328 | for block in self.pdf_info['1']['blocks']: | ||
| 329 | if block['type'] != 0: | ||
| 330 | continue | ||
| 331 | for line in block['lines']: | ||
| 332 | for span in line['spans']: | ||
| 333 | bbox, text = span['bbox'], span['text'] | ||
| 334 | if anchor_bbox[2]<np.mean(bbox[::2])<half_width and \ | ||
| 335 | anchor_bbox[1]<np.mean(bbox[1::2])<anchor_bbox[3]: | ||
| 336 | seller['position'] = bbox | ||
| 337 | seller['words'] = text | ||
| 338 | return seller | ||
| 339 | |||
| 340 | def get_payback_account(self): | ||
| 341 | account = self.item.copy() | ||
| 342 | account_name = self.item.copy() | ||
| 343 | account_bank = self.item.copy() | ||
| 344 | all_text = '' | ||
| 345 | for block in self.pdf_info['1']['blocks']: | ||
| 346 | if block['type'] != 0: | ||
| 347 | continue | ||
| 348 | for line in block['lines']: | ||
| 349 | for span in line['spans']: | ||
| 350 | bbox, text = span['bbox'], span['text'] | ||
| 351 | all_text += text | ||
| 352 | # 首先确定账户信息是哪种,我们只输出非另行通知的格式 | ||
| 353 | if '☑账号' in all_text: | ||
| 354 | all_text = all_text.replace(' ', '') | ||
| 355 | matchs_1 = re.findall(r'账号:(.*)户名', all_text) | ||
| 356 | if matchs_1: | ||
| 357 | words = matchs_1[0] | ||
| 358 | for block in self.pdf_info['1']['blocks']: | ||
| 359 | if block['type'] != 0: | ||
| 360 | continue | ||
| 361 | for line in block['lines']: | ||
| 362 | for span in line['spans']: | ||
| 363 | bbox, text = span['bbox'], span['text'] | ||
| 364 | if f'{words}' in text: | ||
| 365 | account['position'] = bbox | ||
| 366 | account['words'] = words | ||
| 367 | matchs_2 = re.findall(r'户名:(.*)开户行', all_text) | ||
| 368 | if matchs_2: | ||
| 369 | words = matchs_2[0] | ||
| 370 | for block in self.pdf_info['1']['blocks']: | ||
| 371 | if block['type'] != 0: | ||
| 372 | continue | ||
| 373 | for line in block['lines']: | ||
| 374 | for span in line['spans']: | ||
| 375 | bbox, text = span['bbox'], span['text'] | ||
| 376 | if f'{words}' in text: | ||
| 377 | account_name['position'] = bbox | ||
| 378 | account_name['words'] = words | ||
| 379 | matchs_3 = re.findall(r'开户行:(.*);', all_text) | ||
| 380 | if matchs_3: | ||
| 381 | words = matchs_3[0] | ||
| 382 | for block in self.pdf_info['1']['blocks']: | ||
| 383 | if block['type'] != 0: | ||
| 384 | continue | ||
| 385 | for line in block['lines']: | ||
| 386 | for span in line['spans']: | ||
| 387 | bbox, text = span['bbox'], span['text'] | ||
| 388 | if f'开户行:{words};' in text.replace(' ', ''): | ||
| 389 | account_bank['position'] = bbox | ||
| 390 | account_bank['words'] = words | ||
| 391 | return account, account_name, account_bank | ||
| 392 | |||
| 393 | def get_repayment_schedule(self): | ||
| 394 | repayment_schedule = self.item.copy() | ||
| 395 | # 只看第二页 | ||
| 396 | repayment_schedule_table = [] | ||
| 397 | repayment_schedule_text_list = [] | ||
| 398 | table = False | ||
| 399 | for block in self.pdf_info['2']['blocks']: | ||
| 400 | if block['type'] != 0: | ||
| 401 | continue | ||
| 402 | for line in block['lines']: | ||
| 403 | for span in line['spans']: | ||
| 404 | bbox, text = span['bbox'], span['text'] | ||
| 405 | if '序号' == text: | ||
| 406 | table = True | ||
| 407 | if '以上表格中所列的序号并非还款期数' in text: | ||
| 408 | table = False | ||
| 409 | if table == True: | ||
| 410 | repayment_schedule_text_list.append(text) | ||
| 411 | |||
| 412 | for i in range(len(repayment_schedule_text_list)//5): | ||
| 413 | |||
| 414 | line = [] | ||
| 415 | # 5表示5列的意思 | ||
| 416 | for j in range(5): | ||
| 417 | line.append(repayment_schedule_text_list[i*5+j]) | ||
| 418 | |||
| 419 | if str(i+1) == line[1]: | ||
| 420 | break | ||
| 421 | |||
| 422 | repayment_schedule_table.append(line) | ||
| 423 | |||
| 424 | if len(repayment_schedule_table) > 0: | ||
| 425 | repayment_schedule['words'] = repayment_schedule_table | ||
| 426 | return repayment_schedule | ||
| 427 | |||
| 428 | def get_signature_role_1(self): | ||
| 429 | signature_role_1 = self.init_item.copy() | ||
| 430 | # 先定位签字区域 | ||
| 431 | texts = [] | ||
| 432 | boxes = [] | ||
| 433 | page_num = None | ||
| 434 | position = None | ||
| 435 | words = None | ||
| 436 | region = False | ||
| 437 | for i in list(self.pdf_info.keys()): | ||
| 438 | for block in self.pdf_info[i]['blocks']: | ||
| 439 | if block['type'] != 0: | ||
| 440 | continue | ||
| 441 | for line in block['lines']: | ||
| 442 | for span in line['spans']: | ||
| 443 | bbox, text = span['bbox'], span['text'] | ||
| 444 | if '借款人(抵押人)' in text: | ||
| 445 | region = True | ||
| 446 | if '日期' in text: | ||
| 447 | region = False | ||
| 448 | if region == True: | ||
| 449 | page_num = i | ||
| 450 | texts.append(text) | ||
| 451 | boxes.append(bbox) | ||
| 452 | if len(texts) > 4: | ||
| 453 | words = '有' | ||
| 454 | else: | ||
| 455 | words = '无' | ||
| 456 | boxes = np.array(boxes).reshape((-1, 2)) | ||
| 457 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
| 458 | signature_role_1['page_num'] = page_num | ||
| 459 | signature_role_1['position'] = position | ||
| 460 | signature_role_1['words'] = words | ||
| 461 | return signature_role_1 | ||
| 462 | |||
| 463 | def get_signature_role_2(self): | ||
| 464 | signature_role_2 = self.init_item.copy() | ||
| 465 | # 先定位签字区域 | ||
| 466 | texts = [] | ||
| 467 | boxes = [] | ||
| 468 | page_num = None | ||
| 469 | position = None | ||
| 470 | words = None | ||
| 471 | region = False | ||
| 472 | for i in list(self.pdf_info.keys()): | ||
| 473 | for block in self.pdf_info[i]['blocks']: | ||
| 474 | if block['type'] != 0: | ||
| 475 | continue | ||
| 476 | for line in block['lines']: | ||
| 477 | for span in line['spans']: | ||
| 478 | bbox, text = span['bbox'], span['text'] | ||
| 479 | if '共同借款人(共同抵押人)' in text: | ||
| 480 | region = True | ||
| 481 | if '日期' in text: | ||
| 482 | region = False | ||
| 483 | if region == True: | ||
| 484 | page_num = i | ||
| 485 | texts.append(text) | ||
| 486 | boxes.append(bbox) | ||
| 487 | if len(texts) > 4: | ||
| 488 | words = '有' | ||
| 489 | else: | ||
| 490 | words = '无' | ||
| 491 | boxes = np.array(boxes).reshape((-1, 2)) | ||
| 492 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
| 493 | signature_role_2['page_num'] = page_num | ||
| 494 | signature_role_2['position'] = position | ||
| 495 | signature_role_2['words'] = words | ||
| 496 | return signature_role_2 | ||
| 497 | |||
| 498 | def get_signature_role_3(self): | ||
| 499 | signature_role_3 = self.init_item.copy() | ||
| 500 | # 先定位签字区域 | ||
| 501 | texts = [] | ||
| 502 | boxes = [] | ||
| 503 | page_num = None | ||
| 504 | position = None | ||
| 505 | words = None | ||
| 506 | region = False | ||
| 507 | for i in list(self.pdf_info.keys()): | ||
| 508 | for block in self.pdf_info[i]['blocks']: | ||
| 509 | if block['type'] != 0: | ||
| 510 | continue | ||
| 511 | for line in block['lines']: | ||
| 512 | for span in line['spans']: | ||
| 513 | bbox, text = span['bbox'], span['text'] | ||
| 514 | if '保证人1' in text and int(i) != 0: | ||
| 515 | region = True | ||
| 516 | if '日期' in text: | ||
| 517 | region = False | ||
| 518 | if region == True: | ||
| 519 | page_num = i | ||
| 520 | texts.append(text) | ||
| 521 | boxes.append(bbox) | ||
| 522 | if len(texts) > 4: | ||
| 523 | words = '有' | ||
| 524 | else: | ||
| 525 | words = '无' | ||
| 526 | boxes = np.array(boxes).reshape((-1, 2)) | ||
| 527 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
| 528 | signature_role_3['page_num'] = page_num | ||
| 529 | signature_role_3['position'] = position | ||
| 530 | signature_role_3['words'] = words | ||
| 531 | return signature_role_3 | ||
| 532 | |||
| 533 | def get_signature_role_4(self): | ||
| 534 | signature_role_4 = self.init_item.copy() | ||
| 535 | # 先定位签字区域 | ||
| 536 | texts = [] | ||
| 537 | boxes = [] | ||
| 538 | page_num = None | ||
| 539 | position = None | ||
| 540 | words = None | ||
| 541 | region = False | ||
| 542 | for i in list(self.pdf_info.keys()): | ||
| 543 | for block in self.pdf_info[i]['blocks']: | ||
| 544 | if block['type'] != 0: | ||
| 545 | continue | ||
| 546 | for line in block['lines']: | ||
| 547 | for span in line['spans']: | ||
| 548 | bbox, text = span['bbox'], span['text'] | ||
| 549 | if '保证人2' in text and int(i) != 0: | ||
| 550 | region = True | ||
| 551 | if '日期' in text: | ||
| 552 | region = False | ||
| 553 | if region == True: | ||
| 554 | page_num = i | ||
| 555 | texts.append(text) | ||
| 556 | boxes.append(bbox) | ||
| 557 | if len(texts) > 4: | ||
| 558 | words = '有' | ||
| 559 | else: | ||
| 560 | words = '无' | ||
| 561 | boxes = np.array(boxes).reshape((-1, 2)) | ||
| 562 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
| 563 | signature_role_4['page_num'] = page_num | ||
| 564 | signature_role_4['position'] = position | ||
| 565 | signature_role_4['words'] = words | ||
| 566 | return signature_role_4 | ||
| 567 | |||
| 568 | def get_signature_role_5(self): | ||
| 569 | signature_role_5 = self.init_item.copy() | ||
| 570 | # 先定位签字区域 | ||
| 571 | texts = [] | ||
| 572 | boxes = [] | ||
| 573 | page_num = None | ||
| 574 | position = None | ||
| 575 | words = None | ||
| 576 | region = False | ||
| 577 | for i in list(self.pdf_info.keys()): | ||
| 578 | for block in self.pdf_info[i]['blocks']: | ||
| 579 | if block['type'] != 0: | ||
| 580 | continue | ||
| 581 | for line in block['lines']: | ||
| 582 | for span in line['spans']: | ||
| 583 | bbox, text = span['bbox'], span['text'] | ||
| 584 | if '见证人签字' in text and int(i) != 0: | ||
| 585 | region = True | ||
| 586 | if '年' in text: | ||
| 587 | region = False | ||
| 588 | if region == True: | ||
| 589 | page_num = i | ||
| 590 | texts.append(text) | ||
| 591 | boxes.append(bbox) | ||
| 592 | print(texts) | ||
| 593 | if len(texts) > 4: | ||
| 594 | words = '有' | ||
| 595 | else: | ||
| 596 | words = '无' | ||
| 597 | boxes = np.array(boxes).reshape((-1, 2)) | ||
| 598 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
| 599 | signature_role_5['page_num'] = page_num | ||
| 600 | signature_role_5['position'] = position | ||
| 601 | signature_role_5['words'] = words | ||
| 602 | return signature_role_5 | ||
| 603 | |||
| 604 | def get_last_page_signature(self, page_num, top, bottom): | ||
| 605 | signature_name = self.item.copy() | ||
| 606 | signature_date = self.item.copy() | ||
| 607 | anchor_top = None | ||
| 608 | anchor_bottom = None | ||
| 609 | for block in self.pdf_info[page_num]['blocks']: | ||
| 610 | if block['type'] != 0: | ||
| 611 | continue | ||
| 612 | for line in block['lines']: | ||
| 613 | for span in line['spans']: | ||
| 614 | bbox, text = span['bbox'], span['text'] | ||
| 615 | if top in text: | ||
| 616 | anchor_top = bbox[1] | ||
| 617 | if bottom in text: | ||
| 618 | anchor_bottom = bbox[1] | ||
| 619 | if anchor_top is not None and anchor_bottom is not None: | ||
| 620 | for block in self.pdf_info[page_num]['blocks']: | ||
| 621 | if block['type'] != 0: | ||
| 622 | continue | ||
| 623 | for line in block['lines']: | ||
| 624 | for span in line['spans']: | ||
| 625 | bbox, text = span['bbox'], span['text'] | ||
| 626 | if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom): | ||
| 627 | name = text.split(' ')[0] | ||
| 628 | date = text.split(':')[-1] | ||
| 629 | signature_name['words'] = name | ||
| 630 | signature_name['position'] = bbox | ||
| 631 | signature_date['words'] = date | ||
| 632 | signature_name['position'] = bbox | ||
| 633 | return signature_name, signature_date | ||
| 634 | |||
| 635 | def get_info(self): | ||
| 636 | """ | ||
| 637 | block['type'] == 0 : 表示该元素为图片 | ||
| 638 | |||
| 639 | Returns: | ||
| 640 | dict: Description | ||
| 641 | """ | ||
| 642 | |||
| 643 | # 先判断是否为 ASP 产品 | ||
| 644 | # 只看第一页,判断是否有 '附加产品融资贷款本金总金额' 这一句话,若有则为 ASP 产品 | ||
| 645 | # print(self.pdf_info['0']['blocks']) | ||
| 646 | for block in self.pdf_info['0']['blocks']: | ||
| 647 | if block['type'] != 0: | ||
| 648 | continue | ||
| 649 | for line in block['lines']: | ||
| 650 | for span in line['spans']: | ||
| 651 | bbox, text = span['bbox'], span['text'] | ||
| 652 | if '附加产品融资贷款本金总金额' == text: | ||
| 653 | self.is_asp = True | ||
| 654 | |||
| 655 | self.gen_init_result(self.is_asp) | ||
| 656 | |||
| 657 | # Page 1 | ||
| 658 | # 找合同编号 | ||
| 659 | contract_no = self.get_contract_no(page_num='0') | ||
| 660 | self.init_result['page_1']['合同编号'] = contract_no | ||
| 661 | # 所购车辆价格 | ||
| 662 | vehicle_price = self.get_vehicle_price() | ||
| 663 | self.init_result['page_1']['所购车辆价格'] = vehicle_price | ||
| 664 | # 车架号 | ||
| 665 | vin = self.get_vin() | ||
| 666 | self.init_result['page_1']['车架号'] = vehicle_price | ||
| 667 | # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目 | ||
| 668 | upper, lower, asp_1, asp_2 = self.get_loan_principal() | ||
| 669 | self.init_result['page_1']['贷款本金金额']['大写'] = upper | ||
| 670 | self.init_result['page_1']['贷款本金金额']['小写'] = lower | ||
| 671 | self.init_result['page_1']['贷款本金金额']['车辆贷款本金金额'] = asp_1 | ||
| 672 | self.init_result['page_1']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2 | ||
| 673 | # 贷款期限 | ||
| 674 | loan_term = self.get_loan_term() | ||
| 675 | self.init_result['page_1']['贷款期限'] = loan_term | ||
| 676 | # 附加产品融资贷款本金总金额明细(ASP-表格) | ||
| 677 | asp_details_table = self.get_asp_details(page_num='0') | ||
| 678 | self.init_result['page_1']['附加产品融资贷款本金总金额明细'] = asp_details_table | ||
| 679 | # 借款人签字及时间 | ||
| 680 | signature = self.get_signature() | ||
| 681 | self.init_result['page_1']['借款人签字及时间'] = signature | ||
| 682 | ####################################### | ||
| 683 | # Page 2 | ||
| 684 | # 找合同编号 | ||
| 685 | contract_no = self.get_contract_no(page_num='0') | ||
| 686 | self.init_result['page_2']['合同编号'] = contract_no | ||
| 687 | # 找借款人及抵押人(地址字段原本有空格) | ||
| 688 | borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人及共同抵押人:') | ||
| 689 | self.init_result['page_2']['借款人及抵押人']['name'] = borrower_name | ||
| 690 | self.init_result['page_2']['借款人及抵押人']['id'] = borrower_id | ||
| 691 | # 找共同借款人及共同抵押人 | ||
| 692 | co_borrower_name, co_borrower_id = self.get_somebody(top='共同借款人及共同抵押人:', bottom='保证人1:') | ||
| 693 | self.init_result['page_2']['共同借款人及共同抵押人']['name'] = co_borrower_name | ||
| 694 | self.init_result['page_2']['共同借款人及共同抵押人']['id'] = co_borrower_id | ||
| 695 | # 保证人1 | ||
| 696 | first_guarantor_name, first_guarantor_id = self.get_somebody(top='保证人1:', bottom='保证人2:') | ||
| 697 | self.init_result['page_2']['保证人1']['name'] = first_guarantor_name | ||
| 698 | self.init_result['page_2']['保证人1']['id'] = first_guarantor_id | ||
| 699 | # 保证人2 | ||
| 700 | second_guarantor_name, second_guarantor_id = self.get_somebody(top='保证人2:', bottom='第一章') | ||
| 701 | self.init_result['page_2']['保证人2']['name'] = second_guarantor_name | ||
| 702 | self.init_result['page_2']['保证人2']['id'] = second_guarantor_id | ||
| 703 | # 所购车辆价格 | ||
| 704 | vehicle_price = self.get_vehicle_price(page_num='1') | ||
| 705 | self.init_result['page_2']['所购车辆价格'] = vehicle_price | ||
| 706 | # 车架号 | ||
| 707 | vin = self.get_vin(page_num='1') | ||
| 708 | self.init_result['page_2']['车架号'] = vin | ||
| 709 | # 经销商 | ||
| 710 | seller = self.get_seller() | ||
| 711 | self.init_result['page_2']['经销商'] = seller | ||
| 712 | # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目 | ||
| 713 | upper, lower, asp_1, asp_2 = self.get_loan_principal(page_num='1') | ||
| 714 | self.init_result['page_2']['贷款本金金额']['大写'] = upper | ||
| 715 | self.init_result['page_2']['贷款本金金额']['小写'] = lower | ||
| 716 | self.init_result['page_2']['贷款本金金额']['车辆贷款本金金额'] = asp_1 | ||
| 717 | self.init_result['page_2']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2 | ||
| 718 | # 贷款期限 | ||
| 719 | loan_term = self.get_loan_term(page_num='1') | ||
| 720 | self.init_result['page_2']['贷款期限'] = loan_term | ||
| 721 | # 还款账户 | ||
| 722 | account, account_name, account_bank = self.get_payback_account() | ||
| 723 | self.init_result['page_2']['还款账户']['账号'] = account | ||
| 724 | self.init_result['page_2']['还款账户']['户名'] = account_name | ||
| 725 | self.init_result['page_2']['还款账户']['开户行'] = account_bank | ||
| 726 | ####################################### | ||
| 727 | # Page 3 | ||
| 728 | # 找合同编号 | ||
| 729 | contract_no = self.get_contract_no(page_num='2') | ||
| 730 | self.init_result['page_3']['合同编号'] = contract_no | ||
| 731 | # 还款计划表(表格) | ||
| 732 | repayment_schedule_table = self.get_repayment_schedule() | ||
| 733 | self.init_result['page_3']['还款计划表'] = repayment_schedule_table | ||
| 734 | ####################################### | ||
| 735 | # Page 4 | ||
| 736 | # 找合同编号 | ||
| 737 | contract_no = self.get_contract_no(page_num='3') | ||
| 738 | self.init_result['page_4']['合同编号'] = contract_no | ||
| 739 | # 附加产品融资贷款本金总金额明细(ASP-表格) | ||
| 740 | asp_details_table = self.get_asp_details(page_num='3') | ||
| 741 | self.init_result['page_4']['附加产品融资贷款本金总金额明细'] = asp_details_table | ||
| 742 | ####################################### | ||
| 743 | # Page 5 | ||
| 744 | # 找合同编号 | ||
| 745 | contract_no = self.get_contract_no(page_num='4') | ||
| 746 | self.init_result['page_5']['合同编号'] = contract_no | ||
| 747 | ####################################### | ||
| 748 | # Page 6 | ||
| 749 | # 找合同编号 | ||
| 750 | contract_no = self.get_contract_no(page_num='5') | ||
| 751 | self.init_result['page_6']['合同编号'] = contract_no | ||
| 752 | if self.is_asp == False: | ||
| 753 | # Page 7 | ||
| 754 | # 找合同编号 | ||
| 755 | contract_no = self.get_contract_no(page_num='6') | ||
| 756 | self.init_result['page_7']['合同编号'] = contract_no | ||
| 757 | signature_name, signature_date = self.get_last_page_signature(page_num='6', | ||
| 758 | top='借款人(抵押人)', bottom='共同借款人(共同抵押人)') | ||
| 759 | self.init_result['page_7']['主借人签字']['签字'] = signature_name | ||
| 760 | self.init_result['page_7']['主借人签字']['日期'] = signature_date | ||
| 761 | signature_name, signature_date = self.get_last_page_signature(page_num='6', | ||
| 762 | top='共同借款人(共同抵押人)', bottom='保证人1') | ||
| 763 | self.init_result['page_7']['共借人签字']['签字'] = signature_name | ||
| 764 | self.init_result['page_7']['共借人签字']['日期'] = signature_date | ||
| 765 | signature_name, signature_date = self.get_last_page_signature(page_num='6', | ||
| 766 | top='保证人1', bottom='保证人2') | ||
| 767 | self.init_result['page_7']['保证人1签字']['签字'] = signature_name | ||
| 768 | self.init_result['page_7']['保证人1签字']['日期'] = signature_date | ||
| 769 | signature_name, signature_date = self.get_last_page_signature(page_num='6', | ||
| 770 | top='保证人2', bottom='在本人面前亲笔签署本合同') | ||
| 771 | self.init_result['page_7']['保证人2签字']['签字'] = signature_name | ||
| 772 | self.init_result['page_7']['保证人2签字']['日期'] = signature_date | ||
| 773 | signature_name, signature_date = self.get_last_page_signature(page_num='6', | ||
| 774 | top='在本人面前亲笔签署本合同', bottom='(以下无正文)') | ||
| 775 | self.init_result['page_7']['见证人签字']['签字'] = signature_name | ||
| 776 | self.init_result['page_7']['见证人签字']['日期'] = signature_date | ||
| 777 | else: | ||
| 778 | # Page 7 | ||
| 779 | # 找合同编号 | ||
| 780 | contract_no = self.get_contract_no(page_num='6') | ||
| 781 | self.init_result['page_7']['合同编号'] = contract_no | ||
| 782 | # Page 8 | ||
| 783 | # 找合同编号 | ||
| 784 | contract_no = self.get_contract_no(page_num='7') | ||
| 785 | self.init_result['page_8']['合同编号'] = contract_no | ||
| 786 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
| 787 | top='借款人(抵押人)', bottom='共同借款人(共同抵押人)') | ||
| 788 | self.init_result['page_8']['主借人签字']['签字'] = signature_name | ||
| 789 | self.init_result['page_8']['主借人签字']['日期'] = signature_date | ||
| 790 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
| 791 | top='共同借款人(共同抵押人)', bottom='保证人1') | ||
| 792 | self.init_result['page_8']['共借人签字']['签字'] = signature_name | ||
| 793 | self.init_result['page_8']['共借人签字']['日期'] = signature_date | ||
| 794 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
| 795 | top='保证人1', bottom='保证人2') | ||
| 796 | self.init_result['page_8']['保证人1签字']['签字'] = signature_name | ||
| 797 | self.init_result['page_8']['保证人1签字']['日期'] = signature_date | ||
| 798 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
| 799 | top='保证人2', bottom='在本人面前亲笔签署本合同') | ||
| 800 | self.init_result['page_8']['保证人2签字']['签字'] = signature_name | ||
| 801 | self.init_result['page_8']['保证人2签字']['日期'] = signature_date | ||
| 802 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
| 803 | top='在本人面前亲笔签署本合同', bottom='(以下无正文)') | ||
| 804 | self.init_result['page_8']['见证人签字']['签字'] = signature_name | ||
| 805 | self.init_result['page_8']['见证人签字']['日期'] = signature_date | ||
| 806 | |||
| 807 | # 重新定制输出 | ||
| 808 | new_results = {"is_asp": self.is_asp, | ||
| 809 | "page_info": self.init_result | ||
| 810 | } | ||
| 811 | return new_results |
| 1 | # -*- coding: utf-8 -*- | ||
| 2 | # @Author : lk | ||
| 3 | # @Email : 9428.al@gmail.com | ||
| 4 | # @Create Date : 2021-07-20 16:42:41 | ||
| 5 | # @Last Modified : 2021-10-28 17:41:00 | ||
| 6 | # @Description : | ||
| 7 | |||
| 8 | import re | ||
| 9 | import cv2 | ||
| 10 | import base64 | ||
| 11 | import numpy as np | ||
| 12 | from fuzzywuzzy import fuzz | ||
| 13 | |||
| 14 | |||
| 15 | class Finder: | ||
| 16 | |||
| 17 | def __init__(self, pdf_info): | ||
| 18 | self.pdf_info = pdf_info | ||
| 19 | self.item = {"words": None, | ||
| 20 | "page": None, | ||
| 21 | "position": None, | ||
| 22 | } | ||
| 23 | # 格式化算法输出 | ||
| 24 | self.init_result = {"合同编号": self.item, | ||
| 25 | "承租人-姓名": self.item, | ||
| 26 | "承租人-证件号码": self.item, | ||
| 27 | "承租人-法定代表人或授权代表": self.item, | ||
| 28 | "保证人1-姓名": self.item, | ||
| 29 | "保证人1-证件号码": self.item, | ||
| 30 | "保证人1-法定代表人或授权代表": self.item, | ||
| 31 | "保证人2-姓名": self.item, | ||
| 32 | "保证人2-证件号码": self.item, | ||
| 33 | "保证人2-法定代表人或授权代表": self.item, | ||
| 34 | "保证人3-姓名": self.item, | ||
| 35 | "保证人3-证件号码": self.item, | ||
| 36 | "保证人3-法定代表人或授权代表": self.item, | ||
| 37 | "合同编号(正文)": self.item, | ||
| 38 | "车辆识别代码": self.item, | ||
| 39 | "车辆卖方(经销商)": self.item, | ||
| 40 | "车辆原始销售价格(《机动车销售统一发票》所列金额)": self.item, | ||
| 41 | "车辆附加产品明细表": self.item, | ||
| 42 | "融资成本总额": self.item, | ||
| 43 | "租期": self.item, | ||
| 44 | "付款计划表": self.item, | ||
| 45 | "银行账户-户名": self.item, | ||
| 46 | "银行账户-银行账号": self.item, | ||
| 47 | "银行账户-开户行": self.item, | ||
| 48 | "签字页-承租人姓名": self.item, | ||
| 49 | "签字页-承租人签章": self.item, | ||
| 50 | "签字页-保证人1姓名": self.item, | ||
| 51 | "签字页-保证人1签章": self.item, | ||
| 52 | "签字页-保证人2姓名": self.item, | ||
| 53 | "签字页-保证人2签章": self.item, | ||
| 54 | "签字页-保证人3姓名": self.item, | ||
| 55 | "签字页-保证人3签章": self.item, | ||
| 56 | } | ||
| 57 | |||
| 58 | # 格式化输出 车辆处置协议 要是别的字段 | ||
| 59 | self.init_result_1 = {"合同编号": self.item, | ||
| 60 | "承租人-姓名": self.item, | ||
| 61 | "承租人-证件号码": self.item, | ||
| 62 | "销售经销商": self.item, | ||
| 63 | "合同编号(正文)": self.item, | ||
| 64 | "签字页-承租人姓名": self.item, | ||
| 65 | "签字页-承租人证件号码": self.item, | ||
| 66 | "签字页-承租人签章": self.item, | ||
| 67 | "签字页-销售经销商": self.item, | ||
| 68 | "签字页-销售经销商签章": self.item, | ||
| 69 | |||
| 70 | } | ||
| 71 | |||
| 72 | # 格式化输出 车辆租赁抵押合同 | ||
| 73 | self.init_result_2 = {"合同编号": self.item, | ||
| 74 | "合同编号(正文)": self.item, | ||
| 75 | "抵押人姓名/名称": self.item, | ||
| 76 | "抵押人证件号码": self.item, | ||
| 77 | "车辆识别代码": self.item, | ||
| 78 | "租金总额": self.item, | ||
| 79 | "融资租赁期限": self.item, | ||
| 80 | "签字页-抵押人姓名": self.item, | ||
| 81 | "签字页-抵押人签章": self.item, | ||
| 82 | "签字页-抵押人配偶姓名": self.item, | ||
| 83 | "签字页-抵押人配偶签章": self.item, | ||
| 84 | } | ||
| 85 | |||
| 86 | def get_contract_no(self, page_num): | ||
| 87 | """传入页码,查看该页码右上角的编号 | ||
| 88 | |||
| 89 | Args: | ||
| 90 | page_num (string): | ||
| 91 | |||
| 92 | Returns: | ||
| 93 | sting: | ||
| 94 | """ | ||
| 95 | contract_no = self.item.copy() | ||
| 96 | # 只看第一页 | ||
| 97 | for block in self.pdf_info[page_num]['blocks']: | ||
| 98 | if block['type'] != 0: | ||
| 99 | continue | ||
| 100 | for line in block['lines']: | ||
| 101 | for span in line['spans']: | ||
| 102 | bbox, text = span['bbox'], span['text'] | ||
| 103 | if '合同编号:' in text: | ||
| 104 | words = text.split(':')[-1] | ||
| 105 | contract_no['position'] = bbox | ||
| 106 | contract_no['page'] = page_num | ||
| 107 | contract_no['words'] = words | ||
| 108 | if contract_no['words'] == '': | ||
| 109 | for block in self.pdf_info[page_num]['blocks']: | ||
| 110 | if block['type'] != 0: | ||
| 111 | continue | ||
| 112 | for line in block['lines']: | ||
| 113 | for span in line['spans']: | ||
| 114 | bbox, text = span['bbox'], span['text'] | ||
| 115 | if bbox[1] < contract_no['position'][3] and 'CH' in text: | ||
| 116 | contract_no['position'] = bbox | ||
| 117 | contract_no['page'] = page_num | ||
| 118 | contract_no['words'] = text | ||
| 119 | return contract_no | ||
| 120 | |||
| 121 | def get_vehicle_price(self, page_num='0'): | ||
| 122 | vehicle_price = self.item.copy() | ||
| 123 | for block in self.pdf_info[page_num]['blocks']: | ||
| 124 | if block['type'] != 0: | ||
| 125 | continue | ||
| 126 | for line in block['lines']: | ||
| 127 | for span in line['spans']: | ||
| 128 | bbox, text = span['bbox'], span['text'] | ||
| 129 | if '所购车辆价格为人民币' in text: | ||
| 130 | words = text.split('币')[-1] | ||
| 131 | vehicle_price['position'] = bbox | ||
| 132 | vehicle_price['words'] = words | ||
| 133 | return vehicle_price | ||
| 134 | |||
| 135 | def get_contract_no_one(self): | ||
| 136 | # 查找正文中的合同编号,有可能存在换行的情况 | ||
| 137 | contract_no = self.item.copy() | ||
| 138 | for pno in self.pdf_info: | ||
| 139 | all_text = '' | ||
| 140 | for block in self.pdf_info[pno]['blocks']: | ||
| 141 | if block['type'] != 0: | ||
| 142 | continue | ||
| 143 | for line in block['lines']: | ||
| 144 | for span in line['spans']: | ||
| 145 | bbox, text = span['bbox'], span['text'] | ||
| 146 | all_text += text | ||
| 147 | all_text = all_text.replace(' ', '') | ||
| 148 | matchObj = re.search(r'(合同编号:\[(.*?)\])', all_text) | ||
| 149 | if matchObj: | ||
| 150 | words = matchObj.group(1) | ||
| 151 | contract_no['position'] = None | ||
| 152 | contract_no['page'] = pno | ||
| 153 | contract_no['words'] = words | ||
| 154 | return contract_no | ||
| 155 | |||
| 156 | matchObj = re.search(r'编号为(.*?)的', all_text) | ||
| 157 | if matchObj: | ||
| 158 | words = matchObj.group(1).strip() | ||
| 159 | contract_no['position'] = None | ||
| 160 | contract_no['page'] = pno | ||
| 161 | contract_no['words'] = words | ||
| 162 | return contract_no | ||
| 163 | |||
| 164 | matchObj = re.search(r'编号为(.*?))的', all_text) | ||
| 165 | if matchObj: | ||
| 166 | words = matchObj.group(1).strip() | ||
| 167 | contract_no['position'] = None | ||
| 168 | contract_no['page'] = pno | ||
| 169 | contract_no['words'] = words | ||
| 170 | return contract_no | ||
| 171 | |||
| 172 | def get_key_value(self, key, page_num=None): | ||
| 173 | value = self.item.copy() | ||
| 174 | if page_num is not None: | ||
| 175 | pno = page_num | ||
| 176 | for block in self.pdf_info[pno]['blocks']: | ||
| 177 | if block['type'] != 0: | ||
| 178 | continue | ||
| 179 | for line in block['lines']: | ||
| 180 | for span in line['spans']: | ||
| 181 | bbox, text = span['bbox'], span['text'] | ||
| 182 | if key in text: | ||
| 183 | words = text.split(':')[-1] | ||
| 184 | value['position'] = bbox | ||
| 185 | value['page'] = pno | ||
| 186 | value['words'] = words | ||
| 187 | else: | ||
| 188 | for pno in self.pdf_info: | ||
| 189 | for block in self.pdf_info[pno]['blocks']: | ||
| 190 | if block['type'] != 0: | ||
| 191 | continue | ||
| 192 | for line in block['lines']: | ||
| 193 | for span in line['spans']: | ||
| 194 | bbox, text = span['bbox'], span['text'] | ||
| 195 | if key in text: | ||
| 196 | # print(self.pdf_info[pno]) | ||
| 197 | words = text.split(':')[-1] | ||
| 198 | value['position'] = bbox | ||
| 199 | value['page'] = pno | ||
| 200 | value['words'] = words | ||
| 201 | return value | ||
| 202 | |||
| 203 | def get_loan_principal(self, page_num='0'): | ||
| 204 | chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', | ||
| 205 | '佰', '仟', '万', '亿', '元', '角', '分', '零', '整'] | ||
| 206 | upper = self.item.copy() | ||
| 207 | lower = self.item.copy() | ||
| 208 | asp_1 = self.item.copy() | ||
| 209 | asp_2 = self.item.copy() | ||
| 210 | anchor_bbox = None | ||
| 211 | for block in self.pdf_info[page_num]['blocks']: | ||
| 212 | if block['type'] != 0: | ||
| 213 | continue | ||
| 214 | for line in block['lines']: | ||
| 215 | for span in line['spans']: | ||
| 216 | bbox, text = span['bbox'], span['text'] | ||
| 217 | if fuzz.ratio(''.join(chinese_keywords), text) > 15: | ||
| 218 | text = text.split(':')[-1].strip() | ||
| 219 | upper['position'] = bbox | ||
| 220 | upper['words'] = text | ||
| 221 | if '小写:¥' in text: | ||
| 222 | words = text.split('¥')[-1].strip() | ||
| 223 | lower['position'] = bbox | ||
| 224 | lower['words'] = words | ||
| 225 | if '附加产品融资贷款本金总金额' == text: | ||
| 226 | anchor_bbox = bbox | ||
| 227 | if anchor_bbox: | ||
| 228 | for block in self.pdf_info[page_num]['blocks']: | ||
| 229 | if block['type'] != 0: | ||
| 230 | continue | ||
| 231 | for line in block['lines']: | ||
| 232 | for span in line['spans']: | ||
| 233 | bbox, text = span['bbox'], span['text'] | ||
| 234 | if np.mean(bbox[1::2]) < np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text: | ||
| 235 | words = re.findall(r'人民币:小写:\[(.*)\]', text)[0] | ||
| 236 | asp_1['position'] = bbox | ||
| 237 | asp_1['words'] = words | ||
| 238 | if np.mean(bbox[1::2]) > np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text: | ||
| 239 | words = re.findall(r'人民币:小写:\[(.*)\]', text)[0] | ||
| 240 | asp_2['position'] = bbox | ||
| 241 | asp_2['words'] = words | ||
| 242 | return upper, lower, asp_1, asp_2 | ||
| 243 | |||
| 244 | def get_loan_term(self, page_num='0'): | ||
| 245 | loan_term = self.item.copy() | ||
| 246 | all_text = '' | ||
| 247 | for block in self.pdf_info[page_num]['blocks']: | ||
| 248 | if block['type'] != 0: | ||
| 249 | continue | ||
| 250 | for line in block['lines']: | ||
| 251 | for span in line['spans']: | ||
| 252 | bbox, text = span['bbox'], span['text'] | ||
| 253 | all_text += text | ||
| 254 | matchs = re.search(r'贷款期限(\d+)个月', all_text) | ||
| 255 | if matchs: | ||
| 256 | words = matchs.group(1) | ||
| 257 | for block in self.pdf_info[page_num]['blocks']: | ||
| 258 | if block['type'] != 0: | ||
| 259 | continue | ||
| 260 | for line in block['lines']: | ||
| 261 | for span in line['spans']: | ||
| 262 | bbox, text = span['bbox'], span['text'] | ||
| 263 | if f'{words}个月' in text: | ||
| 264 | loan_term['position'] = bbox | ||
| 265 | loan_term['words'] = words | ||
| 266 | return loan_term | ||
| 267 | |||
| 268 | def get_asp_details(self, page_num): | ||
| 269 | asp_details_table_term = self.item.copy() | ||
| 270 | |||
| 271 | asp_details_table = [] | ||
| 272 | asp_details_text_list = [] | ||
| 273 | table = False | ||
| 274 | for block in self.pdf_info[page_num]['blocks']: | ||
| 275 | if block['type'] != 0: | ||
| 276 | continue | ||
| 277 | for line in block['lines']: | ||
| 278 | for span in line['spans']: | ||
| 279 | bbox, text = span['bbox'], span['text'] | ||
| 280 | if '附加产品融资贷款本金总金额明细' == text: | ||
| 281 | table = True | ||
| 282 | if '第二条' in text or '征信管理' in text: | ||
| 283 | table = False | ||
| 284 | if table == True: | ||
| 285 | asp_details_text_list.append(text) | ||
| 286 | |||
| 287 | for i in range((len(asp_details_text_list)+2)//3): | ||
| 288 | |||
| 289 | line = [] | ||
| 290 | if i == 0: | ||
| 291 | line = [asp_details_text_list[0]] | ||
| 292 | else: | ||
| 293 | for j in range(3): | ||
| 294 | line.append(asp_details_text_list[i*3-2+j]) | ||
| 295 | |||
| 296 | asp_details_table.append(line) | ||
| 297 | |||
| 298 | if len(asp_details_table) > 0: | ||
| 299 | asp_details_table_term['words'] = asp_details_table | ||
| 300 | return asp_details_table_term | ||
| 301 | |||
| 302 | def get_signature(self): | ||
| 303 | signature = self.item.copy() | ||
| 304 | |||
| 305 | for block in self.pdf_info['0']['blocks']: | ||
| 306 | if block['type'] != 0: | ||
| 307 | continue | ||
| 308 | for line in block['lines']: | ||
| 309 | for span in line['spans']: | ||
| 310 | bbox, text = span['bbox'], span['text'] | ||
| 311 | if '签署日期' in text: | ||
| 312 | words = text | ||
| 313 | signature['words'] = words | ||
| 314 | signature['position'] = bbox | ||
| 315 | return signature | ||
| 316 | |||
| 317 | def get_somebody(self, top, bottom): | ||
| 318 | # 指定上下边界后,返回上下边界内的客户信息 | ||
| 319 | _name = self.item.copy() | ||
| 320 | _id = self.item.copy() | ||
| 321 | # 只看第一页,先划定上下边界 | ||
| 322 | y_top = 0 | ||
| 323 | y_bottom = 0 | ||
| 324 | for block in self.pdf_info['1']['blocks']: | ||
| 325 | if block['type'] != 0: | ||
| 326 | continue | ||
| 327 | for line in block['lines']: | ||
| 328 | for span in line['spans']: | ||
| 329 | bbox, text = span['bbox'], span['text'] | ||
| 330 | if top in text: | ||
| 331 | y_top = bbox[3] | ||
| 332 | if bottom in text: | ||
| 333 | y_bottom = bbox[3] | ||
| 334 | for block in self.pdf_info['1']['blocks']: | ||
| 335 | if block['type'] != 0: | ||
| 336 | continue | ||
| 337 | for line in block['lines']: | ||
| 338 | for span in line['spans']: | ||
| 339 | bbox, text = span['bbox'], span['text'] | ||
| 340 | if y_top < bbox[3] < y_bottom: | ||
| 341 | if '姓名/名称' in text: | ||
| 342 | words = text.split(':')[-1] | ||
| 343 | _name['position'] = bbox | ||
| 344 | _name['words'] = words | ||
| 345 | if '自然人身份证件号码/法人执照号码' in text: | ||
| 346 | words = text.split(':')[-1] | ||
| 347 | _id['position'] = bbox | ||
| 348 | _id['words'] = words | ||
| 349 | return _name, _id | ||
| 350 | |||
| 351 | def get_seller(self): | ||
| 352 | seller = self.item.copy() | ||
| 353 | # 先找到 key | ||
| 354 | anchor_bbox = None | ||
| 355 | for block in self.pdf_info['1']['blocks']: | ||
| 356 | if block['type'] != 0: | ||
| 357 | continue | ||
| 358 | for line in block['lines']: | ||
| 359 | for span in line['spans']: | ||
| 360 | bbox, text = span['bbox'], span['text'] | ||
| 361 | if '经销商' == text: | ||
| 362 | anchor_bbox = bbox | ||
| 363 | # 当找到了 key, 则根据 key 去匹配 value | ||
| 364 | if anchor_bbox: | ||
| 365 | half_width = self.pdf_info['1']['width'] * 0.5 | ||
| 366 | for block in self.pdf_info['1']['blocks']: | ||
| 367 | if block['type'] != 0: | ||
| 368 | continue | ||
| 369 | for line in block['lines']: | ||
| 370 | for span in line['spans']: | ||
| 371 | bbox, text = span['bbox'], span['text'] | ||
| 372 | if anchor_bbox[2]<np.mean(bbox[::2])<half_width and \ | ||
| 373 | anchor_bbox[1]<np.mean(bbox[1::2])<anchor_bbox[3]: | ||
| 374 | seller['position'] = bbox | ||
| 375 | seller['words'] = text | ||
| 376 | return seller | ||
| 377 | |||
| 378 | def get_payback_account(self): | ||
| 379 | account = self.item.copy() | ||
| 380 | account_name = self.item.copy() | ||
| 381 | account_bank = self.item.copy() | ||
| 382 | all_text = '' | ||
| 383 | for block in self.pdf_info['1']['blocks']: | ||
| 384 | if block['type'] != 0: | ||
| 385 | continue | ||
| 386 | for line in block['lines']: | ||
| 387 | for span in line['spans']: | ||
| 388 | bbox, text = span['bbox'], span['text'] | ||
| 389 | all_text += text | ||
| 390 | # 首先确定账户信息是哪种,我们只输出非另行通知的格式 | ||
| 391 | if '☑账号' in all_text: | ||
| 392 | all_text = all_text.replace(' ', '') | ||
| 393 | matchs_1 = re.findall(r'账号:(.*)户名', all_text) | ||
| 394 | if matchs_1: | ||
| 395 | words = matchs_1[0] | ||
| 396 | for block in self.pdf_info['1']['blocks']: | ||
| 397 | if block['type'] != 0: | ||
| 398 | continue | ||
| 399 | for line in block['lines']: | ||
| 400 | for span in line['spans']: | ||
| 401 | bbox, text = span['bbox'], span['text'] | ||
| 402 | if f'{words}' in text: | ||
| 403 | account['position'] = bbox | ||
| 404 | account['words'] = words | ||
| 405 | matchs_2 = re.findall(r'户名:(.*)开户行', all_text) | ||
| 406 | if matchs_2: | ||
| 407 | words = matchs_2[0] | ||
| 408 | for block in self.pdf_info['1']['blocks']: | ||
| 409 | if block['type'] != 0: | ||
| 410 | continue | ||
| 411 | for line in block['lines']: | ||
| 412 | for span in line['spans']: | ||
| 413 | bbox, text = span['bbox'], span['text'] | ||
| 414 | if f'{words}' in text: | ||
| 415 | account_name['position'] = bbox | ||
| 416 | account_name['words'] = words | ||
| 417 | matchs_3 = re.findall(r'开户行:(.*);', all_text) | ||
| 418 | if matchs_3: | ||
| 419 | words = matchs_3[0] | ||
| 420 | for block in self.pdf_info['1']['blocks']: | ||
| 421 | if block['type'] != 0: | ||
| 422 | continue | ||
| 423 | for line in block['lines']: | ||
| 424 | for span in line['spans']: | ||
| 425 | bbox, text = span['bbox'], span['text'] | ||
| 426 | if f'开户行:{words};' in text.replace(' ', ''): | ||
| 427 | account_bank['position'] = bbox | ||
| 428 | account_bank['words'] = words | ||
| 429 | return account, account_name, account_bank | ||
| 430 | |||
| 431 | def get_repayment_schedule(self): | ||
| 432 | repayment_schedule = self.item.copy() | ||
| 433 | |||
| 434 | repayment_schedule_text_list = [] | ||
| 435 | table = False | ||
| 436 | page = None | ||
| 437 | for pno in self.pdf_info: | ||
| 438 | for block in self.pdf_info[pno]['blocks']: | ||
| 439 | if block['type'] != 0: | ||
| 440 | continue | ||
| 441 | for line in block['lines']: | ||
| 442 | for span in line['spans']: | ||
| 443 | bbox, text = span['bbox'], span['text'] | ||
| 444 | if '以上表格中所列序号' in text: | ||
| 445 | table = False | ||
| 446 | if table == True: | ||
| 447 | repayment_schedule_text_list.append(text) | ||
| 448 | if '61.' in text: | ||
| 449 | page = pno | ||
| 450 | table = True | ||
| 451 | |||
| 452 | repayment_schedule_table = [['序号', '融资租赁成本', '融资租赁费用', '租金', '剩余融资租赁成本']] | ||
| 453 | for i in range(len(repayment_schedule_text_list)//4): | ||
| 454 | line = [f'{i+1}.'] | ||
| 455 | # 4表示4列的意思 | ||
| 456 | for j in range(4): | ||
| 457 | line.append(repayment_schedule_text_list[i*4+j]) | ||
| 458 | |||
| 459 | repayment_schedule_table.append(line) | ||
| 460 | |||
| 461 | repayment_schedule['words'] = repayment_schedule_table | ||
| 462 | repayment_schedule['page'] = page | ||
| 463 | return repayment_schedule | ||
| 464 | |||
| 465 | def get_signature_role_1(self): | ||
| 466 | signature_role_1 = self.item.copy() | ||
| 467 | for pno in self.pdf_info: | ||
| 468 | for block in self.pdf_info[pno]['blocks']: | ||
| 469 | if block['type'] != 0: | ||
| 470 | continue | ||
| 471 | for line in block['lines']: | ||
| 472 | for span in line['spans']: | ||
| 473 | bbox, text = span['bbox'], span['text'] | ||
| 474 | if '签署日期' in text: | ||
| 475 | signature_role_1['position'] = bbox | ||
| 476 | signature_role_1['page'] = pno | ||
| 477 | signature_role_1['words'] = text | ||
| 478 | return signature_role_1 | ||
| 479 | |||
| 480 | def get_signature_role_2(self): | ||
| 481 | signature_role_2 = self.init_item.copy() | ||
| 482 | # 先定位签字区域 | ||
| 483 | texts = [] | ||
| 484 | boxes = [] | ||
| 485 | page_num = None | ||
| 486 | position = None | ||
| 487 | words = None | ||
| 488 | region = False | ||
| 489 | for i in list(self.pdf_info.keys()): | ||
| 490 | for block in self.pdf_info[i]['blocks']: | ||
| 491 | if block['type'] != 0: | ||
| 492 | continue | ||
| 493 | for line in block['lines']: | ||
| 494 | for span in line['spans']: | ||
| 495 | bbox, text = span['bbox'], span['text'] | ||
| 496 | if '共同借款人(共同抵押人)' in text: | ||
| 497 | region = True | ||
| 498 | if '日期' in text: | ||
| 499 | region = False | ||
| 500 | if region == True: | ||
| 501 | page_num = i | ||
| 502 | texts.append(text) | ||
| 503 | boxes.append(bbox) | ||
| 504 | if len(texts) > 4: | ||
| 505 | words = '有' | ||
| 506 | else: | ||
| 507 | words = '无' | ||
| 508 | boxes = np.array(boxes).reshape((-1, 2)) | ||
| 509 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
| 510 | signature_role_2['page_num'] = page_num | ||
| 511 | signature_role_2['position'] = position | ||
| 512 | signature_role_2['words'] = words | ||
| 513 | return signature_role_2 | ||
| 514 | |||
| 515 | def get_signature_role_3(self): | ||
| 516 | signature_role_3 = self.init_item.copy() | ||
| 517 | # 先定位签字区域 | ||
| 518 | texts = [] | ||
| 519 | boxes = [] | ||
| 520 | page_num = None | ||
| 521 | position = None | ||
| 522 | words = None | ||
| 523 | region = False | ||
| 524 | for i in list(self.pdf_info.keys()): | ||
| 525 | for block in self.pdf_info[i]['blocks']: | ||
| 526 | if block['type'] != 0: | ||
| 527 | continue | ||
| 528 | for line in block['lines']: | ||
| 529 | for span in line['spans']: | ||
| 530 | bbox, text = span['bbox'], span['text'] | ||
| 531 | if '保证人1' in text and int(i) != 0: | ||
| 532 | region = True | ||
| 533 | if '日期' in text: | ||
| 534 | region = False | ||
| 535 | if region == True: | ||
| 536 | page_num = i | ||
| 537 | texts.append(text) | ||
| 538 | boxes.append(bbox) | ||
| 539 | if len(texts) > 4: | ||
| 540 | words = '有' | ||
| 541 | else: | ||
| 542 | words = '无' | ||
| 543 | boxes = np.array(boxes).reshape((-1, 2)) | ||
| 544 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
| 545 | signature_role_3['page_num'] = page_num | ||
| 546 | signature_role_3['position'] = position | ||
| 547 | signature_role_3['words'] = words | ||
| 548 | return signature_role_3 | ||
| 549 | |||
| 550 | def get_signature_role_4(self): | ||
| 551 | signature_role_4 = self.init_item.copy() | ||
| 552 | # 先定位签字区域 | ||
| 553 | texts = [] | ||
| 554 | boxes = [] | ||
| 555 | page_num = None | ||
| 556 | position = None | ||
| 557 | words = None | ||
| 558 | region = False | ||
| 559 | for i in list(self.pdf_info.keys()): | ||
| 560 | for block in self.pdf_info[i]['blocks']: | ||
| 561 | if block['type'] != 0: | ||
| 562 | continue | ||
| 563 | for line in block['lines']: | ||
| 564 | for span in line['spans']: | ||
| 565 | bbox, text = span['bbox'], span['text'] | ||
| 566 | if '保证人2' in text and int(i) != 0: | ||
| 567 | region = True | ||
| 568 | if '日期' in text: | ||
| 569 | region = False | ||
| 570 | if region == True: | ||
| 571 | page_num = i | ||
| 572 | texts.append(text) | ||
| 573 | boxes.append(bbox) | ||
| 574 | if len(texts) > 4: | ||
| 575 | words = '有' | ||
| 576 | else: | ||
| 577 | words = '无' | ||
| 578 | boxes = np.array(boxes).reshape((-1, 2)) | ||
| 579 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
| 580 | signature_role_4['page_num'] = page_num | ||
| 581 | signature_role_4['position'] = position | ||
| 582 | signature_role_4['words'] = words | ||
| 583 | return signature_role_4 | ||
| 584 | |||
| 585 | def get_signature_role_5(self): | ||
| 586 | signature_role_5 = self.init_item.copy() | ||
| 587 | # 先定位签字区域 | ||
| 588 | texts = [] | ||
| 589 | boxes = [] | ||
| 590 | page_num = None | ||
| 591 | position = None | ||
| 592 | words = None | ||
| 593 | region = False | ||
| 594 | for i in list(self.pdf_info.keys()): | ||
| 595 | for block in self.pdf_info[i]['blocks']: | ||
| 596 | if block['type'] != 0: | ||
| 597 | continue | ||
| 598 | for line in block['lines']: | ||
| 599 | for span in line['spans']: | ||
| 600 | bbox, text = span['bbox'], span['text'] | ||
| 601 | if '见证人签字' in text and int(i) != 0: | ||
| 602 | region = True | ||
| 603 | if '年' in text: | ||
| 604 | region = False | ||
| 605 | if region == True: | ||
| 606 | page_num = i | ||
| 607 | texts.append(text) | ||
| 608 | boxes.append(bbox) | ||
| 609 | print(texts) | ||
| 610 | if len(texts) > 4: | ||
| 611 | words = '有' | ||
| 612 | else: | ||
| 613 | words = '无' | ||
| 614 | boxes = np.array(boxes).reshape((-1, 2)) | ||
| 615 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
| 616 | signature_role_5['page_num'] = page_num | ||
| 617 | signature_role_5['position'] = position | ||
| 618 | signature_role_5['words'] = words | ||
| 619 | return signature_role_5 | ||
| 620 | |||
| 621 | def get_last_page_signature(self, page_num, top, bottom): | ||
| 622 | signature_name = self.item.copy() | ||
| 623 | signature_date = self.item.copy() | ||
| 624 | anchor_top = None | ||
| 625 | anchor_bottom = None | ||
| 626 | for block in self.pdf_info[page_num]['blocks']: | ||
| 627 | if block['type'] != 0: | ||
| 628 | continue | ||
| 629 | for line in block['lines']: | ||
| 630 | for span in line['spans']: | ||
| 631 | bbox, text = span['bbox'], span['text'] | ||
| 632 | if top in text: | ||
| 633 | anchor_top = bbox[1] | ||
| 634 | if bottom in text: | ||
| 635 | anchor_bottom = bbox[1] | ||
| 636 | if anchor_top is not None and anchor_bottom is not None: | ||
| 637 | for block in self.pdf_info[page_num]['blocks']: | ||
| 638 | if block['type'] != 0: | ||
| 639 | continue | ||
| 640 | for line in block['lines']: | ||
| 641 | for span in line['spans']: | ||
| 642 | bbox, text = span['bbox'], span['text'] | ||
| 643 | if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom): | ||
| 644 | name = text.split(' ')[0] | ||
| 645 | date = text.split(':')[-1] | ||
| 646 | signature_name['words'] = name | ||
| 647 | signature_name['position'] = bbox | ||
| 648 | signature_date['words'] = date | ||
| 649 | signature_name['position'] = bbox | ||
| 650 | return signature_name, signature_date | ||
| 651 | |||
| 652 | def get_electronic_signature(self, top, bottom): | ||
| 653 | signature = self.item.copy() | ||
| 654 | anchor_top = None | ||
| 655 | anchor_bottom = None | ||
| 656 | for pno in self.pdf_info: | ||
| 657 | for block in self.pdf_info[pno]['blocks']: | ||
| 658 | if block['type'] != 0: | ||
| 659 | continue | ||
| 660 | for line in block['lines']: | ||
| 661 | for span in line['spans']: | ||
| 662 | bbox, text = span['bbox'], span['text'] | ||
| 663 | if top in text: | ||
| 664 | anchor_top = bbox[1] | ||
| 665 | if bottom in text: | ||
| 666 | anchor_bottom = bbox[1] | ||
| 667 | if anchor_top is not None and anchor_bottom is not None: | ||
| 668 | for pno in self.pdf_info: | ||
| 669 | for block in self.pdf_info[pno]['blocks']: | ||
| 670 | if block['type'] != 0: | ||
| 671 | continue | ||
| 672 | for line in block['lines']: | ||
| 673 | for span in line['spans']: | ||
| 674 | bbox, text = span['bbox'], span['text'] | ||
| 675 | if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom): | ||
| 676 | words = text | ||
| 677 | signature['words'] = words | ||
| 678 | signature['page'] = pno | ||
| 679 | signature['position'] = bbox | ||
| 680 | return signature | ||
| 681 | |||
| 682 | def get_role_info(self, role_key, page_num='0'): | ||
| 683 | name = self.item.copy() | ||
| 684 | id_num = self.item.copy() | ||
| 685 | representative = self.item.copy() | ||
| 686 | |||
| 687 | # 以保证人3 的左上角为定位点 | ||
| 688 | anchor = None | ||
| 689 | for block in self.pdf_info[page_num]['blocks']: | ||
| 690 | if block['type'] != 0: | ||
| 691 | continue | ||
| 692 | for line in block['lines']: | ||
| 693 | for span in line['spans']: | ||
| 694 | bbox, text = span['bbox'], span['text'] | ||
| 695 | # 找到角色姓名 | ||
| 696 | if re.match('保证人3', text) is not None: | ||
| 697 | anchor = [bbox[0], bbox[1]] | ||
| 698 | |||
| 699 | if anchor is not None: | ||
| 700 | for block in self.pdf_info[page_num]['blocks']: | ||
| 701 | if block['type'] != 0: | ||
| 702 | continue | ||
| 703 | for line in block['lines']: | ||
| 704 | for span in line['spans']: | ||
| 705 | bbox, text = span['bbox'], span['text'] | ||
| 706 | # 找到角色姓名 | ||
| 707 | if re.match(role_key, text) is not None: | ||
| 708 | words = text.split(':')[-1] | ||
| 709 | name['words'] = words | ||
| 710 | name['page'] = page_num | ||
| 711 | name['position'] = bbox | ||
| 712 | if role_key == '承租人:': | ||
| 713 | # 找到证件号码且确定位置 | ||
| 714 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
| 715 | words = text.split(':')[-1] | ||
| 716 | id_num['words'] = words | ||
| 717 | id_num['page'] = page_num | ||
| 718 | id_num['position'] = bbox | ||
| 719 | # 找到法人代表且确定位置 | ||
| 720 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
| 721 | words = text.split(':')[-1] | ||
| 722 | representative['words'] = words | ||
| 723 | representative['page'] = page_num | ||
| 724 | representative['position'] = bbox | ||
| 725 | if role_key == '保证人1:': | ||
| 726 | # 找到证件号码且确定位置 | ||
| 727 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
| 728 | words = text.split(':')[-1] | ||
| 729 | id_num['words'] = words | ||
| 730 | id_num['page'] = page_num | ||
| 731 | id_num['position'] = bbox | ||
| 732 | # 找到法人代表且确定位置 | ||
| 733 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
| 734 | words = text.split(':')[-1] | ||
| 735 | representative['words'] = words | ||
| 736 | representative['page'] = page_num | ||
| 737 | representative['position'] = bbox | ||
| 738 | if role_key == '保证人2:': | ||
| 739 | # 找到证件号码且确定位置 | ||
| 740 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
| 741 | words = text.split(':')[-1] | ||
| 742 | id_num['words'] = words | ||
| 743 | id_num['page'] = page_num | ||
| 744 | id_num['position'] = bbox | ||
| 745 | # 找到法人代表且确定位置 | ||
| 746 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
| 747 | words = text.split(':')[-1] | ||
| 748 | representative['words'] = words | ||
| 749 | representative['page'] = page_num | ||
| 750 | representative['position'] = bbox | ||
| 751 | if role_key == '保证人3:': | ||
| 752 | # 找到证件号码且确定位置 | ||
| 753 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
| 754 | words = text.split(':')[-1] | ||
| 755 | id_num['words'] = words | ||
| 756 | id_num['page'] = page_num | ||
| 757 | id_num['position'] = bbox | ||
| 758 | # 找到法人代表且确定位置 | ||
| 759 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
| 760 | words = text.split(':')[-1] | ||
| 761 | representative['words'] = words | ||
| 762 | representative['page'] = page_num | ||
| 763 | representative['position'] = bbox | ||
| 764 | return name, id_num, representative | ||
| 765 | |||
| 766 | def get_table_add_product(self): | ||
| 767 | table_add_product = self.item.copy() | ||
| 768 | items = [] | ||
| 769 | start = False | ||
| 770 | page = None | ||
| 771 | for pno in self.pdf_info: | ||
| 772 | condition = False | ||
| 773 | for block in self.pdf_info[f'{pno}']['blocks']: | ||
| 774 | if block['type'] != 0: | ||
| 775 | continue | ||
| 776 | for line in block['lines']: | ||
| 777 | for span in line['spans']: | ||
| 778 | bbox, text = span['bbox'], span['text'] | ||
| 779 | if '总计' in text: | ||
| 780 | start = True | ||
| 781 | if '注:出租人向承租人购买租赁车辆的对价' in text: | ||
| 782 | page = pno | ||
| 783 | start = False | ||
| 784 | if start == True: | ||
| 785 | items.append(text) | ||
| 786 | |||
| 787 | lines = [['项目', '购买价格', '实际融资金额']] | ||
| 788 | for i in range(len(items)//3): | ||
| 789 | line = [items[2+i*3+0], items[2+i*3+1], items[2+i*3+2]] | ||
| 790 | lines.append(line) | ||
| 791 | |||
| 792 | if len(items) > 0: | ||
| 793 | lines.append([items[0], '', items[1]]) | ||
| 794 | |||
| 795 | table_add_product['words'] = lines | ||
| 796 | table_add_product['page'] = page | ||
| 797 | table_add_product['position'] = None | ||
| 798 | return table_add_product | ||
| 799 | |||
| 800 | def get_contract_no_dy(self): | ||
| 801 | # 查找抵押合同编号 | ||
| 802 | contract_no = self.item.copy() | ||
| 803 | |||
| 804 | key_box = None | ||
| 805 | for pno in self.pdf_info: | ||
| 806 | for block in self.pdf_info[pno]['blocks']: | ||
| 807 | if block['type'] != 0: | ||
| 808 | continue | ||
| 809 | for line in block['lines']: | ||
| 810 | for span in line['spans']: | ||
| 811 | bbox, text = span['bbox'], span['text'] | ||
| 812 | if '抵押合同编号' in text: | ||
| 813 | key_box = bbox | ||
| 814 | |||
| 815 | if key_box is not None: | ||
| 816 | for pno in self.pdf_info: | ||
| 817 | for block in self.pdf_info[pno]['blocks']: | ||
| 818 | if block['type'] != 0: | ||
| 819 | continue | ||
| 820 | for line in block['lines']: | ||
| 821 | for span in line['spans']: | ||
| 822 | bbox, text = span['bbox'], span['text'] | ||
| 823 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and 'CH-' in text: | ||
| 824 | contract_no['position'] = bbox | ||
| 825 | contract_no['page'] = pno | ||
| 826 | contract_no['words'] = text | ||
| 827 | return contract_no | ||
| 828 | |||
| 829 | def get_dyr_name_id(self): | ||
| 830 | name = self.item.copy() | ||
| 831 | _id = self.item.copy() | ||
| 832 | |||
| 833 | key_box = None | ||
| 834 | for pno in self.pdf_info: | ||
| 835 | for block in self.pdf_info[pno]['blocks']: | ||
| 836 | if block['type'] != 0: | ||
| 837 | continue | ||
| 838 | for line in block['lines']: | ||
| 839 | for span in line['spans']: | ||
| 840 | bbox, text = span['bbox'], span['text'] | ||
| 841 | if text == '抵押人': | ||
| 842 | key_box = bbox | ||
| 843 | |||
| 844 | if key_box is not None: | ||
| 845 | rh = abs(key_box[1]-key_box[3]) | ||
| 846 | for pno in self.pdf_info: | ||
| 847 | for block in self.pdf_info[pno]['blocks']: | ||
| 848 | if block['type'] != 0: | ||
| 849 | continue | ||
| 850 | for line in block['lines']: | ||
| 851 | for span in line['spans']: | ||
| 852 | bbox, text = span['bbox'], span['text'] | ||
| 853 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '姓名' in text: | ||
| 854 | words = text.split(':')[-1] | ||
| 855 | name['position'] = bbox | ||
| 856 | name['page'] = pno | ||
| 857 | name['words'] = words | ||
| 858 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '证件号码' in text: | ||
| 859 | words = text.split(':')[-1] | ||
| 860 | _id['position'] = bbox | ||
| 861 | _id['page'] = pno | ||
| 862 | _id['words'] = words | ||
| 863 | return name, _id | ||
| 864 | |||
| 865 | def get_key_value_position(self, key): | ||
| 866 | value = self.item.copy() | ||
| 867 | |||
| 868 | key_box = None | ||
| 869 | for pno in self.pdf_info: | ||
| 870 | for block in self.pdf_info[pno]['blocks']: | ||
| 871 | if block['type'] != 0: | ||
| 872 | continue | ||
| 873 | for line in block['lines']: | ||
| 874 | for span in line['spans']: | ||
| 875 | bbox, text = span['bbox'], span['text'] | ||
| 876 | if text == key: | ||
| 877 | key_box = bbox | ||
| 878 | |||
| 879 | if key_box is not None: | ||
| 880 | rh = abs(key_box[1]-key_box[3]) | ||
| 881 | for pno in self.pdf_info: | ||
| 882 | for block in self.pdf_info[pno]['blocks']: | ||
| 883 | if block['type'] != 0: | ||
| 884 | continue | ||
| 885 | for line in block['lines']: | ||
| 886 | for span in line['spans']: | ||
| 887 | bbox, text = span['bbox'], span['text'] | ||
| 888 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and key_box[0] < bbox[0] and abs(key_box[2]-bbox[0]) < rh*10: | ||
| 889 | words = text | ||
| 890 | value['position'] = bbox | ||
| 891 | value['page'] = pno | ||
| 892 | value['words'] = words | ||
| 893 | return value | ||
| 894 | |||
| 895 | def get_info(self): | ||
| 896 | """ | ||
| 897 | block['type'] == 0 : 表示该元素为图片 | ||
| 898 | |||
| 899 | Returns: | ||
| 900 | dict: Description | ||
| 901 | """ | ||
| 902 | if len(self.pdf_info) > 0: | ||
| 903 | # 取 Page 1 上的合同编号 | ||
| 904 | contract_no = self.get_contract_no(page_num='0') | ||
| 905 | self.init_result['合同编号'] = contract_no | ||
| 906 | # 从第一页上取四个角色的姓名和证件号码 | ||
| 907 | name, id_num, representative = self.get_role_info(role_key='承租人:', page_num='0') | ||
| 908 | self.init_result['承租人-姓名'] = name | ||
| 909 | self.init_result['承租人-证件号码'] = id_num | ||
| 910 | self.init_result['承租人-法定代表人或授权代表'] = representative | ||
| 911 | name, id_num, representative = self.get_role_info(role_key='保证人1:', page_num='0') | ||
| 912 | self.init_result['保证人1-姓名'] = name | ||
| 913 | self.init_result['保证人1-证件号码'] = id_num | ||
| 914 | self.init_result['保证人1-法定代表人或授权代表'] = representative | ||
| 915 | name, id_num, representative = self.get_role_info(role_key='保证人2:', page_num='0') | ||
| 916 | self.init_result['保证人2-姓名'] = name | ||
| 917 | self.init_result['保证人2-证件号码'] = id_num | ||
| 918 | self.init_result['保证人2-法定代表人或授权代表'] = representative | ||
| 919 | name, id_num, representative = self.get_role_info(role_key='保证人3:', page_num='0') | ||
| 920 | self.init_result['保证人3-姓名'] = name | ||
| 921 | self.init_result['保证人3-证件号码'] = id_num | ||
| 922 | self.init_result['保证人3-法定代表人或授权代表'] = representative | ||
| 923 | # 在所有页面中找正文中(第二部分 融资租赁主要条款及付款计划)的那个编号,因为存在换行的情况所以暂时不带位置输出 | ||
| 924 | contract_no = self.get_contract_no_one() | ||
| 925 | self.init_result['合同编号(正文)'] = contract_no | ||
| 926 | # 找到车辆识别代码 | ||
| 927 | vin = self.get_key_value(key='车辆识别代码:') | ||
| 928 | self.init_result['车辆识别代码'] = vin | ||
| 929 | # 找到经销商(车辆卖方(经销商)) | ||
| 930 | seller = self.get_key_value(key='车辆卖方(经销商):') | ||
| 931 | self.init_result['车辆卖方(经销商)'] = seller | ||
| 932 | # 找到 —— 车辆原始销售价格 | ||
| 933 | vehicle_price = self.get_key_value(key='车辆原始销售价格(《机动车销售统一发票》所列金额):') | ||
| 934 | self.init_result['车辆原始销售价格(《机动车销售统一发票》所列金额)'] = vehicle_price | ||
| 935 | # 找车辆附加产品明细(表) | ||
| 936 | table_add_product = self.get_table_add_product() | ||
| 937 | self.init_result['车辆附加产品明细表'] = table_add_product | ||
| 938 | # 找融资成本总额 | ||
| 939 | financing_cost = self.get_key_value(key='融资成本总额:') | ||
| 940 | self.init_result['融资成本总额'] = financing_cost | ||
| 941 | # 找租期 | ||
| 942 | lease_term = self.get_key_value(key='租期:') | ||
| 943 | self.init_result['租期'] = lease_term | ||
| 944 | # 找还款计划(表) | ||
| 945 | repayment_schedule = self.get_repayment_schedule() | ||
| 946 | self.init_result['付款计划表'] = repayment_schedule | ||
| 947 | # 找开户行户名、银行账号、银行 | ||
| 948 | name = self.get_key_value(key='户名:') | ||
| 949 | self.init_result['银行账户-户名'] = name | ||
| 950 | account = self.get_key_value(key='银行账号:') | ||
| 951 | self.init_result['银行账户-银行账号'] = account | ||
| 952 | bank = self.get_key_value(key='开户银行:') | ||
| 953 | self.init_result['银行账户-开户行'] = bank | ||
| 954 | # 找签字页上的系列信息 | ||
| 955 | # 承租人姓名、签章 | ||
| 956 | name = self.get_key_value(key='承租人姓名:') | ||
| 957 | electronic_signature = self.get_electronic_signature(top='承租人姓名:', bottom='保证人1姓名:') | ||
| 958 | self.init_result['签字页-承租人姓名'] = name | ||
| 959 | self.init_result['签字页-承租人签章'] = electronic_signature | ||
| 960 | # 保证人1姓名、签章 | ||
| 961 | name = self.get_key_value(key='保证人1姓名:') | ||
| 962 | electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:') | ||
| 963 | self.init_result['签字页-保证人1姓名'] = name | ||
| 964 | self.init_result['签字页-保证人1签章'] = electronic_signature | ||
| 965 | # 保证人2姓名、签章 | ||
| 966 | name = self.get_key_value(key='保证人2姓名:') | ||
| 967 | electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='保证人3姓名:') | ||
| 968 | self.init_result['签字页-保证人2姓名'] = name | ||
| 969 | self.init_result['签字页-保证人2签章'] = electronic_signature | ||
| 970 | # 保证人2姓名、签章 | ||
| 971 | name = self.get_key_value(key='保证人3姓名:') | ||
| 972 | electronic_signature = self.get_electronic_signature(top='保证人3姓名:', bottom='日期:') | ||
| 973 | self.init_result['签字页-保证人3姓名'] = name | ||
| 974 | self.init_result['签字页-保证人3签章'] = electronic_signature | ||
| 975 | |||
| 976 | return self.init_result | ||
| 977 | |||
| 978 | # results['is_shhz_contract'] = True | ||
| 979 | # results['pdf_info'] = self.init_result | ||
| 980 | |||
| 981 | # return results | ||
| 982 | |||
| 983 | def get_info_1(self): | ||
| 984 | if len(self.pdf_info) > 0: | ||
| 985 | contract_no = self.get_contract_no(page_num='0') | ||
| 986 | self.init_result_1['合同编号'] = contract_no | ||
| 987 | # 承租人姓名 | ||
| 988 | name = self.get_key_value(key='承租人:', page_num='0') | ||
| 989 | self.init_result_1['承租人-姓名'] = name | ||
| 990 | # 承租人证件号码 | ||
| 991 | _id = self.get_key_value(key='证件号码:', page_num='0') | ||
| 992 | self.init_result_1['承租人-证件号码'] = _id | ||
| 993 | # 销售经销商 | ||
| 994 | seller = self.get_key_value(key='销售经销商:', page_num='0') | ||
| 995 | self.init_result_1['销售经销商'] = seller | ||
| 996 | # 合同编号(正文) | ||
| 997 | contract_no = self.get_contract_no_one() | ||
| 998 | self.init_result_1['合同编号(正文)'] = contract_no | ||
| 999 | # 签字页-承租人姓名 | ||
| 1000 | name = self.get_key_value(key='姓名/名称:') | ||
| 1001 | self.init_result_1['签字页-承租人姓名'] = name | ||
| 1002 | # 签字页-承租人证件号码 | ||
| 1003 | _id = self.get_key_value(key='自然人身份证件号码/法人执照号码:') | ||
| 1004 | self.init_result_1['签字页-承租人证件号码'] = _id | ||
| 1005 | # 签字页-承租人签章 | ||
| 1006 | signature_role_1 = self.get_signature_role_1() | ||
| 1007 | self.init_result_1['签字页-承租人签章'] = signature_role_1 | ||
| 1008 | # 签字页-销售经销商 | ||
| 1009 | seller = self.get_key_value(key='销售经销商:') | ||
| 1010 | self.init_result_1['签字页-销售经销商'] = seller | ||
| 1011 | # 经销商签章 | ||
| 1012 | pass | ||
| 1013 | return self.init_result_1 | ||
| 1014 | |||
| 1015 | def get_info_2(self): | ||
| 1016 | if len(self.pdf_info) > 0: | ||
| 1017 | contract_no = self.get_contract_no_dy() | ||
| 1018 | self.init_result_2['合同编号'] = contract_no | ||
| 1019 | # 合同编号(正文) | ||
| 1020 | contract_no = self.get_contract_no_one() | ||
| 1021 | self.init_result_2['合同编号(正文)'] = contract_no | ||
| 1022 | # 抵押人姓名/名称 | ||
| 1023 | name, _id = self.get_dyr_name_id() | ||
| 1024 | self.init_result_2['抵押人姓名/名称'] = name | ||
| 1025 | self.init_result_2['抵押人证件号码'] = _id | ||
| 1026 | # 车辆识别代码 | ||
| 1027 | vin = self.get_key_value(key='车辆识别代码:') | ||
| 1028 | self.init_result_2['车辆识别代码'] = vin | ||
| 1029 | # 租金总额 | ||
| 1030 | rent = self.get_key_value_position(key='租金总额') | ||
| 1031 | self.init_result_2['租金总额'] = rent | ||
| 1032 | # 融资租赁期限 | ||
| 1033 | lease_term = self.get_key_value_position(key='融资租赁期限') | ||
| 1034 | self.init_result_2['融资租赁期限'] = lease_term | ||
| 1035 | # 签字页抵押人姓名和签章 | ||
| 1036 | name = self.get_key_value(key='抵押人姓名:') | ||
| 1037 | electronic_signature = self.get_electronic_signature(top='抵押权人盖章', bottom='抵押人配偶姓名:') | ||
| 1038 | self.init_result_2['签字页-抵押人姓名'] = name | ||
| 1039 | self.init_result_2['签字页-抵押人签章'] = electronic_signature | ||
| 1040 | # 签字页抵押人配偶姓名和签章 | ||
| 1041 | name = self.get_key_value(key='抵押人配偶姓名:') | ||
| 1042 | electronic_signature = self.get_electronic_signature(top='抵押人配偶姓名:', bottom='日期') | ||
| 1043 | self.init_result_2['签字页-抵押人配偶姓名'] = name | ||
| 1044 | self.init_result_2['签字页-抵押人配偶签章'] = electronic_signature | ||
| 1045 | return self.init_result_2 |
| 1 | # -*- coding: utf-8 -*- | ||
| 2 | # @Author : lk | ||
| 3 | # @Email : 9428.al@gmail.com | ||
| 4 | # @Created Date : 2021-06-29 17:43:46 | ||
| 5 | # @Last Modified : 2021-11-03 16:07:36 | ||
| 6 | # @Description : | ||
| 7 | |||
| 8 | from .get_char import Finder | ||
| 9 | |||
| 10 | |||
| 11 | def predict(pdf_info, file_cls): | ||
| 12 | """Summary | ||
| 13 | |||
| 14 | Args: | ||
| 15 | pdf_info (TYPE): Description | ||
| 16 | file_cls (TYPE): file_cls = 0: 售后回租合同; file_cls = 1: 车辆处置协议; file_cls = 2: 车辆租赁抵押合同 | ||
| 17 | |||
| 18 | Returns: | ||
| 19 | TYPE: Description | ||
| 20 | """ | ||
| 21 | |||
| 22 | # 0: 售后回租合同 | ||
| 23 | pdf_info_0 = [] | ||
| 24 | for pno in pdf_info: | ||
| 25 | for block in pdf_info[f'{pno}']['blocks']: | ||
| 26 | if block['type'] != 0: | ||
| 27 | continue | ||
| 28 | for line in block['lines']: | ||
| 29 | for span in line['spans']: | ||
| 30 | bbox, text = span['bbox'], span['text'] | ||
| 31 | if '售后回租合同_' in text: | ||
| 32 | pdf_info_0.append(pdf_info[pno]) | ||
| 33 | |||
| 34 | # 1: 车辆处置协议 | ||
| 35 | pdf_info_1 = [] | ||
| 36 | for pno in pdf_info: | ||
| 37 | for block in pdf_info[f'{pno}']['blocks']: | ||
| 38 | if block['type'] != 0: | ||
| 39 | continue | ||
| 40 | for line in block['lines']: | ||
| 41 | for span in line['spans']: | ||
| 42 | bbox, text = span['bbox'], span['text'] | ||
| 43 | if '售后回租合同附件一' in text: | ||
| 44 | pdf_info_1.append(pdf_info[pno]) | ||
| 45 | |||
| 46 | # 2: 车辆租赁抵押合同 | ||
| 47 | pdf_info_2 = [] | ||
| 48 | for pno in pdf_info: | ||
| 49 | for block in pdf_info[f'{pno}']['blocks']: | ||
| 50 | if block['type'] != 0: | ||
| 51 | continue | ||
| 52 | for line in block['lines']: | ||
| 53 | for span in line['spans']: | ||
| 54 | bbox, text = span['bbox'], span['text'] | ||
| 55 | if '车辆租赁抵押合同_' in text: | ||
| 56 | pdf_info_2.append(pdf_info[pno]) | ||
| 57 | |||
| 58 | is_clczxy = False | ||
| 59 | # 如果 pdf_info_1 == 4 页,则说明此时输入包含了车辆处置协议 | ||
| 60 | if len(pdf_info_1) == 4 and file_cls == 1 and len(pdf_info_0) != 0: | ||
| 61 | is_clczxy = True | ||
| 62 | pdf_info = dict() | ||
| 63 | for pno, page_info in enumerate(pdf_info_1): | ||
| 64 | pdf_info[str(pno)] = page_info | ||
| 65 | |||
| 66 | f = Finder(pdf_info) | ||
| 67 | if file_cls == 0: | ||
| 68 | results = f.get_info() | ||
| 69 | if file_cls == 1: | ||
| 70 | # 提取信息 ———— 车辆处置协议 | ||
| 71 | results = f.get_info_1() | ||
| 72 | if file_cls == 2: | ||
| 73 | # 提取信息 ———— 车辆租赁抵押合同 | ||
| 74 | results = f.get_info_2() | ||
| 75 | |||
| 76 | if is_clczxy == True: | ||
| 77 | for key in results: | ||
| 78 | if results[key]['page'] is not None: | ||
| 79 | results[key]['page'] = str(int(results[key]['page'])+6) | ||
| 80 | |||
| 81 | for key in results: | ||
| 82 | if results[key]['page'] is not None: | ||
| 83 | results[key]['page'] = 'page_' + str(int(results[key]['page'])+1) | ||
| 84 | return results |
src/common/tools/mssql_script10.py
0 → 100644
| 1 | import pyodbc | ||
| 2 | |||
| 3 | afc_sql = """ | ||
| 4 | create table afc_contract | ||
| 5 | ( | ||
| 6 | id bigint identity primary key, | ||
| 7 | application_id nvarchar(64) not null, | ||
| 8 | create_time datetime not null | ||
| 9 | ); | ||
| 10 | |||
| 11 | create index afc_contract_application_id_index | ||
| 12 | on afc_contract (application_id); | ||
| 13 | """ | ||
| 14 | |||
| 15 | hil_sql = """ | ||
| 16 | create table hil_contract | ||
| 17 | ( | ||
| 18 | id bigint identity primary key, | ||
| 19 | application_id nvarchar(64) not null, | ||
| 20 | create_time datetime not null | ||
| 21 | ); | ||
| 22 | |||
| 23 | create index hil_contract_application_id_index | ||
| 24 | on hil_contract (application_id); | ||
| 25 | """ | ||
| 26 | |||
| 27 | hil_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True) | ||
| 28 | |||
| 29 | hil_cursor = hil_cnxn.cursor() | ||
| 30 | hil_cursor.execute(hil_sql) | ||
| 31 | |||
| 32 | hil_cursor.close() | ||
| 33 | hil_cnxn.close() | ||
| 34 | |||
| 35 | afc_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True) | ||
| 36 | |||
| 37 | afc_cursor = afc_cnxn.cursor() | ||
| 38 | afc_cursor.execute(afc_sql) | ||
| 39 | |||
| 40 | afc_cursor.close() | ||
| 41 | afc_cnxn.close() |
| 1 | import os | 1 | import os |
| 2 | import json | ||
| 2 | import cv2 | 3 | import cv2 |
| 3 | import shutil | 4 | import shutil |
| 4 | import fitz | 5 | import fitz |
| ... | @@ -35,6 +36,8 @@ class PDFHandler: | ... | @@ -35,6 +36,8 @@ class PDFHandler: |
| 35 | self.suffix = self.get_suffix(document_name) | 36 | self.suffix = self.get_suffix(document_name) |
| 36 | self.is_ebank = False | 37 | self.is_ebank = False |
| 37 | self.page_text_list = [] | 38 | self.page_text_list = [] |
| 39 | self.pdf_info = {} | ||
| 40 | self.img_path_pno_list = [] | ||
| 38 | 41 | ||
| 39 | def get_suffix(self, file_name): | 42 | def get_suffix(self, file_name): |
| 40 | if file_name is None: | 43 | if file_name is None: |
| ... | @@ -296,6 +299,17 @@ class PDFHandler: | ... | @@ -296,6 +299,17 @@ class PDFHandler: |
| 296 | self.is_ebank = True | 299 | self.is_ebank = True |
| 297 | self.page_text_list = page_text_list | 300 | self.page_text_list = page_text_list |
| 298 | 301 | ||
| 302 | def e_contract_process(self): | ||
| 303 | with fitz.Document(self.path) as pdf: | ||
| 304 | for pno in range(pdf.pageCount): | ||
| 305 | page = pdf.loadPage(pno) | ||
| 306 | self.pdf_info[str(pno)] = json.loads(page.getText('json')) | ||
| 307 | |||
| 308 | pix = page.getPixmap() | ||
| 309 | img_save_path = self.get_img_save_path(page.number) | ||
| 310 | self.img_path_pno_list.append((img_save_path, 'page_{0}'.format(str(pno+1)))) | ||
| 311 | pix.writePNG(img_save_path) | ||
| 312 | |||
| 299 | def extract_image(self, max_img_count=None): | 313 | def extract_image(self, max_img_count=None): |
| 300 | self.img_path_list = [] | 314 | self.img_path_list = [] |
| 301 | self.xref_set = set() | 315 | self.xref_set = set() | ... | ... |
| ... | @@ -13,3 +13,5 @@ EDMS_UPLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/Uploa | ... | @@ -13,3 +13,5 @@ EDMS_UPLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/Uploa |
| 13 | DEALER_CODE = ocr_situ_group | 13 | DEALER_CODE = ocr_situ_group |
| 14 | 14 | ||
| 15 | BASE_URL = https://staging-bmw-ocr.situdata.com | 15 | BASE_URL = https://staging-bmw-ocr.situdata.com |
| 16 | |||
| 17 | DELAY_SECONDS = 60 | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
| ... | @@ -13,3 +13,5 @@ EDMS_UPLOAD_URL = http://sccn0637.bmwgroup.net/FH/FileHold/DocumentRepository/Up | ... | @@ -13,3 +13,5 @@ EDMS_UPLOAD_URL = http://sccn0637.bmwgroup.net/FH/FileHold/DocumentRepository/Up |
| 13 | DEALER_CODE = ocr_situ_group | 13 | DEALER_CODE = ocr_situ_group |
| 14 | 14 | ||
| 15 | BASE_URL = https://li19dkocruat01vm.bmwgroup.net | 15 | BASE_URL = https://li19dkocruat01vm.bmwgroup.net |
| 16 | |||
| 17 | DELAY_SECONDS = 60 | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
-
Please register or sign in to post a comment