e-contract part 1
Showing
15 changed files
with
2327 additions
and
116 deletions
... | @@ -1773,3 +1773,21 @@ APPLICANT_TYPE_MAP = { | ... | @@ -1773,3 +1773,21 @@ APPLICANT_TYPE_MAP = { |
1773 | } | 1773 | } |
1774 | 1774 | ||
1775 | APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager'] | 1775 | APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager'] |
1776 | |||
1777 | FILE_NAME_PREFIX_MAP = { | ||
1778 | AFC_PREFIX: [ | ||
1779 | ((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'), | ||
1780 | ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), | ||
1781 | ], | ||
1782 | HIL_PREFIX: [ | ||
1783 | ((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'), | ||
1784 | ((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'), | ||
1785 | ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), | ||
1786 | ] | ||
1787 | } | ||
1788 | |||
1789 | HIL_CONTRACT_TYPE_MAP = { | ||
1790 | str(HIL_CONTRACT_1_CLASSIFY): 0, | ||
1791 | str(HIL_CONTRACT_2_CLASSIFY): 2, | ||
1792 | str(HIL_CONTRACT_3_CLASSIFY): 1, | ||
1793 | } | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
... | @@ -18,6 +18,8 @@ from settings import conf | ... | @@ -18,6 +18,8 @@ from settings import conf |
18 | from common.mixins import LoggerMixin | 18 | from common.mixins import LoggerMixin |
19 | from common.tools.file_tools import write_zip_file | 19 | from common.tools.file_tools import write_zip_file |
20 | from common.tools.pdf_to_img import PDFHandler | 20 | from common.tools.pdf_to_img import PDFHandler |
21 | from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict | ||
22 | from common.electronic_hil_contract.hil_contract_ocr import predict as hil_predict | ||
21 | from apps.doc import consts | 23 | from apps.doc import consts |
22 | # from apps.doc.ocr.edms import EDMS, rh | 24 | # from apps.doc.ocr.edms import EDMS, rh |
23 | from apps.doc.ocr.ecm import ECM, rh | 25 | from apps.doc.ocr.ecm import ECM, rh |
... | @@ -47,6 +49,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -47,6 +49,7 @@ class Command(BaseCommand, LoggerMixin): |
47 | def __init__(self): | 49 | def __init__(self): |
48 | super().__init__() | 50 | super().__init__() |
49 | self.log_base = '[doc ocr process]' | 51 | self.log_base = '[doc ocr process]' |
52 | self.e_log_base = '[e-contract ocr process]' | ||
50 | # 处理文件开关 | 53 | # 处理文件开关 |
51 | self.switch = True | 54 | self.switch = True |
52 | # 睡眠时间 | 55 | # 睡眠时间 |
... | @@ -90,13 +93,20 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -90,13 +93,20 @@ class Command(BaseCommand, LoggerMixin): |
90 | task_str, is_priority = rh.dequeue() | 93 | task_str, is_priority = rh.dequeue() |
91 | if task_str is None: | 94 | if task_str is None: |
92 | self.online_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base)) | 95 | self.online_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base)) |
93 | return None, None, None | 96 | return None, None, None, None, None |
94 | 97 | ||
95 | self.online_log.info('{0} [get_doc_info] [task={1}] [is_priority={2}]'.format( | 98 | self.online_log.info('{0} [get_doc_info] [task={1}] [is_priority={2}]'.format( |
96 | self.log_base, task_str, is_priority)) | 99 | self.log_base, task_str, is_priority)) |
97 | try: | 100 | try: |
98 | # doc, business_type = self.get_doc_object(task_str) | 101 | # doc, business_type = self.get_doc_object(task_str) |
99 | business_type, doc_id_str = task_str.split(consts.SPLIT_STR) | 102 | info_tuple = task_str.split(consts.SPLIT_STR) |
103 | if len(info_tuple) == 2: | ||
104 | business_type, doc_id_str = info_tuple | ||
105 | classify_1_str = classify_2_str = '0' | ||
106 | rebuild_task_str = task_str | ||
107 | else: | ||
108 | business_type, doc_id_str, classify_1_str, classify_2_str = info_tuple | ||
109 | rebuild_task_str = '{0}{1}{2}'.format(business_type, consts.SPLIT_STR, doc_id_str) | ||
100 | doc_id = int(doc_id_str) | 110 | doc_id = int(doc_id_str) |
101 | doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc | 111 | doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc |
102 | doc = doc_class.objects.filter(id=doc_id).first() | 112 | doc = doc_class.objects.filter(id=doc_id).first() |
... | @@ -104,11 +114,11 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -104,11 +114,11 @@ class Command(BaseCommand, LoggerMixin): |
104 | if doc is None: | 114 | if doc is None: |
105 | self.online_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format( | 115 | self.online_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format( |
106 | self.log_base, task_str, is_priority)) | 116 | self.log_base, task_str, is_priority)) |
107 | return None, None, None | 117 | return None, None, None, None, None |
108 | elif doc.status != DocStatus.INIT.value: | 118 | elif doc.status != DocStatus.INIT.value: |
109 | self.online_log.warn('{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] ' | 119 | self.online_log.warn('{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] ' |
110 | '[doc_status={3}]'.format(self.log_base, task_str, is_priority, doc.status)) | 120 | '[doc_status={3}]'.format(self.log_base, task_str, is_priority, doc.status)) |
111 | return None, None, None | 121 | return None, None, None, None, None |
112 | doc.status = DocStatus.PROCESSING.value | 122 | doc.status = DocStatus.PROCESSING.value |
113 | doc.start_time = timezone.now() | 123 | doc.start_time = timezone.now() |
114 | doc.save() | 124 | doc.save() |
... | @@ -120,7 +130,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -120,7 +130,7 @@ class Command(BaseCommand, LoggerMixin): |
120 | else: | 130 | else: |
121 | self.online_log.info('{0} [get_doc_info] [db save end] [task_str={1}] [is_priority={2}]'.format( | 131 | self.online_log.info('{0} [get_doc_info] [db save end] [task_str={1}] [is_priority={2}]'.format( |
122 | self.log_base, task_str, is_priority)) | 132 | self.log_base, task_str, is_priority)) |
123 | return doc, business_type, task_str | 133 | return doc, business_type, rebuild_task_str, classify_1_str, classify_2_str |
124 | 134 | ||
125 | # def pdf_download(self, doc, pdf_path): | 135 | # def pdf_download(self, doc, pdf_path): |
126 | # if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): | 136 | # if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): |
... | @@ -212,7 +222,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -212,7 +222,7 @@ class Command(BaseCommand, LoggerMixin): |
212 | 222 | ||
213 | def contract_process(self, classify, ocr_data, contract_result, res_list, pno, ino, part_idx, img_path): | 223 | def contract_process(self, classify, ocr_data, contract_result, res_list, pno, ino, part_idx, img_path): |
214 | contract_dict = ocr_data.get('data') | 224 | contract_dict = ocr_data.get('data') |
215 | if not contract_dict or contract_dict.get('page_num') is None or contract_dict.get('page_info') is None: | 225 | if not contract_dict or contract_dict.get('page_num') is None or contract_dict.get('page_info') is None: |
216 | res_list.append((pno, ino, part_idx, consts.RES_SUCCESS_EMPTY)) | 226 | res_list.append((pno, ino, part_idx, consts.RES_SUCCESS_EMPTY)) |
217 | return | 227 | return |
218 | res_list.append((pno, ino, part_idx, consts.RES_SUCCESS)) | 228 | res_list.append((pno, ino, part_idx, consts.RES_SUCCESS)) |
... | @@ -915,11 +925,11 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -915,11 +925,11 @@ class Command(BaseCommand, LoggerMixin): |
915 | # summary['confidence'] = max(summary['confidence']) | 925 | # summary['confidence'] = max(summary['confidence']) |
916 | return merged_bs_summary | 926 | return merged_bs_summary |
917 | 927 | ||
918 | def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock, error_list): | 928 | def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock, error_list, res_dict, finish_queue): |
919 | while self.switch: | 929 | while self.switch: |
920 | try: | 930 | try: |
921 | # 1. 从队列获取文件信息 | 931 | # 1. 从队列获取文件信息 |
922 | doc, business_type, task_str = self.get_doc_info() | 932 | doc, business_type, task_str, classify_1_str, classify_2_str = self.get_doc_info() |
923 | # 队列为空时的处理 | 933 | # 队列为空时的处理 |
924 | if doc is None: | 934 | if doc is None: |
925 | time.sleep(self.sleep_time_doc_get) | 935 | time.sleep(self.sleep_time_doc_get) |
... | @@ -930,55 +940,109 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -930,55 +940,109 @@ class Command(BaseCommand, LoggerMixin): |
930 | error_list.append(1) | 940 | error_list.append(1) |
931 | return | 941 | return |
932 | else: | 942 | else: |
933 | try: | 943 | doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id)) |
934 | # 2. 从EDMS获取PDF文件 | 944 | os.makedirs(doc_data_path, exist_ok=True) |
935 | doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id)) | 945 | img_save_path = os.path.join(doc_data_path, 'img') |
936 | os.makedirs(doc_data_path, exist_ok=True) | 946 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) |
937 | img_save_path = os.path.join(doc_data_path, 'img') | ||
938 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) | ||
939 | |||
940 | pdf_handler = PDFHandler(pdf_path, img_save_path, doc.document_name) | ||
941 | max_count_obj = Configs.objects.filter(id=2).first() | ||
942 | try: | ||
943 | max_img_count = int(max_count_obj.value) | ||
944 | except Exception as e: | ||
945 | max_img_count = 500 | ||
946 | 947 | ||
947 | for times in range(consts.RETRY_TIMES): | 948 | pdf_handler = PDFHandler(pdf_path, img_save_path, doc.document_name) |
948 | try: | ||
949 | if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): | ||
950 | # self.edms.download(pdf_path, doc.metadata_version_id) | ||
951 | self.edms.download(pdf_path, doc.metadata_version_id, doc.document_scheme, business_type) | ||
952 | self.online_log.info('{0} [edms download success] [task={1}] [times={2}] ' | ||
953 | '[pdf_path={3}]'.format(self.log_base, task_str, times, pdf_path)) | ||
954 | 949 | ||
955 | # 3.PDF文件提取图片 | 950 | if classify_1_str == '0' or classify_1_str == str(consts.HMH_CLASSIFY): |
956 | self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format( | 951 | try: |
957 | self.log_base, task_str, times)) | 952 | # 2. 从EDMS获取PDF文件 |
958 | start_time = time.time() | 953 | max_count_obj = Configs.objects.filter(id=2).first() |
959 | pdf_handler.extract_image(max_img_count) | 954 | try: |
960 | end_time = time.time() | 955 | max_img_count = int(max_count_obj.value) |
961 | speed_time = int(end_time - start_time) | ||
962 | self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format( | ||
963 | self.log_base, task_str, times, speed_time)) | ||
964 | except Exception as e: | 956 | except Exception as e: |
965 | self.online_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] ' | 957 | max_img_count = 500 |
966 | '[error={3}]'.format(self.log_base, task_str, times, | 958 | |
967 | traceback.format_exc())) | 959 | for times in range(consts.RETRY_TIMES): |
960 | try: | ||
961 | if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): | ||
962 | # self.edms.download(pdf_path, doc.metadata_version_id) | ||
963 | self.edms.download(pdf_path, doc.metadata_version_id, doc.document_scheme, business_type) | ||
964 | self.online_log.info('{0} [edms download success] [task={1}] [times={2}] ' | ||
965 | '[pdf_path={3}]'.format(self.log_base, task_str, times, pdf_path)) | ||
966 | |||
967 | # 3.PDF文件提取图片 | ||
968 | self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format( | ||
969 | self.log_base, task_str, times)) | ||
970 | start_time = time.time() | ||
971 | pdf_handler.extract_image(max_img_count) | ||
972 | end_time = time.time() | ||
973 | speed_time = int(end_time - start_time) | ||
974 | self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format( | ||
975 | self.log_base, task_str, times, speed_time)) | ||
976 | except Exception as e: | ||
977 | self.online_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] ' | ||
978 | '[error={3}]'.format(self.log_base, task_str, times, | ||
979 | traceback.format_exc())) | ||
980 | else: | ||
981 | break | ||
968 | else: | 982 | else: |
969 | break | 983 | raise Exception('download or pdf to img failed') |
970 | else: | ||
971 | raise Exception('download or pdf to img failed') | ||
972 | 984 | ||
973 | if pdf_handler.img_count == 0: | 985 | if pdf_handler.img_count == 0: |
974 | self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format( | 986 | self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format( |
975 | self.log_base, task_str)) | 987 | self.log_base, task_str)) |
976 | raise Exception('pdf img empty') | 988 | raise Exception('pdf img empty') |
977 | elif pdf_handler.img_count >= max_img_count: | 989 | elif pdf_handler.img_count >= max_img_count: |
978 | self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format( | 990 | self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format( |
979 | self.log_base, task_str, pdf_handler.img_count)) | 991 | self.log_base, task_str, pdf_handler.img_count)) |
980 | 992 | ||
993 | try: | ||
994 | report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport | ||
995 | report_table.objects.create( | ||
996 | case_number=doc.application_id, | ||
997 | request_team=RequestTeam.get_value(doc.document_scheme, 0), | ||
998 | request_trigger=RequestTrigger.get_value(doc.data_source, 0), | ||
999 | input_file=doc.document_name, | ||
1000 | transaction_start=doc.start_time, | ||
1001 | transaction_end=doc.start_time, | ||
1002 | successful_at_this_level=False, | ||
1003 | failure_reason=FailureReason.IMG_LIMIT.value, | ||
1004 | process_name=ProcessName.ALL.value, | ||
1005 | notes='pdf page count: {0}'.format(str(pdf_handler.img_count)) | ||
1006 | ) | ||
1007 | except Exception as e: | ||
1008 | self.online_log.error('{0} [process error (report db save)] [error={1}]'.format( | ||
1009 | self.log_base, traceback.format_exc())) | ||
1010 | |||
1011 | try: | ||
1012 | doc.status = DocStatus.PROCESS_FAILED.value | ||
1013 | doc.save() | ||
1014 | except Exception as e: | ||
1015 | self.online_log.error('{0} [process error (db save)] [error={1}]'.format( | ||
1016 | self.log_base, traceback.format_exc())) | ||
1017 | else: | ||
1018 | with lock: | ||
1019 | todo_count_dict[task_str] = pdf_handler.img_count | ||
1020 | for img_idx, img_path in enumerate(pdf_handler.img_path_list): | ||
1021 | while img_queue.full(): | ||
1022 | self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) | ||
1023 | time.sleep(self.sleep_time_img_put) | ||
1024 | if pdf_handler.is_ebank: | ||
1025 | try: | ||
1026 | text_list = pdf_handler.page_text_list[img_idx].pop('rebuild_text') | ||
1027 | except Exception as e: | ||
1028 | text_list = [] | ||
1029 | else: | ||
1030 | text_list = [] | ||
1031 | img_queue.put((business_type, img_path, text_list)) | ||
1032 | # except EDMSException as e: | ||
1033 | # try: | ||
1034 | # doc.status = DocStatus.PROCESS_FAILED.value | ||
1035 | # doc.save() | ||
1036 | # self.online_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format( | ||
1037 | # self.log_base, task_str, traceback.format_exc())) | ||
1038 | # except Exception as e: | ||
1039 | # self.online_log.error('{0} [process error (db save 1)] [error={1}]'.format( | ||
1040 | # self.log_base, traceback.format_exc())) | ||
1041 | # error_list.append(1) | ||
1042 | # return | ||
1043 | except Exception as e: | ||
981 | try: | 1044 | try: |
1045 | end_time = timezone.now() | ||
982 | report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport | 1046 | report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport |
983 | report_table.objects.create( | 1047 | report_table.objects.create( |
984 | case_number=doc.application_id, | 1048 | case_number=doc.application_id, |
... | @@ -986,11 +1050,10 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -986,11 +1050,10 @@ class Command(BaseCommand, LoggerMixin): |
986 | request_trigger=RequestTrigger.get_value(doc.data_source, 0), | 1050 | request_trigger=RequestTrigger.get_value(doc.data_source, 0), |
987 | input_file=doc.document_name, | 1051 | input_file=doc.document_name, |
988 | transaction_start=doc.start_time, | 1052 | transaction_start=doc.start_time, |
989 | transaction_end=doc.start_time, | 1053 | transaction_end=end_time, |
990 | successful_at_this_level=False, | 1054 | successful_at_this_level=False, |
991 | failure_reason=FailureReason.IMG_LIMIT.value, | 1055 | failure_reason=FailureReason.PDF.value, |
992 | process_name=ProcessName.ALL.value, | 1056 | process_name=ProcessName.ALL.value, |
993 | notes='pdf page count: {0}'.format(str(pdf_handler.img_count)) | ||
994 | ) | 1057 | ) |
995 | except Exception as e: | 1058 | except Exception as e: |
996 | self.online_log.error('{0} [process error (report db save)] [error={1}]'.format( | 1059 | self.online_log.error('{0} [process error (report db save)] [error={1}]'.format( |
... | @@ -999,64 +1062,114 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -999,64 +1062,114 @@ class Command(BaseCommand, LoggerMixin): |
999 | try: | 1062 | try: |
1000 | doc.status = DocStatus.PROCESS_FAILED.value | 1063 | doc.status = DocStatus.PROCESS_FAILED.value |
1001 | doc.save() | 1064 | doc.save() |
1065 | self.online_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] ' | ||
1066 | '[error={2}]'.format(self.log_base, task_str, traceback.format_exc())) | ||
1002 | except Exception as e: | 1067 | except Exception as e: |
1003 | self.online_log.error('{0} [process error (db save)] [error={1}]'.format( | 1068 | self.online_log.error('{0} [process error (db save)] [error={1}]'.format( |
1004 | self.log_base, traceback.format_exc())) | 1069 | self.log_base, traceback.format_exc())) |
1005 | else: | 1070 | error_list.append(1) |
1006 | with lock: | 1071 | return |
1007 | todo_count_dict[task_str] = pdf_handler.img_count | 1072 | else: # e-contract |
1008 | for img_idx, img_path in enumerate(pdf_handler.img_path_list): | ||
1009 | while img_queue.full(): | ||
1010 | self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) | ||
1011 | time.sleep(self.sleep_time_img_put) | ||
1012 | if pdf_handler.is_ebank: | ||
1013 | try: | ||
1014 | text_list = pdf_handler.page_text_list[img_idx].pop('rebuild_text') | ||
1015 | except Exception as e: | ||
1016 | text_list = [] | ||
1017 | else: | ||
1018 | text_list = [] | ||
1019 | img_queue.put((business_type, img_path, text_list)) | ||
1020 | # except EDMSException as e: | ||
1021 | # try: | ||
1022 | # doc.status = DocStatus.PROCESS_FAILED.value | ||
1023 | # doc.save() | ||
1024 | # self.online_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format( | ||
1025 | # self.log_base, task_str, traceback.format_exc())) | ||
1026 | # except Exception as e: | ||
1027 | # self.online_log.error('{0} [process error (db save 1)] [error={1}]'.format( | ||
1028 | # self.log_base, traceback.format_exc())) | ||
1029 | # error_list.append(1) | ||
1030 | # return | ||
1031 | except Exception as e: | ||
1032 | try: | 1073 | try: |
1033 | end_time = timezone.now() | 1074 | # pdf下载 处理 图片存储 识别 |
1034 | report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport | 1075 | for times in range(consts.RETRY_TIMES): |
1035 | report_table.objects.create( | 1076 | try: |
1036 | case_number=doc.application_id, | 1077 | self.edms.download(pdf_path, doc.metadata_version_id, doc.document_scheme, business_type) |
1037 | request_team=RequestTeam.get_value(doc.document_scheme, 0), | 1078 | self.online_log.info('{0} [edms download success] [task={1}] [times={2}] ' |
1038 | request_trigger=RequestTrigger.get_value(doc.data_source, 0), | 1079 | '[pdf_path={3}]'.format(self.e_log_base, task_str, times, pdf_path)) |
1039 | input_file=doc.document_name, | 1080 | |
1040 | transaction_start=doc.start_time, | 1081 | self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format( |
1041 | transaction_end=end_time, | 1082 | self.e_log_base, task_str, times)) |
1042 | successful_at_this_level=False, | 1083 | pdf_handler.e_contract_process() |
1043 | failure_reason=FailureReason.PDF.value, | 1084 | self.online_log.info( |
1044 | process_name=ProcessName.ALL.value, | 1085 | '{0} [pdf to img end] [task={1}] [times={2}]'.format(self.e_log_base, task_str, times)) |
1045 | ) | 1086 | except Exception as e: |
1046 | except Exception as e: | 1087 | self.online_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] ' |
1047 | self.online_log.error('{0} [process error (report db save)] [error={1}]'.format( | 1088 | '[error={3}]'.format(self.e_log_base, task_str, times, |
1048 | self.log_base, traceback.format_exc())) | 1089 | traceback.format_exc())) |
1090 | else: | ||
1091 | break | ||
1092 | else: | ||
1093 | raise Exception('download or pdf to img failed') | ||
1094 | |||
1095 | if classify_1_str == str(consts.CONTRACT_CLASSIFY): | ||
1096 | ocr_result = afc_predict(pdf_handler.pdf_info) | ||
1097 | page_res = {} | ||
1098 | for page_num, page_info in ocr_result.get('page_info', {}).items(): | ||
1099 | if isinstance(page_num, str) and page_num.startswith('page_'): | ||
1100 | page_res[page_num] = { | ||
1101 | 'classify': int(classify_1_str), | ||
1102 | 'page_num': page_num, | ||
1103 | 'page_info': page_info | ||
1104 | } | ||
1049 | 1105 | ||
1050 | try: | 1106 | else: |
1051 | doc.status = DocStatus.PROCESS_FAILED.value | 1107 | file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str) |
1052 | doc.save() | 1108 | file_type_2 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_2_str) |
1053 | self.online_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] ' | 1109 | ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1) |
1054 | '[error={2}]'.format(self.log_base, task_str, traceback.format_exc())) | 1110 | rebuild_res_1 = {} |
1111 | page_res = {} | ||
1112 | for field_name, field_info in ocr_result_1.items(): | ||
1113 | page_num = field_info.pop('page', 'page_1') | ||
1114 | rebuild_res_1.setdefault(page_num, dict())[field_name] = field_info | ||
1115 | for page_num, page_info in rebuild_res_1.items(): | ||
1116 | if isinstance(page_num, str) and page_num.startswith('page_'): | ||
1117 | page_res[page_num] = { | ||
1118 | 'classify': int(classify_1_str), | ||
1119 | 'page_num': page_num, | ||
1120 | 'page_info': page_info | ||
1121 | } | ||
1122 | if isinstance(file_type_2, int): | ||
1123 | rebuild_res_2 = {} | ||
1124 | ocr_result_2 = hil_predict(pdf_handler.pdf_info, file_type_2) | ||
1125 | for field_name, field_info in ocr_result_2.items(): | ||
1126 | page_num = field_info.pop('page', 'page_1') | ||
1127 | rebuild_res_2.setdefault(page_num, dict())[field_name] = field_info | ||
1128 | for page_num, page_info in ocr_result_2.items(): | ||
1129 | if isinstance(page_num, str) and page_num.startswith('page_'): | ||
1130 | page_res[page_num] = { | ||
1131 | 'classify': int(classify_2_str), | ||
1132 | 'page_num': page_num, | ||
1133 | 'page_info': page_info | ||
1134 | } | ||
1135 | |||
1136 | contract_res = {} | ||
1137 | for img_path_tmp, page_key in pdf_handler.img_path_pno_list: | ||
1138 | if page_key in page_res: | ||
1139 | img_contract_res = { | ||
1140 | 'code': 1, | ||
1141 | 'data': [ | ||
1142 | { | ||
1143 | 'classify': page_res[page_key].pop('classify', consts.OTHER_CLASSIFY), | ||
1144 | 'data': page_res[page_key] | ||
1145 | } | ||
1146 | ] | ||
1147 | } | ||
1148 | else: | ||
1149 | img_contract_res = { | ||
1150 | 'code': 1, | ||
1151 | 'data': [ | ||
1152 | { | ||
1153 | 'classify': int(classify_1_str), | ||
1154 | } | ||
1155 | ] | ||
1156 | } | ||
1157 | contract_res[img_path_tmp] = img_contract_res | ||
1158 | |||
1159 | with lock: | ||
1160 | res_dict[task_str] = contract_res | ||
1161 | finish_queue.put(task_str) | ||
1055 | except Exception as e: | 1162 | except Exception as e: |
1056 | self.online_log.error('{0} [process error (db save)] [error={1}]'.format( | 1163 | try: |
1057 | self.log_base, traceback.format_exc())) | 1164 | doc.status = DocStatus.PROCESS_FAILED.value |
1058 | error_list.append(1) | 1165 | doc.save() |
1059 | return | 1166 | self.online_log.warn('{0} [process failed (e-contract)] [task={1}] ' |
1167 | '[error={2}]'.format(self.e_log_base, task_str, traceback.format_exc())) | ||
1168 | except Exception as e: | ||
1169 | self.online_log.error('{0} [process error (db save)] [error={1}]'.format( | ||
1170 | self.e_log_base, traceback.format_exc())) | ||
1171 | error_list.append(1) | ||
1172 | return | ||
1060 | 1173 | ||
1061 | def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url, error_list): | 1174 | def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url, error_list): |
1062 | while len(error_list) == 0 or not img_queue.empty(): | 1175 | while len(error_list) == 0 or not img_queue.empty(): |
... | @@ -1801,7 +1914,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1801,7 +1914,7 @@ class Command(BaseCommand, LoggerMixin): |
1801 | finish_queue = Queue() | 1914 | finish_queue = Queue() |
1802 | 1915 | ||
1803 | process_list = [] | 1916 | process_list = [] |
1804 | pdf_process = Process(target=self.pdf_2_img_2_queue, args=(img_queue, todo_count_dict, lock, error_list)) | 1917 | pdf_process = Process(target=self.pdf_2_img_2_queue, args=(img_queue, todo_count_dict, lock, error_list, res_dict, finish_queue)) |
1805 | process_list.append(pdf_process) | 1918 | process_list.append(pdf_process) |
1806 | 1919 | ||
1807 | for url in self.ocr_1_urls.values(): | 1920 | for url in self.ocr_1_urls.values(): | ... | ... |
... | @@ -789,3 +789,24 @@ class HILCACompareResultRecord(models.Model): | ... | @@ -789,3 +789,24 @@ class HILCACompareResultRecord(models.Model): |
789 | db_table = 'hil_ca_compare_result_record' | 789 | db_table = 'hil_ca_compare_result_record' |
790 | 790 | ||
791 | 791 | ||
792 | class HILContract(models.Model): | ||
793 | id = models.AutoField(primary_key=True, verbose_name="id") # 主键 | ||
794 | application_id = models.CharField(max_length=64, verbose_name="申请id") # 索引 | ||
795 | create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') | ||
796 | |||
797 | class Meta: | ||
798 | managed = False | ||
799 | db_table = 'hil_contract' | ||
800 | |||
801 | |||
802 | class AFCContract(models.Model): | ||
803 | id = models.AutoField(primary_key=True, verbose_name="id") # 主键 | ||
804 | application_id = models.CharField(max_length=64, verbose_name="申请id") # 索引 | ||
805 | create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') | ||
806 | |||
807 | class Meta: | ||
808 | managed = False | ||
809 | db_table = 'afc_contract' | ||
810 | situ_db_label = 'afc' | ||
811 | |||
812 | ... | ... |
1 | import os | ||
1 | import base64 | 2 | import base64 |
2 | import requests | 3 | import requests |
3 | from common.redis_cache import redis_handler as rh | 4 | from common.redis_cache import redis_handler as rh |
... | @@ -44,7 +45,6 @@ class ECM: | ... | @@ -44,7 +45,6 @@ class ECM: |
44 | "b_coborrower_id", "b_coborrower_name", "b_guarantor_id", "b_guarantor_name", | 45 | "b_coborrower_id", "b_coborrower_name", "b_guarantor_id", "b_guarantor_name", |
45 | "b_frontend_partner", "b_dealer_code", "b_dealer_name", "b_input_date", "b_comment", | 46 | "b_frontend_partner", "b_dealer_code", "b_dealer_name", "b_input_date", "b_comment", |
46 | "b_contract_no", "b_location"] | 47 | "b_contract_no", "b_location"] |
47 | self.contract_prefix = '电子' | ||
48 | 48 | ||
49 | def update_oauth_token(self): | 49 | def update_oauth_token(self): |
50 | response = requests.post(self.oauth_url, headers=self.oauth_headers, data=self.oauth_payload, verify=False) | 50 | response = requests.post(self.oauth_url, headers=self.oauth_headers, data=self.oauth_payload, verify=False) |
... | @@ -69,9 +69,9 @@ class ECM: | ... | @@ -69,9 +69,9 @@ class ECM: |
69 | def get_headers(self): | 69 | def get_headers(self): |
70 | return {'Authorization': '{0} {1}'.format(self.token_type, self.get_oauth_token())} | 70 | return {'Authorization': '{0} {1}'.format(self.token_type, self.get_oauth_token())} |
71 | 71 | ||
72 | def search(self, application_id, business_type): | 72 | def search(self, application_id, business_type, prefix): |
73 | sql = "select * from {0} where b_application_no='{1}' and object_name like '{2}%'".format( | 73 | sql = "select * from {0} where b_application_no='{1}' and object_name like '{2}%'".format( |
74 | self.settlement_type, application_id, self.contract_prefix) | 74 | self.settlement_type, application_id, prefix) |
75 | search_args = { | 75 | search_args = { |
76 | "userName": self.username, | 76 | "userName": self.username, |
77 | "password": self.pwd, | 77 | "password": self.pwd, |
... | @@ -96,7 +96,6 @@ class ECM: | ... | @@ -96,7 +96,6 @@ class ECM: |
96 | result.append((object_name, object_id)) | 96 | result.append((object_name, object_id)) |
97 | return result | 97 | return result |
98 | 98 | ||
99 | |||
100 | def download(self, save_path, object_id, document_scheme, business_type): | 99 | def download(self, save_path, object_id, document_scheme, business_type): |
101 | doc_type, _, _ = self.doc_type_map.get(document_scheme) | 100 | doc_type, _, _ = self.doc_type_map.get(document_scheme) |
102 | download_json = { | 101 | download_json = { | ... | ... |
... | @@ -36,12 +36,14 @@ from .models import ( | ... | @@ -36,12 +36,14 @@ from .models import ( |
36 | AFCSECompareResultRecord, | 36 | AFCSECompareResultRecord, |
37 | HILCACompareResultRecord, | 37 | HILCACompareResultRecord, |
38 | HILSECompareResultRecord, | 38 | HILSECompareResultRecord, |
39 | HILContract, | ||
40 | AFCContract, | ||
39 | ) | 41 | ) |
40 | from .named_enum import ErrorType | 42 | from .named_enum import ErrorType |
41 | from .mixins import DocHandler | 43 | from .mixins import DocHandler |
42 | from . import consts | 44 | from . import consts |
43 | from apps.account.authentication import OAuth2AuthenticationWithUser | 45 | from apps.account.authentication import OAuth2AuthenticationWithUser |
44 | from celery_compare.tasks import compare | 46 | from celery_compare.tasks import compare, forwarding_station |
45 | 47 | ||
46 | 48 | ||
47 | class CustomDate(fields.Date): | 49 | class CustomDate(fields.Date): |
... | @@ -1164,5 +1166,11 @@ class SEContractView(GenericView): | ... | @@ -1164,5 +1166,11 @@ class SEContractView(GenericView): |
1164 | # pos上传e-contract信息接口 SE | 1166 | # pos上传e-contract信息接口 SE |
1165 | @use_args(se_contract_args, location='data') | 1167 | @use_args(se_contract_args, location='data') |
1166 | def post(self, request, args): | 1168 | def post(self, request, args): |
1167 | self.running_log.info('e-contract in') | 1169 | contract_info = args.get('content', {}) |
1170 | application_id = contract_info.get('applicationId', '') | ||
1171 | entity = contract_info.get('applicationEntity', '') | ||
1172 | table_class = HILContract if entity == consts.HIL_PREFIX else AFCContract | ||
1173 | table_class.objects.create(application_id=application_id) | ||
1174 | forwarding_station.apply_async((application_id, entity), queue='queue_compare', countdown=conf.DELAY_SECONDS) | ||
1175 | self.running_log.info('[e-contract] [application_id={0}] [entity={1}]'.format(application_id, entity)) | ||
1168 | return response.ok() | 1176 | return response.ok() | ... | ... |
... | @@ -27,10 +27,13 @@ from apps.doc.models import ( | ... | @@ -27,10 +27,13 @@ from apps.doc.models import ( |
27 | AFCCACompareResult, | 27 | AFCCACompareResult, |
28 | HILSECompareResult, | 28 | HILSECompareResult, |
29 | HILCACompareResult, | 29 | HILCACompareResult, |
30 | AFCDoc, | ||
31 | HILDoc | ||
30 | ) | 32 | ) |
31 | from apps.doc import consts | 33 | from apps.doc import consts |
32 | from apps.doc.ocr.gcap import gcap | 34 | from apps.doc.ocr.gcap import gcap |
33 | from apps.doc.ocr.cms import cms | 35 | from apps.doc.ocr.cms import cms |
36 | from apps.doc.ocr.ecm import ECM, rh | ||
34 | from apps.doc.exceptions import GCAPException | 37 | from apps.doc.exceptions import GCAPException |
35 | from apps.doc.named_enum import RequestTeam, RequestTrigger, ProcessName, ErrorType | 38 | from apps.doc.named_enum import RequestTeam, RequestTrigger, ProcessName, ErrorType |
36 | from common.tools.comparison import cp | 39 | from common.tools.comparison import cp |
... | @@ -38,9 +41,11 @@ from common.tools.des import decode_des | ... | @@ -38,9 +41,11 @@ from common.tools.des import decode_des |
38 | 41 | ||
39 | compare_log = logging.getLogger('compare') | 42 | compare_log = logging.getLogger('compare') |
40 | log_base = '[Compare]' | 43 | log_base = '[Compare]' |
44 | e_log_base = '[e-contract]' | ||
41 | empty_str = '' | 45 | empty_str = '' |
42 | empty_error_type = 1000 | 46 | empty_error_type = 1000 |
43 | des_key = conf.CMS_DES_KEY | 47 | des_key = conf.CMS_DES_KEY |
48 | ecm = ECM() | ||
44 | 49 | ||
45 | 50 | ||
46 | def rotate_bound(image, angle): | 51 | def rotate_bound(image, angle): |
... | @@ -1867,4 +1872,32 @@ def compare(application_id, application_entity, uniq_seq, ocr_res_id, is_ca=True | ... | @@ -1867,4 +1872,32 @@ def compare(application_id, application_entity, uniq_seq, ocr_res_id, is_ca=True |
1867 | se_compare(application_id, application_entity, ocr_res_id, last_obj, ocr_res_dict, is_cms) | 1872 | se_compare(application_id, application_entity, ocr_res_id, last_obj, ocr_res_dict, is_cms) |
1868 | 1873 | ||
1869 | 1874 | ||
1870 | 1875 | @app.task | |
1876 | def forwarding_station(application_id, entity): | ||
1877 | compare_log.info('{0} [forward start] [application_id={1}] [entity={2}]'.format(e_log_base, application_id, entity)) | ||
1878 | doc_class = HILDoc if entity in consts.HIL_SET else AFCDoc | ||
1879 | entity_prefix = consts.HIL_PREFIX if entity in consts.HIL_SET else consts.AFC_PREFIX | ||
1880 | for (classify_1, classify_2), prefix in consts.FILE_NAME_PREFIX_MAP.get(entity): | ||
1881 | try: | ||
1882 | file_list = ecm.search(application_id, entity, prefix.format(application_id)) # TODO 获取最新文件 | ||
1883 | except Exception as e: | ||
1884 | compare_log.error('{0} [search failed] [application_id={1}] [entity={2}] [error={3}]'.format( | ||
1885 | e_log_base, application_id, entity, traceback.format_exc())) | ||
1886 | else: | ||
1887 | compare_log.info('{0} [search end] [application_id={1}] [entity={2}] [file_list={3}]'.format( | ||
1888 | e_log_base, application_id, entity, file_list)) | ||
1889 | for object_name, object_id in file_list: | ||
1890 | doc = doc_class.objects.create( | ||
1891 | metadata_version_id=object_id, | ||
1892 | application_id=application_id, | ||
1893 | document_name=object_name, | ||
1894 | document_scheme='SETTLEMENT', | ||
1895 | data_source='POS', | ||
1896 | upload_finish_time=datetime.now(), | ||
1897 | ) | ||
1898 | task = consts.SPLIT_STR.join([entity_prefix, str(doc.id), str(classify_1), str(classify_2)]) | ||
1899 | enqueue_res = rh.enqueue([task], False) | ||
1900 | compare_log.info('{0} [upload success] [res={1}] [application_id={2}] [entity={3}] [object_name={4}] ' | ||
1901 | '[object_id={5}] [doc_id={6}]'.format(e_log_base, enqueue_res, application_id, entity, | ||
1902 | object_name, object_id, doc.id)) | ||
1903 | compare_log.info('{0} [forward end] [application_id={1}] [entity={2}]'.format(e_log_base, application_id, entity)) | ... | ... |
1 | # -*- coding: utf-8 -*- | ||
2 | # @Author : lk | ||
3 | # @Email : 9428.al@gmail.com | ||
4 | # @Created Date : 2021-06-29 17:43:46 | ||
5 | # @Last Modified : 2021-09-07 14:11:25 | ||
6 | # @Description : | ||
7 | |||
8 | from .get_char import Finder | ||
9 | |||
10 | |||
11 | def predict(pdf_info): | ||
12 | # 输入是整个 PDF 中的信息 | ||
13 | f = Finder(pdf_info) | ||
14 | results = f.get_info() | ||
15 | |||
16 | return results | ||
17 | |||
18 |
1 | # -*- coding: utf-8 -*- | ||
2 | # @Author : lk | ||
3 | # @Email : 9428.al@gmail.com | ||
4 | # @Create Date : 2021-07-20 16:42:41 | ||
5 | # @Last Modified : 2021-09-07 19:52:39 | ||
6 | # @Description : | ||
7 | |||
8 | import re | ||
9 | import numpy as np | ||
10 | from fuzzywuzzy import fuzz | ||
11 | |||
12 | |||
13 | class Finder: | ||
14 | |||
15 | def __init__(self, pdf_info): | ||
16 | self.pdf_info = pdf_info | ||
17 | self.is_asp = False | ||
18 | self.item = {"words": None, | ||
19 | "position": None, | ||
20 | } | ||
21 | |||
22 | def gen_init_result(self, is_asp): | ||
23 | # 格式化算法输出 | ||
24 | self.init_result = {"page_1": {"合同编号": self.item, | ||
25 | "所购车辆价格": self.item, | ||
26 | "车架号": self.item, | ||
27 | "贷款本金金额": {"大写": self.item, | ||
28 | "小写": self.item, | ||
29 | "车辆贷款本金金额": self.item, | ||
30 | "附加产品融资贷款本金总金额": self.item, | ||
31 | }, | ||
32 | "贷款期限": self.item, | ||
33 | "附加产品融资贷款本金总金额明细": self.item, | ||
34 | "借款人签字及时间": self.item, | ||
35 | }, | ||
36 | "page_2": {"合同编号": self.item, | ||
37 | "借款人及抵押人": {"name": self.item, | ||
38 | "id": self.item, | ||
39 | }, | ||
40 | "共同借款人及共同抵押人": {"name": self.item, | ||
41 | "id": self.item, | ||
42 | }, | ||
43 | "保证人1": {"name": self.item, | ||
44 | "id": self.item, | ||
45 | }, | ||
46 | "保证人2": {"name": self.item, | ||
47 | "id": self.item, | ||
48 | }, | ||
49 | "所购车辆价格": self.item, | ||
50 | "车架号": self.item, | ||
51 | "经销商": self.item, | ||
52 | "贷款本金金额": {"大写": self.item, | ||
53 | "小写": self.item, | ||
54 | "车辆贷款本金金额": self.item, | ||
55 | "附加产品融资贷款本金总金额": self.item, | ||
56 | }, | ||
57 | "贷款期限": self.item, | ||
58 | "还款账户": {"账号": self.item, | ||
59 | "户名": self.item, | ||
60 | "开户行": self.item, | ||
61 | }, | ||
62 | }, | ||
63 | "page_3": {"合同编号": self.item, | ||
64 | "还款计划表": self.item, | ||
65 | }, | ||
66 | "page_4": {"合同编号": self.item, | ||
67 | "附加产品融资贷款本金总金额明细": self.item, | ||
68 | }, | ||
69 | "page_5": {"合同编号": self.item, | ||
70 | }, | ||
71 | "page_6": {"合同编号": self.item, | ||
72 | }, | ||
73 | } | ||
74 | if self.is_asp == False: | ||
75 | self.init_result["page_7"] = {"合同编号": self.item, | ||
76 | "主借人签字": {"签字": self.item, | ||
77 | "日期": self.item, | ||
78 | }, | ||
79 | "共借人签字": {"签字": self.item, | ||
80 | "日期": self.item, | ||
81 | }, | ||
82 | "保证人1签字": {"签字": self.item, | ||
83 | "日期": self.item, | ||
84 | }, | ||
85 | "保证人2签字": {"签字": self.item, | ||
86 | "日期": self.item, | ||
87 | }, | ||
88 | "见证人签字": {"签字": self.item, | ||
89 | "日期": self.item, | ||
90 | }, | ||
91 | } | ||
92 | else: | ||
93 | self.init_result["page_7"] = {"合同编号": self.item, | ||
94 | } | ||
95 | self.init_result["page_8"] = {"合同编号": self.item, | ||
96 | "主借人签字": {"签字": self.item, | ||
97 | "日期": self.item, | ||
98 | }, | ||
99 | "共借人签字": {"签字": self.item, | ||
100 | "日期": self.item, | ||
101 | }, | ||
102 | "保证人1签字": {"签字": self.item, | ||
103 | "日期": self.item, | ||
104 | }, | ||
105 | "保证人2签字": {"签字": self.item, | ||
106 | "日期": self.item, | ||
107 | }, | ||
108 | "见证人签字": {"签字": self.item, | ||
109 | "日期": self.item, | ||
110 | }, | ||
111 | } | ||
112 | |||
113 | |||
114 | def get_contract_no(self, page_num): | ||
115 | """传入页码,查看该页码右上角的编号 | ||
116 | |||
117 | Args: | ||
118 | page_num (string): | ||
119 | |||
120 | Returns: | ||
121 | sting: | ||
122 | """ | ||
123 | contract_no = self.item.copy() | ||
124 | # 只看第一页 | ||
125 | for block in self.pdf_info[page_num]['blocks']: | ||
126 | if block['type'] != 0: | ||
127 | continue | ||
128 | for line in block['lines']: | ||
129 | for span in line['spans']: | ||
130 | bbox, text = span['bbox'], span['text'] | ||
131 | if '合同编号:' in text: | ||
132 | words = text.split(':')[-1] | ||
133 | contract_no['position'] = bbox | ||
134 | contract_no['words'] = words | ||
135 | return contract_no | ||
136 | |||
137 | def get_vehicle_price(self, page_num='0'): | ||
138 | vehicle_price = self.item.copy() | ||
139 | for block in self.pdf_info[page_num]['blocks']: | ||
140 | if block['type'] != 0: | ||
141 | continue | ||
142 | for line in block['lines']: | ||
143 | for span in line['spans']: | ||
144 | bbox, text = span['bbox'], span['text'] | ||
145 | if '所购车辆价格为人民币' in text: | ||
146 | words = text.split('币')[-1] | ||
147 | vehicle_price['position'] = bbox | ||
148 | vehicle_price['words'] = words | ||
149 | return vehicle_price | ||
150 | |||
151 | def get_vin(self, page_num='0'): | ||
152 | vin = self.item.copy() | ||
153 | for block in self.pdf_info[page_num]['blocks']: | ||
154 | if block['type'] != 0: | ||
155 | continue | ||
156 | for line in block['lines']: | ||
157 | for span in line['spans']: | ||
158 | bbox, text = span['bbox'], span['text'] | ||
159 | if '车架号:' in text: | ||
160 | words = text.split(':')[-1] | ||
161 | vin['position'] = bbox | ||
162 | vin['words'] = words | ||
163 | return vin | ||
164 | |||
165 | def get_loan_principal(self, page_num='0'): | ||
166 | chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', | ||
167 | '佰', '仟', '万', '亿', '元', '角', '分', '零', '整'] | ||
168 | upper = self.item.copy() | ||
169 | lower = self.item.copy() | ||
170 | asp_1 = self.item.copy() | ||
171 | asp_2 = self.item.copy() | ||
172 | anchor_bbox = None | ||
173 | for block in self.pdf_info[page_num]['blocks']: | ||
174 | if block['type'] != 0: | ||
175 | continue | ||
176 | for line in block['lines']: | ||
177 | for span in line['spans']: | ||
178 | bbox, text = span['bbox'], span['text'] | ||
179 | if fuzz.ratio(''.join(chinese_keywords), text) > 15: | ||
180 | text = text.split(':')[-1].strip() | ||
181 | upper['position'] = bbox | ||
182 | upper['words'] = text | ||
183 | if '小写:¥' in text: | ||
184 | words = text.split('¥')[-1].strip() | ||
185 | lower['position'] = bbox | ||
186 | lower['words'] = words | ||
187 | if '附加产品融资贷款本金总金额' == text: | ||
188 | anchor_bbox = bbox | ||
189 | if anchor_bbox: | ||
190 | for block in self.pdf_info[page_num]['blocks']: | ||
191 | if block['type'] != 0: | ||
192 | continue | ||
193 | for line in block['lines']: | ||
194 | for span in line['spans']: | ||
195 | bbox, text = span['bbox'], span['text'] | ||
196 | if np.mean(bbox[1::2]) < np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text: | ||
197 | words = re.findall(r'人民币:小写:\[(.*)\]', text)[0] | ||
198 | asp_1['position'] = bbox | ||
199 | asp_1['words'] = words | ||
200 | if np.mean(bbox[1::2]) > np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text: | ||
201 | words = re.findall(r'人民币:小写:\[(.*)\]', text)[0] | ||
202 | asp_2['position'] = bbox | ||
203 | asp_2['words'] = words | ||
204 | return upper, lower, asp_1, asp_2 | ||
205 | |||
206 | def get_loan_term(self, page_num='0'): | ||
207 | loan_term = self.item.copy() | ||
208 | all_text = '' | ||
209 | for block in self.pdf_info[page_num]['blocks']: | ||
210 | if block['type'] != 0: | ||
211 | continue | ||
212 | for line in block['lines']: | ||
213 | for span in line['spans']: | ||
214 | bbox, text = span['bbox'], span['text'] | ||
215 | all_text += text | ||
216 | matchs = re.search(r'贷款期限(\d+)个月', all_text) | ||
217 | if matchs: | ||
218 | words = matchs.group(1) | ||
219 | for block in self.pdf_info[page_num]['blocks']: | ||
220 | if block['type'] != 0: | ||
221 | continue | ||
222 | for line in block['lines']: | ||
223 | for span in line['spans']: | ||
224 | bbox, text = span['bbox'], span['text'] | ||
225 | if f'{words}个月' in text: | ||
226 | loan_term['position'] = bbox | ||
227 | loan_term['words'] = words | ||
228 | return loan_term | ||
229 | |||
230 | def get_asp_details(self, page_num): | ||
231 | asp_details_table_term = self.item.copy() | ||
232 | |||
233 | asp_details_table = [] | ||
234 | asp_details_text_list = [] | ||
235 | table = False | ||
236 | for block in self.pdf_info[page_num]['blocks']: | ||
237 | if block['type'] != 0: | ||
238 | continue | ||
239 | for line in block['lines']: | ||
240 | for span in line['spans']: | ||
241 | bbox, text = span['bbox'], span['text'] | ||
242 | if '附加产品融资贷款本金总金额明细' == text: | ||
243 | table = True | ||
244 | if '第二条' in text or '征信管理' in text: | ||
245 | table = False | ||
246 | if table == True: | ||
247 | asp_details_text_list.append(text) | ||
248 | |||
249 | for i in range((len(asp_details_text_list)+2)//3): | ||
250 | |||
251 | line = [] | ||
252 | if i == 0: | ||
253 | line = [asp_details_text_list[0]] | ||
254 | else: | ||
255 | for j in range(3): | ||
256 | line.append(asp_details_text_list[i*3-2+j]) | ||
257 | |||
258 | asp_details_table.append(line) | ||
259 | |||
260 | if len(asp_details_table) > 0: | ||
261 | asp_details_table_term['words'] = asp_details_table | ||
262 | return asp_details_table_term | ||
263 | |||
264 | def get_signature(self): | ||
265 | signature = self.item.copy() | ||
266 | |||
267 | for block in self.pdf_info['0']['blocks']: | ||
268 | if block['type'] != 0: | ||
269 | continue | ||
270 | for line in block['lines']: | ||
271 | for span in line['spans']: | ||
272 | bbox, text = span['bbox'], span['text'] | ||
273 | if '签署日期' in text: | ||
274 | words = text | ||
275 | signature['words'] = words | ||
276 | signature['position'] = bbox | ||
277 | return signature | ||
278 | |||
279 | def get_somebody(self, top, bottom): | ||
280 | # 指定上下边界后,返回上下边界内的客户信息 | ||
281 | _name = self.item.copy() | ||
282 | _id = self.item.copy() | ||
283 | # 只看第一页,先划定上下边界 | ||
284 | y_top = 0 | ||
285 | y_bottom = 0 | ||
286 | for block in self.pdf_info['1']['blocks']: | ||
287 | if block['type'] != 0: | ||
288 | continue | ||
289 | for line in block['lines']: | ||
290 | for span in line['spans']: | ||
291 | bbox, text = span['bbox'], span['text'] | ||
292 | if top in text: | ||
293 | y_top = bbox[3] | ||
294 | if bottom in text: | ||
295 | y_bottom = bbox[3] | ||
296 | for block in self.pdf_info['1']['blocks']: | ||
297 | if block['type'] != 0: | ||
298 | continue | ||
299 | for line in block['lines']: | ||
300 | for span in line['spans']: | ||
301 | bbox, text = span['bbox'], span['text'] | ||
302 | if y_top < bbox[3] < y_bottom: | ||
303 | if '姓名/名称' in text: | ||
304 | words = text.split(':')[-1] | ||
305 | _name['position'] = bbox | ||
306 | _name['words'] = words | ||
307 | if '自然人身份证件号码/法人执照号码' in text: | ||
308 | words = text.split(':')[-1] | ||
309 | _id['position'] = bbox | ||
310 | _id['words'] = words | ||
311 | return _name, _id | ||
312 | |||
313 | def get_seller(self): | ||
314 | seller = self.item.copy() | ||
315 | # 先找到 key | ||
316 | anchor_bbox = None | ||
317 | for block in self.pdf_info['1']['blocks']: | ||
318 | if block['type'] != 0: | ||
319 | continue | ||
320 | for line in block['lines']: | ||
321 | for span in line['spans']: | ||
322 | bbox, text = span['bbox'], span['text'] | ||
323 | if '经销商' == text: | ||
324 | anchor_bbox = bbox | ||
325 | # 当找到了 key, 则根据 key 去匹配 value | ||
326 | if anchor_bbox: | ||
327 | half_width = self.pdf_info['1']['width'] * 0.5 | ||
328 | for block in self.pdf_info['1']['blocks']: | ||
329 | if block['type'] != 0: | ||
330 | continue | ||
331 | for line in block['lines']: | ||
332 | for span in line['spans']: | ||
333 | bbox, text = span['bbox'], span['text'] | ||
334 | if anchor_bbox[2]<np.mean(bbox[::2])<half_width and \ | ||
335 | anchor_bbox[1]<np.mean(bbox[1::2])<anchor_bbox[3]: | ||
336 | seller['position'] = bbox | ||
337 | seller['words'] = text | ||
338 | return seller | ||
339 | |||
340 | def get_payback_account(self): | ||
341 | account = self.item.copy() | ||
342 | account_name = self.item.copy() | ||
343 | account_bank = self.item.copy() | ||
344 | all_text = '' | ||
345 | for block in self.pdf_info['1']['blocks']: | ||
346 | if block['type'] != 0: | ||
347 | continue | ||
348 | for line in block['lines']: | ||
349 | for span in line['spans']: | ||
350 | bbox, text = span['bbox'], span['text'] | ||
351 | all_text += text | ||
352 | # 首先确定账户信息是哪种,我们只输出非另行通知的格式 | ||
353 | if '☑账号' in all_text: | ||
354 | all_text = all_text.replace(' ', '') | ||
355 | matchs_1 = re.findall(r'账号:(.*)户名', all_text) | ||
356 | if matchs_1: | ||
357 | words = matchs_1[0] | ||
358 | for block in self.pdf_info['1']['blocks']: | ||
359 | if block['type'] != 0: | ||
360 | continue | ||
361 | for line in block['lines']: | ||
362 | for span in line['spans']: | ||
363 | bbox, text = span['bbox'], span['text'] | ||
364 | if f'{words}' in text: | ||
365 | account['position'] = bbox | ||
366 | account['words'] = words | ||
367 | matchs_2 = re.findall(r'户名:(.*)开户行', all_text) | ||
368 | if matchs_2: | ||
369 | words = matchs_2[0] | ||
370 | for block in self.pdf_info['1']['blocks']: | ||
371 | if block['type'] != 0: | ||
372 | continue | ||
373 | for line in block['lines']: | ||
374 | for span in line['spans']: | ||
375 | bbox, text = span['bbox'], span['text'] | ||
376 | if f'{words}' in text: | ||
377 | account_name['position'] = bbox | ||
378 | account_name['words'] = words | ||
379 | matchs_3 = re.findall(r'开户行:(.*);', all_text) | ||
380 | if matchs_3: | ||
381 | words = matchs_3[0] | ||
382 | for block in self.pdf_info['1']['blocks']: | ||
383 | if block['type'] != 0: | ||
384 | continue | ||
385 | for line in block['lines']: | ||
386 | for span in line['spans']: | ||
387 | bbox, text = span['bbox'], span['text'] | ||
388 | if f'开户行:{words};' in text.replace(' ', ''): | ||
389 | account_bank['position'] = bbox | ||
390 | account_bank['words'] = words | ||
391 | return account, account_name, account_bank | ||
392 | |||
393 | def get_repayment_schedule(self): | ||
394 | repayment_schedule = self.item.copy() | ||
395 | # 只看第二页 | ||
396 | repayment_schedule_table = [] | ||
397 | repayment_schedule_text_list = [] | ||
398 | table = False | ||
399 | for block in self.pdf_info['2']['blocks']: | ||
400 | if block['type'] != 0: | ||
401 | continue | ||
402 | for line in block['lines']: | ||
403 | for span in line['spans']: | ||
404 | bbox, text = span['bbox'], span['text'] | ||
405 | if '序号' == text: | ||
406 | table = True | ||
407 | if '以上表格中所列的序号并非还款期数' in text: | ||
408 | table = False | ||
409 | if table == True: | ||
410 | repayment_schedule_text_list.append(text) | ||
411 | |||
412 | for i in range(len(repayment_schedule_text_list)//5): | ||
413 | |||
414 | line = [] | ||
415 | # 5表示5列的意思 | ||
416 | for j in range(5): | ||
417 | line.append(repayment_schedule_text_list[i*5+j]) | ||
418 | |||
419 | if str(i+1) == line[1]: | ||
420 | break | ||
421 | |||
422 | repayment_schedule_table.append(line) | ||
423 | |||
424 | if len(repayment_schedule_table) > 0: | ||
425 | repayment_schedule['words'] = repayment_schedule_table | ||
426 | return repayment_schedule | ||
427 | |||
428 | def get_signature_role_1(self): | ||
429 | signature_role_1 = self.init_item.copy() | ||
430 | # 先定位签字区域 | ||
431 | texts = [] | ||
432 | boxes = [] | ||
433 | page_num = None | ||
434 | position = None | ||
435 | words = None | ||
436 | region = False | ||
437 | for i in list(self.pdf_info.keys()): | ||
438 | for block in self.pdf_info[i]['blocks']: | ||
439 | if block['type'] != 0: | ||
440 | continue | ||
441 | for line in block['lines']: | ||
442 | for span in line['spans']: | ||
443 | bbox, text = span['bbox'], span['text'] | ||
444 | if '借款人(抵押人)' in text: | ||
445 | region = True | ||
446 | if '日期' in text: | ||
447 | region = False | ||
448 | if region == True: | ||
449 | page_num = i | ||
450 | texts.append(text) | ||
451 | boxes.append(bbox) | ||
452 | if len(texts) > 4: | ||
453 | words = '有' | ||
454 | else: | ||
455 | words = '无' | ||
456 | boxes = np.array(boxes).reshape((-1, 2)) | ||
457 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
458 | signature_role_1['page_num'] = page_num | ||
459 | signature_role_1['position'] = position | ||
460 | signature_role_1['words'] = words | ||
461 | return signature_role_1 | ||
462 | |||
463 | def get_signature_role_2(self): | ||
464 | signature_role_2 = self.init_item.copy() | ||
465 | # 先定位签字区域 | ||
466 | texts = [] | ||
467 | boxes = [] | ||
468 | page_num = None | ||
469 | position = None | ||
470 | words = None | ||
471 | region = False | ||
472 | for i in list(self.pdf_info.keys()): | ||
473 | for block in self.pdf_info[i]['blocks']: | ||
474 | if block['type'] != 0: | ||
475 | continue | ||
476 | for line in block['lines']: | ||
477 | for span in line['spans']: | ||
478 | bbox, text = span['bbox'], span['text'] | ||
479 | if '共同借款人(共同抵押人)' in text: | ||
480 | region = True | ||
481 | if '日期' in text: | ||
482 | region = False | ||
483 | if region == True: | ||
484 | page_num = i | ||
485 | texts.append(text) | ||
486 | boxes.append(bbox) | ||
487 | if len(texts) > 4: | ||
488 | words = '有' | ||
489 | else: | ||
490 | words = '无' | ||
491 | boxes = np.array(boxes).reshape((-1, 2)) | ||
492 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
493 | signature_role_2['page_num'] = page_num | ||
494 | signature_role_2['position'] = position | ||
495 | signature_role_2['words'] = words | ||
496 | return signature_role_2 | ||
497 | |||
498 | def get_signature_role_3(self): | ||
499 | signature_role_3 = self.init_item.copy() | ||
500 | # 先定位签字区域 | ||
501 | texts = [] | ||
502 | boxes = [] | ||
503 | page_num = None | ||
504 | position = None | ||
505 | words = None | ||
506 | region = False | ||
507 | for i in list(self.pdf_info.keys()): | ||
508 | for block in self.pdf_info[i]['blocks']: | ||
509 | if block['type'] != 0: | ||
510 | continue | ||
511 | for line in block['lines']: | ||
512 | for span in line['spans']: | ||
513 | bbox, text = span['bbox'], span['text'] | ||
514 | if '保证人1' in text and int(i) != 0: | ||
515 | region = True | ||
516 | if '日期' in text: | ||
517 | region = False | ||
518 | if region == True: | ||
519 | page_num = i | ||
520 | texts.append(text) | ||
521 | boxes.append(bbox) | ||
522 | if len(texts) > 4: | ||
523 | words = '有' | ||
524 | else: | ||
525 | words = '无' | ||
526 | boxes = np.array(boxes).reshape((-1, 2)) | ||
527 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
528 | signature_role_3['page_num'] = page_num | ||
529 | signature_role_3['position'] = position | ||
530 | signature_role_3['words'] = words | ||
531 | return signature_role_3 | ||
532 | |||
533 | def get_signature_role_4(self): | ||
534 | signature_role_4 = self.init_item.copy() | ||
535 | # 先定位签字区域 | ||
536 | texts = [] | ||
537 | boxes = [] | ||
538 | page_num = None | ||
539 | position = None | ||
540 | words = None | ||
541 | region = False | ||
542 | for i in list(self.pdf_info.keys()): | ||
543 | for block in self.pdf_info[i]['blocks']: | ||
544 | if block['type'] != 0: | ||
545 | continue | ||
546 | for line in block['lines']: | ||
547 | for span in line['spans']: | ||
548 | bbox, text = span['bbox'], span['text'] | ||
549 | if '保证人2' in text and int(i) != 0: | ||
550 | region = True | ||
551 | if '日期' in text: | ||
552 | region = False | ||
553 | if region == True: | ||
554 | page_num = i | ||
555 | texts.append(text) | ||
556 | boxes.append(bbox) | ||
557 | if len(texts) > 4: | ||
558 | words = '有' | ||
559 | else: | ||
560 | words = '无' | ||
561 | boxes = np.array(boxes).reshape((-1, 2)) | ||
562 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
563 | signature_role_4['page_num'] = page_num | ||
564 | signature_role_4['position'] = position | ||
565 | signature_role_4['words'] = words | ||
566 | return signature_role_4 | ||
567 | |||
568 | def get_signature_role_5(self): | ||
569 | signature_role_5 = self.init_item.copy() | ||
570 | # 先定位签字区域 | ||
571 | texts = [] | ||
572 | boxes = [] | ||
573 | page_num = None | ||
574 | position = None | ||
575 | words = None | ||
576 | region = False | ||
577 | for i in list(self.pdf_info.keys()): | ||
578 | for block in self.pdf_info[i]['blocks']: | ||
579 | if block['type'] != 0: | ||
580 | continue | ||
581 | for line in block['lines']: | ||
582 | for span in line['spans']: | ||
583 | bbox, text = span['bbox'], span['text'] | ||
584 | if '见证人签字' in text and int(i) != 0: | ||
585 | region = True | ||
586 | if '年' in text: | ||
587 | region = False | ||
588 | if region == True: | ||
589 | page_num = i | ||
590 | texts.append(text) | ||
591 | boxes.append(bbox) | ||
592 | print(texts) | ||
593 | if len(texts) > 4: | ||
594 | words = '有' | ||
595 | else: | ||
596 | words = '无' | ||
597 | boxes = np.array(boxes).reshape((-1, 2)) | ||
598 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
599 | signature_role_5['page_num'] = page_num | ||
600 | signature_role_5['position'] = position | ||
601 | signature_role_5['words'] = words | ||
602 | return signature_role_5 | ||
603 | |||
604 | def get_last_page_signature(self, page_num, top, bottom): | ||
605 | signature_name = self.item.copy() | ||
606 | signature_date = self.item.copy() | ||
607 | anchor_top = None | ||
608 | anchor_bottom = None | ||
609 | for block in self.pdf_info[page_num]['blocks']: | ||
610 | if block['type'] != 0: | ||
611 | continue | ||
612 | for line in block['lines']: | ||
613 | for span in line['spans']: | ||
614 | bbox, text = span['bbox'], span['text'] | ||
615 | if top in text: | ||
616 | anchor_top = bbox[1] | ||
617 | if bottom in text: | ||
618 | anchor_bottom = bbox[1] | ||
619 | if anchor_top is not None and anchor_bottom is not None: | ||
620 | for block in self.pdf_info[page_num]['blocks']: | ||
621 | if block['type'] != 0: | ||
622 | continue | ||
623 | for line in block['lines']: | ||
624 | for span in line['spans']: | ||
625 | bbox, text = span['bbox'], span['text'] | ||
626 | if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom): | ||
627 | name = text.split(' ')[0] | ||
628 | date = text.split(':')[-1] | ||
629 | signature_name['words'] = name | ||
630 | signature_name['position'] = bbox | ||
631 | signature_date['words'] = date | ||
632 | signature_name['position'] = bbox | ||
633 | return signature_name, signature_date | ||
634 | |||
635 | def get_info(self): | ||
636 | """ | ||
637 | block['type'] == 0 : 表示该元素为图片 | ||
638 | |||
639 | Returns: | ||
640 | dict: Description | ||
641 | """ | ||
642 | |||
643 | # 先判断是否为 ASP 产品 | ||
644 | # 只看第一页,判断是否有 '附加产品融资贷款本金总金额' 这一句话,若有则为 ASP 产品 | ||
645 | # print(self.pdf_info['0']['blocks']) | ||
646 | for block in self.pdf_info['0']['blocks']: | ||
647 | if block['type'] != 0: | ||
648 | continue | ||
649 | for line in block['lines']: | ||
650 | for span in line['spans']: | ||
651 | bbox, text = span['bbox'], span['text'] | ||
652 | if '附加产品融资贷款本金总金额' == text: | ||
653 | self.is_asp = True | ||
654 | |||
655 | self.gen_init_result(self.is_asp) | ||
656 | |||
657 | # Page 1 | ||
658 | # 找合同编号 | ||
659 | contract_no = self.get_contract_no(page_num='0') | ||
660 | self.init_result['page_1']['合同编号'] = contract_no | ||
661 | # 所购车辆价格 | ||
662 | vehicle_price = self.get_vehicle_price() | ||
663 | self.init_result['page_1']['所购车辆价格'] = vehicle_price | ||
664 | # 车架号 | ||
665 | vin = self.get_vin() | ||
666 | self.init_result['page_1']['车架号'] = vehicle_price | ||
667 | # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目 | ||
668 | upper, lower, asp_1, asp_2 = self.get_loan_principal() | ||
669 | self.init_result['page_1']['贷款本金金额']['大写'] = upper | ||
670 | self.init_result['page_1']['贷款本金金额']['小写'] = lower | ||
671 | self.init_result['page_1']['贷款本金金额']['车辆贷款本金金额'] = asp_1 | ||
672 | self.init_result['page_1']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2 | ||
673 | # 贷款期限 | ||
674 | loan_term = self.get_loan_term() | ||
675 | self.init_result['page_1']['贷款期限'] = loan_term | ||
676 | # 附加产品融资贷款本金总金额明细(ASP-表格) | ||
677 | asp_details_table = self.get_asp_details(page_num='0') | ||
678 | self.init_result['page_1']['附加产品融资贷款本金总金额明细'] = asp_details_table | ||
679 | # 借款人签字及时间 | ||
680 | signature = self.get_signature() | ||
681 | self.init_result['page_1']['借款人签字及时间'] = signature | ||
682 | ####################################### | ||
683 | # Page 2 | ||
684 | # 找合同编号 | ||
685 | contract_no = self.get_contract_no(page_num='0') | ||
686 | self.init_result['page_2']['合同编号'] = contract_no | ||
687 | # 找借款人及抵押人(地址字段原本有空格) | ||
688 | borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人及共同抵押人:') | ||
689 | self.init_result['page_2']['借款人及抵押人']['name'] = borrower_name | ||
690 | self.init_result['page_2']['借款人及抵押人']['id'] = borrower_id | ||
691 | # 找共同借款人及共同抵押人 | ||
692 | co_borrower_name, co_borrower_id = self.get_somebody(top='共同借款人及共同抵押人:', bottom='保证人1:') | ||
693 | self.init_result['page_2']['共同借款人及共同抵押人']['name'] = co_borrower_name | ||
694 | self.init_result['page_2']['共同借款人及共同抵押人']['id'] = co_borrower_id | ||
695 | # 保证人1 | ||
696 | first_guarantor_name, first_guarantor_id = self.get_somebody(top='保证人1:', bottom='保证人2:') | ||
697 | self.init_result['page_2']['保证人1']['name'] = first_guarantor_name | ||
698 | self.init_result['page_2']['保证人1']['id'] = first_guarantor_id | ||
699 | # 保证人2 | ||
700 | second_guarantor_name, second_guarantor_id = self.get_somebody(top='保证人2:', bottom='第一章') | ||
701 | self.init_result['page_2']['保证人2']['name'] = second_guarantor_name | ||
702 | self.init_result['page_2']['保证人2']['id'] = second_guarantor_id | ||
703 | # 所购车辆价格 | ||
704 | vehicle_price = self.get_vehicle_price(page_num='1') | ||
705 | self.init_result['page_2']['所购车辆价格'] = vehicle_price | ||
706 | # 车架号 | ||
707 | vin = self.get_vin(page_num='1') | ||
708 | self.init_result['page_2']['车架号'] = vin | ||
709 | # 经销商 | ||
710 | seller = self.get_seller() | ||
711 | self.init_result['page_2']['经销商'] = seller | ||
712 | # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目 | ||
713 | upper, lower, asp_1, asp_2 = self.get_loan_principal(page_num='1') | ||
714 | self.init_result['page_2']['贷款本金金额']['大写'] = upper | ||
715 | self.init_result['page_2']['贷款本金金额']['小写'] = lower | ||
716 | self.init_result['page_2']['贷款本金金额']['车辆贷款本金金额'] = asp_1 | ||
717 | self.init_result['page_2']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2 | ||
718 | # 贷款期限 | ||
719 | loan_term = self.get_loan_term(page_num='1') | ||
720 | self.init_result['page_2']['贷款期限'] = loan_term | ||
721 | # 还款账户 | ||
722 | account, account_name, account_bank = self.get_payback_account() | ||
723 | self.init_result['page_2']['还款账户']['账号'] = account | ||
724 | self.init_result['page_2']['还款账户']['户名'] = account_name | ||
725 | self.init_result['page_2']['还款账户']['开户行'] = account_bank | ||
726 | ####################################### | ||
727 | # Page 3 | ||
728 | # 找合同编号 | ||
729 | contract_no = self.get_contract_no(page_num='2') | ||
730 | self.init_result['page_3']['合同编号'] = contract_no | ||
731 | # 还款计划表(表格) | ||
732 | repayment_schedule_table = self.get_repayment_schedule() | ||
733 | self.init_result['page_3']['还款计划表'] = repayment_schedule_table | ||
734 | ####################################### | ||
735 | # Page 4 | ||
736 | # 找合同编号 | ||
737 | contract_no = self.get_contract_no(page_num='3') | ||
738 | self.init_result['page_4']['合同编号'] = contract_no | ||
739 | # 附加产品融资贷款本金总金额明细(ASP-表格) | ||
740 | asp_details_table = self.get_asp_details(page_num='3') | ||
741 | self.init_result['page_4']['附加产品融资贷款本金总金额明细'] = asp_details_table | ||
742 | ####################################### | ||
743 | # Page 5 | ||
744 | # 找合同编号 | ||
745 | contract_no = self.get_contract_no(page_num='4') | ||
746 | self.init_result['page_5']['合同编号'] = contract_no | ||
747 | ####################################### | ||
748 | # Page 6 | ||
749 | # 找合同编号 | ||
750 | contract_no = self.get_contract_no(page_num='5') | ||
751 | self.init_result['page_6']['合同编号'] = contract_no | ||
752 | if self.is_asp == False: | ||
753 | # Page 7 | ||
754 | # 找合同编号 | ||
755 | contract_no = self.get_contract_no(page_num='6') | ||
756 | self.init_result['page_7']['合同编号'] = contract_no | ||
757 | signature_name, signature_date = self.get_last_page_signature(page_num='6', | ||
758 | top='借款人(抵押人)', bottom='共同借款人(共同抵押人)') | ||
759 | self.init_result['page_7']['主借人签字']['签字'] = signature_name | ||
760 | self.init_result['page_7']['主借人签字']['日期'] = signature_date | ||
761 | signature_name, signature_date = self.get_last_page_signature(page_num='6', | ||
762 | top='共同借款人(共同抵押人)', bottom='保证人1') | ||
763 | self.init_result['page_7']['共借人签字']['签字'] = signature_name | ||
764 | self.init_result['page_7']['共借人签字']['日期'] = signature_date | ||
765 | signature_name, signature_date = self.get_last_page_signature(page_num='6', | ||
766 | top='保证人1', bottom='保证人2') | ||
767 | self.init_result['page_7']['保证人1签字']['签字'] = signature_name | ||
768 | self.init_result['page_7']['保证人1签字']['日期'] = signature_date | ||
769 | signature_name, signature_date = self.get_last_page_signature(page_num='6', | ||
770 | top='保证人2', bottom='在本人面前亲笔签署本合同') | ||
771 | self.init_result['page_7']['保证人2签字']['签字'] = signature_name | ||
772 | self.init_result['page_7']['保证人2签字']['日期'] = signature_date | ||
773 | signature_name, signature_date = self.get_last_page_signature(page_num='6', | ||
774 | top='在本人面前亲笔签署本合同', bottom='(以下无正文)') | ||
775 | self.init_result['page_7']['见证人签字']['签字'] = signature_name | ||
776 | self.init_result['page_7']['见证人签字']['日期'] = signature_date | ||
777 | else: | ||
778 | # Page 7 | ||
779 | # 找合同编号 | ||
780 | contract_no = self.get_contract_no(page_num='6') | ||
781 | self.init_result['page_7']['合同编号'] = contract_no | ||
782 | # Page 8 | ||
783 | # 找合同编号 | ||
784 | contract_no = self.get_contract_no(page_num='7') | ||
785 | self.init_result['page_8']['合同编号'] = contract_no | ||
786 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
787 | top='借款人(抵押人)', bottom='共同借款人(共同抵押人)') | ||
788 | self.init_result['page_8']['主借人签字']['签字'] = signature_name | ||
789 | self.init_result['page_8']['主借人签字']['日期'] = signature_date | ||
790 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
791 | top='共同借款人(共同抵押人)', bottom='保证人1') | ||
792 | self.init_result['page_8']['共借人签字']['签字'] = signature_name | ||
793 | self.init_result['page_8']['共借人签字']['日期'] = signature_date | ||
794 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
795 | top='保证人1', bottom='保证人2') | ||
796 | self.init_result['page_8']['保证人1签字']['签字'] = signature_name | ||
797 | self.init_result['page_8']['保证人1签字']['日期'] = signature_date | ||
798 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
799 | top='保证人2', bottom='在本人面前亲笔签署本合同') | ||
800 | self.init_result['page_8']['保证人2签字']['签字'] = signature_name | ||
801 | self.init_result['page_8']['保证人2签字']['日期'] = signature_date | ||
802 | signature_name, signature_date = self.get_last_page_signature(page_num='7', | ||
803 | top='在本人面前亲笔签署本合同', bottom='(以下无正文)') | ||
804 | self.init_result['page_8']['见证人签字']['签字'] = signature_name | ||
805 | self.init_result['page_8']['见证人签字']['日期'] = signature_date | ||
806 | |||
807 | # 重新定制输出 | ||
808 | new_results = {"is_asp": self.is_asp, | ||
809 | "page_info": self.init_result | ||
810 | } | ||
811 | return new_results |
1 | # -*- coding: utf-8 -*- | ||
2 | # @Author : lk | ||
3 | # @Email : 9428.al@gmail.com | ||
4 | # @Create Date : 2021-07-20 16:42:41 | ||
5 | # @Last Modified : 2021-10-28 17:41:00 | ||
6 | # @Description : | ||
7 | |||
8 | import re | ||
9 | import cv2 | ||
10 | import base64 | ||
11 | import numpy as np | ||
12 | from fuzzywuzzy import fuzz | ||
13 | |||
14 | |||
15 | class Finder: | ||
16 | |||
17 | def __init__(self, pdf_info): | ||
18 | self.pdf_info = pdf_info | ||
19 | self.item = {"words": None, | ||
20 | "page": None, | ||
21 | "position": None, | ||
22 | } | ||
23 | # 格式化算法输出 | ||
24 | self.init_result = {"合同编号": self.item, | ||
25 | "承租人-姓名": self.item, | ||
26 | "承租人-证件号码": self.item, | ||
27 | "承租人-法定代表人或授权代表": self.item, | ||
28 | "保证人1-姓名": self.item, | ||
29 | "保证人1-证件号码": self.item, | ||
30 | "保证人1-法定代表人或授权代表": self.item, | ||
31 | "保证人2-姓名": self.item, | ||
32 | "保证人2-证件号码": self.item, | ||
33 | "保证人2-法定代表人或授权代表": self.item, | ||
34 | "保证人3-姓名": self.item, | ||
35 | "保证人3-证件号码": self.item, | ||
36 | "保证人3-法定代表人或授权代表": self.item, | ||
37 | "合同编号(正文)": self.item, | ||
38 | "车辆识别代码": self.item, | ||
39 | "车辆卖方(经销商)": self.item, | ||
40 | "车辆原始销售价格(《机动车销售统一发票》所列金额)": self.item, | ||
41 | "车辆附加产品明细表": self.item, | ||
42 | "融资成本总额": self.item, | ||
43 | "租期": self.item, | ||
44 | "付款计划表": self.item, | ||
45 | "银行账户-户名": self.item, | ||
46 | "银行账户-银行账号": self.item, | ||
47 | "银行账户-开户行": self.item, | ||
48 | "签字页-承租人姓名": self.item, | ||
49 | "签字页-承租人签章": self.item, | ||
50 | "签字页-保证人1姓名": self.item, | ||
51 | "签字页-保证人1签章": self.item, | ||
52 | "签字页-保证人2姓名": self.item, | ||
53 | "签字页-保证人2签章": self.item, | ||
54 | "签字页-保证人3姓名": self.item, | ||
55 | "签字页-保证人3签章": self.item, | ||
56 | } | ||
57 | |||
58 | # 格式化输出 车辆处置协议 要是别的字段 | ||
59 | self.init_result_1 = {"合同编号": self.item, | ||
60 | "承租人-姓名": self.item, | ||
61 | "承租人-证件号码": self.item, | ||
62 | "销售经销商": self.item, | ||
63 | "合同编号(正文)": self.item, | ||
64 | "签字页-承租人姓名": self.item, | ||
65 | "签字页-承租人证件号码": self.item, | ||
66 | "签字页-承租人签章": self.item, | ||
67 | "签字页-销售经销商": self.item, | ||
68 | "签字页-销售经销商签章": self.item, | ||
69 | |||
70 | } | ||
71 | |||
72 | # 格式化输出 车辆租赁抵押合同 | ||
73 | self.init_result_2 = {"合同编号": self.item, | ||
74 | "合同编号(正文)": self.item, | ||
75 | "抵押人姓名/名称": self.item, | ||
76 | "抵押人证件号码": self.item, | ||
77 | "车辆识别代码": self.item, | ||
78 | "租金总额": self.item, | ||
79 | "融资租赁期限": self.item, | ||
80 | "签字页-抵押人姓名": self.item, | ||
81 | "签字页-抵押人签章": self.item, | ||
82 | "签字页-抵押人配偶姓名": self.item, | ||
83 | "签字页-抵押人配偶签章": self.item, | ||
84 | } | ||
85 | |||
86 | def get_contract_no(self, page_num): | ||
87 | """传入页码,查看该页码右上角的编号 | ||
88 | |||
89 | Args: | ||
90 | page_num (string): | ||
91 | |||
92 | Returns: | ||
93 | sting: | ||
94 | """ | ||
95 | contract_no = self.item.copy() | ||
96 | # 只看第一页 | ||
97 | for block in self.pdf_info[page_num]['blocks']: | ||
98 | if block['type'] != 0: | ||
99 | continue | ||
100 | for line in block['lines']: | ||
101 | for span in line['spans']: | ||
102 | bbox, text = span['bbox'], span['text'] | ||
103 | if '合同编号:' in text: | ||
104 | words = text.split(':')[-1] | ||
105 | contract_no['position'] = bbox | ||
106 | contract_no['page'] = page_num | ||
107 | contract_no['words'] = words | ||
108 | if contract_no['words'] == '': | ||
109 | for block in self.pdf_info[page_num]['blocks']: | ||
110 | if block['type'] != 0: | ||
111 | continue | ||
112 | for line in block['lines']: | ||
113 | for span in line['spans']: | ||
114 | bbox, text = span['bbox'], span['text'] | ||
115 | if bbox[1] < contract_no['position'][3] and 'CH' in text: | ||
116 | contract_no['position'] = bbox | ||
117 | contract_no['page'] = page_num | ||
118 | contract_no['words'] = text | ||
119 | return contract_no | ||
120 | |||
121 | def get_vehicle_price(self, page_num='0'): | ||
122 | vehicle_price = self.item.copy() | ||
123 | for block in self.pdf_info[page_num]['blocks']: | ||
124 | if block['type'] != 0: | ||
125 | continue | ||
126 | for line in block['lines']: | ||
127 | for span in line['spans']: | ||
128 | bbox, text = span['bbox'], span['text'] | ||
129 | if '所购车辆价格为人民币' in text: | ||
130 | words = text.split('币')[-1] | ||
131 | vehicle_price['position'] = bbox | ||
132 | vehicle_price['words'] = words | ||
133 | return vehicle_price | ||
134 | |||
135 | def get_contract_no_one(self): | ||
136 | # 查找正文中的合同编号,有可能存在换行的情况 | ||
137 | contract_no = self.item.copy() | ||
138 | for pno in self.pdf_info: | ||
139 | all_text = '' | ||
140 | for block in self.pdf_info[pno]['blocks']: | ||
141 | if block['type'] != 0: | ||
142 | continue | ||
143 | for line in block['lines']: | ||
144 | for span in line['spans']: | ||
145 | bbox, text = span['bbox'], span['text'] | ||
146 | all_text += text | ||
147 | all_text = all_text.replace(' ', '') | ||
148 | matchObj = re.search(r'(合同编号:\[(.*?)\])', all_text) | ||
149 | if matchObj: | ||
150 | words = matchObj.group(1) | ||
151 | contract_no['position'] = None | ||
152 | contract_no['page'] = pno | ||
153 | contract_no['words'] = words | ||
154 | return contract_no | ||
155 | |||
156 | matchObj = re.search(r'编号为(.*?)的', all_text) | ||
157 | if matchObj: | ||
158 | words = matchObj.group(1).strip() | ||
159 | contract_no['position'] = None | ||
160 | contract_no['page'] = pno | ||
161 | contract_no['words'] = words | ||
162 | return contract_no | ||
163 | |||
164 | matchObj = re.search(r'编号为(.*?))的', all_text) | ||
165 | if matchObj: | ||
166 | words = matchObj.group(1).strip() | ||
167 | contract_no['position'] = None | ||
168 | contract_no['page'] = pno | ||
169 | contract_no['words'] = words | ||
170 | return contract_no | ||
171 | |||
172 | def get_key_value(self, key, page_num=None): | ||
173 | value = self.item.copy() | ||
174 | if page_num is not None: | ||
175 | pno = page_num | ||
176 | for block in self.pdf_info[pno]['blocks']: | ||
177 | if block['type'] != 0: | ||
178 | continue | ||
179 | for line in block['lines']: | ||
180 | for span in line['spans']: | ||
181 | bbox, text = span['bbox'], span['text'] | ||
182 | if key in text: | ||
183 | words = text.split(':')[-1] | ||
184 | value['position'] = bbox | ||
185 | value['page'] = pno | ||
186 | value['words'] = words | ||
187 | else: | ||
188 | for pno in self.pdf_info: | ||
189 | for block in self.pdf_info[pno]['blocks']: | ||
190 | if block['type'] != 0: | ||
191 | continue | ||
192 | for line in block['lines']: | ||
193 | for span in line['spans']: | ||
194 | bbox, text = span['bbox'], span['text'] | ||
195 | if key in text: | ||
196 | # print(self.pdf_info[pno]) | ||
197 | words = text.split(':')[-1] | ||
198 | value['position'] = bbox | ||
199 | value['page'] = pno | ||
200 | value['words'] = words | ||
201 | return value | ||
202 | |||
203 | def get_loan_principal(self, page_num='0'): | ||
204 | chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', | ||
205 | '佰', '仟', '万', '亿', '元', '角', '分', '零', '整'] | ||
206 | upper = self.item.copy() | ||
207 | lower = self.item.copy() | ||
208 | asp_1 = self.item.copy() | ||
209 | asp_2 = self.item.copy() | ||
210 | anchor_bbox = None | ||
211 | for block in self.pdf_info[page_num]['blocks']: | ||
212 | if block['type'] != 0: | ||
213 | continue | ||
214 | for line in block['lines']: | ||
215 | for span in line['spans']: | ||
216 | bbox, text = span['bbox'], span['text'] | ||
217 | if fuzz.ratio(''.join(chinese_keywords), text) > 15: | ||
218 | text = text.split(':')[-1].strip() | ||
219 | upper['position'] = bbox | ||
220 | upper['words'] = text | ||
221 | if '小写:¥' in text: | ||
222 | words = text.split('¥')[-1].strip() | ||
223 | lower['position'] = bbox | ||
224 | lower['words'] = words | ||
225 | if '附加产品融资贷款本金总金额' == text: | ||
226 | anchor_bbox = bbox | ||
227 | if anchor_bbox: | ||
228 | for block in self.pdf_info[page_num]['blocks']: | ||
229 | if block['type'] != 0: | ||
230 | continue | ||
231 | for line in block['lines']: | ||
232 | for span in line['spans']: | ||
233 | bbox, text = span['bbox'], span['text'] | ||
234 | if np.mean(bbox[1::2]) < np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text: | ||
235 | words = re.findall(r'人民币:小写:\[(.*)\]', text)[0] | ||
236 | asp_1['position'] = bbox | ||
237 | asp_1['words'] = words | ||
238 | if np.mean(bbox[1::2]) > np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text: | ||
239 | words = re.findall(r'人民币:小写:\[(.*)\]', text)[0] | ||
240 | asp_2['position'] = bbox | ||
241 | asp_2['words'] = words | ||
242 | return upper, lower, asp_1, asp_2 | ||
243 | |||
244 | def get_loan_term(self, page_num='0'): | ||
245 | loan_term = self.item.copy() | ||
246 | all_text = '' | ||
247 | for block in self.pdf_info[page_num]['blocks']: | ||
248 | if block['type'] != 0: | ||
249 | continue | ||
250 | for line in block['lines']: | ||
251 | for span in line['spans']: | ||
252 | bbox, text = span['bbox'], span['text'] | ||
253 | all_text += text | ||
254 | matchs = re.search(r'贷款期限(\d+)个月', all_text) | ||
255 | if matchs: | ||
256 | words = matchs.group(1) | ||
257 | for block in self.pdf_info[page_num]['blocks']: | ||
258 | if block['type'] != 0: | ||
259 | continue | ||
260 | for line in block['lines']: | ||
261 | for span in line['spans']: | ||
262 | bbox, text = span['bbox'], span['text'] | ||
263 | if f'{words}个月' in text: | ||
264 | loan_term['position'] = bbox | ||
265 | loan_term['words'] = words | ||
266 | return loan_term | ||
267 | |||
268 | def get_asp_details(self, page_num): | ||
269 | asp_details_table_term = self.item.copy() | ||
270 | |||
271 | asp_details_table = [] | ||
272 | asp_details_text_list = [] | ||
273 | table = False | ||
274 | for block in self.pdf_info[page_num]['blocks']: | ||
275 | if block['type'] != 0: | ||
276 | continue | ||
277 | for line in block['lines']: | ||
278 | for span in line['spans']: | ||
279 | bbox, text = span['bbox'], span['text'] | ||
280 | if '附加产品融资贷款本金总金额明细' == text: | ||
281 | table = True | ||
282 | if '第二条' in text or '征信管理' in text: | ||
283 | table = False | ||
284 | if table == True: | ||
285 | asp_details_text_list.append(text) | ||
286 | |||
287 | for i in range((len(asp_details_text_list)+2)//3): | ||
288 | |||
289 | line = [] | ||
290 | if i == 0: | ||
291 | line = [asp_details_text_list[0]] | ||
292 | else: | ||
293 | for j in range(3): | ||
294 | line.append(asp_details_text_list[i*3-2+j]) | ||
295 | |||
296 | asp_details_table.append(line) | ||
297 | |||
298 | if len(asp_details_table) > 0: | ||
299 | asp_details_table_term['words'] = asp_details_table | ||
300 | return asp_details_table_term | ||
301 | |||
302 | def get_signature(self): | ||
303 | signature = self.item.copy() | ||
304 | |||
305 | for block in self.pdf_info['0']['blocks']: | ||
306 | if block['type'] != 0: | ||
307 | continue | ||
308 | for line in block['lines']: | ||
309 | for span in line['spans']: | ||
310 | bbox, text = span['bbox'], span['text'] | ||
311 | if '签署日期' in text: | ||
312 | words = text | ||
313 | signature['words'] = words | ||
314 | signature['position'] = bbox | ||
315 | return signature | ||
316 | |||
317 | def get_somebody(self, top, bottom): | ||
318 | # 指定上下边界后,返回上下边界内的客户信息 | ||
319 | _name = self.item.copy() | ||
320 | _id = self.item.copy() | ||
321 | # 只看第一页,先划定上下边界 | ||
322 | y_top = 0 | ||
323 | y_bottom = 0 | ||
324 | for block in self.pdf_info['1']['blocks']: | ||
325 | if block['type'] != 0: | ||
326 | continue | ||
327 | for line in block['lines']: | ||
328 | for span in line['spans']: | ||
329 | bbox, text = span['bbox'], span['text'] | ||
330 | if top in text: | ||
331 | y_top = bbox[3] | ||
332 | if bottom in text: | ||
333 | y_bottom = bbox[3] | ||
334 | for block in self.pdf_info['1']['blocks']: | ||
335 | if block['type'] != 0: | ||
336 | continue | ||
337 | for line in block['lines']: | ||
338 | for span in line['spans']: | ||
339 | bbox, text = span['bbox'], span['text'] | ||
340 | if y_top < bbox[3] < y_bottom: | ||
341 | if '姓名/名称' in text: | ||
342 | words = text.split(':')[-1] | ||
343 | _name['position'] = bbox | ||
344 | _name['words'] = words | ||
345 | if '自然人身份证件号码/法人执照号码' in text: | ||
346 | words = text.split(':')[-1] | ||
347 | _id['position'] = bbox | ||
348 | _id['words'] = words | ||
349 | return _name, _id | ||
350 | |||
351 | def get_seller(self): | ||
352 | seller = self.item.copy() | ||
353 | # 先找到 key | ||
354 | anchor_bbox = None | ||
355 | for block in self.pdf_info['1']['blocks']: | ||
356 | if block['type'] != 0: | ||
357 | continue | ||
358 | for line in block['lines']: | ||
359 | for span in line['spans']: | ||
360 | bbox, text = span['bbox'], span['text'] | ||
361 | if '经销商' == text: | ||
362 | anchor_bbox = bbox | ||
363 | # 当找到了 key, 则根据 key 去匹配 value | ||
364 | if anchor_bbox: | ||
365 | half_width = self.pdf_info['1']['width'] * 0.5 | ||
366 | for block in self.pdf_info['1']['blocks']: | ||
367 | if block['type'] != 0: | ||
368 | continue | ||
369 | for line in block['lines']: | ||
370 | for span in line['spans']: | ||
371 | bbox, text = span['bbox'], span['text'] | ||
372 | if anchor_bbox[2]<np.mean(bbox[::2])<half_width and \ | ||
373 | anchor_bbox[1]<np.mean(bbox[1::2])<anchor_bbox[3]: | ||
374 | seller['position'] = bbox | ||
375 | seller['words'] = text | ||
376 | return seller | ||
377 | |||
378 | def get_payback_account(self): | ||
379 | account = self.item.copy() | ||
380 | account_name = self.item.copy() | ||
381 | account_bank = self.item.copy() | ||
382 | all_text = '' | ||
383 | for block in self.pdf_info['1']['blocks']: | ||
384 | if block['type'] != 0: | ||
385 | continue | ||
386 | for line in block['lines']: | ||
387 | for span in line['spans']: | ||
388 | bbox, text = span['bbox'], span['text'] | ||
389 | all_text += text | ||
390 | # 首先确定账户信息是哪种,我们只输出非另行通知的格式 | ||
391 | if '☑账号' in all_text: | ||
392 | all_text = all_text.replace(' ', '') | ||
393 | matchs_1 = re.findall(r'账号:(.*)户名', all_text) | ||
394 | if matchs_1: | ||
395 | words = matchs_1[0] | ||
396 | for block in self.pdf_info['1']['blocks']: | ||
397 | if block['type'] != 0: | ||
398 | continue | ||
399 | for line in block['lines']: | ||
400 | for span in line['spans']: | ||
401 | bbox, text = span['bbox'], span['text'] | ||
402 | if f'{words}' in text: | ||
403 | account['position'] = bbox | ||
404 | account['words'] = words | ||
405 | matchs_2 = re.findall(r'户名:(.*)开户行', all_text) | ||
406 | if matchs_2: | ||
407 | words = matchs_2[0] | ||
408 | for block in self.pdf_info['1']['blocks']: | ||
409 | if block['type'] != 0: | ||
410 | continue | ||
411 | for line in block['lines']: | ||
412 | for span in line['spans']: | ||
413 | bbox, text = span['bbox'], span['text'] | ||
414 | if f'{words}' in text: | ||
415 | account_name['position'] = bbox | ||
416 | account_name['words'] = words | ||
417 | matchs_3 = re.findall(r'开户行:(.*);', all_text) | ||
418 | if matchs_3: | ||
419 | words = matchs_3[0] | ||
420 | for block in self.pdf_info['1']['blocks']: | ||
421 | if block['type'] != 0: | ||
422 | continue | ||
423 | for line in block['lines']: | ||
424 | for span in line['spans']: | ||
425 | bbox, text = span['bbox'], span['text'] | ||
426 | if f'开户行:{words};' in text.replace(' ', ''): | ||
427 | account_bank['position'] = bbox | ||
428 | account_bank['words'] = words | ||
429 | return account, account_name, account_bank | ||
430 | |||
431 | def get_repayment_schedule(self): | ||
432 | repayment_schedule = self.item.copy() | ||
433 | |||
434 | repayment_schedule_text_list = [] | ||
435 | table = False | ||
436 | page = None | ||
437 | for pno in self.pdf_info: | ||
438 | for block in self.pdf_info[pno]['blocks']: | ||
439 | if block['type'] != 0: | ||
440 | continue | ||
441 | for line in block['lines']: | ||
442 | for span in line['spans']: | ||
443 | bbox, text = span['bbox'], span['text'] | ||
444 | if '以上表格中所列序号' in text: | ||
445 | table = False | ||
446 | if table == True: | ||
447 | repayment_schedule_text_list.append(text) | ||
448 | if '61.' in text: | ||
449 | page = pno | ||
450 | table = True | ||
451 | |||
452 | repayment_schedule_table = [['序号', '融资租赁成本', '融资租赁费用', '租金', '剩余融资租赁成本']] | ||
453 | for i in range(len(repayment_schedule_text_list)//4): | ||
454 | line = [f'{i+1}.'] | ||
455 | # 4表示4列的意思 | ||
456 | for j in range(4): | ||
457 | line.append(repayment_schedule_text_list[i*4+j]) | ||
458 | |||
459 | repayment_schedule_table.append(line) | ||
460 | |||
461 | repayment_schedule['words'] = repayment_schedule_table | ||
462 | repayment_schedule['page'] = page | ||
463 | return repayment_schedule | ||
464 | |||
465 | def get_signature_role_1(self): | ||
466 | signature_role_1 = self.item.copy() | ||
467 | for pno in self.pdf_info: | ||
468 | for block in self.pdf_info[pno]['blocks']: | ||
469 | if block['type'] != 0: | ||
470 | continue | ||
471 | for line in block['lines']: | ||
472 | for span in line['spans']: | ||
473 | bbox, text = span['bbox'], span['text'] | ||
474 | if '签署日期' in text: | ||
475 | signature_role_1['position'] = bbox | ||
476 | signature_role_1['page'] = pno | ||
477 | signature_role_1['words'] = text | ||
478 | return signature_role_1 | ||
479 | |||
480 | def get_signature_role_2(self): | ||
481 | signature_role_2 = self.init_item.copy() | ||
482 | # 先定位签字区域 | ||
483 | texts = [] | ||
484 | boxes = [] | ||
485 | page_num = None | ||
486 | position = None | ||
487 | words = None | ||
488 | region = False | ||
489 | for i in list(self.pdf_info.keys()): | ||
490 | for block in self.pdf_info[i]['blocks']: | ||
491 | if block['type'] != 0: | ||
492 | continue | ||
493 | for line in block['lines']: | ||
494 | for span in line['spans']: | ||
495 | bbox, text = span['bbox'], span['text'] | ||
496 | if '共同借款人(共同抵押人)' in text: | ||
497 | region = True | ||
498 | if '日期' in text: | ||
499 | region = False | ||
500 | if region == True: | ||
501 | page_num = i | ||
502 | texts.append(text) | ||
503 | boxes.append(bbox) | ||
504 | if len(texts) > 4: | ||
505 | words = '有' | ||
506 | else: | ||
507 | words = '无' | ||
508 | boxes = np.array(boxes).reshape((-1, 2)) | ||
509 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
510 | signature_role_2['page_num'] = page_num | ||
511 | signature_role_2['position'] = position | ||
512 | signature_role_2['words'] = words | ||
513 | return signature_role_2 | ||
514 | |||
515 | def get_signature_role_3(self): | ||
516 | signature_role_3 = self.init_item.copy() | ||
517 | # 先定位签字区域 | ||
518 | texts = [] | ||
519 | boxes = [] | ||
520 | page_num = None | ||
521 | position = None | ||
522 | words = None | ||
523 | region = False | ||
524 | for i in list(self.pdf_info.keys()): | ||
525 | for block in self.pdf_info[i]['blocks']: | ||
526 | if block['type'] != 0: | ||
527 | continue | ||
528 | for line in block['lines']: | ||
529 | for span in line['spans']: | ||
530 | bbox, text = span['bbox'], span['text'] | ||
531 | if '保证人1' in text and int(i) != 0: | ||
532 | region = True | ||
533 | if '日期' in text: | ||
534 | region = False | ||
535 | if region == True: | ||
536 | page_num = i | ||
537 | texts.append(text) | ||
538 | boxes.append(bbox) | ||
539 | if len(texts) > 4: | ||
540 | words = '有' | ||
541 | else: | ||
542 | words = '无' | ||
543 | boxes = np.array(boxes).reshape((-1, 2)) | ||
544 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
545 | signature_role_3['page_num'] = page_num | ||
546 | signature_role_3['position'] = position | ||
547 | signature_role_3['words'] = words | ||
548 | return signature_role_3 | ||
549 | |||
550 | def get_signature_role_4(self): | ||
551 | signature_role_4 = self.init_item.copy() | ||
552 | # 先定位签字区域 | ||
553 | texts = [] | ||
554 | boxes = [] | ||
555 | page_num = None | ||
556 | position = None | ||
557 | words = None | ||
558 | region = False | ||
559 | for i in list(self.pdf_info.keys()): | ||
560 | for block in self.pdf_info[i]['blocks']: | ||
561 | if block['type'] != 0: | ||
562 | continue | ||
563 | for line in block['lines']: | ||
564 | for span in line['spans']: | ||
565 | bbox, text = span['bbox'], span['text'] | ||
566 | if '保证人2' in text and int(i) != 0: | ||
567 | region = True | ||
568 | if '日期' in text: | ||
569 | region = False | ||
570 | if region == True: | ||
571 | page_num = i | ||
572 | texts.append(text) | ||
573 | boxes.append(bbox) | ||
574 | if len(texts) > 4: | ||
575 | words = '有' | ||
576 | else: | ||
577 | words = '无' | ||
578 | boxes = np.array(boxes).reshape((-1, 2)) | ||
579 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
580 | signature_role_4['page_num'] = page_num | ||
581 | signature_role_4['position'] = position | ||
582 | signature_role_4['words'] = words | ||
583 | return signature_role_4 | ||
584 | |||
585 | def get_signature_role_5(self): | ||
586 | signature_role_5 = self.init_item.copy() | ||
587 | # 先定位签字区域 | ||
588 | texts = [] | ||
589 | boxes = [] | ||
590 | page_num = None | ||
591 | position = None | ||
592 | words = None | ||
593 | region = False | ||
594 | for i in list(self.pdf_info.keys()): | ||
595 | for block in self.pdf_info[i]['blocks']: | ||
596 | if block['type'] != 0: | ||
597 | continue | ||
598 | for line in block['lines']: | ||
599 | for span in line['spans']: | ||
600 | bbox, text = span['bbox'], span['text'] | ||
601 | if '见证人签字' in text and int(i) != 0: | ||
602 | region = True | ||
603 | if '年' in text: | ||
604 | region = False | ||
605 | if region == True: | ||
606 | page_num = i | ||
607 | texts.append(text) | ||
608 | boxes.append(bbox) | ||
609 | print(texts) | ||
610 | if len(texts) > 4: | ||
611 | words = '有' | ||
612 | else: | ||
613 | words = '无' | ||
614 | boxes = np.array(boxes).reshape((-1, 2)) | ||
615 | position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])] | ||
616 | signature_role_5['page_num'] = page_num | ||
617 | signature_role_5['position'] = position | ||
618 | signature_role_5['words'] = words | ||
619 | return signature_role_5 | ||
620 | |||
621 | def get_last_page_signature(self, page_num, top, bottom): | ||
622 | signature_name = self.item.copy() | ||
623 | signature_date = self.item.copy() | ||
624 | anchor_top = None | ||
625 | anchor_bottom = None | ||
626 | for block in self.pdf_info[page_num]['blocks']: | ||
627 | if block['type'] != 0: | ||
628 | continue | ||
629 | for line in block['lines']: | ||
630 | for span in line['spans']: | ||
631 | bbox, text = span['bbox'], span['text'] | ||
632 | if top in text: | ||
633 | anchor_top = bbox[1] | ||
634 | if bottom in text: | ||
635 | anchor_bottom = bbox[1] | ||
636 | if anchor_top is not None and anchor_bottom is not None: | ||
637 | for block in self.pdf_info[page_num]['blocks']: | ||
638 | if block['type'] != 0: | ||
639 | continue | ||
640 | for line in block['lines']: | ||
641 | for span in line['spans']: | ||
642 | bbox, text = span['bbox'], span['text'] | ||
643 | if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom): | ||
644 | name = text.split(' ')[0] | ||
645 | date = text.split(':')[-1] | ||
646 | signature_name['words'] = name | ||
647 | signature_name['position'] = bbox | ||
648 | signature_date['words'] = date | ||
649 | signature_name['position'] = bbox | ||
650 | return signature_name, signature_date | ||
651 | |||
652 | def get_electronic_signature(self, top, bottom): | ||
653 | signature = self.item.copy() | ||
654 | anchor_top = None | ||
655 | anchor_bottom = None | ||
656 | for pno in self.pdf_info: | ||
657 | for block in self.pdf_info[pno]['blocks']: | ||
658 | if block['type'] != 0: | ||
659 | continue | ||
660 | for line in block['lines']: | ||
661 | for span in line['spans']: | ||
662 | bbox, text = span['bbox'], span['text'] | ||
663 | if top in text: | ||
664 | anchor_top = bbox[1] | ||
665 | if bottom in text: | ||
666 | anchor_bottom = bbox[1] | ||
667 | if anchor_top is not None and anchor_bottom is not None: | ||
668 | for pno in self.pdf_info: | ||
669 | for block in self.pdf_info[pno]['blocks']: | ||
670 | if block['type'] != 0: | ||
671 | continue | ||
672 | for line in block['lines']: | ||
673 | for span in line['spans']: | ||
674 | bbox, text = span['bbox'], span['text'] | ||
675 | if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom): | ||
676 | words = text | ||
677 | signature['words'] = words | ||
678 | signature['page'] = pno | ||
679 | signature['position'] = bbox | ||
680 | return signature | ||
681 | |||
682 | def get_role_info(self, role_key, page_num='0'): | ||
683 | name = self.item.copy() | ||
684 | id_num = self.item.copy() | ||
685 | representative = self.item.copy() | ||
686 | |||
687 | # 以保证人3 的左上角为定位点 | ||
688 | anchor = None | ||
689 | for block in self.pdf_info[page_num]['blocks']: | ||
690 | if block['type'] != 0: | ||
691 | continue | ||
692 | for line in block['lines']: | ||
693 | for span in line['spans']: | ||
694 | bbox, text = span['bbox'], span['text'] | ||
695 | # 找到角色姓名 | ||
696 | if re.match('保证人3', text) is not None: | ||
697 | anchor = [bbox[0], bbox[1]] | ||
698 | |||
699 | if anchor is not None: | ||
700 | for block in self.pdf_info[page_num]['blocks']: | ||
701 | if block['type'] != 0: | ||
702 | continue | ||
703 | for line in block['lines']: | ||
704 | for span in line['spans']: | ||
705 | bbox, text = span['bbox'], span['text'] | ||
706 | # 找到角色姓名 | ||
707 | if re.match(role_key, text) is not None: | ||
708 | words = text.split(':')[-1] | ||
709 | name['words'] = words | ||
710 | name['page'] = page_num | ||
711 | name['position'] = bbox | ||
712 | if role_key == '承租人:': | ||
713 | # 找到证件号码且确定位置 | ||
714 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
715 | words = text.split(':')[-1] | ||
716 | id_num['words'] = words | ||
717 | id_num['page'] = page_num | ||
718 | id_num['position'] = bbox | ||
719 | # 找到法人代表且确定位置 | ||
720 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
721 | words = text.split(':')[-1] | ||
722 | representative['words'] = words | ||
723 | representative['page'] = page_num | ||
724 | representative['position'] = bbox | ||
725 | if role_key == '保证人1:': | ||
726 | # 找到证件号码且确定位置 | ||
727 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
728 | words = text.split(':')[-1] | ||
729 | id_num['words'] = words | ||
730 | id_num['page'] = page_num | ||
731 | id_num['position'] = bbox | ||
732 | # 找到法人代表且确定位置 | ||
733 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
734 | words = text.split(':')[-1] | ||
735 | representative['words'] = words | ||
736 | representative['page'] = page_num | ||
737 | representative['position'] = bbox | ||
738 | if role_key == '保证人2:': | ||
739 | # 找到证件号码且确定位置 | ||
740 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
741 | words = text.split(':')[-1] | ||
742 | id_num['words'] = words | ||
743 | id_num['page'] = page_num | ||
744 | id_num['position'] = bbox | ||
745 | # 找到法人代表且确定位置 | ||
746 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | ||
747 | words = text.split(':')[-1] | ||
748 | representative['words'] = words | ||
749 | representative['page'] = page_num | ||
750 | representative['position'] = bbox | ||
751 | if role_key == '保证人3:': | ||
752 | # 找到证件号码且确定位置 | ||
753 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
754 | words = text.split(':')[-1] | ||
755 | id_num['words'] = words | ||
756 | id_num['page'] = page_num | ||
757 | id_num['position'] = bbox | ||
758 | # 找到法人代表且确定位置 | ||
759 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | ||
760 | words = text.split(':')[-1] | ||
761 | representative['words'] = words | ||
762 | representative['page'] = page_num | ||
763 | representative['position'] = bbox | ||
764 | return name, id_num, representative | ||
765 | |||
766 | def get_table_add_product(self): | ||
767 | table_add_product = self.item.copy() | ||
768 | items = [] | ||
769 | start = False | ||
770 | page = None | ||
771 | for pno in self.pdf_info: | ||
772 | condition = False | ||
773 | for block in self.pdf_info[f'{pno}']['blocks']: | ||
774 | if block['type'] != 0: | ||
775 | continue | ||
776 | for line in block['lines']: | ||
777 | for span in line['spans']: | ||
778 | bbox, text = span['bbox'], span['text'] | ||
779 | if '总计' in text: | ||
780 | start = True | ||
781 | if '注:出租人向承租人购买租赁车辆的对价' in text: | ||
782 | page = pno | ||
783 | start = False | ||
784 | if start == True: | ||
785 | items.append(text) | ||
786 | |||
787 | lines = [['项目', '购买价格', '实际融资金额']] | ||
788 | for i in range(len(items)//3): | ||
789 | line = [items[2+i*3+0], items[2+i*3+1], items[2+i*3+2]] | ||
790 | lines.append(line) | ||
791 | |||
792 | if len(items) > 0: | ||
793 | lines.append([items[0], '', items[1]]) | ||
794 | |||
795 | table_add_product['words'] = lines | ||
796 | table_add_product['page'] = page | ||
797 | table_add_product['position'] = None | ||
798 | return table_add_product | ||
799 | |||
800 | def get_contract_no_dy(self): | ||
801 | # 查找抵押合同编号 | ||
802 | contract_no = self.item.copy() | ||
803 | |||
804 | key_box = None | ||
805 | for pno in self.pdf_info: | ||
806 | for block in self.pdf_info[pno]['blocks']: | ||
807 | if block['type'] != 0: | ||
808 | continue | ||
809 | for line in block['lines']: | ||
810 | for span in line['spans']: | ||
811 | bbox, text = span['bbox'], span['text'] | ||
812 | if '抵押合同编号' in text: | ||
813 | key_box = bbox | ||
814 | |||
815 | if key_box is not None: | ||
816 | for pno in self.pdf_info: | ||
817 | for block in self.pdf_info[pno]['blocks']: | ||
818 | if block['type'] != 0: | ||
819 | continue | ||
820 | for line in block['lines']: | ||
821 | for span in line['spans']: | ||
822 | bbox, text = span['bbox'], span['text'] | ||
823 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and 'CH-' in text: | ||
824 | contract_no['position'] = bbox | ||
825 | contract_no['page'] = pno | ||
826 | contract_no['words'] = text | ||
827 | return contract_no | ||
828 | |||
829 | def get_dyr_name_id(self): | ||
830 | name = self.item.copy() | ||
831 | _id = self.item.copy() | ||
832 | |||
833 | key_box = None | ||
834 | for pno in self.pdf_info: | ||
835 | for block in self.pdf_info[pno]['blocks']: | ||
836 | if block['type'] != 0: | ||
837 | continue | ||
838 | for line in block['lines']: | ||
839 | for span in line['spans']: | ||
840 | bbox, text = span['bbox'], span['text'] | ||
841 | if text == '抵押人': | ||
842 | key_box = bbox | ||
843 | |||
844 | if key_box is not None: | ||
845 | rh = abs(key_box[1]-key_box[3]) | ||
846 | for pno in self.pdf_info: | ||
847 | for block in self.pdf_info[pno]['blocks']: | ||
848 | if block['type'] != 0: | ||
849 | continue | ||
850 | for line in block['lines']: | ||
851 | for span in line['spans']: | ||
852 | bbox, text = span['bbox'], span['text'] | ||
853 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '姓名' in text: | ||
854 | words = text.split(':')[-1] | ||
855 | name['position'] = bbox | ||
856 | name['page'] = pno | ||
857 | name['words'] = words | ||
858 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '证件号码' in text: | ||
859 | words = text.split(':')[-1] | ||
860 | _id['position'] = bbox | ||
861 | _id['page'] = pno | ||
862 | _id['words'] = words | ||
863 | return name, _id | ||
864 | |||
865 | def get_key_value_position(self, key): | ||
866 | value = self.item.copy() | ||
867 | |||
868 | key_box = None | ||
869 | for pno in self.pdf_info: | ||
870 | for block in self.pdf_info[pno]['blocks']: | ||
871 | if block['type'] != 0: | ||
872 | continue | ||
873 | for line in block['lines']: | ||
874 | for span in line['spans']: | ||
875 | bbox, text = span['bbox'], span['text'] | ||
876 | if text == key: | ||
877 | key_box = bbox | ||
878 | |||
879 | if key_box is not None: | ||
880 | rh = abs(key_box[1]-key_box[3]) | ||
881 | for pno in self.pdf_info: | ||
882 | for block in self.pdf_info[pno]['blocks']: | ||
883 | if block['type'] != 0: | ||
884 | continue | ||
885 | for line in block['lines']: | ||
886 | for span in line['spans']: | ||
887 | bbox, text = span['bbox'], span['text'] | ||
888 | if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and key_box[0] < bbox[0] and abs(key_box[2]-bbox[0]) < rh*10: | ||
889 | words = text | ||
890 | value['position'] = bbox | ||
891 | value['page'] = pno | ||
892 | value['words'] = words | ||
893 | return value | ||
894 | |||
895 | def get_info(self): | ||
896 | """ | ||
897 | block['type'] == 0 : 表示该元素为图片 | ||
898 | |||
899 | Returns: | ||
900 | dict: Description | ||
901 | """ | ||
902 | if len(self.pdf_info) > 0: | ||
903 | # 取 Page 1 上的合同编号 | ||
904 | contract_no = self.get_contract_no(page_num='0') | ||
905 | self.init_result['合同编号'] = contract_no | ||
906 | # 从第一页上取四个角色的姓名和证件号码 | ||
907 | name, id_num, representative = self.get_role_info(role_key='承租人:', page_num='0') | ||
908 | self.init_result['承租人-姓名'] = name | ||
909 | self.init_result['承租人-证件号码'] = id_num | ||
910 | self.init_result['承租人-法定代表人或授权代表'] = representative | ||
911 | name, id_num, representative = self.get_role_info(role_key='保证人1:', page_num='0') | ||
912 | self.init_result['保证人1-姓名'] = name | ||
913 | self.init_result['保证人1-证件号码'] = id_num | ||
914 | self.init_result['保证人1-法定代表人或授权代表'] = representative | ||
915 | name, id_num, representative = self.get_role_info(role_key='保证人2:', page_num='0') | ||
916 | self.init_result['保证人2-姓名'] = name | ||
917 | self.init_result['保证人2-证件号码'] = id_num | ||
918 | self.init_result['保证人2-法定代表人或授权代表'] = representative | ||
919 | name, id_num, representative = self.get_role_info(role_key='保证人3:', page_num='0') | ||
920 | self.init_result['保证人3-姓名'] = name | ||
921 | self.init_result['保证人3-证件号码'] = id_num | ||
922 | self.init_result['保证人3-法定代表人或授权代表'] = representative | ||
923 | # 在所有页面中找正文中(第二部分 融资租赁主要条款及付款计划)的那个编号,因为存在换行的情况所以暂时不带位置输出 | ||
924 | contract_no = self.get_contract_no_one() | ||
925 | self.init_result['合同编号(正文)'] = contract_no | ||
926 | # 找到车辆识别代码 | ||
927 | vin = self.get_key_value(key='车辆识别代码:') | ||
928 | self.init_result['车辆识别代码'] = vin | ||
929 | # 找到经销商(车辆卖方(经销商)) | ||
930 | seller = self.get_key_value(key='车辆卖方(经销商):') | ||
931 | self.init_result['车辆卖方(经销商)'] = seller | ||
932 | # 找到 —— 车辆原始销售价格 | ||
933 | vehicle_price = self.get_key_value(key='车辆原始销售价格(《机动车销售统一发票》所列金额):') | ||
934 | self.init_result['车辆原始销售价格(《机动车销售统一发票》所列金额)'] = vehicle_price | ||
935 | # 找车辆附加产品明细(表) | ||
936 | table_add_product = self.get_table_add_product() | ||
937 | self.init_result['车辆附加产品明细表'] = table_add_product | ||
938 | # 找融资成本总额 | ||
939 | financing_cost = self.get_key_value(key='融资成本总额:') | ||
940 | self.init_result['融资成本总额'] = financing_cost | ||
941 | # 找租期 | ||
942 | lease_term = self.get_key_value(key='租期:') | ||
943 | self.init_result['租期'] = lease_term | ||
944 | # 找还款计划(表) | ||
945 | repayment_schedule = self.get_repayment_schedule() | ||
946 | self.init_result['付款计划表'] = repayment_schedule | ||
947 | # 找开户行户名、银行账号、银行 | ||
948 | name = self.get_key_value(key='户名:') | ||
949 | self.init_result['银行账户-户名'] = name | ||
950 | account = self.get_key_value(key='银行账号:') | ||
951 | self.init_result['银行账户-银行账号'] = account | ||
952 | bank = self.get_key_value(key='开户银行:') | ||
953 | self.init_result['银行账户-开户行'] = bank | ||
954 | # 找签字页上的系列信息 | ||
955 | # 承租人姓名、签章 | ||
956 | name = self.get_key_value(key='承租人姓名:') | ||
957 | electronic_signature = self.get_electronic_signature(top='承租人姓名:', bottom='保证人1姓名:') | ||
958 | self.init_result['签字页-承租人姓名'] = name | ||
959 | self.init_result['签字页-承租人签章'] = electronic_signature | ||
960 | # 保证人1姓名、签章 | ||
961 | name = self.get_key_value(key='保证人1姓名:') | ||
962 | electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:') | ||
963 | self.init_result['签字页-保证人1姓名'] = name | ||
964 | self.init_result['签字页-保证人1签章'] = electronic_signature | ||
965 | # 保证人2姓名、签章 | ||
966 | name = self.get_key_value(key='保证人2姓名:') | ||
967 | electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='保证人3姓名:') | ||
968 | self.init_result['签字页-保证人2姓名'] = name | ||
969 | self.init_result['签字页-保证人2签章'] = electronic_signature | ||
970 | # 保证人2姓名、签章 | ||
971 | name = self.get_key_value(key='保证人3姓名:') | ||
972 | electronic_signature = self.get_electronic_signature(top='保证人3姓名:', bottom='日期:') | ||
973 | self.init_result['签字页-保证人3姓名'] = name | ||
974 | self.init_result['签字页-保证人3签章'] = electronic_signature | ||
975 | |||
976 | return self.init_result | ||
977 | |||
978 | # results['is_shhz_contract'] = True | ||
979 | # results['pdf_info'] = self.init_result | ||
980 | |||
981 | # return results | ||
982 | |||
983 | def get_info_1(self): | ||
984 | if len(self.pdf_info) > 0: | ||
985 | contract_no = self.get_contract_no(page_num='0') | ||
986 | self.init_result_1['合同编号'] = contract_no | ||
987 | # 承租人姓名 | ||
988 | name = self.get_key_value(key='承租人:', page_num='0') | ||
989 | self.init_result_1['承租人-姓名'] = name | ||
990 | # 承租人证件号码 | ||
991 | _id = self.get_key_value(key='证件号码:', page_num='0') | ||
992 | self.init_result_1['承租人-证件号码'] = _id | ||
993 | # 销售经销商 | ||
994 | seller = self.get_key_value(key='销售经销商:', page_num='0') | ||
995 | self.init_result_1['销售经销商'] = seller | ||
996 | # 合同编号(正文) | ||
997 | contract_no = self.get_contract_no_one() | ||
998 | self.init_result_1['合同编号(正文)'] = contract_no | ||
999 | # 签字页-承租人姓名 | ||
1000 | name = self.get_key_value(key='姓名/名称:') | ||
1001 | self.init_result_1['签字页-承租人姓名'] = name | ||
1002 | # 签字页-承租人证件号码 | ||
1003 | _id = self.get_key_value(key='自然人身份证件号码/法人执照号码:') | ||
1004 | self.init_result_1['签字页-承租人证件号码'] = _id | ||
1005 | # 签字页-承租人签章 | ||
1006 | signature_role_1 = self.get_signature_role_1() | ||
1007 | self.init_result_1['签字页-承租人签章'] = signature_role_1 | ||
1008 | # 签字页-销售经销商 | ||
1009 | seller = self.get_key_value(key='销售经销商:') | ||
1010 | self.init_result_1['签字页-销售经销商'] = seller | ||
1011 | # 经销商签章 | ||
1012 | pass | ||
1013 | return self.init_result_1 | ||
1014 | |||
1015 | def get_info_2(self): | ||
1016 | if len(self.pdf_info) > 0: | ||
1017 | contract_no = self.get_contract_no_dy() | ||
1018 | self.init_result_2['合同编号'] = contract_no | ||
1019 | # 合同编号(正文) | ||
1020 | contract_no = self.get_contract_no_one() | ||
1021 | self.init_result_2['合同编号(正文)'] = contract_no | ||
1022 | # 抵押人姓名/名称 | ||
1023 | name, _id = self.get_dyr_name_id() | ||
1024 | self.init_result_2['抵押人姓名/名称'] = name | ||
1025 | self.init_result_2['抵押人证件号码'] = _id | ||
1026 | # 车辆识别代码 | ||
1027 | vin = self.get_key_value(key='车辆识别代码:') | ||
1028 | self.init_result_2['车辆识别代码'] = vin | ||
1029 | # 租金总额 | ||
1030 | rent = self.get_key_value_position(key='租金总额') | ||
1031 | self.init_result_2['租金总额'] = rent | ||
1032 | # 融资租赁期限 | ||
1033 | lease_term = self.get_key_value_position(key='融资租赁期限') | ||
1034 | self.init_result_2['融资租赁期限'] = lease_term | ||
1035 | # 签字页抵押人姓名和签章 | ||
1036 | name = self.get_key_value(key='抵押人姓名:') | ||
1037 | electronic_signature = self.get_electronic_signature(top='抵押权人盖章', bottom='抵押人配偶姓名:') | ||
1038 | self.init_result_2['签字页-抵押人姓名'] = name | ||
1039 | self.init_result_2['签字页-抵押人签章'] = electronic_signature | ||
1040 | # 签字页抵押人配偶姓名和签章 | ||
1041 | name = self.get_key_value(key='抵押人配偶姓名:') | ||
1042 | electronic_signature = self.get_electronic_signature(top='抵押人配偶姓名:', bottom='日期') | ||
1043 | self.init_result_2['签字页-抵押人配偶姓名'] = name | ||
1044 | self.init_result_2['签字页-抵押人配偶签章'] = electronic_signature | ||
1045 | return self.init_result_2 |
1 | # -*- coding: utf-8 -*- | ||
2 | # @Author : lk | ||
3 | # @Email : 9428.al@gmail.com | ||
4 | # @Created Date : 2021-06-29 17:43:46 | ||
5 | # @Last Modified : 2021-11-03 16:07:36 | ||
6 | # @Description : | ||
7 | |||
8 | from .get_char import Finder | ||
9 | |||
10 | |||
11 | def predict(pdf_info, file_cls): | ||
12 | """Summary | ||
13 | |||
14 | Args: | ||
15 | pdf_info (TYPE): Description | ||
16 | file_cls (TYPE): file_cls = 0: 售后回租合同; file_cls = 1: 车辆处置协议; file_cls = 2: 车辆租赁抵押合同 | ||
17 | |||
18 | Returns: | ||
19 | TYPE: Description | ||
20 | """ | ||
21 | |||
22 | # 0: 售后回租合同 | ||
23 | pdf_info_0 = [] | ||
24 | for pno in pdf_info: | ||
25 | for block in pdf_info[f'{pno}']['blocks']: | ||
26 | if block['type'] != 0: | ||
27 | continue | ||
28 | for line in block['lines']: | ||
29 | for span in line['spans']: | ||
30 | bbox, text = span['bbox'], span['text'] | ||
31 | if '售后回租合同_' in text: | ||
32 | pdf_info_0.append(pdf_info[pno]) | ||
33 | |||
34 | # 1: 车辆处置协议 | ||
35 | pdf_info_1 = [] | ||
36 | for pno in pdf_info: | ||
37 | for block in pdf_info[f'{pno}']['blocks']: | ||
38 | if block['type'] != 0: | ||
39 | continue | ||
40 | for line in block['lines']: | ||
41 | for span in line['spans']: | ||
42 | bbox, text = span['bbox'], span['text'] | ||
43 | if '售后回租合同附件一' in text: | ||
44 | pdf_info_1.append(pdf_info[pno]) | ||
45 | |||
46 | # 2: 车辆租赁抵押合同 | ||
47 | pdf_info_2 = [] | ||
48 | for pno in pdf_info: | ||
49 | for block in pdf_info[f'{pno}']['blocks']: | ||
50 | if block['type'] != 0: | ||
51 | continue | ||
52 | for line in block['lines']: | ||
53 | for span in line['spans']: | ||
54 | bbox, text = span['bbox'], span['text'] | ||
55 | if '车辆租赁抵押合同_' in text: | ||
56 | pdf_info_2.append(pdf_info[pno]) | ||
57 | |||
58 | is_clczxy = False | ||
59 | # 如果 pdf_info_1 == 4 页,则说明此时输入包含了车辆处置协议 | ||
60 | if len(pdf_info_1) == 4 and file_cls == 1 and len(pdf_info_0) != 0: | ||
61 | is_clczxy = True | ||
62 | pdf_info = dict() | ||
63 | for pno, page_info in enumerate(pdf_info_1): | ||
64 | pdf_info[str(pno)] = page_info | ||
65 | |||
66 | f = Finder(pdf_info) | ||
67 | if file_cls == 0: | ||
68 | results = f.get_info() | ||
69 | if file_cls == 1: | ||
70 | # 提取信息 ———— 车辆处置协议 | ||
71 | results = f.get_info_1() | ||
72 | if file_cls == 2: | ||
73 | # 提取信息 ———— 车辆租赁抵押合同 | ||
74 | results = f.get_info_2() | ||
75 | |||
76 | if is_clczxy == True: | ||
77 | for key in results: | ||
78 | if results[key]['page'] is not None: | ||
79 | results[key]['page'] = str(int(results[key]['page'])+6) | ||
80 | |||
81 | for key in results: | ||
82 | if results[key]['page'] is not None: | ||
83 | results[key]['page'] = 'page_' + str(int(results[key]['page'])+1) | ||
84 | return results |
src/common/tools/mssql_script10.py
0 → 100644
1 | import pyodbc | ||
2 | |||
3 | afc_sql = """ | ||
4 | create table afc_contract | ||
5 | ( | ||
6 | id bigint identity primary key, | ||
7 | application_id nvarchar(64) not null, | ||
8 | create_time datetime not null | ||
9 | ); | ||
10 | |||
11 | create index afc_contract_application_id_index | ||
12 | on afc_contract (application_id); | ||
13 | """ | ||
14 | |||
15 | hil_sql = """ | ||
16 | create table hil_contract | ||
17 | ( | ||
18 | id bigint identity primary key, | ||
19 | application_id nvarchar(64) not null, | ||
20 | create_time datetime not null | ||
21 | ); | ||
22 | |||
23 | create index hil_contract_application_id_index | ||
24 | on hil_contract (application_id); | ||
25 | """ | ||
26 | |||
27 | hil_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True) | ||
28 | |||
29 | hil_cursor = hil_cnxn.cursor() | ||
30 | hil_cursor.execute(hil_sql) | ||
31 | |||
32 | hil_cursor.close() | ||
33 | hil_cnxn.close() | ||
34 | |||
35 | afc_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True) | ||
36 | |||
37 | afc_cursor = afc_cnxn.cursor() | ||
38 | afc_cursor.execute(afc_sql) | ||
39 | |||
40 | afc_cursor.close() | ||
41 | afc_cnxn.close() |
1 | import os | 1 | import os |
2 | import json | ||
2 | import cv2 | 3 | import cv2 |
3 | import shutil | 4 | import shutil |
4 | import fitz | 5 | import fitz |
... | @@ -35,6 +36,8 @@ class PDFHandler: | ... | @@ -35,6 +36,8 @@ class PDFHandler: |
35 | self.suffix = self.get_suffix(document_name) | 36 | self.suffix = self.get_suffix(document_name) |
36 | self.is_ebank = False | 37 | self.is_ebank = False |
37 | self.page_text_list = [] | 38 | self.page_text_list = [] |
39 | self.pdf_info = {} | ||
40 | self.img_path_pno_list = [] | ||
38 | 41 | ||
39 | def get_suffix(self, file_name): | 42 | def get_suffix(self, file_name): |
40 | if file_name is None: | 43 | if file_name is None: |
... | @@ -296,6 +299,17 @@ class PDFHandler: | ... | @@ -296,6 +299,17 @@ class PDFHandler: |
296 | self.is_ebank = True | 299 | self.is_ebank = True |
297 | self.page_text_list = page_text_list | 300 | self.page_text_list = page_text_list |
298 | 301 | ||
302 | def e_contract_process(self): | ||
303 | with fitz.Document(self.path) as pdf: | ||
304 | for pno in range(pdf.pageCount): | ||
305 | page = pdf.loadPage(pno) | ||
306 | self.pdf_info[str(pno)] = json.loads(page.getText('json')) | ||
307 | |||
308 | pix = page.getPixmap() | ||
309 | img_save_path = self.get_img_save_path(page.number) | ||
310 | self.img_path_pno_list.append((img_save_path, 'page_{0}'.format(str(pno+1)))) | ||
311 | pix.writePNG(img_save_path) | ||
312 | |||
299 | def extract_image(self, max_img_count=None): | 313 | def extract_image(self, max_img_count=None): |
300 | self.img_path_list = [] | 314 | self.img_path_list = [] |
301 | self.xref_set = set() | 315 | self.xref_set = set() | ... | ... |
... | @@ -12,4 +12,6 @@ EDMS_DOWNLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/Dow | ... | @@ -12,4 +12,6 @@ EDMS_DOWNLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/Dow |
12 | EDMS_UPLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/UploadHandler.ashx | 12 | EDMS_UPLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/UploadHandler.ashx |
13 | DEALER_CODE = ocr_situ_group | 13 | DEALER_CODE = ocr_situ_group |
14 | 14 | ||
15 | BASE_URL = https://staging-bmw-ocr.situdata.com | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
15 | BASE_URL = https://staging-bmw-ocr.situdata.com | ||
16 | |||
17 | DELAY_SECONDS = 60 | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
... | @@ -12,4 +12,6 @@ EDMS_DOWNLOAD_URL = http://sccn0637.bmwgroup.net/FH/FileHold/DocumentRepository/ | ... | @@ -12,4 +12,6 @@ EDMS_DOWNLOAD_URL = http://sccn0637.bmwgroup.net/FH/FileHold/DocumentRepository/ |
12 | EDMS_UPLOAD_URL = http://sccn0637.bmwgroup.net/FH/FileHold/DocumentRepository/UploadHandler.ashx | 12 | EDMS_UPLOAD_URL = http://sccn0637.bmwgroup.net/FH/FileHold/DocumentRepository/UploadHandler.ashx |
13 | DEALER_CODE = ocr_situ_group | 13 | DEALER_CODE = ocr_situ_group |
14 | 14 | ||
15 | BASE_URL = https://li19dkocruat01vm.bmwgroup.net | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
15 | BASE_URL = https://li19dkocruat01vm.bmwgroup.net | ||
16 | |||
17 | DELAY_SECONDS = 60 | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
-
Please register or sign in to post a comment