f77b2322 by 周伟奇

e-contract part 1

1 parent cc6c63c8
...@@ -1773,3 +1773,21 @@ APPLICANT_TYPE_MAP = { ...@@ -1773,3 +1773,21 @@ APPLICANT_TYPE_MAP = {
1773 } 1773 }
1774 1774
1775 APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager'] 1775 APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager']
1776
1777 FILE_NAME_PREFIX_MAP = {
1778 AFC_PREFIX: [
1779 ((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'),
1780 ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
1781 ],
1782 HIL_PREFIX: [
1783 ((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'),
1784 ((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'),
1785 ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
1786 ]
1787 }
1788
1789 HIL_CONTRACT_TYPE_MAP = {
1790 str(HIL_CONTRACT_1_CLASSIFY): 0,
1791 str(HIL_CONTRACT_2_CLASSIFY): 2,
1792 str(HIL_CONTRACT_3_CLASSIFY): 1,
1793 }
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -18,6 +18,8 @@ from settings import conf ...@@ -18,6 +18,8 @@ from settings import conf
18 from common.mixins import LoggerMixin 18 from common.mixins import LoggerMixin
19 from common.tools.file_tools import write_zip_file 19 from common.tools.file_tools import write_zip_file
20 from common.tools.pdf_to_img import PDFHandler 20 from common.tools.pdf_to_img import PDFHandler
21 from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict
22 from common.electronic_hil_contract.hil_contract_ocr import predict as hil_predict
21 from apps.doc import consts 23 from apps.doc import consts
22 # from apps.doc.ocr.edms import EDMS, rh 24 # from apps.doc.ocr.edms import EDMS, rh
23 from apps.doc.ocr.ecm import ECM, rh 25 from apps.doc.ocr.ecm import ECM, rh
...@@ -47,6 +49,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -47,6 +49,7 @@ class Command(BaseCommand, LoggerMixin):
47 def __init__(self): 49 def __init__(self):
48 super().__init__() 50 super().__init__()
49 self.log_base = '[doc ocr process]' 51 self.log_base = '[doc ocr process]'
52 self.e_log_base = '[e-contract ocr process]'
50 # 处理文件开关 53 # 处理文件开关
51 self.switch = True 54 self.switch = True
52 # 睡眠时间 55 # 睡眠时间
...@@ -90,13 +93,20 @@ class Command(BaseCommand, LoggerMixin): ...@@ -90,13 +93,20 @@ class Command(BaseCommand, LoggerMixin):
90 task_str, is_priority = rh.dequeue() 93 task_str, is_priority = rh.dequeue()
91 if task_str is None: 94 if task_str is None:
92 self.online_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base)) 95 self.online_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base))
93 return None, None, None 96 return None, None, None, None, None
94 97
95 self.online_log.info('{0} [get_doc_info] [task={1}] [is_priority={2}]'.format( 98 self.online_log.info('{0} [get_doc_info] [task={1}] [is_priority={2}]'.format(
96 self.log_base, task_str, is_priority)) 99 self.log_base, task_str, is_priority))
97 try: 100 try:
98 # doc, business_type = self.get_doc_object(task_str) 101 # doc, business_type = self.get_doc_object(task_str)
99 business_type, doc_id_str = task_str.split(consts.SPLIT_STR) 102 info_tuple = task_str.split(consts.SPLIT_STR)
103 if len(info_tuple) == 2:
104 business_type, doc_id_str = info_tuple
105 classify_1_str = classify_2_str = '0'
106 rebuild_task_str = task_str
107 else:
108 business_type, doc_id_str, classify_1_str, classify_2_str = info_tuple
109 rebuild_task_str = '{0}{1}{2}'.format(business_type, consts.SPLIT_STR, doc_id_str)
100 doc_id = int(doc_id_str) 110 doc_id = int(doc_id_str)
101 doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc 111 doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
102 doc = doc_class.objects.filter(id=doc_id).first() 112 doc = doc_class.objects.filter(id=doc_id).first()
...@@ -104,11 +114,11 @@ class Command(BaseCommand, LoggerMixin): ...@@ -104,11 +114,11 @@ class Command(BaseCommand, LoggerMixin):
104 if doc is None: 114 if doc is None:
105 self.online_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format( 115 self.online_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format(
106 self.log_base, task_str, is_priority)) 116 self.log_base, task_str, is_priority))
107 return None, None, None 117 return None, None, None, None, None
108 elif doc.status != DocStatus.INIT.value: 118 elif doc.status != DocStatus.INIT.value:
109 self.online_log.warn('{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] ' 119 self.online_log.warn('{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] '
110 '[doc_status={3}]'.format(self.log_base, task_str, is_priority, doc.status)) 120 '[doc_status={3}]'.format(self.log_base, task_str, is_priority, doc.status))
111 return None, None, None 121 return None, None, None, None, None
112 doc.status = DocStatus.PROCESSING.value 122 doc.status = DocStatus.PROCESSING.value
113 doc.start_time = timezone.now() 123 doc.start_time = timezone.now()
114 doc.save() 124 doc.save()
...@@ -120,7 +130,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -120,7 +130,7 @@ class Command(BaseCommand, LoggerMixin):
120 else: 130 else:
121 self.online_log.info('{0} [get_doc_info] [db save end] [task_str={1}] [is_priority={2}]'.format( 131 self.online_log.info('{0} [get_doc_info] [db save end] [task_str={1}] [is_priority={2}]'.format(
122 self.log_base, task_str, is_priority)) 132 self.log_base, task_str, is_priority))
123 return doc, business_type, task_str 133 return doc, business_type, rebuild_task_str, classify_1_str, classify_2_str
124 134
125 # def pdf_download(self, doc, pdf_path): 135 # def pdf_download(self, doc, pdf_path):
126 # if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): 136 # if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
...@@ -212,7 +222,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -212,7 +222,7 @@ class Command(BaseCommand, LoggerMixin):
212 222
213 def contract_process(self, classify, ocr_data, contract_result, res_list, pno, ino, part_idx, img_path): 223 def contract_process(self, classify, ocr_data, contract_result, res_list, pno, ino, part_idx, img_path):
214 contract_dict = ocr_data.get('data') 224 contract_dict = ocr_data.get('data')
215 if not contract_dict or contract_dict.get('page_num') is None or contract_dict.get('page_info') is None: 225 if not contract_dict or contract_dict.get('page_num') is None or contract_dict.get('page_info') is None:
216 res_list.append((pno, ino, part_idx, consts.RES_SUCCESS_EMPTY)) 226 res_list.append((pno, ino, part_idx, consts.RES_SUCCESS_EMPTY))
217 return 227 return
218 res_list.append((pno, ino, part_idx, consts.RES_SUCCESS)) 228 res_list.append((pno, ino, part_idx, consts.RES_SUCCESS))
...@@ -915,11 +925,11 @@ class Command(BaseCommand, LoggerMixin): ...@@ -915,11 +925,11 @@ class Command(BaseCommand, LoggerMixin):
915 # summary['confidence'] = max(summary['confidence']) 925 # summary['confidence'] = max(summary['confidence'])
916 return merged_bs_summary 926 return merged_bs_summary
917 927
918 def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock, error_list): 928 def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock, error_list, res_dict, finish_queue):
919 while self.switch: 929 while self.switch:
920 try: 930 try:
921 # 1. 从队列获取文件信息 931 # 1. 从队列获取文件信息
922 doc, business_type, task_str = self.get_doc_info() 932 doc, business_type, task_str, classify_1_str, classify_2_str = self.get_doc_info()
923 # 队列为空时的处理 933 # 队列为空时的处理
924 if doc is None: 934 if doc is None:
925 time.sleep(self.sleep_time_doc_get) 935 time.sleep(self.sleep_time_doc_get)
...@@ -930,55 +940,109 @@ class Command(BaseCommand, LoggerMixin): ...@@ -930,55 +940,109 @@ class Command(BaseCommand, LoggerMixin):
930 error_list.append(1) 940 error_list.append(1)
931 return 941 return
932 else: 942 else:
933 try: 943 doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id))
934 # 2. 从EDMS获取PDF文件 944 os.makedirs(doc_data_path, exist_ok=True)
935 doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id)) 945 img_save_path = os.path.join(doc_data_path, 'img')
936 os.makedirs(doc_data_path, exist_ok=True) 946 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
937 img_save_path = os.path.join(doc_data_path, 'img')
938 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
939
940 pdf_handler = PDFHandler(pdf_path, img_save_path, doc.document_name)
941 max_count_obj = Configs.objects.filter(id=2).first()
942 try:
943 max_img_count = int(max_count_obj.value)
944 except Exception as e:
945 max_img_count = 500
946 947
947 for times in range(consts.RETRY_TIMES): 948 pdf_handler = PDFHandler(pdf_path, img_save_path, doc.document_name)
948 try:
949 if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
950 # self.edms.download(pdf_path, doc.metadata_version_id)
951 self.edms.download(pdf_path, doc.metadata_version_id, doc.document_scheme, business_type)
952 self.online_log.info('{0} [edms download success] [task={1}] [times={2}] '
953 '[pdf_path={3}]'.format(self.log_base, task_str, times, pdf_path))
954 949
955 # 3.PDF文件提取图片 950 if classify_1_str == '0' or classify_1_str == str(consts.HMH_CLASSIFY):
956 self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format( 951 try:
957 self.log_base, task_str, times)) 952 # 2. 从EDMS获取PDF文件
958 start_time = time.time() 953 max_count_obj = Configs.objects.filter(id=2).first()
959 pdf_handler.extract_image(max_img_count) 954 try:
960 end_time = time.time() 955 max_img_count = int(max_count_obj.value)
961 speed_time = int(end_time - start_time)
962 self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format(
963 self.log_base, task_str, times, speed_time))
964 except Exception as e: 956 except Exception as e:
965 self.online_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] ' 957 max_img_count = 500
966 '[error={3}]'.format(self.log_base, task_str, times, 958
967 traceback.format_exc())) 959 for times in range(consts.RETRY_TIMES):
960 try:
961 if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
962 # self.edms.download(pdf_path, doc.metadata_version_id)
963 self.edms.download(pdf_path, doc.metadata_version_id, doc.document_scheme, business_type)
964 self.online_log.info('{0} [edms download success] [task={1}] [times={2}] '
965 '[pdf_path={3}]'.format(self.log_base, task_str, times, pdf_path))
966
967 # 3.PDF文件提取图片
968 self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
969 self.log_base, task_str, times))
970 start_time = time.time()
971 pdf_handler.extract_image(max_img_count)
972 end_time = time.time()
973 speed_time = int(end_time - start_time)
974 self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format(
975 self.log_base, task_str, times, speed_time))
976 except Exception as e:
977 self.online_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] '
978 '[error={3}]'.format(self.log_base, task_str, times,
979 traceback.format_exc()))
980 else:
981 break
968 else: 982 else:
969 break 983 raise Exception('download or pdf to img failed')
970 else:
971 raise Exception('download or pdf to img failed')
972 984
973 if pdf_handler.img_count == 0: 985 if pdf_handler.img_count == 0:
974 self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format( 986 self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format(
975 self.log_base, task_str)) 987 self.log_base, task_str))
976 raise Exception('pdf img empty') 988 raise Exception('pdf img empty')
977 elif pdf_handler.img_count >= max_img_count: 989 elif pdf_handler.img_count >= max_img_count:
978 self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format( 990 self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format(
979 self.log_base, task_str, pdf_handler.img_count)) 991 self.log_base, task_str, pdf_handler.img_count))
980 992
993 try:
994 report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport
995 report_table.objects.create(
996 case_number=doc.application_id,
997 request_team=RequestTeam.get_value(doc.document_scheme, 0),
998 request_trigger=RequestTrigger.get_value(doc.data_source, 0),
999 input_file=doc.document_name,
1000 transaction_start=doc.start_time,
1001 transaction_end=doc.start_time,
1002 successful_at_this_level=False,
1003 failure_reason=FailureReason.IMG_LIMIT.value,
1004 process_name=ProcessName.ALL.value,
1005 notes='pdf page count: {0}'.format(str(pdf_handler.img_count))
1006 )
1007 except Exception as e:
1008 self.online_log.error('{0} [process error (report db save)] [error={1}]'.format(
1009 self.log_base, traceback.format_exc()))
1010
1011 try:
1012 doc.status = DocStatus.PROCESS_FAILED.value
1013 doc.save()
1014 except Exception as e:
1015 self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
1016 self.log_base, traceback.format_exc()))
1017 else:
1018 with lock:
1019 todo_count_dict[task_str] = pdf_handler.img_count
1020 for img_idx, img_path in enumerate(pdf_handler.img_path_list):
1021 while img_queue.full():
1022 self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
1023 time.sleep(self.sleep_time_img_put)
1024 if pdf_handler.is_ebank:
1025 try:
1026 text_list = pdf_handler.page_text_list[img_idx].pop('rebuild_text')
1027 except Exception as e:
1028 text_list = []
1029 else:
1030 text_list = []
1031 img_queue.put((business_type, img_path, text_list))
1032 # except EDMSException as e:
1033 # try:
1034 # doc.status = DocStatus.PROCESS_FAILED.value
1035 # doc.save()
1036 # self.online_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
1037 # self.log_base, task_str, traceback.format_exc()))
1038 # except Exception as e:
1039 # self.online_log.error('{0} [process error (db save 1)] [error={1}]'.format(
1040 # self.log_base, traceback.format_exc()))
1041 # error_list.append(1)
1042 # return
1043 except Exception as e:
981 try: 1044 try:
1045 end_time = timezone.now()
982 report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport 1046 report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport
983 report_table.objects.create( 1047 report_table.objects.create(
984 case_number=doc.application_id, 1048 case_number=doc.application_id,
...@@ -986,11 +1050,10 @@ class Command(BaseCommand, LoggerMixin): ...@@ -986,11 +1050,10 @@ class Command(BaseCommand, LoggerMixin):
986 request_trigger=RequestTrigger.get_value(doc.data_source, 0), 1050 request_trigger=RequestTrigger.get_value(doc.data_source, 0),
987 input_file=doc.document_name, 1051 input_file=doc.document_name,
988 transaction_start=doc.start_time, 1052 transaction_start=doc.start_time,
989 transaction_end=doc.start_time, 1053 transaction_end=end_time,
990 successful_at_this_level=False, 1054 successful_at_this_level=False,
991 failure_reason=FailureReason.IMG_LIMIT.value, 1055 failure_reason=FailureReason.PDF.value,
992 process_name=ProcessName.ALL.value, 1056 process_name=ProcessName.ALL.value,
993 notes='pdf page count: {0}'.format(str(pdf_handler.img_count))
994 ) 1057 )
995 except Exception as e: 1058 except Exception as e:
996 self.online_log.error('{0} [process error (report db save)] [error={1}]'.format( 1059 self.online_log.error('{0} [process error (report db save)] [error={1}]'.format(
...@@ -999,64 +1062,114 @@ class Command(BaseCommand, LoggerMixin): ...@@ -999,64 +1062,114 @@ class Command(BaseCommand, LoggerMixin):
999 try: 1062 try:
1000 doc.status = DocStatus.PROCESS_FAILED.value 1063 doc.status = DocStatus.PROCESS_FAILED.value
1001 doc.save() 1064 doc.save()
1065 self.online_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] '
1066 '[error={2}]'.format(self.log_base, task_str, traceback.format_exc()))
1002 except Exception as e: 1067 except Exception as e:
1003 self.online_log.error('{0} [process error (db save)] [error={1}]'.format( 1068 self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
1004 self.log_base, traceback.format_exc())) 1069 self.log_base, traceback.format_exc()))
1005 else: 1070 error_list.append(1)
1006 with lock: 1071 return
1007 todo_count_dict[task_str] = pdf_handler.img_count 1072 else: # e-contract
1008 for img_idx, img_path in enumerate(pdf_handler.img_path_list):
1009 while img_queue.full():
1010 self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
1011 time.sleep(self.sleep_time_img_put)
1012 if pdf_handler.is_ebank:
1013 try:
1014 text_list = pdf_handler.page_text_list[img_idx].pop('rebuild_text')
1015 except Exception as e:
1016 text_list = []
1017 else:
1018 text_list = []
1019 img_queue.put((business_type, img_path, text_list))
1020 # except EDMSException as e:
1021 # try:
1022 # doc.status = DocStatus.PROCESS_FAILED.value
1023 # doc.save()
1024 # self.online_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
1025 # self.log_base, task_str, traceback.format_exc()))
1026 # except Exception as e:
1027 # self.online_log.error('{0} [process error (db save 1)] [error={1}]'.format(
1028 # self.log_base, traceback.format_exc()))
1029 # error_list.append(1)
1030 # return
1031 except Exception as e:
1032 try: 1073 try:
1033 end_time = timezone.now() 1074 # pdf下载 处理 图片存储 识别
1034 report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport 1075 for times in range(consts.RETRY_TIMES):
1035 report_table.objects.create( 1076 try:
1036 case_number=doc.application_id, 1077 self.edms.download(pdf_path, doc.metadata_version_id, doc.document_scheme, business_type)
1037 request_team=RequestTeam.get_value(doc.document_scheme, 0), 1078 self.online_log.info('{0} [edms download success] [task={1}] [times={2}] '
1038 request_trigger=RequestTrigger.get_value(doc.data_source, 0), 1079 '[pdf_path={3}]'.format(self.e_log_base, task_str, times, pdf_path))
1039 input_file=doc.document_name, 1080
1040 transaction_start=doc.start_time, 1081 self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
1041 transaction_end=end_time, 1082 self.e_log_base, task_str, times))
1042 successful_at_this_level=False, 1083 pdf_handler.e_contract_process()
1043 failure_reason=FailureReason.PDF.value, 1084 self.online_log.info(
1044 process_name=ProcessName.ALL.value, 1085 '{0} [pdf to img end] [task={1}] [times={2}]'.format(self.e_log_base, task_str, times))
1045 ) 1086 except Exception as e:
1046 except Exception as e: 1087 self.online_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] '
1047 self.online_log.error('{0} [process error (report db save)] [error={1}]'.format( 1088 '[error={3}]'.format(self.e_log_base, task_str, times,
1048 self.log_base, traceback.format_exc())) 1089 traceback.format_exc()))
1090 else:
1091 break
1092 else:
1093 raise Exception('download or pdf to img failed')
1094
1095 if classify_1_str == str(consts.CONTRACT_CLASSIFY):
1096 ocr_result = afc_predict(pdf_handler.pdf_info)
1097 page_res = {}
1098 for page_num, page_info in ocr_result.get('page_info', {}).items():
1099 if isinstance(page_num, str) and page_num.startswith('page_'):
1100 page_res[page_num] = {
1101 'classify': int(classify_1_str),
1102 'page_num': page_num,
1103 'page_info': page_info
1104 }
1049 1105
1050 try: 1106 else:
1051 doc.status = DocStatus.PROCESS_FAILED.value 1107 file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str)
1052 doc.save() 1108 file_type_2 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_2_str)
1053 self.online_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] ' 1109 ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1)
1054 '[error={2}]'.format(self.log_base, task_str, traceback.format_exc())) 1110 rebuild_res_1 = {}
1111 page_res = {}
1112 for field_name, field_info in ocr_result_1.items():
1113 page_num = field_info.pop('page', 'page_1')
1114 rebuild_res_1.setdefault(page_num, dict())[field_name] = field_info
1115 for page_num, page_info in rebuild_res_1.items():
1116 if isinstance(page_num, str) and page_num.startswith('page_'):
1117 page_res[page_num] = {
1118 'classify': int(classify_1_str),
1119 'page_num': page_num,
1120 'page_info': page_info
1121 }
1122 if isinstance(file_type_2, int):
1123 rebuild_res_2 = {}
1124 ocr_result_2 = hil_predict(pdf_handler.pdf_info, file_type_2)
1125 for field_name, field_info in ocr_result_2.items():
1126 page_num = field_info.pop('page', 'page_1')
1127 rebuild_res_2.setdefault(page_num, dict())[field_name] = field_info
1128 for page_num, page_info in ocr_result_2.items():
1129 if isinstance(page_num, str) and page_num.startswith('page_'):
1130 page_res[page_num] = {
1131 'classify': int(classify_2_str),
1132 'page_num': page_num,
1133 'page_info': page_info
1134 }
1135
1136 contract_res = {}
1137 for img_path_tmp, page_key in pdf_handler.img_path_pno_list:
1138 if page_key in page_res:
1139 img_contract_res = {
1140 'code': 1,
1141 'data': [
1142 {
1143 'classify': page_res[page_key].pop('classify', consts.OTHER_CLASSIFY),
1144 'data': page_res[page_key]
1145 }
1146 ]
1147 }
1148 else:
1149 img_contract_res = {
1150 'code': 1,
1151 'data': [
1152 {
1153 'classify': int(classify_1_str),
1154 }
1155 ]
1156 }
1157 contract_res[img_path_tmp] = img_contract_res
1158
1159 with lock:
1160 res_dict[task_str] = contract_res
1161 finish_queue.put(task_str)
1055 except Exception as e: 1162 except Exception as e:
1056 self.online_log.error('{0} [process error (db save)] [error={1}]'.format( 1163 try:
1057 self.log_base, traceback.format_exc())) 1164 doc.status = DocStatus.PROCESS_FAILED.value
1058 error_list.append(1) 1165 doc.save()
1059 return 1166 self.online_log.warn('{0} [process failed (e-contract)] [task={1}] '
1167 '[error={2}]'.format(self.e_log_base, task_str, traceback.format_exc()))
1168 except Exception as e:
1169 self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
1170 self.e_log_base, traceback.format_exc()))
1171 error_list.append(1)
1172 return
1060 1173
1061 def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url, error_list): 1174 def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url, error_list):
1062 while len(error_list) == 0 or not img_queue.empty(): 1175 while len(error_list) == 0 or not img_queue.empty():
...@@ -1801,7 +1914,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1801,7 +1914,7 @@ class Command(BaseCommand, LoggerMixin):
1801 finish_queue = Queue() 1914 finish_queue = Queue()
1802 1915
1803 process_list = [] 1916 process_list = []
1804 pdf_process = Process(target=self.pdf_2_img_2_queue, args=(img_queue, todo_count_dict, lock, error_list)) 1917 pdf_process = Process(target=self.pdf_2_img_2_queue, args=(img_queue, todo_count_dict, lock, error_list, res_dict, finish_queue))
1805 process_list.append(pdf_process) 1918 process_list.append(pdf_process)
1806 1919
1807 for url in self.ocr_1_urls.values(): 1920 for url in self.ocr_1_urls.values():
......
...@@ -789,3 +789,24 @@ class HILCACompareResultRecord(models.Model): ...@@ -789,3 +789,24 @@ class HILCACompareResultRecord(models.Model):
789 db_table = 'hil_ca_compare_result_record' 789 db_table = 'hil_ca_compare_result_record'
790 790
791 791
792 class HILContract(models.Model):
793 id = models.AutoField(primary_key=True, verbose_name="id") # 主键
794 application_id = models.CharField(max_length=64, verbose_name="申请id") # 索引
795 create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
796
797 class Meta:
798 managed = False
799 db_table = 'hil_contract'
800
801
802 class AFCContract(models.Model):
803 id = models.AutoField(primary_key=True, verbose_name="id") # 主键
804 application_id = models.CharField(max_length=64, verbose_name="申请id") # 索引
805 create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
806
807 class Meta:
808 managed = False
809 db_table = 'afc_contract'
810 situ_db_label = 'afc'
811
812
......
1 import os
1 import base64 2 import base64
2 import requests 3 import requests
3 from common.redis_cache import redis_handler as rh 4 from common.redis_cache import redis_handler as rh
...@@ -44,7 +45,6 @@ class ECM: ...@@ -44,7 +45,6 @@ class ECM:
44 "b_coborrower_id", "b_coborrower_name", "b_guarantor_id", "b_guarantor_name", 45 "b_coborrower_id", "b_coborrower_name", "b_guarantor_id", "b_guarantor_name",
45 "b_frontend_partner", "b_dealer_code", "b_dealer_name", "b_input_date", "b_comment", 46 "b_frontend_partner", "b_dealer_code", "b_dealer_name", "b_input_date", "b_comment",
46 "b_contract_no", "b_location"] 47 "b_contract_no", "b_location"]
47 self.contract_prefix = '电子'
48 48
49 def update_oauth_token(self): 49 def update_oauth_token(self):
50 response = requests.post(self.oauth_url, headers=self.oauth_headers, data=self.oauth_payload, verify=False) 50 response = requests.post(self.oauth_url, headers=self.oauth_headers, data=self.oauth_payload, verify=False)
...@@ -69,9 +69,9 @@ class ECM: ...@@ -69,9 +69,9 @@ class ECM:
69 def get_headers(self): 69 def get_headers(self):
70 return {'Authorization': '{0} {1}'.format(self.token_type, self.get_oauth_token())} 70 return {'Authorization': '{0} {1}'.format(self.token_type, self.get_oauth_token())}
71 71
72 def search(self, application_id, business_type): 72 def search(self, application_id, business_type, prefix):
73 sql = "select * from {0} where b_application_no='{1}' and object_name like '{2}%'".format( 73 sql = "select * from {0} where b_application_no='{1}' and object_name like '{2}%'".format(
74 self.settlement_type, application_id, self.contract_prefix) 74 self.settlement_type, application_id, prefix)
75 search_args = { 75 search_args = {
76 "userName": self.username, 76 "userName": self.username,
77 "password": self.pwd, 77 "password": self.pwd,
...@@ -96,7 +96,6 @@ class ECM: ...@@ -96,7 +96,6 @@ class ECM:
96 result.append((object_name, object_id)) 96 result.append((object_name, object_id))
97 return result 97 return result
98 98
99
100 def download(self, save_path, object_id, document_scheme, business_type): 99 def download(self, save_path, object_id, document_scheme, business_type):
101 doc_type, _, _ = self.doc_type_map.get(document_scheme) 100 doc_type, _, _ = self.doc_type_map.get(document_scheme)
102 download_json = { 101 download_json = {
......
...@@ -36,12 +36,14 @@ from .models import ( ...@@ -36,12 +36,14 @@ from .models import (
36 AFCSECompareResultRecord, 36 AFCSECompareResultRecord,
37 HILCACompareResultRecord, 37 HILCACompareResultRecord,
38 HILSECompareResultRecord, 38 HILSECompareResultRecord,
39 HILContract,
40 AFCContract,
39 ) 41 )
40 from .named_enum import ErrorType 42 from .named_enum import ErrorType
41 from .mixins import DocHandler 43 from .mixins import DocHandler
42 from . import consts 44 from . import consts
43 from apps.account.authentication import OAuth2AuthenticationWithUser 45 from apps.account.authentication import OAuth2AuthenticationWithUser
44 from celery_compare.tasks import compare 46 from celery_compare.tasks import compare, forwarding_station
45 47
46 48
47 class CustomDate(fields.Date): 49 class CustomDate(fields.Date):
...@@ -1164,5 +1166,11 @@ class SEContractView(GenericView): ...@@ -1164,5 +1166,11 @@ class SEContractView(GenericView):
1164 # pos上传e-contract信息接口 SE 1166 # pos上传e-contract信息接口 SE
1165 @use_args(se_contract_args, location='data') 1167 @use_args(se_contract_args, location='data')
1166 def post(self, request, args): 1168 def post(self, request, args):
1167 self.running_log.info('e-contract in') 1169 contract_info = args.get('content', {})
1170 application_id = contract_info.get('applicationId', '')
1171 entity = contract_info.get('applicationEntity', '')
1172 table_class = HILContract if entity == consts.HIL_PREFIX else AFCContract
1173 table_class.objects.create(application_id=application_id)
1174 forwarding_station.apply_async((application_id, entity), queue='queue_compare', countdown=conf.DELAY_SECONDS)
1175 self.running_log.info('[e-contract] [application_id={0}] [entity={1}]'.format(application_id, entity))
1168 return response.ok() 1176 return response.ok()
......
...@@ -27,10 +27,13 @@ from apps.doc.models import ( ...@@ -27,10 +27,13 @@ from apps.doc.models import (
27 AFCCACompareResult, 27 AFCCACompareResult,
28 HILSECompareResult, 28 HILSECompareResult,
29 HILCACompareResult, 29 HILCACompareResult,
30 AFCDoc,
31 HILDoc
30 ) 32 )
31 from apps.doc import consts 33 from apps.doc import consts
32 from apps.doc.ocr.gcap import gcap 34 from apps.doc.ocr.gcap import gcap
33 from apps.doc.ocr.cms import cms 35 from apps.doc.ocr.cms import cms
36 from apps.doc.ocr.ecm import ECM, rh
34 from apps.doc.exceptions import GCAPException 37 from apps.doc.exceptions import GCAPException
35 from apps.doc.named_enum import RequestTeam, RequestTrigger, ProcessName, ErrorType 38 from apps.doc.named_enum import RequestTeam, RequestTrigger, ProcessName, ErrorType
36 from common.tools.comparison import cp 39 from common.tools.comparison import cp
...@@ -38,9 +41,11 @@ from common.tools.des import decode_des ...@@ -38,9 +41,11 @@ from common.tools.des import decode_des
38 41
39 compare_log = logging.getLogger('compare') 42 compare_log = logging.getLogger('compare')
40 log_base = '[Compare]' 43 log_base = '[Compare]'
44 e_log_base = '[e-contract]'
41 empty_str = '' 45 empty_str = ''
42 empty_error_type = 1000 46 empty_error_type = 1000
43 des_key = conf.CMS_DES_KEY 47 des_key = conf.CMS_DES_KEY
48 ecm = ECM()
44 49
45 50
46 def rotate_bound(image, angle): 51 def rotate_bound(image, angle):
...@@ -1867,4 +1872,32 @@ def compare(application_id, application_entity, uniq_seq, ocr_res_id, is_ca=True ...@@ -1867,4 +1872,32 @@ def compare(application_id, application_entity, uniq_seq, ocr_res_id, is_ca=True
1867 se_compare(application_id, application_entity, ocr_res_id, last_obj, ocr_res_dict, is_cms) 1872 se_compare(application_id, application_entity, ocr_res_id, last_obj, ocr_res_dict, is_cms)
1868 1873
1869 1874
1870 1875 @app.task
1876 def forwarding_station(application_id, entity):
1877 compare_log.info('{0} [forward start] [application_id={1}] [entity={2}]'.format(e_log_base, application_id, entity))
1878 doc_class = HILDoc if entity in consts.HIL_SET else AFCDoc
1879 entity_prefix = consts.HIL_PREFIX if entity in consts.HIL_SET else consts.AFC_PREFIX
1880 for (classify_1, classify_2), prefix in consts.FILE_NAME_PREFIX_MAP.get(entity):
1881 try:
1882 file_list = ecm.search(application_id, entity, prefix.format(application_id)) # TODO 获取最新文件
1883 except Exception as e:
1884 compare_log.error('{0} [search failed] [application_id={1}] [entity={2}] [error={3}]'.format(
1885 e_log_base, application_id, entity, traceback.format_exc()))
1886 else:
1887 compare_log.info('{0} [search end] [application_id={1}] [entity={2}] [file_list={3}]'.format(
1888 e_log_base, application_id, entity, file_list))
1889 for object_name, object_id in file_list:
1890 doc = doc_class.objects.create(
1891 metadata_version_id=object_id,
1892 application_id=application_id,
1893 document_name=object_name,
1894 document_scheme='SETTLEMENT',
1895 data_source='POS',
1896 upload_finish_time=datetime.now(),
1897 )
1898 task = consts.SPLIT_STR.join([entity_prefix, str(doc.id), str(classify_1), str(classify_2)])
1899 enqueue_res = rh.enqueue([task], False)
1900 compare_log.info('{0} [upload success] [res={1}] [application_id={2}] [entity={3}] [object_name={4}] '
1901 '[object_id={5}] [doc_id={6}]'.format(e_log_base, enqueue_res, application_id, entity,
1902 object_name, object_id, doc.id))
1903 compare_log.info('{0} [forward end] [application_id={1}] [entity={2}]'.format(e_log_base, application_id, entity))
......
1 # -*- coding: utf-8 -*-
2 # @Author : lk
3 # @Email : 9428.al@gmail.com
4 # @Created Date : 2021-06-29 17:43:46
5 # @Last Modified : 2021-09-07 14:11:25
6 # @Description :
7
8 from .get_char import Finder
9
10
11 def predict(pdf_info):
12 # 输入是整个 PDF 中的信息
13 f = Finder(pdf_info)
14 results = f.get_info()
15
16 return results
17
18
1 # -*- coding: utf-8 -*-
2 # @Author : lk
3 # @Email : 9428.al@gmail.com
4 # @Create Date : 2021-07-20 16:42:41
5 # @Last Modified : 2021-09-07 19:52:39
6 # @Description :
7
8 import re
9 import numpy as np
10 from fuzzywuzzy import fuzz
11
12
13 class Finder:
14
15 def __init__(self, pdf_info):
16 self.pdf_info = pdf_info
17 self.is_asp = False
18 self.item = {"words": None,
19 "position": None,
20 }
21
22 def gen_init_result(self, is_asp):
23 # 格式化算法输出
24 self.init_result = {"page_1": {"合同编号": self.item,
25 "所购车辆价格": self.item,
26 "车架号": self.item,
27 "贷款本金金额": {"大写": self.item,
28 "小写": self.item,
29 "车辆贷款本金金额": self.item,
30 "附加产品融资贷款本金总金额": self.item,
31 },
32 "贷款期限": self.item,
33 "附加产品融资贷款本金总金额明细": self.item,
34 "借款人签字及时间": self.item,
35 },
36 "page_2": {"合同编号": self.item,
37 "借款人及抵押人": {"name": self.item,
38 "id": self.item,
39 },
40 "共同借款人及共同抵押人": {"name": self.item,
41 "id": self.item,
42 },
43 "保证人1": {"name": self.item,
44 "id": self.item,
45 },
46 "保证人2": {"name": self.item,
47 "id": self.item,
48 },
49 "所购车辆价格": self.item,
50 "车架号": self.item,
51 "经销商": self.item,
52 "贷款本金金额": {"大写": self.item,
53 "小写": self.item,
54 "车辆贷款本金金额": self.item,
55 "附加产品融资贷款本金总金额": self.item,
56 },
57 "贷款期限": self.item,
58 "还款账户": {"账号": self.item,
59 "户名": self.item,
60 "开户行": self.item,
61 },
62 },
63 "page_3": {"合同编号": self.item,
64 "还款计划表": self.item,
65 },
66 "page_4": {"合同编号": self.item,
67 "附加产品融资贷款本金总金额明细": self.item,
68 },
69 "page_5": {"合同编号": self.item,
70 },
71 "page_6": {"合同编号": self.item,
72 },
73 }
74 if self.is_asp == False:
75 self.init_result["page_7"] = {"合同编号": self.item,
76 "主借人签字": {"签字": self.item,
77 "日期": self.item,
78 },
79 "共借人签字": {"签字": self.item,
80 "日期": self.item,
81 },
82 "保证人1签字": {"签字": self.item,
83 "日期": self.item,
84 },
85 "保证人2签字": {"签字": self.item,
86 "日期": self.item,
87 },
88 "见证人签字": {"签字": self.item,
89 "日期": self.item,
90 },
91 }
92 else:
93 self.init_result["page_7"] = {"合同编号": self.item,
94 }
95 self.init_result["page_8"] = {"合同编号": self.item,
96 "主借人签字": {"签字": self.item,
97 "日期": self.item,
98 },
99 "共借人签字": {"签字": self.item,
100 "日期": self.item,
101 },
102 "保证人1签字": {"签字": self.item,
103 "日期": self.item,
104 },
105 "保证人2签字": {"签字": self.item,
106 "日期": self.item,
107 },
108 "见证人签字": {"签字": self.item,
109 "日期": self.item,
110 },
111 }
112
113
114 def get_contract_no(self, page_num):
115 """传入页码,查看该页码右上角的编号
116
117 Args:
118 page_num (string):
119
120 Returns:
121 sting:
122 """
123 contract_no = self.item.copy()
124 # 只看第一页
125 for block in self.pdf_info[page_num]['blocks']:
126 if block['type'] != 0:
127 continue
128 for line in block['lines']:
129 for span in line['spans']:
130 bbox, text = span['bbox'], span['text']
131 if '合同编号:' in text:
132 words = text.split(':')[-1]
133 contract_no['position'] = bbox
134 contract_no['words'] = words
135 return contract_no
136
137 def get_vehicle_price(self, page_num='0'):
138 vehicle_price = self.item.copy()
139 for block in self.pdf_info[page_num]['blocks']:
140 if block['type'] != 0:
141 continue
142 for line in block['lines']:
143 for span in line['spans']:
144 bbox, text = span['bbox'], span['text']
145 if '所购车辆价格为人民币' in text:
146 words = text.split('币')[-1]
147 vehicle_price['position'] = bbox
148 vehicle_price['words'] = words
149 return vehicle_price
150
151 def get_vin(self, page_num='0'):
152 vin = self.item.copy()
153 for block in self.pdf_info[page_num]['blocks']:
154 if block['type'] != 0:
155 continue
156 for line in block['lines']:
157 for span in line['spans']:
158 bbox, text = span['bbox'], span['text']
159 if '车架号:' in text:
160 words = text.split(':')[-1]
161 vin['position'] = bbox
162 vin['words'] = words
163 return vin
164
165 def get_loan_principal(self, page_num='0'):
166 chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾',
167 '佰', '仟', '万', '亿', '元', '角', '分', '零', '整']
168 upper = self.item.copy()
169 lower = self.item.copy()
170 asp_1 = self.item.copy()
171 asp_2 = self.item.copy()
172 anchor_bbox = None
173 for block in self.pdf_info[page_num]['blocks']:
174 if block['type'] != 0:
175 continue
176 for line in block['lines']:
177 for span in line['spans']:
178 bbox, text = span['bbox'], span['text']
179 if fuzz.ratio(''.join(chinese_keywords), text) > 15:
180 text = text.split(':')[-1].strip()
181 upper['position'] = bbox
182 upper['words'] = text
183 if '小写:¥' in text:
184 words = text.split('¥')[-1].strip()
185 lower['position'] = bbox
186 lower['words'] = words
187 if '附加产品融资贷款本金总金额' == text:
188 anchor_bbox = bbox
189 if anchor_bbox:
190 for block in self.pdf_info[page_num]['blocks']:
191 if block['type'] != 0:
192 continue
193 for line in block['lines']:
194 for span in line['spans']:
195 bbox, text = span['bbox'], span['text']
196 if np.mean(bbox[1::2]) < np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text:
197 words = re.findall(r'人民币:小写:\[(.*)\]', text)[0]
198 asp_1['position'] = bbox
199 asp_1['words'] = words
200 if np.mean(bbox[1::2]) > np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text:
201 words = re.findall(r'人民币:小写:\[(.*)\]', text)[0]
202 asp_2['position'] = bbox
203 asp_2['words'] = words
204 return upper, lower, asp_1, asp_2
205
206 def get_loan_term(self, page_num='0'):
207 loan_term = self.item.copy()
208 all_text = ''
209 for block in self.pdf_info[page_num]['blocks']:
210 if block['type'] != 0:
211 continue
212 for line in block['lines']:
213 for span in line['spans']:
214 bbox, text = span['bbox'], span['text']
215 all_text += text
216 matchs = re.search(r'贷款期限(\d+)个月', all_text)
217 if matchs:
218 words = matchs.group(1)
219 for block in self.pdf_info[page_num]['blocks']:
220 if block['type'] != 0:
221 continue
222 for line in block['lines']:
223 for span in line['spans']:
224 bbox, text = span['bbox'], span['text']
225 if f'{words}个月' in text:
226 loan_term['position'] = bbox
227 loan_term['words'] = words
228 return loan_term
229
230 def get_asp_details(self, page_num):
231 asp_details_table_term = self.item.copy()
232
233 asp_details_table = []
234 asp_details_text_list = []
235 table = False
236 for block in self.pdf_info[page_num]['blocks']:
237 if block['type'] != 0:
238 continue
239 for line in block['lines']:
240 for span in line['spans']:
241 bbox, text = span['bbox'], span['text']
242 if '附加产品融资贷款本金总金额明细' == text:
243 table = True
244 if '第二条' in text or '征信管理' in text:
245 table = False
246 if table == True:
247 asp_details_text_list.append(text)
248
249 for i in range((len(asp_details_text_list)+2)//3):
250
251 line = []
252 if i == 0:
253 line = [asp_details_text_list[0]]
254 else:
255 for j in range(3):
256 line.append(asp_details_text_list[i*3-2+j])
257
258 asp_details_table.append(line)
259
260 if len(asp_details_table) > 0:
261 asp_details_table_term['words'] = asp_details_table
262 return asp_details_table_term
263
264 def get_signature(self):
265 signature = self.item.copy()
266
267 for block in self.pdf_info['0']['blocks']:
268 if block['type'] != 0:
269 continue
270 for line in block['lines']:
271 for span in line['spans']:
272 bbox, text = span['bbox'], span['text']
273 if '签署日期' in text:
274 words = text
275 signature['words'] = words
276 signature['position'] = bbox
277 return signature
278
279 def get_somebody(self, top, bottom):
280 # 指定上下边界后,返回上下边界内的客户信息
281 _name = self.item.copy()
282 _id = self.item.copy()
283 # 只看第一页,先划定上下边界
284 y_top = 0
285 y_bottom = 0
286 for block in self.pdf_info['1']['blocks']:
287 if block['type'] != 0:
288 continue
289 for line in block['lines']:
290 for span in line['spans']:
291 bbox, text = span['bbox'], span['text']
292 if top in text:
293 y_top = bbox[3]
294 if bottom in text:
295 y_bottom = bbox[3]
296 for block in self.pdf_info['1']['blocks']:
297 if block['type'] != 0:
298 continue
299 for line in block['lines']:
300 for span in line['spans']:
301 bbox, text = span['bbox'], span['text']
302 if y_top < bbox[3] < y_bottom:
303 if '姓名/名称' in text:
304 words = text.split(':')[-1]
305 _name['position'] = bbox
306 _name['words'] = words
307 if '自然人身份证件号码/法人执照号码' in text:
308 words = text.split(':')[-1]
309 _id['position'] = bbox
310 _id['words'] = words
311 return _name, _id
312
313 def get_seller(self):
314 seller = self.item.copy()
315 # 先找到 key
316 anchor_bbox = None
317 for block in self.pdf_info['1']['blocks']:
318 if block['type'] != 0:
319 continue
320 for line in block['lines']:
321 for span in line['spans']:
322 bbox, text = span['bbox'], span['text']
323 if '经销商' == text:
324 anchor_bbox = bbox
325 # 当找到了 key, 则根据 key 去匹配 value
326 if anchor_bbox:
327 half_width = self.pdf_info['1']['width'] * 0.5
328 for block in self.pdf_info['1']['blocks']:
329 if block['type'] != 0:
330 continue
331 for line in block['lines']:
332 for span in line['spans']:
333 bbox, text = span['bbox'], span['text']
334 if anchor_bbox[2]<np.mean(bbox[::2])<half_width and \
335 anchor_bbox[1]<np.mean(bbox[1::2])<anchor_bbox[3]:
336 seller['position'] = bbox
337 seller['words'] = text
338 return seller
339
340 def get_payback_account(self):
341 account = self.item.copy()
342 account_name = self.item.copy()
343 account_bank = self.item.copy()
344 all_text = ''
345 for block in self.pdf_info['1']['blocks']:
346 if block['type'] != 0:
347 continue
348 for line in block['lines']:
349 for span in line['spans']:
350 bbox, text = span['bbox'], span['text']
351 all_text += text
352 # 首先确定账户信息是哪种,我们只输出非另行通知的格式
353 if '☑账号' in all_text:
354 all_text = all_text.replace(' ', '')
355 matchs_1 = re.findall(r'账号:(.*)户名', all_text)
356 if matchs_1:
357 words = matchs_1[0]
358 for block in self.pdf_info['1']['blocks']:
359 if block['type'] != 0:
360 continue
361 for line in block['lines']:
362 for span in line['spans']:
363 bbox, text = span['bbox'], span['text']
364 if f'{words}' in text:
365 account['position'] = bbox
366 account['words'] = words
367 matchs_2 = re.findall(r'户名:(.*)开户行', all_text)
368 if matchs_2:
369 words = matchs_2[0]
370 for block in self.pdf_info['1']['blocks']:
371 if block['type'] != 0:
372 continue
373 for line in block['lines']:
374 for span in line['spans']:
375 bbox, text = span['bbox'], span['text']
376 if f'{words}' in text:
377 account_name['position'] = bbox
378 account_name['words'] = words
379 matchs_3 = re.findall(r'开户行:(.*);', all_text)
380 if matchs_3:
381 words = matchs_3[0]
382 for block in self.pdf_info['1']['blocks']:
383 if block['type'] != 0:
384 continue
385 for line in block['lines']:
386 for span in line['spans']:
387 bbox, text = span['bbox'], span['text']
388 if f'开户行:{words};' in text.replace(' ', ''):
389 account_bank['position'] = bbox
390 account_bank['words'] = words
391 return account, account_name, account_bank
392
393 def get_repayment_schedule(self):
394 repayment_schedule = self.item.copy()
395 # 只看第二页
396 repayment_schedule_table = []
397 repayment_schedule_text_list = []
398 table = False
399 for block in self.pdf_info['2']['blocks']:
400 if block['type'] != 0:
401 continue
402 for line in block['lines']:
403 for span in line['spans']:
404 bbox, text = span['bbox'], span['text']
405 if '序号' == text:
406 table = True
407 if '以上表格中所列的序号并非还款期数' in text:
408 table = False
409 if table == True:
410 repayment_schedule_text_list.append(text)
411
412 for i in range(len(repayment_schedule_text_list)//5):
413
414 line = []
415 # 5表示5列的意思
416 for j in range(5):
417 line.append(repayment_schedule_text_list[i*5+j])
418
419 if str(i+1) == line[1]:
420 break
421
422 repayment_schedule_table.append(line)
423
424 if len(repayment_schedule_table) > 0:
425 repayment_schedule['words'] = repayment_schedule_table
426 return repayment_schedule
427
428 def get_signature_role_1(self):
429 signature_role_1 = self.init_item.copy()
430 # 先定位签字区域
431 texts = []
432 boxes = []
433 page_num = None
434 position = None
435 words = None
436 region = False
437 for i in list(self.pdf_info.keys()):
438 for block in self.pdf_info[i]['blocks']:
439 if block['type'] != 0:
440 continue
441 for line in block['lines']:
442 for span in line['spans']:
443 bbox, text = span['bbox'], span['text']
444 if '借款人(抵押人)' in text:
445 region = True
446 if '日期' in text:
447 region = False
448 if region == True:
449 page_num = i
450 texts.append(text)
451 boxes.append(bbox)
452 if len(texts) > 4:
453 words = '有'
454 else:
455 words = '无'
456 boxes = np.array(boxes).reshape((-1, 2))
457 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
458 signature_role_1['page_num'] = page_num
459 signature_role_1['position'] = position
460 signature_role_1['words'] = words
461 return signature_role_1
462
463 def get_signature_role_2(self):
464 signature_role_2 = self.init_item.copy()
465 # 先定位签字区域
466 texts = []
467 boxes = []
468 page_num = None
469 position = None
470 words = None
471 region = False
472 for i in list(self.pdf_info.keys()):
473 for block in self.pdf_info[i]['blocks']:
474 if block['type'] != 0:
475 continue
476 for line in block['lines']:
477 for span in line['spans']:
478 bbox, text = span['bbox'], span['text']
479 if '共同借款人(共同抵押人)' in text:
480 region = True
481 if '日期' in text:
482 region = False
483 if region == True:
484 page_num = i
485 texts.append(text)
486 boxes.append(bbox)
487 if len(texts) > 4:
488 words = '有'
489 else:
490 words = '无'
491 boxes = np.array(boxes).reshape((-1, 2))
492 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
493 signature_role_2['page_num'] = page_num
494 signature_role_2['position'] = position
495 signature_role_2['words'] = words
496 return signature_role_2
497
498 def get_signature_role_3(self):
499 signature_role_3 = self.init_item.copy()
500 # 先定位签字区域
501 texts = []
502 boxes = []
503 page_num = None
504 position = None
505 words = None
506 region = False
507 for i in list(self.pdf_info.keys()):
508 for block in self.pdf_info[i]['blocks']:
509 if block['type'] != 0:
510 continue
511 for line in block['lines']:
512 for span in line['spans']:
513 bbox, text = span['bbox'], span['text']
514 if '保证人1' in text and int(i) != 0:
515 region = True
516 if '日期' in text:
517 region = False
518 if region == True:
519 page_num = i
520 texts.append(text)
521 boxes.append(bbox)
522 if len(texts) > 4:
523 words = '有'
524 else:
525 words = '无'
526 boxes = np.array(boxes).reshape((-1, 2))
527 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
528 signature_role_3['page_num'] = page_num
529 signature_role_3['position'] = position
530 signature_role_3['words'] = words
531 return signature_role_3
532
533 def get_signature_role_4(self):
534 signature_role_4 = self.init_item.copy()
535 # 先定位签字区域
536 texts = []
537 boxes = []
538 page_num = None
539 position = None
540 words = None
541 region = False
542 for i in list(self.pdf_info.keys()):
543 for block in self.pdf_info[i]['blocks']:
544 if block['type'] != 0:
545 continue
546 for line in block['lines']:
547 for span in line['spans']:
548 bbox, text = span['bbox'], span['text']
549 if '保证人2' in text and int(i) != 0:
550 region = True
551 if '日期' in text:
552 region = False
553 if region == True:
554 page_num = i
555 texts.append(text)
556 boxes.append(bbox)
557 if len(texts) > 4:
558 words = '有'
559 else:
560 words = '无'
561 boxes = np.array(boxes).reshape((-1, 2))
562 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
563 signature_role_4['page_num'] = page_num
564 signature_role_4['position'] = position
565 signature_role_4['words'] = words
566 return signature_role_4
567
568 def get_signature_role_5(self):
569 signature_role_5 = self.init_item.copy()
570 # 先定位签字区域
571 texts = []
572 boxes = []
573 page_num = None
574 position = None
575 words = None
576 region = False
577 for i in list(self.pdf_info.keys()):
578 for block in self.pdf_info[i]['blocks']:
579 if block['type'] != 0:
580 continue
581 for line in block['lines']:
582 for span in line['spans']:
583 bbox, text = span['bbox'], span['text']
584 if '见证人签字' in text and int(i) != 0:
585 region = True
586 if '年' in text:
587 region = False
588 if region == True:
589 page_num = i
590 texts.append(text)
591 boxes.append(bbox)
592 print(texts)
593 if len(texts) > 4:
594 words = '有'
595 else:
596 words = '无'
597 boxes = np.array(boxes).reshape((-1, 2))
598 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
599 signature_role_5['page_num'] = page_num
600 signature_role_5['position'] = position
601 signature_role_5['words'] = words
602 return signature_role_5
603
604 def get_last_page_signature(self, page_num, top, bottom):
605 signature_name = self.item.copy()
606 signature_date = self.item.copy()
607 anchor_top = None
608 anchor_bottom = None
609 for block in self.pdf_info[page_num]['blocks']:
610 if block['type'] != 0:
611 continue
612 for line in block['lines']:
613 for span in line['spans']:
614 bbox, text = span['bbox'], span['text']
615 if top in text:
616 anchor_top = bbox[1]
617 if bottom in text:
618 anchor_bottom = bbox[1]
619 if anchor_top is not None and anchor_bottom is not None:
620 for block in self.pdf_info[page_num]['blocks']:
621 if block['type'] != 0:
622 continue
623 for line in block['lines']:
624 for span in line['spans']:
625 bbox, text = span['bbox'], span['text']
626 if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom):
627 name = text.split(' ')[0]
628 date = text.split(':')[-1]
629 signature_name['words'] = name
630 signature_name['position'] = bbox
631 signature_date['words'] = date
632 signature_name['position'] = bbox
633 return signature_name, signature_date
634
635 def get_info(self):
636 """
637 block['type'] == 0 : 表示该元素为图片
638
639 Returns:
640 dict: Description
641 """
642
643 # 先判断是否为 ASP 产品
644 # 只看第一页,判断是否有 '附加产品融资贷款本金总金额' 这一句话,若有则为 ASP 产品
645 # print(self.pdf_info['0']['blocks'])
646 for block in self.pdf_info['0']['blocks']:
647 if block['type'] != 0:
648 continue
649 for line in block['lines']:
650 for span in line['spans']:
651 bbox, text = span['bbox'], span['text']
652 if '附加产品融资贷款本金总金额' == text:
653 self.is_asp = True
654
655 self.gen_init_result(self.is_asp)
656
657 # Page 1
658 # 找合同编号
659 contract_no = self.get_contract_no(page_num='0')
660 self.init_result['page_1']['合同编号'] = contract_no
661 # 所购车辆价格
662 vehicle_price = self.get_vehicle_price()
663 self.init_result['page_1']['所购车辆价格'] = vehicle_price
664 # 车架号
665 vin = self.get_vin()
666 self.init_result['page_1']['车架号'] = vehicle_price
667 # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目
668 upper, lower, asp_1, asp_2 = self.get_loan_principal()
669 self.init_result['page_1']['贷款本金金额']['大写'] = upper
670 self.init_result['page_1']['贷款本金金额']['小写'] = lower
671 self.init_result['page_1']['贷款本金金额']['车辆贷款本金金额'] = asp_1
672 self.init_result['page_1']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2
673 # 贷款期限
674 loan_term = self.get_loan_term()
675 self.init_result['page_1']['贷款期限'] = loan_term
676 # 附加产品融资贷款本金总金额明细(ASP-表格)
677 asp_details_table = self.get_asp_details(page_num='0')
678 self.init_result['page_1']['附加产品融资贷款本金总金额明细'] = asp_details_table
679 # 借款人签字及时间
680 signature = self.get_signature()
681 self.init_result['page_1']['借款人签字及时间'] = signature
682 #######################################
683 # Page 2
684 # 找合同编号
685 contract_no = self.get_contract_no(page_num='0')
686 self.init_result['page_2']['合同编号'] = contract_no
687 # 找借款人及抵押人(地址字段原本有空格)
688 borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人及共同抵押人:')
689 self.init_result['page_2']['借款人及抵押人']['name'] = borrower_name
690 self.init_result['page_2']['借款人及抵押人']['id'] = borrower_id
691 # 找共同借款人及共同抵押人
692 co_borrower_name, co_borrower_id = self.get_somebody(top='共同借款人及共同抵押人:', bottom='保证人1:')
693 self.init_result['page_2']['共同借款人及共同抵押人']['name'] = co_borrower_name
694 self.init_result['page_2']['共同借款人及共同抵押人']['id'] = co_borrower_id
695 # 保证人1
696 first_guarantor_name, first_guarantor_id = self.get_somebody(top='保证人1:', bottom='保证人2:')
697 self.init_result['page_2']['保证人1']['name'] = first_guarantor_name
698 self.init_result['page_2']['保证人1']['id'] = first_guarantor_id
699 # 保证人2
700 second_guarantor_name, second_guarantor_id = self.get_somebody(top='保证人2:', bottom='第一章')
701 self.init_result['page_2']['保证人2']['name'] = second_guarantor_name
702 self.init_result['page_2']['保证人2']['id'] = second_guarantor_id
703 # 所购车辆价格
704 vehicle_price = self.get_vehicle_price(page_num='1')
705 self.init_result['page_2']['所购车辆价格'] = vehicle_price
706 # 车架号
707 vin = self.get_vin(page_num='1')
708 self.init_result['page_2']['车架号'] = vin
709 # 经销商
710 seller = self.get_seller()
711 self.init_result['page_2']['经销商'] = seller
712 # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目
713 upper, lower, asp_1, asp_2 = self.get_loan_principal(page_num='1')
714 self.init_result['page_2']['贷款本金金额']['大写'] = upper
715 self.init_result['page_2']['贷款本金金额']['小写'] = lower
716 self.init_result['page_2']['贷款本金金额']['车辆贷款本金金额'] = asp_1
717 self.init_result['page_2']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2
718 # 贷款期限
719 loan_term = self.get_loan_term(page_num='1')
720 self.init_result['page_2']['贷款期限'] = loan_term
721 # 还款账户
722 account, account_name, account_bank = self.get_payback_account()
723 self.init_result['page_2']['还款账户']['账号'] = account
724 self.init_result['page_2']['还款账户']['户名'] = account_name
725 self.init_result['page_2']['还款账户']['开户行'] = account_bank
726 #######################################
727 # Page 3
728 # 找合同编号
729 contract_no = self.get_contract_no(page_num='2')
730 self.init_result['page_3']['合同编号'] = contract_no
731 # 还款计划表(表格)
732 repayment_schedule_table = self.get_repayment_schedule()
733 self.init_result['page_3']['还款计划表'] = repayment_schedule_table
734 #######################################
735 # Page 4
736 # 找合同编号
737 contract_no = self.get_contract_no(page_num='3')
738 self.init_result['page_4']['合同编号'] = contract_no
739 # 附加产品融资贷款本金总金额明细(ASP-表格)
740 asp_details_table = self.get_asp_details(page_num='3')
741 self.init_result['page_4']['附加产品融资贷款本金总金额明细'] = asp_details_table
742 #######################################
743 # Page 5
744 # 找合同编号
745 contract_no = self.get_contract_no(page_num='4')
746 self.init_result['page_5']['合同编号'] = contract_no
747 #######################################
748 # Page 6
749 # 找合同编号
750 contract_no = self.get_contract_no(page_num='5')
751 self.init_result['page_6']['合同编号'] = contract_no
752 if self.is_asp == False:
753 # Page 7
754 # 找合同编号
755 contract_no = self.get_contract_no(page_num='6')
756 self.init_result['page_7']['合同编号'] = contract_no
757 signature_name, signature_date = self.get_last_page_signature(page_num='6',
758 top='借款人(抵押人)', bottom='共同借款人(共同抵押人)')
759 self.init_result['page_7']['主借人签字']['签字'] = signature_name
760 self.init_result['page_7']['主借人签字']['日期'] = signature_date
761 signature_name, signature_date = self.get_last_page_signature(page_num='6',
762 top='共同借款人(共同抵押人)', bottom='保证人1')
763 self.init_result['page_7']['共借人签字']['签字'] = signature_name
764 self.init_result['page_7']['共借人签字']['日期'] = signature_date
765 signature_name, signature_date = self.get_last_page_signature(page_num='6',
766 top='保证人1', bottom='保证人2')
767 self.init_result['page_7']['保证人1签字']['签字'] = signature_name
768 self.init_result['page_7']['保证人1签字']['日期'] = signature_date
769 signature_name, signature_date = self.get_last_page_signature(page_num='6',
770 top='保证人2', bottom='在本人面前亲笔签署本合同')
771 self.init_result['page_7']['保证人2签字']['签字'] = signature_name
772 self.init_result['page_7']['保证人2签字']['日期'] = signature_date
773 signature_name, signature_date = self.get_last_page_signature(page_num='6',
774 top='在本人面前亲笔签署本合同', bottom='(以下无正文)')
775 self.init_result['page_7']['见证人签字']['签字'] = signature_name
776 self.init_result['page_7']['见证人签字']['日期'] = signature_date
777 else:
778 # Page 7
779 # 找合同编号
780 contract_no = self.get_contract_no(page_num='6')
781 self.init_result['page_7']['合同编号'] = contract_no
782 # Page 8
783 # 找合同编号
784 contract_no = self.get_contract_no(page_num='7')
785 self.init_result['page_8']['合同编号'] = contract_no
786 signature_name, signature_date = self.get_last_page_signature(page_num='7',
787 top='借款人(抵押人)', bottom='共同借款人(共同抵押人)')
788 self.init_result['page_8']['主借人签字']['签字'] = signature_name
789 self.init_result['page_8']['主借人签字']['日期'] = signature_date
790 signature_name, signature_date = self.get_last_page_signature(page_num='7',
791 top='共同借款人(共同抵押人)', bottom='保证人1')
792 self.init_result['page_8']['共借人签字']['签字'] = signature_name
793 self.init_result['page_8']['共借人签字']['日期'] = signature_date
794 signature_name, signature_date = self.get_last_page_signature(page_num='7',
795 top='保证人1', bottom='保证人2')
796 self.init_result['page_8']['保证人1签字']['签字'] = signature_name
797 self.init_result['page_8']['保证人1签字']['日期'] = signature_date
798 signature_name, signature_date = self.get_last_page_signature(page_num='7',
799 top='保证人2', bottom='在本人面前亲笔签署本合同')
800 self.init_result['page_8']['保证人2签字']['签字'] = signature_name
801 self.init_result['page_8']['保证人2签字']['日期'] = signature_date
802 signature_name, signature_date = self.get_last_page_signature(page_num='7',
803 top='在本人面前亲笔签署本合同', bottom='(以下无正文)')
804 self.init_result['page_8']['见证人签字']['签字'] = signature_name
805 self.init_result['page_8']['见证人签字']['日期'] = signature_date
806
807 # 重新定制输出
808 new_results = {"is_asp": self.is_asp,
809 "page_info": self.init_result
810 }
811 return new_results
1 # -*- coding: utf-8 -*-
2 # @Author : lk
3 # @Email : 9428.al@gmail.com
4 # @Create Date : 2021-07-20 16:42:41
5 # @Last Modified : 2021-10-28 17:41:00
6 # @Description :
7
8 import re
9 import cv2
10 import base64
11 import numpy as np
12 from fuzzywuzzy import fuzz
13
14
15 class Finder:
16
17 def __init__(self, pdf_info):
18 self.pdf_info = pdf_info
19 self.item = {"words": None,
20 "page": None,
21 "position": None,
22 }
23 # 格式化算法输出
24 self.init_result = {"合同编号": self.item,
25 "承租人-姓名": self.item,
26 "承租人-证件号码": self.item,
27 "承租人-法定代表人或授权代表": self.item,
28 "保证人1-姓名": self.item,
29 "保证人1-证件号码": self.item,
30 "保证人1-法定代表人或授权代表": self.item,
31 "保证人2-姓名": self.item,
32 "保证人2-证件号码": self.item,
33 "保证人2-法定代表人或授权代表": self.item,
34 "保证人3-姓名": self.item,
35 "保证人3-证件号码": self.item,
36 "保证人3-法定代表人或授权代表": self.item,
37 "合同编号(正文)": self.item,
38 "车辆识别代码": self.item,
39 "车辆卖方(经销商)": self.item,
40 "车辆原始销售价格(《机动车销售统一发票》所列金额)": self.item,
41 "车辆附加产品明细表": self.item,
42 "融资成本总额": self.item,
43 "租期": self.item,
44 "付款计划表": self.item,
45 "银行账户-户名": self.item,
46 "银行账户-银行账号": self.item,
47 "银行账户-开户行": self.item,
48 "签字页-承租人姓名": self.item,
49 "签字页-承租人签章": self.item,
50 "签字页-保证人1姓名": self.item,
51 "签字页-保证人1签章": self.item,
52 "签字页-保证人2姓名": self.item,
53 "签字页-保证人2签章": self.item,
54 "签字页-保证人3姓名": self.item,
55 "签字页-保证人3签章": self.item,
56 }
57
58 # 格式化输出 车辆处置协议 要是别的字段
59 self.init_result_1 = {"合同编号": self.item,
60 "承租人-姓名": self.item,
61 "承租人-证件号码": self.item,
62 "销售经销商": self.item,
63 "合同编号(正文)": self.item,
64 "签字页-承租人姓名": self.item,
65 "签字页-承租人证件号码": self.item,
66 "签字页-承租人签章": self.item,
67 "签字页-销售经销商": self.item,
68 "签字页-销售经销商签章": self.item,
69
70 }
71
72 # 格式化输出 车辆租赁抵押合同
73 self.init_result_2 = {"合同编号": self.item,
74 "合同编号(正文)": self.item,
75 "抵押人姓名/名称": self.item,
76 "抵押人证件号码": self.item,
77 "车辆识别代码": self.item,
78 "租金总额": self.item,
79 "融资租赁期限": self.item,
80 "签字页-抵押人姓名": self.item,
81 "签字页-抵押人签章": self.item,
82 "签字页-抵押人配偶姓名": self.item,
83 "签字页-抵押人配偶签章": self.item,
84 }
85
86 def get_contract_no(self, page_num):
87 """传入页码,查看该页码右上角的编号
88
89 Args:
90 page_num (string):
91
92 Returns:
93 sting:
94 """
95 contract_no = self.item.copy()
96 # 只看第一页
97 for block in self.pdf_info[page_num]['blocks']:
98 if block['type'] != 0:
99 continue
100 for line in block['lines']:
101 for span in line['spans']:
102 bbox, text = span['bbox'], span['text']
103 if '合同编号:' in text:
104 words = text.split(':')[-1]
105 contract_no['position'] = bbox
106 contract_no['page'] = page_num
107 contract_no['words'] = words
108 if contract_no['words'] == '':
109 for block in self.pdf_info[page_num]['blocks']:
110 if block['type'] != 0:
111 continue
112 for line in block['lines']:
113 for span in line['spans']:
114 bbox, text = span['bbox'], span['text']
115 if bbox[1] < contract_no['position'][3] and 'CH' in text:
116 contract_no['position'] = bbox
117 contract_no['page'] = page_num
118 contract_no['words'] = text
119 return contract_no
120
121 def get_vehicle_price(self, page_num='0'):
122 vehicle_price = self.item.copy()
123 for block in self.pdf_info[page_num]['blocks']:
124 if block['type'] != 0:
125 continue
126 for line in block['lines']:
127 for span in line['spans']:
128 bbox, text = span['bbox'], span['text']
129 if '所购车辆价格为人民币' in text:
130 words = text.split('币')[-1]
131 vehicle_price['position'] = bbox
132 vehicle_price['words'] = words
133 return vehicle_price
134
135 def get_contract_no_one(self):
136 # 查找正文中的合同编号,有可能存在换行的情况
137 contract_no = self.item.copy()
138 for pno in self.pdf_info:
139 all_text = ''
140 for block in self.pdf_info[pno]['blocks']:
141 if block['type'] != 0:
142 continue
143 for line in block['lines']:
144 for span in line['spans']:
145 bbox, text = span['bbox'], span['text']
146 all_text += text
147 all_text = all_text.replace(' ', '')
148 matchObj = re.search(r'(合同编号:\[(.*?)\])', all_text)
149 if matchObj:
150 words = matchObj.group(1)
151 contract_no['position'] = None
152 contract_no['page'] = pno
153 contract_no['words'] = words
154 return contract_no
155
156 matchObj = re.search(r'编号为(.*?)的', all_text)
157 if matchObj:
158 words = matchObj.group(1).strip()
159 contract_no['position'] = None
160 contract_no['page'] = pno
161 contract_no['words'] = words
162 return contract_no
163
164 matchObj = re.search(r'编号为(.*?))的', all_text)
165 if matchObj:
166 words = matchObj.group(1).strip()
167 contract_no['position'] = None
168 contract_no['page'] = pno
169 contract_no['words'] = words
170 return contract_no
171
172 def get_key_value(self, key, page_num=None):
173 value = self.item.copy()
174 if page_num is not None:
175 pno = page_num
176 for block in self.pdf_info[pno]['blocks']:
177 if block['type'] != 0:
178 continue
179 for line in block['lines']:
180 for span in line['spans']:
181 bbox, text = span['bbox'], span['text']
182 if key in text:
183 words = text.split(':')[-1]
184 value['position'] = bbox
185 value['page'] = pno
186 value['words'] = words
187 else:
188 for pno in self.pdf_info:
189 for block in self.pdf_info[pno]['blocks']:
190 if block['type'] != 0:
191 continue
192 for line in block['lines']:
193 for span in line['spans']:
194 bbox, text = span['bbox'], span['text']
195 if key in text:
196 # print(self.pdf_info[pno])
197 words = text.split(':')[-1]
198 value['position'] = bbox
199 value['page'] = pno
200 value['words'] = words
201 return value
202
203 def get_loan_principal(self, page_num='0'):
204 chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾',
205 '佰', '仟', '万', '亿', '元', '角', '分', '零', '整']
206 upper = self.item.copy()
207 lower = self.item.copy()
208 asp_1 = self.item.copy()
209 asp_2 = self.item.copy()
210 anchor_bbox = None
211 for block in self.pdf_info[page_num]['blocks']:
212 if block['type'] != 0:
213 continue
214 for line in block['lines']:
215 for span in line['spans']:
216 bbox, text = span['bbox'], span['text']
217 if fuzz.ratio(''.join(chinese_keywords), text) > 15:
218 text = text.split(':')[-1].strip()
219 upper['position'] = bbox
220 upper['words'] = text
221 if '小写:¥' in text:
222 words = text.split('¥')[-1].strip()
223 lower['position'] = bbox
224 lower['words'] = words
225 if '附加产品融资贷款本金总金额' == text:
226 anchor_bbox = bbox
227 if anchor_bbox:
228 for block in self.pdf_info[page_num]['blocks']:
229 if block['type'] != 0:
230 continue
231 for line in block['lines']:
232 for span in line['spans']:
233 bbox, text = span['bbox'], span['text']
234 if np.mean(bbox[1::2]) < np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text:
235 words = re.findall(r'人民币:小写:\[(.*)\]', text)[0]
236 asp_1['position'] = bbox
237 asp_1['words'] = words
238 if np.mean(bbox[1::2]) > np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text:
239 words = re.findall(r'人民币:小写:\[(.*)\]', text)[0]
240 asp_2['position'] = bbox
241 asp_2['words'] = words
242 return upper, lower, asp_1, asp_2
243
244 def get_loan_term(self, page_num='0'):
245 loan_term = self.item.copy()
246 all_text = ''
247 for block in self.pdf_info[page_num]['blocks']:
248 if block['type'] != 0:
249 continue
250 for line in block['lines']:
251 for span in line['spans']:
252 bbox, text = span['bbox'], span['text']
253 all_text += text
254 matchs = re.search(r'贷款期限(\d+)个月', all_text)
255 if matchs:
256 words = matchs.group(1)
257 for block in self.pdf_info[page_num]['blocks']:
258 if block['type'] != 0:
259 continue
260 for line in block['lines']:
261 for span in line['spans']:
262 bbox, text = span['bbox'], span['text']
263 if f'{words}个月' in text:
264 loan_term['position'] = bbox
265 loan_term['words'] = words
266 return loan_term
267
268 def get_asp_details(self, page_num):
269 asp_details_table_term = self.item.copy()
270
271 asp_details_table = []
272 asp_details_text_list = []
273 table = False
274 for block in self.pdf_info[page_num]['blocks']:
275 if block['type'] != 0:
276 continue
277 for line in block['lines']:
278 for span in line['spans']:
279 bbox, text = span['bbox'], span['text']
280 if '附加产品融资贷款本金总金额明细' == text:
281 table = True
282 if '第二条' in text or '征信管理' in text:
283 table = False
284 if table == True:
285 asp_details_text_list.append(text)
286
287 for i in range((len(asp_details_text_list)+2)//3):
288
289 line = []
290 if i == 0:
291 line = [asp_details_text_list[0]]
292 else:
293 for j in range(3):
294 line.append(asp_details_text_list[i*3-2+j])
295
296 asp_details_table.append(line)
297
298 if len(asp_details_table) > 0:
299 asp_details_table_term['words'] = asp_details_table
300 return asp_details_table_term
301
302 def get_signature(self):
303 signature = self.item.copy()
304
305 for block in self.pdf_info['0']['blocks']:
306 if block['type'] != 0:
307 continue
308 for line in block['lines']:
309 for span in line['spans']:
310 bbox, text = span['bbox'], span['text']
311 if '签署日期' in text:
312 words = text
313 signature['words'] = words
314 signature['position'] = bbox
315 return signature
316
317 def get_somebody(self, top, bottom):
318 # 指定上下边界后,返回上下边界内的客户信息
319 _name = self.item.copy()
320 _id = self.item.copy()
321 # 只看第一页,先划定上下边界
322 y_top = 0
323 y_bottom = 0
324 for block in self.pdf_info['1']['blocks']:
325 if block['type'] != 0:
326 continue
327 for line in block['lines']:
328 for span in line['spans']:
329 bbox, text = span['bbox'], span['text']
330 if top in text:
331 y_top = bbox[3]
332 if bottom in text:
333 y_bottom = bbox[3]
334 for block in self.pdf_info['1']['blocks']:
335 if block['type'] != 0:
336 continue
337 for line in block['lines']:
338 for span in line['spans']:
339 bbox, text = span['bbox'], span['text']
340 if y_top < bbox[3] < y_bottom:
341 if '姓名/名称' in text:
342 words = text.split(':')[-1]
343 _name['position'] = bbox
344 _name['words'] = words
345 if '自然人身份证件号码/法人执照号码' in text:
346 words = text.split(':')[-1]
347 _id['position'] = bbox
348 _id['words'] = words
349 return _name, _id
350
351 def get_seller(self):
352 seller = self.item.copy()
353 # 先找到 key
354 anchor_bbox = None
355 for block in self.pdf_info['1']['blocks']:
356 if block['type'] != 0:
357 continue
358 for line in block['lines']:
359 for span in line['spans']:
360 bbox, text = span['bbox'], span['text']
361 if '经销商' == text:
362 anchor_bbox = bbox
363 # 当找到了 key, 则根据 key 去匹配 value
364 if anchor_bbox:
365 half_width = self.pdf_info['1']['width'] * 0.5
366 for block in self.pdf_info['1']['blocks']:
367 if block['type'] != 0:
368 continue
369 for line in block['lines']:
370 for span in line['spans']:
371 bbox, text = span['bbox'], span['text']
372 if anchor_bbox[2]<np.mean(bbox[::2])<half_width and \
373 anchor_bbox[1]<np.mean(bbox[1::2])<anchor_bbox[3]:
374 seller['position'] = bbox
375 seller['words'] = text
376 return seller
377
378 def get_payback_account(self):
379 account = self.item.copy()
380 account_name = self.item.copy()
381 account_bank = self.item.copy()
382 all_text = ''
383 for block in self.pdf_info['1']['blocks']:
384 if block['type'] != 0:
385 continue
386 for line in block['lines']:
387 for span in line['spans']:
388 bbox, text = span['bbox'], span['text']
389 all_text += text
390 # 首先确定账户信息是哪种,我们只输出非另行通知的格式
391 if '☑账号' in all_text:
392 all_text = all_text.replace(' ', '')
393 matchs_1 = re.findall(r'账号:(.*)户名', all_text)
394 if matchs_1:
395 words = matchs_1[0]
396 for block in self.pdf_info['1']['blocks']:
397 if block['type'] != 0:
398 continue
399 for line in block['lines']:
400 for span in line['spans']:
401 bbox, text = span['bbox'], span['text']
402 if f'{words}' in text:
403 account['position'] = bbox
404 account['words'] = words
405 matchs_2 = re.findall(r'户名:(.*)开户行', all_text)
406 if matchs_2:
407 words = matchs_2[0]
408 for block in self.pdf_info['1']['blocks']:
409 if block['type'] != 0:
410 continue
411 for line in block['lines']:
412 for span in line['spans']:
413 bbox, text = span['bbox'], span['text']
414 if f'{words}' in text:
415 account_name['position'] = bbox
416 account_name['words'] = words
417 matchs_3 = re.findall(r'开户行:(.*);', all_text)
418 if matchs_3:
419 words = matchs_3[0]
420 for block in self.pdf_info['1']['blocks']:
421 if block['type'] != 0:
422 continue
423 for line in block['lines']:
424 for span in line['spans']:
425 bbox, text = span['bbox'], span['text']
426 if f'开户行:{words};' in text.replace(' ', ''):
427 account_bank['position'] = bbox
428 account_bank['words'] = words
429 return account, account_name, account_bank
430
431 def get_repayment_schedule(self):
432 repayment_schedule = self.item.copy()
433
434 repayment_schedule_text_list = []
435 table = False
436 page = None
437 for pno in self.pdf_info:
438 for block in self.pdf_info[pno]['blocks']:
439 if block['type'] != 0:
440 continue
441 for line in block['lines']:
442 for span in line['spans']:
443 bbox, text = span['bbox'], span['text']
444 if '以上表格中所列序号' in text:
445 table = False
446 if table == True:
447 repayment_schedule_text_list.append(text)
448 if '61.' in text:
449 page = pno
450 table = True
451
452 repayment_schedule_table = [['序号', '融资租赁成本', '融资租赁费用', '租金', '剩余融资租赁成本']]
453 for i in range(len(repayment_schedule_text_list)//4):
454 line = [f'{i+1}.']
455 # 4表示4列的意思
456 for j in range(4):
457 line.append(repayment_schedule_text_list[i*4+j])
458
459 repayment_schedule_table.append(line)
460
461 repayment_schedule['words'] = repayment_schedule_table
462 repayment_schedule['page'] = page
463 return repayment_schedule
464
465 def get_signature_role_1(self):
466 signature_role_1 = self.item.copy()
467 for pno in self.pdf_info:
468 for block in self.pdf_info[pno]['blocks']:
469 if block['type'] != 0:
470 continue
471 for line in block['lines']:
472 for span in line['spans']:
473 bbox, text = span['bbox'], span['text']
474 if '签署日期' in text:
475 signature_role_1['position'] = bbox
476 signature_role_1['page'] = pno
477 signature_role_1['words'] = text
478 return signature_role_1
479
480 def get_signature_role_2(self):
481 signature_role_2 = self.init_item.copy()
482 # 先定位签字区域
483 texts = []
484 boxes = []
485 page_num = None
486 position = None
487 words = None
488 region = False
489 for i in list(self.pdf_info.keys()):
490 for block in self.pdf_info[i]['blocks']:
491 if block['type'] != 0:
492 continue
493 for line in block['lines']:
494 for span in line['spans']:
495 bbox, text = span['bbox'], span['text']
496 if '共同借款人(共同抵押人)' in text:
497 region = True
498 if '日期' in text:
499 region = False
500 if region == True:
501 page_num = i
502 texts.append(text)
503 boxes.append(bbox)
504 if len(texts) > 4:
505 words = '有'
506 else:
507 words = '无'
508 boxes = np.array(boxes).reshape((-1, 2))
509 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
510 signature_role_2['page_num'] = page_num
511 signature_role_2['position'] = position
512 signature_role_2['words'] = words
513 return signature_role_2
514
515 def get_signature_role_3(self):
516 signature_role_3 = self.init_item.copy()
517 # 先定位签字区域
518 texts = []
519 boxes = []
520 page_num = None
521 position = None
522 words = None
523 region = False
524 for i in list(self.pdf_info.keys()):
525 for block in self.pdf_info[i]['blocks']:
526 if block['type'] != 0:
527 continue
528 for line in block['lines']:
529 for span in line['spans']:
530 bbox, text = span['bbox'], span['text']
531 if '保证人1' in text and int(i) != 0:
532 region = True
533 if '日期' in text:
534 region = False
535 if region == True:
536 page_num = i
537 texts.append(text)
538 boxes.append(bbox)
539 if len(texts) > 4:
540 words = '有'
541 else:
542 words = '无'
543 boxes = np.array(boxes).reshape((-1, 2))
544 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
545 signature_role_3['page_num'] = page_num
546 signature_role_3['position'] = position
547 signature_role_3['words'] = words
548 return signature_role_3
549
550 def get_signature_role_4(self):
551 signature_role_4 = self.init_item.copy()
552 # 先定位签字区域
553 texts = []
554 boxes = []
555 page_num = None
556 position = None
557 words = None
558 region = False
559 for i in list(self.pdf_info.keys()):
560 for block in self.pdf_info[i]['blocks']:
561 if block['type'] != 0:
562 continue
563 for line in block['lines']:
564 for span in line['spans']:
565 bbox, text = span['bbox'], span['text']
566 if '保证人2' in text and int(i) != 0:
567 region = True
568 if '日期' in text:
569 region = False
570 if region == True:
571 page_num = i
572 texts.append(text)
573 boxes.append(bbox)
574 if len(texts) > 4:
575 words = '有'
576 else:
577 words = '无'
578 boxes = np.array(boxes).reshape((-1, 2))
579 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
580 signature_role_4['page_num'] = page_num
581 signature_role_4['position'] = position
582 signature_role_4['words'] = words
583 return signature_role_4
584
585 def get_signature_role_5(self):
586 signature_role_5 = self.init_item.copy()
587 # 先定位签字区域
588 texts = []
589 boxes = []
590 page_num = None
591 position = None
592 words = None
593 region = False
594 for i in list(self.pdf_info.keys()):
595 for block in self.pdf_info[i]['blocks']:
596 if block['type'] != 0:
597 continue
598 for line in block['lines']:
599 for span in line['spans']:
600 bbox, text = span['bbox'], span['text']
601 if '见证人签字' in text and int(i) != 0:
602 region = True
603 if '年' in text:
604 region = False
605 if region == True:
606 page_num = i
607 texts.append(text)
608 boxes.append(bbox)
609 print(texts)
610 if len(texts) > 4:
611 words = '有'
612 else:
613 words = '无'
614 boxes = np.array(boxes).reshape((-1, 2))
615 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
616 signature_role_5['page_num'] = page_num
617 signature_role_5['position'] = position
618 signature_role_5['words'] = words
619 return signature_role_5
620
621 def get_last_page_signature(self, page_num, top, bottom):
622 signature_name = self.item.copy()
623 signature_date = self.item.copy()
624 anchor_top = None
625 anchor_bottom = None
626 for block in self.pdf_info[page_num]['blocks']:
627 if block['type'] != 0:
628 continue
629 for line in block['lines']:
630 for span in line['spans']:
631 bbox, text = span['bbox'], span['text']
632 if top in text:
633 anchor_top = bbox[1]
634 if bottom in text:
635 anchor_bottom = bbox[1]
636 if anchor_top is not None and anchor_bottom is not None:
637 for block in self.pdf_info[page_num]['blocks']:
638 if block['type'] != 0:
639 continue
640 for line in block['lines']:
641 for span in line['spans']:
642 bbox, text = span['bbox'], span['text']
643 if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom):
644 name = text.split(' ')[0]
645 date = text.split(':')[-1]
646 signature_name['words'] = name
647 signature_name['position'] = bbox
648 signature_date['words'] = date
649 signature_name['position'] = bbox
650 return signature_name, signature_date
651
652 def get_electronic_signature(self, top, bottom):
653 signature = self.item.copy()
654 anchor_top = None
655 anchor_bottom = None
656 for pno in self.pdf_info:
657 for block in self.pdf_info[pno]['blocks']:
658 if block['type'] != 0:
659 continue
660 for line in block['lines']:
661 for span in line['spans']:
662 bbox, text = span['bbox'], span['text']
663 if top in text:
664 anchor_top = bbox[1]
665 if bottom in text:
666 anchor_bottom = bbox[1]
667 if anchor_top is not None and anchor_bottom is not None:
668 for pno in self.pdf_info:
669 for block in self.pdf_info[pno]['blocks']:
670 if block['type'] != 0:
671 continue
672 for line in block['lines']:
673 for span in line['spans']:
674 bbox, text = span['bbox'], span['text']
675 if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom):
676 words = text
677 signature['words'] = words
678 signature['page'] = pno
679 signature['position'] = bbox
680 return signature
681
682 def get_role_info(self, role_key, page_num='0'):
683 name = self.item.copy()
684 id_num = self.item.copy()
685 representative = self.item.copy()
686
687 # 以保证人3 的左上角为定位点
688 anchor = None
689 for block in self.pdf_info[page_num]['blocks']:
690 if block['type'] != 0:
691 continue
692 for line in block['lines']:
693 for span in line['spans']:
694 bbox, text = span['bbox'], span['text']
695 # 找到角色姓名
696 if re.match('保证人3', text) is not None:
697 anchor = [bbox[0], bbox[1]]
698
699 if anchor is not None:
700 for block in self.pdf_info[page_num]['blocks']:
701 if block['type'] != 0:
702 continue
703 for line in block['lines']:
704 for span in line['spans']:
705 bbox, text = span['bbox'], span['text']
706 # 找到角色姓名
707 if re.match(role_key, text) is not None:
708 words = text.split(':')[-1]
709 name['words'] = words
710 name['page'] = page_num
711 name['position'] = bbox
712 if role_key == '承租人:':
713 # 找到证件号码且确定位置
714 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
715 words = text.split(':')[-1]
716 id_num['words'] = words
717 id_num['page'] = page_num
718 id_num['position'] = bbox
719 # 找到法人代表且确定位置
720 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
721 words = text.split(':')[-1]
722 representative['words'] = words
723 representative['page'] = page_num
724 representative['position'] = bbox
725 if role_key == '保证人1:':
726 # 找到证件号码且确定位置
727 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
728 words = text.split(':')[-1]
729 id_num['words'] = words
730 id_num['page'] = page_num
731 id_num['position'] = bbox
732 # 找到法人代表且确定位置
733 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
734 words = text.split(':')[-1]
735 representative['words'] = words
736 representative['page'] = page_num
737 representative['position'] = bbox
738 if role_key == '保证人2:':
739 # 找到证件号码且确定位置
740 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
741 words = text.split(':')[-1]
742 id_num['words'] = words
743 id_num['page'] = page_num
744 id_num['position'] = bbox
745 # 找到法人代表且确定位置
746 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
747 words = text.split(':')[-1]
748 representative['words'] = words
749 representative['page'] = page_num
750 representative['position'] = bbox
751 if role_key == '保证人3:':
752 # 找到证件号码且确定位置
753 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
754 words = text.split(':')[-1]
755 id_num['words'] = words
756 id_num['page'] = page_num
757 id_num['position'] = bbox
758 # 找到法人代表且确定位置
759 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
760 words = text.split(':')[-1]
761 representative['words'] = words
762 representative['page'] = page_num
763 representative['position'] = bbox
764 return name, id_num, representative
765
766 def get_table_add_product(self):
767 table_add_product = self.item.copy()
768 items = []
769 start = False
770 page = None
771 for pno in self.pdf_info:
772 condition = False
773 for block in self.pdf_info[f'{pno}']['blocks']:
774 if block['type'] != 0:
775 continue
776 for line in block['lines']:
777 for span in line['spans']:
778 bbox, text = span['bbox'], span['text']
779 if '总计' in text:
780 start = True
781 if '注:出租人向承租人购买租赁车辆的对价' in text:
782 page = pno
783 start = False
784 if start == True:
785 items.append(text)
786
787 lines = [['项目', '购买价格', '实际融资金额']]
788 for i in range(len(items)//3):
789 line = [items[2+i*3+0], items[2+i*3+1], items[2+i*3+2]]
790 lines.append(line)
791
792 if len(items) > 0:
793 lines.append([items[0], '', items[1]])
794
795 table_add_product['words'] = lines
796 table_add_product['page'] = page
797 table_add_product['position'] = None
798 return table_add_product
799
800 def get_contract_no_dy(self):
801 # 查找抵押合同编号
802 contract_no = self.item.copy()
803
804 key_box = None
805 for pno in self.pdf_info:
806 for block in self.pdf_info[pno]['blocks']:
807 if block['type'] != 0:
808 continue
809 for line in block['lines']:
810 for span in line['spans']:
811 bbox, text = span['bbox'], span['text']
812 if '抵押合同编号' in text:
813 key_box = bbox
814
815 if key_box is not None:
816 for pno in self.pdf_info:
817 for block in self.pdf_info[pno]['blocks']:
818 if block['type'] != 0:
819 continue
820 for line in block['lines']:
821 for span in line['spans']:
822 bbox, text = span['bbox'], span['text']
823 if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and 'CH-' in text:
824 contract_no['position'] = bbox
825 contract_no['page'] = pno
826 contract_no['words'] = text
827 return contract_no
828
829 def get_dyr_name_id(self):
830 name = self.item.copy()
831 _id = self.item.copy()
832
833 key_box = None
834 for pno in self.pdf_info:
835 for block in self.pdf_info[pno]['blocks']:
836 if block['type'] != 0:
837 continue
838 for line in block['lines']:
839 for span in line['spans']:
840 bbox, text = span['bbox'], span['text']
841 if text == '抵押人':
842 key_box = bbox
843
844 if key_box is not None:
845 rh = abs(key_box[1]-key_box[3])
846 for pno in self.pdf_info:
847 for block in self.pdf_info[pno]['blocks']:
848 if block['type'] != 0:
849 continue
850 for line in block['lines']:
851 for span in line['spans']:
852 bbox, text = span['bbox'], span['text']
853 if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '姓名' in text:
854 words = text.split(':')[-1]
855 name['position'] = bbox
856 name['page'] = pno
857 name['words'] = words
858 if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '证件号码' in text:
859 words = text.split(':')[-1]
860 _id['position'] = bbox
861 _id['page'] = pno
862 _id['words'] = words
863 return name, _id
864
865 def get_key_value_position(self, key):
866 value = self.item.copy()
867
868 key_box = None
869 for pno in self.pdf_info:
870 for block in self.pdf_info[pno]['blocks']:
871 if block['type'] != 0:
872 continue
873 for line in block['lines']:
874 for span in line['spans']:
875 bbox, text = span['bbox'], span['text']
876 if text == key:
877 key_box = bbox
878
879 if key_box is not None:
880 rh = abs(key_box[1]-key_box[3])
881 for pno in self.pdf_info:
882 for block in self.pdf_info[pno]['blocks']:
883 if block['type'] != 0:
884 continue
885 for line in block['lines']:
886 for span in line['spans']:
887 bbox, text = span['bbox'], span['text']
888 if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and key_box[0] < bbox[0] and abs(key_box[2]-bbox[0]) < rh*10:
889 words = text
890 value['position'] = bbox
891 value['page'] = pno
892 value['words'] = words
893 return value
894
895 def get_info(self):
896 """
897 block['type'] == 0 : 表示该元素为图片
898
899 Returns:
900 dict: Description
901 """
902 if len(self.pdf_info) > 0:
903 # 取 Page 1 上的合同编号
904 contract_no = self.get_contract_no(page_num='0')
905 self.init_result['合同编号'] = contract_no
906 # 从第一页上取四个角色的姓名和证件号码
907 name, id_num, representative = self.get_role_info(role_key='承租人:', page_num='0')
908 self.init_result['承租人-姓名'] = name
909 self.init_result['承租人-证件号码'] = id_num
910 self.init_result['承租人-法定代表人或授权代表'] = representative
911 name, id_num, representative = self.get_role_info(role_key='保证人1:', page_num='0')
912 self.init_result['保证人1-姓名'] = name
913 self.init_result['保证人1-证件号码'] = id_num
914 self.init_result['保证人1-法定代表人或授权代表'] = representative
915 name, id_num, representative = self.get_role_info(role_key='保证人2:', page_num='0')
916 self.init_result['保证人2-姓名'] = name
917 self.init_result['保证人2-证件号码'] = id_num
918 self.init_result['保证人2-法定代表人或授权代表'] = representative
919 name, id_num, representative = self.get_role_info(role_key='保证人3:', page_num='0')
920 self.init_result['保证人3-姓名'] = name
921 self.init_result['保证人3-证件号码'] = id_num
922 self.init_result['保证人3-法定代表人或授权代表'] = representative
923 # 在所有页面中找正文中(第二部分 融资租赁主要条款及付款计划)的那个编号,因为存在换行的情况所以暂时不带位置输出
924 contract_no = self.get_contract_no_one()
925 self.init_result['合同编号(正文)'] = contract_no
926 # 找到车辆识别代码
927 vin = self.get_key_value(key='车辆识别代码:')
928 self.init_result['车辆识别代码'] = vin
929 # 找到经销商(车辆卖方(经销商))
930 seller = self.get_key_value(key='车辆卖方(经销商):')
931 self.init_result['车辆卖方(经销商)'] = seller
932 # 找到 —— 车辆原始销售价格
933 vehicle_price = self.get_key_value(key='车辆原始销售价格(《机动车销售统一发票》所列金额):')
934 self.init_result['车辆原始销售价格(《机动车销售统一发票》所列金额)'] = vehicle_price
935 # 找车辆附加产品明细(表)
936 table_add_product = self.get_table_add_product()
937 self.init_result['车辆附加产品明细表'] = table_add_product
938 # 找融资成本总额
939 financing_cost = self.get_key_value(key='融资成本总额:')
940 self.init_result['融资成本总额'] = financing_cost
941 # 找租期
942 lease_term = self.get_key_value(key='租期:')
943 self.init_result['租期'] = lease_term
944 # 找还款计划(表)
945 repayment_schedule = self.get_repayment_schedule()
946 self.init_result['付款计划表'] = repayment_schedule
947 # 找开户行户名、银行账号、银行
948 name = self.get_key_value(key='户名:')
949 self.init_result['银行账户-户名'] = name
950 account = self.get_key_value(key='银行账号:')
951 self.init_result['银行账户-银行账号'] = account
952 bank = self.get_key_value(key='开户银行:')
953 self.init_result['银行账户-开户行'] = bank
954 # 找签字页上的系列信息
955 # 承租人姓名、签章
956 name = self.get_key_value(key='承租人姓名:')
957 electronic_signature = self.get_electronic_signature(top='承租人姓名:', bottom='保证人1姓名:')
958 self.init_result['签字页-承租人姓名'] = name
959 self.init_result['签字页-承租人签章'] = electronic_signature
960 # 保证人1姓名、签章
961 name = self.get_key_value(key='保证人1姓名:')
962 electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:')
963 self.init_result['签字页-保证人1姓名'] = name
964 self.init_result['签字页-保证人1签章'] = electronic_signature
965 # 保证人2姓名、签章
966 name = self.get_key_value(key='保证人2姓名:')
967 electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='保证人3姓名:')
968 self.init_result['签字页-保证人2姓名'] = name
969 self.init_result['签字页-保证人2签章'] = electronic_signature
970 # 保证人2姓名、签章
971 name = self.get_key_value(key='保证人3姓名:')
972 electronic_signature = self.get_electronic_signature(top='保证人3姓名:', bottom='日期:')
973 self.init_result['签字页-保证人3姓名'] = name
974 self.init_result['签字页-保证人3签章'] = electronic_signature
975
976 return self.init_result
977
978 # results['is_shhz_contract'] = True
979 # results['pdf_info'] = self.init_result
980
981 # return results
982
983 def get_info_1(self):
984 if len(self.pdf_info) > 0:
985 contract_no = self.get_contract_no(page_num='0')
986 self.init_result_1['合同编号'] = contract_no
987 # 承租人姓名
988 name = self.get_key_value(key='承租人:', page_num='0')
989 self.init_result_1['承租人-姓名'] = name
990 # 承租人证件号码
991 _id = self.get_key_value(key='证件号码:', page_num='0')
992 self.init_result_1['承租人-证件号码'] = _id
993 # 销售经销商
994 seller = self.get_key_value(key='销售经销商:', page_num='0')
995 self.init_result_1['销售经销商'] = seller
996 # 合同编号(正文)
997 contract_no = self.get_contract_no_one()
998 self.init_result_1['合同编号(正文)'] = contract_no
999 # 签字页-承租人姓名
1000 name = self.get_key_value(key='姓名/名称:')
1001 self.init_result_1['签字页-承租人姓名'] = name
1002 # 签字页-承租人证件号码
1003 _id = self.get_key_value(key='自然人身份证件号码/法人执照号码:')
1004 self.init_result_1['签字页-承租人证件号码'] = _id
1005 # 签字页-承租人签章
1006 signature_role_1 = self.get_signature_role_1()
1007 self.init_result_1['签字页-承租人签章'] = signature_role_1
1008 # 签字页-销售经销商
1009 seller = self.get_key_value(key='销售经销商:')
1010 self.init_result_1['签字页-销售经销商'] = seller
1011 # 经销商签章
1012 pass
1013 return self.init_result_1
1014
1015 def get_info_2(self):
1016 if len(self.pdf_info) > 0:
1017 contract_no = self.get_contract_no_dy()
1018 self.init_result_2['合同编号'] = contract_no
1019 # 合同编号(正文)
1020 contract_no = self.get_contract_no_one()
1021 self.init_result_2['合同编号(正文)'] = contract_no
1022 # 抵押人姓名/名称
1023 name, _id = self.get_dyr_name_id()
1024 self.init_result_2['抵押人姓名/名称'] = name
1025 self.init_result_2['抵押人证件号码'] = _id
1026 # 车辆识别代码
1027 vin = self.get_key_value(key='车辆识别代码:')
1028 self.init_result_2['车辆识别代码'] = vin
1029 # 租金总额
1030 rent = self.get_key_value_position(key='租金总额')
1031 self.init_result_2['租金总额'] = rent
1032 # 融资租赁期限
1033 lease_term = self.get_key_value_position(key='融资租赁期限')
1034 self.init_result_2['融资租赁期限'] = lease_term
1035 # 签字页抵押人姓名和签章
1036 name = self.get_key_value(key='抵押人姓名:')
1037 electronic_signature = self.get_electronic_signature(top='抵押权人盖章', bottom='抵押人配偶姓名:')
1038 self.init_result_2['签字页-抵押人姓名'] = name
1039 self.init_result_2['签字页-抵押人签章'] = electronic_signature
1040 # 签字页抵押人配偶姓名和签章
1041 name = self.get_key_value(key='抵押人配偶姓名:')
1042 electronic_signature = self.get_electronic_signature(top='抵押人配偶姓名:', bottom='日期')
1043 self.init_result_2['签字页-抵押人配偶姓名'] = name
1044 self.init_result_2['签字页-抵押人配偶签章'] = electronic_signature
1045 return self.init_result_2
1 # -*- coding: utf-8 -*-
2 # @Author : lk
3 # @Email : 9428.al@gmail.com
4 # @Created Date : 2021-06-29 17:43:46
5 # @Last Modified : 2021-11-03 16:07:36
6 # @Description :
7
8 from .get_char import Finder
9
10
11 def predict(pdf_info, file_cls):
12 """Summary
13
14 Args:
15 pdf_info (TYPE): Description
16 file_cls (TYPE): file_cls = 0: 售后回租合同; file_cls = 1: 车辆处置协议; file_cls = 2: 车辆租赁抵押合同
17
18 Returns:
19 TYPE: Description
20 """
21
22 # 0: 售后回租合同
23 pdf_info_0 = []
24 for pno in pdf_info:
25 for block in pdf_info[f'{pno}']['blocks']:
26 if block['type'] != 0:
27 continue
28 for line in block['lines']:
29 for span in line['spans']:
30 bbox, text = span['bbox'], span['text']
31 if '售后回租合同_' in text:
32 pdf_info_0.append(pdf_info[pno])
33
34 # 1: 车辆处置协议
35 pdf_info_1 = []
36 for pno in pdf_info:
37 for block in pdf_info[f'{pno}']['blocks']:
38 if block['type'] != 0:
39 continue
40 for line in block['lines']:
41 for span in line['spans']:
42 bbox, text = span['bbox'], span['text']
43 if '售后回租合同附件一' in text:
44 pdf_info_1.append(pdf_info[pno])
45
46 # 2: 车辆租赁抵押合同
47 pdf_info_2 = []
48 for pno in pdf_info:
49 for block in pdf_info[f'{pno}']['blocks']:
50 if block['type'] != 0:
51 continue
52 for line in block['lines']:
53 for span in line['spans']:
54 bbox, text = span['bbox'], span['text']
55 if '车辆租赁抵押合同_' in text:
56 pdf_info_2.append(pdf_info[pno])
57
58 is_clczxy = False
59 # 如果 pdf_info_1 == 4 页,则说明此时输入包含了车辆处置协议
60 if len(pdf_info_1) == 4 and file_cls == 1 and len(pdf_info_0) != 0:
61 is_clczxy = True
62 pdf_info = dict()
63 for pno, page_info in enumerate(pdf_info_1):
64 pdf_info[str(pno)] = page_info
65
66 f = Finder(pdf_info)
67 if file_cls == 0:
68 results = f.get_info()
69 if file_cls == 1:
70 # 提取信息 ———— 车辆处置协议
71 results = f.get_info_1()
72 if file_cls == 2:
73 # 提取信息 ———— 车辆租赁抵押合同
74 results = f.get_info_2()
75
76 if is_clczxy == True:
77 for key in results:
78 if results[key]['page'] is not None:
79 results[key]['page'] = str(int(results[key]['page'])+6)
80
81 for key in results:
82 if results[key]['page'] is not None:
83 results[key]['page'] = 'page_' + str(int(results[key]['page'])+1)
84 return results
1 import pyodbc
2
3 afc_sql = """
4 create table afc_contract
5 (
6 id bigint identity primary key,
7 application_id nvarchar(64) not null,
8 create_time datetime not null
9 );
10
11 create index afc_contract_application_id_index
12 on afc_contract (application_id);
13 """
14
15 hil_sql = """
16 create table hil_contract
17 (
18 id bigint identity primary key,
19 application_id nvarchar(64) not null,
20 create_time datetime not null
21 );
22
23 create index hil_contract_application_id_index
24 on hil_contract (application_id);
25 """
26
27 hil_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True)
28
29 hil_cursor = hil_cnxn.cursor()
30 hil_cursor.execute(hil_sql)
31
32 hil_cursor.close()
33 hil_cnxn.close()
34
35 afc_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True)
36
37 afc_cursor = afc_cnxn.cursor()
38 afc_cursor.execute(afc_sql)
39
40 afc_cursor.close()
41 afc_cnxn.close()
1 import os 1 import os
2 import json
2 import cv2 3 import cv2
3 import shutil 4 import shutil
4 import fitz 5 import fitz
...@@ -35,6 +36,8 @@ class PDFHandler: ...@@ -35,6 +36,8 @@ class PDFHandler:
35 self.suffix = self.get_suffix(document_name) 36 self.suffix = self.get_suffix(document_name)
36 self.is_ebank = False 37 self.is_ebank = False
37 self.page_text_list = [] 38 self.page_text_list = []
39 self.pdf_info = {}
40 self.img_path_pno_list = []
38 41
39 def get_suffix(self, file_name): 42 def get_suffix(self, file_name):
40 if file_name is None: 43 if file_name is None:
...@@ -296,6 +299,17 @@ class PDFHandler: ...@@ -296,6 +299,17 @@ class PDFHandler:
296 self.is_ebank = True 299 self.is_ebank = True
297 self.page_text_list = page_text_list 300 self.page_text_list = page_text_list
298 301
302 def e_contract_process(self):
303 with fitz.Document(self.path) as pdf:
304 for pno in range(pdf.pageCount):
305 page = pdf.loadPage(pno)
306 self.pdf_info[str(pno)] = json.loads(page.getText('json'))
307
308 pix = page.getPixmap()
309 img_save_path = self.get_img_save_path(page.number)
310 self.img_path_pno_list.append((img_save_path, 'page_{0}'.format(str(pno+1))))
311 pix.writePNG(img_save_path)
312
299 def extract_image(self, max_img_count=None): 313 def extract_image(self, max_img_count=None):
300 self.img_path_list = [] 314 self.img_path_list = []
301 self.xref_set = set() 315 self.xref_set = set()
......
...@@ -14,3 +14,5 @@ DEALER_CODE = ocr_group ...@@ -14,3 +14,5 @@ DEALER_CODE = ocr_group
14 14
15 BASE_URL = https://li19dkocruat02vm.bmwgroup.net 15 BASE_URL = https://li19dkocruat02vm.bmwgroup.net
16 16
17 DELAY_SECONDS = 60
18
......
...@@ -12,4 +12,6 @@ EDMS_DOWNLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/Dow ...@@ -12,4 +12,6 @@ EDMS_DOWNLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/Dow
12 EDMS_UPLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/UploadHandler.ashx 12 EDMS_UPLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/UploadHandler.ashx
13 DEALER_CODE = ocr_situ_group 13 DEALER_CODE = ocr_situ_group
14 14
15 BASE_URL = https://staging-bmw-ocr.situdata.com
...\ No newline at end of file ...\ No newline at end of file
15 BASE_URL = https://staging-bmw-ocr.situdata.com
16
17 DELAY_SECONDS = 60
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -12,4 +12,6 @@ EDMS_DOWNLOAD_URL = http://sccn0637.bmwgroup.net/FH/FileHold/DocumentRepository/ ...@@ -12,4 +12,6 @@ EDMS_DOWNLOAD_URL = http://sccn0637.bmwgroup.net/FH/FileHold/DocumentRepository/
12 EDMS_UPLOAD_URL = http://sccn0637.bmwgroup.net/FH/FileHold/DocumentRepository/UploadHandler.ashx 12 EDMS_UPLOAD_URL = http://sccn0637.bmwgroup.net/FH/FileHold/DocumentRepository/UploadHandler.ashx
13 DEALER_CODE = ocr_situ_group 13 DEALER_CODE = ocr_situ_group
14 14
15 BASE_URL = https://li19dkocruat01vm.bmwgroup.net
...\ No newline at end of file ...\ No newline at end of file
15 BASE_URL = https://li19dkocruat01vm.bmwgroup.net
16
17 DELAY_SECONDS = 60
...\ No newline at end of file ...\ No newline at end of file
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!