9f4b3645 by 周伟奇

Merge branch 'feature/uat-tmp' into 'master'

Feature/uat tmp

See merge request !18
2 parents 284b3e1d 66e1870b
...@@ -10,8 +10,8 @@ PAGE_SIZE_DEFAULT = 10 ...@@ -10,8 +10,8 @@ PAGE_SIZE_DEFAULT = 10
10 10
11 FIXED_APPLICATION_ID_PREFIX = 'CH-S' 11 FIXED_APPLICATION_ID_PREFIX = 'CH-S'
12 12
13 DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACTMANAGEMENT'] 13 DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACTMANAGEMENT', 'INSURANCE']
14 DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT'] 14 DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT', 'OVP']
15 COMPARE_DOC_SCHEME_LIST = ['CA', 'SE'] 15 COMPARE_DOC_SCHEME_LIST = ['CA', 'SE']
16 16
17 HIL_PREFIX = 'HIL' 17 HIL_PREFIX = 'HIL'
...@@ -1057,7 +1057,25 @@ HIL_CONTRACT_2_CLASSIFY = 44 ...@@ -1057,7 +1057,25 @@ HIL_CONTRACT_2_CLASSIFY = 44
1057 HIL_CONTRACT_3_CN_NAME = '车辆处置协议' 1057 HIL_CONTRACT_3_CN_NAME = '车辆处置协议'
1058 HIL_CONTRACT_3_CLASSIFY = 45 1058 HIL_CONTRACT_3_CLASSIFY = 45
1059 1059
1060 CONTRACT_SET = {CONTRACT_QRS_CLASSIFY, CONTRACT_CLASSIFY, HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_2_CLASSIFY, HIL_CONTRACT_3_CLASSIFY} 1060 FSM_CONTRACT_WEP_CN_NAME = '延长保修合同'
1061 FSM_CONTRACT_WEP_CLASSIFY = 51
1062
1063 FSM_CONTRACT_MSI_CN_NAME = '长悦保养合同'
1064 FSM_CONTRACT_MSI_CLASSIFY = 52
1065
1066 FSM_CONTRACT_SC_CN_NAME = '汽车销售合同'
1067 FSM_CONTRACT_SC_CLASSIFY = 53
1068
1069 CONTRACT_SET = {
1070 CONTRACT_QRS_CLASSIFY,
1071 CONTRACT_CLASSIFY,
1072 HIL_CONTRACT_1_CLASSIFY,
1073 HIL_CONTRACT_2_CLASSIFY,
1074 HIL_CONTRACT_3_CLASSIFY,
1075 FSM_CONTRACT_WEP_CLASSIFY,
1076 FSM_CONTRACT_MSI_CLASSIFY,
1077 FSM_CONTRACT_SC_CLASSIFY,
1078 }
1061 1079
1062 CONTRACT_MAP = { 1080 CONTRACT_MAP = {
1063 HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME, 1081 HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME,
...@@ -1065,8 +1083,13 @@ CONTRACT_MAP = { ...@@ -1065,8 +1083,13 @@ CONTRACT_MAP = {
1065 HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME, 1083 HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME,
1066 CONTRACT_CLASSIFY: CONTRACT_CN_NAME, 1084 CONTRACT_CLASSIFY: CONTRACT_CN_NAME,
1067 CONTRACT_QRS_CLASSIFY: CONTRACT_QRS_CN_NAME, 1085 CONTRACT_QRS_CLASSIFY: CONTRACT_QRS_CN_NAME,
1086 FSM_CONTRACT_WEP_CLASSIFY: FSM_CONTRACT_WEP_CN_NAME,
1087 FSM_CONTRACT_MSI_CLASSIFY: FSM_CONTRACT_MSI_CN_NAME,
1088 FSM_CONTRACT_SC_CLASSIFY: FSM_CONTRACT_SC_CN_NAME,
1068 } 1089 }
1069 1090
1091 FSM_CONTRACT_CLASSIFY_SET = {FSM_CONTRACT_WEP_CLASSIFY, FSM_CONTRACT_MSI_CLASSIFY, FSM_CONTRACT_SC_CLASSIFY}
1092
1070 # 保单 1093 # 保单
1071 INSURANCE_CN_NAME = '保单' 1094 INSURANCE_CN_NAME = '保单'
1072 INSURANCE_CLASSIFY = 42 1095 INSURANCE_CLASSIFY = 42
...@@ -1215,6 +1238,11 @@ BS_FIELD = 'bss_ocr' ...@@ -1215,6 +1238,11 @@ BS_FIELD = 'bss_ocr'
1215 HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr' 1238 HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr'
1216 HIL_CONTRACT_2_FIELD = 'hil_contract_2_ocr' 1239 HIL_CONTRACT_2_FIELD = 'hil_contract_2_ocr'
1217 HIL_CONTRACT_3_FIELD = 'hil_contract_3_ocr' 1240 HIL_CONTRACT_3_FIELD = 'hil_contract_3_ocr'
1241 FSM_CONTRACT_WEP_FIELD = 'fsm_wep_ocr'
1242 FSM_CONTRACT_MSI_FIELD = 'fsm_msi_ocr'
1243 FSM_CONTRACT_SC_FIELD = 'fsm_sc_ocr'
1244
1245
1218 BS_CLASSIFY = 10089 1246 BS_CLASSIFY = 10089
1219 1247
1220 RESULT_MAPPING = { 1248 RESULT_MAPPING = {
...@@ -1239,6 +1267,9 @@ RESULT_MAPPING = { ...@@ -1239,6 +1267,9 @@ RESULT_MAPPING = {
1239 HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD, 1267 HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD,
1240 HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_FIELD, 1268 HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_FIELD,
1241 HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_FIELD, 1269 HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_FIELD,
1270 FSM_CONTRACT_WEP_CLASSIFY: FSM_CONTRACT_WEP_FIELD,
1271 FSM_CONTRACT_MSI_CLASSIFY: FSM_CONTRACT_MSI_FIELD,
1272 FSM_CONTRACT_SC_CLASSIFY: FSM_CONTRACT_SC_FIELD,
1242 } 1273 }
1243 1274
1244 CA_ADD_COMPARE_FIELDS = (IC_OCR_FIELD, BL_OCR_FIELD, BS_FIELD) 1275 CA_ADD_COMPARE_FIELDS = (IC_OCR_FIELD, BL_OCR_FIELD, BS_FIELD)
...@@ -1511,6 +1542,9 @@ SE_AFC_CON_MAP = { ...@@ -1511,6 +1542,9 @@ SE_AFC_CON_MAP = {
1511 '还款账号': (2, 2, '还款账户', '账号'), 1542 '还款账号': (2, 2, '还款账户', '账号'),
1512 '户名': (2, 2, '还款账户', '户名'), 1543 '户名': (2, 2, '还款账户', '户名'),
1513 '开户行': (2, 2, '还款账户', '开户行'), 1544 '开户行': (2, 2, '还款账户', '开户行'),
1545 '收款账号': (2, 2, '借款人收款账户', '账号'),
1546 '收款户名': (2, 2, '借款人收款账户', '户名'),
1547 '收款开户行': (2, 2, '借款人收款账户', '开户行'),
1514 1548
1515 '借款人签字及时间': (1, 1, '借款人签字及时间', None), 1549 '借款人签字及时间': (1, 1, '借款人签字及时间', None),
1516 1550
...@@ -1550,9 +1584,12 @@ SE_HIL_CON_1_MAP = { ...@@ -1550,9 +1584,12 @@ SE_HIL_CON_1_MAP = {
1550 '融资成本总额': (5, 4, 7, '融资成本总额', None), 1584 '融资成本总额': (5, 4, 7, '融资成本总额', None),
1551 '租期': (5, 4, 7, '租期', None), 1585 '租期': (5, 4, 7, '租期', None),
1552 '还款计划表': (5, 5, 7, '付款计划表', None), 1586 '还款计划表': (5, 5, 7, '付款计划表', None),
1553 '还款账号': (5, 5, 7, '银行账户-银行账号', None), 1587 '还款账号': (5, 6, 7, '银行账户-银行账号', None),
1554 '户名': (5, 5, 7, '银行账户-户名', None), 1588 '户名': (5, 6, 7, '银行账户-户名', None),
1555 '开户行': (5, 5, 7, '银行账户-开户行', None), 1589 '开户行': (5, 6, 7, '银行账户-开户行', None),
1590 '收款账号': (5, 5, 7, '收款银行账户-银行账号', None),
1591 '收款户名': (5, 5, 7, '收款银行账户-户名', None),
1592 '收款开户行': (5, 5, 7, '收款银行账户-开户行', None),
1556 'ASP项目详情': (5, 4, 7, '车辆附加产品明细表', None), 1593 'ASP项目详情': (5, 4, 7, '车辆附加产品明细表', None),
1557 '承租人法定代表人或授权代表': (1, 1, 7, '承租人-法定代表人或授权代表', None), 1594 '承租人法定代表人或授权代表': (1, 1, 7, '承租人-法定代表人或授权代表', None),
1558 '共同承租人法定代表人或授权代表': (1, 1, 7, '共同承租人-法定代表人或授权代表', None), 1595 '共同承租人法定代表人或授权代表': (1, 1, 7, '共同承租人-法定代表人或授权代表', None),
...@@ -1608,6 +1645,39 @@ SE_HIL_CON_MAP = { ...@@ -1608,6 +1645,39 @@ SE_HIL_CON_MAP = {
1608 HIL_CONTRACT_3_CLASSIFY: SE_HIL_CON_3_MAP, 1645 HIL_CONTRACT_3_CLASSIFY: SE_HIL_CON_3_MAP,
1609 } 1646 }
1610 1647
1648 SE_FSM_WEP_MAP = {
1649 '客户姓名': (1, '客户姓名'),
1650 '证件类型': (1, '证件类型'),
1651 '证件号码': (1, '证件号码'),
1652 '合同价格(小写)': (1, '合同价格(小写)'),
1653 '客户签名': (1, '客户签名'),
1654 '签单日期': (1, '签单日期'),
1655 }
1656
1657 SE_FSM_MSI_MAP = {
1658 '客户姓名': (1, '客户姓名'),
1659 '证件类型': (1, '证件类型'),
1660 '证件号码': (1, '证件号码'),
1661 '合同价格(小写)': (1, '合同价格(小写)'),
1662 '客户签名': (2, '客户签名'),
1663 '签单日期': (2, '签单日期'),
1664 }
1665
1666 SE_FSM_SC_MAP = {
1667 '姓名': (1, '姓名'),
1668 '证件类型': (1, '证件类型'),
1669 '证件号码': (1, '证件号码'),
1670 '总价': (1, '总价'),
1671 '客户签名': (12, '客户签名'),
1672 '签单日期': (12, '签单日期'),
1673 }
1674
1675 SE_FSM_CON_MAP = {
1676 FSM_CONTRACT_WEP_CLASSIFY: SE_FSM_WEP_MAP,
1677 FSM_CONTRACT_MSI_CLASSIFY: SE_FSM_MSI_MAP,
1678 FSM_CONTRACT_SC_CLASSIFY: SE_FSM_SC_MAP,
1679 }
1680
1611 SE_AFC_CON_QRS_FIELD = ['合同编号'] 1681 SE_AFC_CON_QRS_FIELD = ['合同编号']
1612 SE_AFC_CON_FIELD = ['合同编号-每页', '所购车辆价格-小写-重要条款', '车架号-重要条款', '贷款本金金额-重要条款', '贷款期限-重要条款', 1682 SE_AFC_CON_FIELD = ['合同编号-每页', '所购车辆价格-小写-重要条款', '车架号-重要条款', '贷款本金金额-重要条款', '贷款期限-重要条款',
1613 '车辆贷款本金金额-重要条款', '附加产品融资贷款本金总额-重要条款', '所购车辆价格', '车架号', '经销商', 1683 '车辆贷款本金金额-重要条款', '附加产品融资贷款本金总额-重要条款', '所购车辆价格', '车架号', '经销商',
...@@ -2314,29 +2384,42 @@ APPLICANT_TYPE_MAP = { ...@@ -2314,29 +2384,42 @@ APPLICANT_TYPE_MAP = {
2314 2384
2315 APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager'] 2385 APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager']
2316 2386
2317 FILE_NAME_PREFIX_MAP = { 2387 # FILE_NAME_PREFIX_MAP = {
2318 AFC_PREFIX: [ 2388 # AFC_PREFIX: [
2319 ((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'), 2389 # ((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'),
2320 ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), 2390 # ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
2321 ], 2391 # ],
2322 HIL_PREFIX: [ 2392 # HIL_PREFIX: [
2323 ((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'), 2393 # ((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'),
2324 ((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'), 2394 # ((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'),
2325 ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), 2395 # ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
2326 ] 2396 # ]
2327 } 2397 # }
2328 2398
2329 ECONTRACT_KEYWORDS_MAP = { 2399 ECONTRACT_KEYWORDS_MAP = {
2330 AFC_PREFIX: [ 2400 AFC_PREFIX: [
2331 ('抵押贷款合同', CONTRACT_CLASSIFY), 2401 ('抵押贷款合同', CONTRACT_CLASSIFY),
2332 ('送达地址确认书', CONTRACT_QRS_CLASSIFY), 2402 ('送达地址确认书', CONTRACT_QRS_CLASSIFY),
2333 # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0), 2403 ('抵押登记豁免函', HMH_CLASSIFY),
2334 ], 2404 ],
2335 HIL_PREFIX: [ 2405 HIL_PREFIX: [
2336 ('售后回租合同', HIL_CONTRACT_1_CLASSIFY), 2406 ('售后回租合同', HIL_CONTRACT_1_CLASSIFY),
2337 ('租赁抵押合同', HIL_CONTRACT_2_CLASSIFY), 2407 ('租赁抵押合同', HIL_CONTRACT_2_CLASSIFY),
2338 ('车辆处置协议', HIL_CONTRACT_3_CLASSIFY), 2408 ('车辆处置协议', HIL_CONTRACT_3_CLASSIFY),
2339 # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0), 2409 ('抵押登记豁免函', HMH_CLASSIFY),
2410 ]
2411 }
2412
2413 FSM_ECONTRACT_KEYWORDS_MAP = {
2414 AFC_PREFIX: [
2415 ('延长保修条款与条件', FSM_CONTRACT_WEP_CLASSIFY),
2416 ('长悦保养套餐服务合约', FSM_CONTRACT_MSI_CLASSIFY),
2417 ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY),
2418 ],
2419 HIL_PREFIX: [
2420 ('延长保修条款与条件', FSM_CONTRACT_WEP_CLASSIFY),
2421 ('长悦保养套餐服务合同', FSM_CONTRACT_MSI_CLASSIFY),
2422 ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY),
2340 ] 2423 ]
2341 } 2424 }
2342 2425
...@@ -2346,6 +2429,12 @@ HIL_CONTRACT_TYPE_MAP = { ...@@ -2346,6 +2429,12 @@ HIL_CONTRACT_TYPE_MAP = {
2346 str(HIL_CONTRACT_3_CLASSIFY): 1, 2429 str(HIL_CONTRACT_3_CLASSIFY): 1,
2347 } 2430 }
2348 2431
2432 FSM_CONTRACT_TYPE_MAP = {
2433 str(FSM_CONTRACT_WEP_CLASSIFY): 0,
2434 str(FSM_CONTRACT_MSI_CLASSIFY): 1,
2435 str(FSM_CONTRACT_SC_CLASSIFY): 2,
2436 }
2437
2349 RESULT_MAP = { 2438 RESULT_MAP = {
2350 0: None, 2439 0: None,
2351 1: True, 2440 1: True,
...@@ -2379,3 +2468,26 @@ MPOS_MAP = { ...@@ -2379,3 +2468,26 @@ MPOS_MAP = {
2379 } 2468 }
2380 2469
2381 FOLDER_WSC_CLASSIFY = 199 2470 FOLDER_WSC_CLASSIFY = 199
2471
2472
2473 FSM_BEFORE_ACTIVITED_STATUS = {
2474 "APSVD": "Saved",
2475 "APEAE": "E-app Editing",
2476 "APADA": "Awaiting Dealer Action",
2477 "APAPR": "Acceptance Processing",
2478 "APPSB": "Pre-submit Processed",
2479 "APSBT": "Submitted",
2480 "APAPP": "Approved",
2481 "APHOC": "Held Offer-Docs",
2482 "APHOD": "Held Offer-Data",
2483 "APINI": "Initiated",
2484 "APSEP": "Settlement Processing"
2485 }
2486
2487 FSM_ACTIVITED_STATUS = {
2488 "APADF": "Activated-Document Follow up",
2489 "APASC": "Activated-Awaiting Settlement Check",
2490 "APIPN": "Activated-Invoice Passed-Non PT",
2491 "APIPP": "Activated-Invoice Passed-PT Doc Required",
2492 "APARD": "Activated-Review done",
2493 }
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -20,6 +20,8 @@ from common.tools.file_tools import get_pwd_list_from_str, extract_zip_or_rar, g ...@@ -20,6 +20,8 @@ from common.tools.file_tools import get_pwd_list_from_str, extract_zip_or_rar, g
20 from common.tools.pdf_to_img import PDFHandler 20 from common.tools.pdf_to_img import PDFHandler
21 from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict 21 from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict
22 from common.electronic_hil_contract.hil_contract_ocr import predict as hil_predict 22 from common.electronic_hil_contract.hil_contract_ocr import predict as hil_predict
23 from common.fsm_econtract.fsm_contract_ocr import predict as fsm_predict
24 from common.fsm_econtract.hmh_ocr import predict as hmh_predict
23 from apps.doc import consts 25 from apps.doc import consts
24 # from apps.doc.ocr.edms import EDMS, rh 26 # from apps.doc.ocr.edms import EDMS, rh
25 from apps.doc.ocr.ecm import ECM, rh 27 from apps.doc.ocr.ecm import ECM, rh
...@@ -40,8 +42,10 @@ from apps.doc.models import ( ...@@ -40,8 +42,10 @@ from apps.doc.models import (
40 DDARecords, 42 DDARecords,
41 IDBCRecords, 43 IDBCRecords,
42 Configs, 44 Configs,
45 AFCCmsStatusInfo,
46 HILCmsStatusInfo,
43 ) 47 )
44 from celery_compare.tasks import compare 48 from celery_compare.tasks import compare, fsm_compare
45 49
46 50
47 class Command(BaseCommand, LoggerMixin): 51 class Command(BaseCommand, LoggerMixin):
...@@ -996,7 +1000,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -996,7 +1000,7 @@ class Command(BaseCommand, LoggerMixin):
996 res.setdefault(consts.ALL_POSITION_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get( 1000 res.setdefault(consts.ALL_POSITION_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get(
997 consts.ALL_POSITION_KEY, {}).get(key1, []) 1001 consts.ALL_POSITION_KEY, {}).get(key1, [])
998 license_summary[classify] = [res] 1002 license_summary[classify] = [res]
999 else: 1003 elif classify in consts.SE_HIL_CON_MAP:
1000 res = {} 1004 res = {}
1001 for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items(): 1005 for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items():
1002 if pno1 is None: 1006 if pno1 is None:
...@@ -1020,7 +1024,14 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1020,7 +1024,14 @@ class Command(BaseCommand, LoggerMixin):
1020 res[key] = tmp_res 1024 res[key] = tmp_res
1021 res.setdefault(consts.IMG_PATH_KEY, dict())[key] = page_info_dict.get(str(img_pno), {}).get( 1025 res.setdefault(consts.IMG_PATH_KEY, dict())[key] = page_info_dict.get(str(img_pno), {}).get(
1022 consts.IMG_PATH_KEY, '') 1026 consts.IMG_PATH_KEY, '')
1027 license_summary[classify] = [res]
1023 1028
1029 elif classify in consts.SE_FSM_CON_MAP:
1030 res = {}
1031 for key, (pno1, key1) in consts.SE_FSM_CON_MAP[classify].items():
1032 res[key] = page_info_dict.get(str(pno1), {}).get(key1)
1033 res.setdefault(consts.IMG_PATH_KEY, dict())[key] = page_info_dict.get(str(pno1), {}).get(
1034 consts.IMG_PATH_KEY, '')
1024 license_summary[classify] = [res] 1035 license_summary[classify] = [res]
1025 1036
1026 def rebuild_bs_summary(self, bs_summary, unknown_summary): 1037 def rebuild_bs_summary(self, bs_summary, unknown_summary):
...@@ -1442,7 +1453,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1442,7 +1453,7 @@ class Command(BaseCommand, LoggerMixin):
1442 self.log_base, traceback.format_exc())) 1453 self.log_base, traceback.format_exc()))
1443 error_list.append(1) 1454 error_list.append(1)
1444 return 1455 return
1445 else: # e-contract 1456 else: # e-contract or or e-fsm-contract or e-hmh
1446 try: 1457 try:
1447 # pdf下载 处理 图片存储 识别 1458 # pdf下载 处理 图片存储 识别
1448 for times in range(consts.RETRY_TIMES): 1459 for times in range(consts.RETRY_TIMES):
...@@ -1472,8 +1483,10 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1472,8 +1483,10 @@ class Command(BaseCommand, LoggerMixin):
1472 self.online_log.error('{0} [process error (db save)] [error={1}]'.format( 1483 self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
1473 self.log_base, traceback.format_exc())) 1484 self.log_base, traceback.format_exc()))
1474 1485
1486 # AFC合同
1475 if classify_1_str == str(consts.CONTRACT_CLASSIFY): 1487 if classify_1_str == str(consts.CONTRACT_CLASSIFY):
1476 ocr_result = afc_predict(pdf_handler.pdf_info) 1488 is_fsm = doc.data_source == consts.DATA_SOURCE_LIST[3]
1489 ocr_result = afc_predict(pdf_handler.pdf_info, is_fsm=is_fsm)
1477 page_res = {} 1490 page_res = {}
1478 for page_num, page_info in ocr_result.get('page_info', {}).items(): 1491 for page_num, page_info in ocr_result.get('page_info', {}).items():
1479 if isinstance(page_num, str) and page_num.startswith('page_'): 1492 if isinstance(page_num, str) and page_num.startswith('page_'):
...@@ -1483,6 +1496,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1483,6 +1496,7 @@ class Command(BaseCommand, LoggerMixin):
1483 'page_num': page_num, 1496 'page_num': page_num,
1484 'page_info': page_info 1497 'page_info': page_info
1485 } 1498 }
1499 # 送达地址确认书
1486 elif classify_1_str == str(consts.CONTRACT_QRS_CLASSIFY): 1500 elif classify_1_str == str(consts.CONTRACT_QRS_CLASSIFY):
1487 ocr_result = afc_predict(pdf_handler.pdf_info, is_qrs=True) 1501 ocr_result = afc_predict(pdf_handler.pdf_info, is_qrs=True)
1488 page_num = 'page_1' 1502 page_num = 'page_1'
...@@ -1493,9 +1507,11 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1493,9 +1507,11 @@ class Command(BaseCommand, LoggerMixin):
1493 'page_info': ocr_result.pop(page_num, {}) 1507 'page_info': ocr_result.pop(page_num, {})
1494 } 1508 }
1495 } 1509 }
1496 else: 1510 # HIL合同
1511 elif classify_1_str in consts.HIL_CONTRACT_TYPE_MAP:
1512 is_fsm = doc.data_source == consts.DATA_SOURCE_LIST[3]
1497 file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str) 1513 file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str)
1498 ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1) 1514 ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1, is_fsm=is_fsm)
1499 rebuild_res_1 = {} 1515 rebuild_res_1 = {}
1500 page_res = {} 1516 page_res = {}
1501 for field_name, field_info in ocr_result_1.items(): 1517 for field_name, field_info in ocr_result_1.items():
...@@ -1508,9 +1524,36 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1508,9 +1524,36 @@ class Command(BaseCommand, LoggerMixin):
1508 'page_num': page_num, 1524 'page_num': page_num,
1509 'page_info': page_info 1525 'page_info': page_info
1510 } 1526 }
1527 # FSM合同 WEP MSI SC
1528 elif classify_1_str in consts.FSM_CONTRACT_TYPE_MAP:
1529 file_type = consts.FSM_CONTRACT_TYPE_MAP.get(classify_1_str)
1530 ocr_result = fsm_predict(pdf_handler.pdf_info, file_type)
1531 page_res = {}
1532 for page_num, page_info in ocr_result.items():
1533 if isinstance(page_num, str) and page_num.startswith('page_'):
1534 page_res[page_num] = {
1535 'classify': int(classify_1_str),
1536 'page_num': page_num,
1537 'page_info': page_info
1538 }
1539 # hmh
1540 # else:
1541 # pass
1542
1511 1543
1512 contract_res = {} 1544 contract_res = {}
1513 for img_path_tmp, page_key in pdf_handler.img_path_pno_list: 1545 for img_path_tmp, page_key in pdf_handler.img_path_pno_list:
1546 if classify_1_str == str(consts.HMH_CLASSIFY):
1547 img_contract_res = {
1548 'code': 1,
1549 'data': [
1550 {
1551 'classify': consts.HMH_CLASSIFY,
1552 'data': hmh_predict(pdf_handler.pdf_info)
1553 }
1554 ]
1555 }
1556 else:
1514 if page_key in page_res: 1557 if page_key in page_res:
1515 img_contract_res = { 1558 img_contract_res = {
1516 'code': 1, 1559 'code': 1,
...@@ -1966,6 +2009,9 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1966,6 +2009,9 @@ class Command(BaseCommand, LoggerMixin):
1966 report_list[5] = BSCheckResult.CHECK_FAILED.value 2009 report_list[5] = BSCheckResult.CHECK_FAILED.value
1967 2010
1968 finally: 2011 finally:
2012 self.online_log.info('{0} [task={1}] [license_summary={2}] '
2013 '[contract_result_compare={3}]'.format(self.log_base, task_str,
2014 license_summary, contract_result_compare))
1969 self.rebuild_contract(license_summary, contract_result_compare) 2015 self.rebuild_contract(license_summary, contract_result_compare)
1970 2016
1971 bs_rebuild = self.rebuild_bs(merged_bs_summary) 2017 bs_rebuild = self.rebuild_bs(merged_bs_summary)
...@@ -2015,6 +2061,16 @@ class Command(BaseCommand, LoggerMixin): ...@@ -2015,6 +2061,16 @@ class Command(BaseCommand, LoggerMixin):
2015 self.log_base, task_str, res_obj.id)) 2061 self.log_base, task_str, res_obj.id))
2016 # 触发比对 2062 # 触发比对
2017 try: 2063 try:
2064 # 是否fsm
2065 cms_status_class = HILCmsStatusInfo if business_type in consts.HIL_SET else AFCCmsStatusInfo
2066 cms_status_info = cms_status_class.objects.filter(application_id=doc.application_id).first()
2067 is_fsm = cms_status_info is not None and cms_status_info.is_fsm == 1
2068 self.online_log.info('{0} [isfsm] [task={1}] [true or false={2}]'.format(
2069 self.log_base, task_str, is_fsm))
2070 if is_fsm:
2071 fsm_compare.apply_async((doc.application_id, business_type, None, res_obj.id, is_ca, True),
2072 queue='queue_compare')
2073 else:
2018 # pass 2074 # pass
2019 compare.apply_async((doc.application_id, business_type, None, res_obj.id, 2075 compare.apply_async((doc.application_id, business_type, None, res_obj.id,
2020 is_ca, True), queue='queue_compare') 2076 is_ca, True), queue='queue_compare')
......
...@@ -329,6 +329,11 @@ class AFCOCRResult(models.Model): ...@@ -329,6 +329,11 @@ class AFCOCRResult(models.Model):
329 hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2") 329 hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2")
330 hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") 330 hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3")
331 qrs_ocr = models.TextField(null=True, verbose_name="AFC合同确认书") 331 qrs_ocr = models.TextField(null=True, verbose_name="AFC合同确认书")
332 fsm_wep_ocr = models.TextField(null=True, verbose_name="延长保修合同")
333 fsm_msi_ocr = models.TextField(null=True, verbose_name="长悦保养合同")
334 fsm_sc_ocr = models.TextField(null=True, verbose_name="汽车销售合同")
335 fsm_activited = models.IntegerField(null=False, default=0, verbose_name="fsm激活状态 1:激活")
336
332 337
333 update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') 338 update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间')
334 create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') 339 create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
...@@ -366,6 +371,11 @@ class HILOCRResult(models.Model): ...@@ -366,6 +371,11 @@ class HILOCRResult(models.Model):
366 hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") 371 hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3")
367 qrs_ocr = models.TextField(null=True, verbose_name="AFC合同确认书") 372 qrs_ocr = models.TextField(null=True, verbose_name="AFC合同确认书")
368 373
374 fsm_wep_ocr = models.TextField(null=True, verbose_name="延长保修合同")
375 fsm_msi_ocr = models.TextField(null=True, verbose_name="长悦保养合同")
376 fsm_sc_ocr = models.TextField(null=True, verbose_name="汽车销售合同")
377 fsm_activited = models.IntegerField(null=False, default=0, verbose_name="fsm激活状态 1:激活")
378
369 update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') 379 update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间')
370 create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') 380 create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
371 381
...@@ -401,6 +411,11 @@ class AFCSEOCRResult(models.Model): ...@@ -401,6 +411,11 @@ class AFCSEOCRResult(models.Model):
401 hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") 411 hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3")
402 qrs_ocr = models.TextField(null=True, verbose_name="AFC合同确认书") 412 qrs_ocr = models.TextField(null=True, verbose_name="AFC合同确认书")
403 413
414 fsm_wep_ocr = models.TextField(null=True, verbose_name="延长保修合同")
415 fsm_msi_ocr = models.TextField(null=True, verbose_name="长悦保养合同")
416 fsm_sc_ocr = models.TextField(null=True, verbose_name="汽车销售合同")
417 fsm_activited = models.IntegerField(null=False, default=0, verbose_name="fsm激活状态 1:激活")
418
404 update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') 419 update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间')
405 create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') 420 create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
406 421
...@@ -436,6 +451,10 @@ class HILSEOCRResult(models.Model): ...@@ -436,6 +451,10 @@ class HILSEOCRResult(models.Model):
436 hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2") 451 hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2")
437 hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") 452 hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3")
438 qrs_ocr = models.TextField(null=True, verbose_name="AFC合同确认书") 453 qrs_ocr = models.TextField(null=True, verbose_name="AFC合同确认书")
454 fsm_wep_ocr = models.TextField(null=True, verbose_name="延长保修合同")
455 fsm_msi_ocr = models.TextField(null=True, verbose_name="长悦保养合同")
456 fsm_sc_ocr = models.TextField(null=True, verbose_name="汽车销售合同")
457 fsm_activited = models.IntegerField(null=False, default=0, verbose_name="fsm激活状态 1:激活")
439 458
440 update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') 459 update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间')
441 create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') 460 create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
...@@ -1042,3 +1061,41 @@ class AFCCompareReportNew(models.Model): ...@@ -1042,3 +1061,41 @@ class AFCCompareReportNew(models.Model):
1042 managed = False 1061 managed = False
1043 db_table = 'afc_compare_report_new' 1062 db_table = 'afc_compare_report_new'
1044 situ_db_label = 'afc' 1063 situ_db_label = 'afc'
1064
1065
1066 class NscInvoice(models.Model):
1067 id = models.AutoField(primary_key=True, verbose_name="id") # 主键
1068 vin = models.CharField(max_length=64, verbose_name="车架号") # 索引
1069 content = models.TextField(null=True, verbose_name="nsc发票信息")
1070 create_time = models.DateTimeField(verbose_name='创建时间')
1071
1072 class Meta:
1073 managed = False
1074 db_table = 'nsc_invoice'
1075
1076
1077 class AFCCmsStatusInfo(models.Model):
1078 id = models.AutoField(primary_key=True, verbose_name="id") # 主键
1079 application_id = models.CharField(max_length=64, verbose_name="订单id") # 索引
1080 business_type = models.CharField(max_length=64, verbose_name="业务类型")
1081 is_fsm = models.SmallIntegerField(null=False, default=0, verbose_name="是否fsm流程 1:是")
1082 update_time = models.DateTimeField(verbose_name='更新时间')
1083 create_time = models.DateTimeField(verbose_name='创建时间')
1084
1085 class Meta:
1086 managed = False
1087 db_table = 'afc_cms_status_info'
1088 situ_db_label = 'afc'
1089
1090
1091 class HILCmsStatusInfo(models.Model):
1092 id = models.AutoField(primary_key=True, verbose_name="id") # 主键
1093 application_id = models.CharField(max_length=64, verbose_name="订单id") # 索引
1094 business_type = models.CharField(max_length=64, verbose_name="业务类型")
1095 is_fsm = models.SmallIntegerField(null=False, default=0, verbose_name="是否fsm流程 1:是")
1096 update_time = models.DateTimeField(verbose_name='更新时间')
1097 create_time = models.DateTimeField(verbose_name='创建时间')
1098
1099 class Meta:
1100 managed = False
1101 db_table = 'hil_cms_status_info'
......
...@@ -27,6 +27,7 @@ class RequestTeam(NamedEnum): ...@@ -27,6 +27,7 @@ class RequestTeam(NamedEnum):
27 SETTLEMENT = (1, 'SETTLEMENT') 27 SETTLEMENT = (1, 'SETTLEMENT')
28 CONTRACTMANAGEMENT = (2, 'CONTRACTMANAGEMENT') 28 CONTRACTMANAGEMENT = (2, 'CONTRACTMANAGEMENT')
29 CONTROLLING = (3, 'CONTROLLING') 29 CONTROLLING = (3, 'CONTROLLING')
30 INSURANCE = (4, 'INSURANCE')
30 31
31 32
32 class RequestTrigger(NamedEnum): 33 class RequestTrigger(NamedEnum):
...@@ -36,6 +37,7 @@ class RequestTrigger(NamedEnum): ...@@ -36,6 +37,7 @@ class RequestTrigger(NamedEnum):
36 DOCUPLOAD = (3, 'Document Upload') 37 DOCUPLOAD = (3, 'Document Upload')
37 SUBMITING = (4, 'Submiting') 38 SUBMITING = (4, 'Submiting')
38 UPLOADING = (5, 'Uploading') 39 UPLOADING = (5, 'Uploading')
40 OVP = (6, 'OVP')
39 41
40 42
41 class FailureReason(NamedEnum): 43 class FailureReason(NamedEnum):
......
...@@ -34,6 +34,7 @@ class ECM: ...@@ -34,6 +34,7 @@ class ECM:
34 'ACCEPTANCE': ('acceptance', conf.ECM_FOLDER_CA, conf.ECM_FOLDER_CA_HIL), 34 'ACCEPTANCE': ('acceptance', conf.ECM_FOLDER_CA, conf.ECM_FOLDER_CA_HIL),
35 'SETTLEMENT': (self.settlement_type, conf.ECM_FOLDER_SE, conf.ECM_FOLDER_SE_HIL), 35 'SETTLEMENT': (self.settlement_type, conf.ECM_FOLDER_SE, conf.ECM_FOLDER_SE_HIL),
36 'CONTRACTMANAGEMENT': ('contract_management', conf.ECM_FOLDER_CA, conf.ECM_FOLDER_CA_HIL), 36 'CONTRACTMANAGEMENT': ('contract_management', conf.ECM_FOLDER_CA, conf.ECM_FOLDER_CA_HIL),
37 'INSURANCE': ('insurance', conf.ECM_FOLDER_SE, conf.ECM_FOLDER_SE_HIL),
37 } 38 }
38 self.doc_base_map = { 39 self.doc_base_map = {
39 'AFC': 'SF5_CN', 40 'AFC': 'SF5_CN',
......
...@@ -808,10 +808,12 @@ class BSWorkbook(Workbook): ...@@ -808,10 +808,12 @@ class BSWorkbook(Workbook):
808 if field_str is not None: 808 if field_str is not None:
809 count_list.append((field_str, count)) 809 count_list.append((field_str, count))
810 810
811 def contract_rebuild(self, contract_result_dict): 811 def contract_rebuild(self, contract_result_dict, is_ca=False):
812 for classify, contract_result in contract_result_dict.items(): 812 for classify, contract_result in contract_result_dict.items():
813 if len(contract_result) == 0: 813 if len(contract_result) == 0:
814 continue 814 continue
815 if is_ca and classify not in consts.FSM_CONTRACT_CLASSIFY_SET:
816 continue
815 ws = self.create_sheet(consts.CONTRACT_MAP.get(classify)) 817 ws = self.create_sheet(consts.CONTRACT_MAP.get(classify))
816 for i in range(30): 818 for i in range(30):
817 if str(i) in contract_result: 819 if str(i) in contract_result:
...@@ -934,6 +936,7 @@ class BSWorkbook(Workbook): ...@@ -934,6 +936,7 @@ class BSWorkbook(Workbook):
934 else: 936 else:
935 self.bs_rebuild(bs_summary, res_count_tuple, metadata) 937 self.bs_rebuild(bs_summary, res_count_tuple, metadata)
936 self.license_rebuild(license_summary, document_scheme, count_list) 938 self.license_rebuild(license_summary, document_scheme, count_list)
939 self.contract_rebuild(contract_result, True)
937 self.move_res_sheet() 940 self.move_res_sheet()
938 self.remove_base_sheet() 941 self.remove_base_sheet()
939 return count_list, self.need_follow 942 return count_list, self.need_follow
......
...@@ -48,14 +48,23 @@ from .models import ( ...@@ -48,14 +48,23 @@ from .models import (
48 MposReport, 48 MposReport,
49 GenericOCRReport, 49 GenericOCRReport,
50 InterfaceReport, 50 InterfaceReport,
51 HILOCRResult,
52 HILSEOCRResult,
53 AFCOCRResult,
54 AFCSEOCRResult,
55 HILCmsStatusInfo,
56 AFCCmsStatusInfo
51 ) 57 )
52 from .named_enum import ErrorType, AutoResult, WholeResult, RPAResult, SystemName 58 from .named_enum import ErrorType, AutoResult, WholeResult, RPAResult, SystemName, RequestTeam
53 from .mixins import DocHandler, MPOSHandler, PreSEHandler 59 from .mixins import DocHandler, MPOSHandler, PreSEHandler
54 from . import consts 60 from . import consts
55 from apps.account.authentication import OAuth2AuthenticationWithUser 61 from apps.account.authentication import OAuth2AuthenticationWithUser
56 from celery_compare.tasks import compare 62 from celery_compare.tasks import compare, fsm_compare
63 from prese.compare import get_empty_result
57 64
58 import time 65 import time
66
67
59 class CustomDate(fields.Date): 68 class CustomDate(fields.Date):
60 69
61 def _deserialize(self, value, attr, data, **kwargs): 70 def _deserialize(self, value, attr, data, **kwargs):
...@@ -248,6 +257,7 @@ se_compare_content = { ...@@ -248,6 +257,7 @@ se_compare_content = {
248 'fsmSpecialCar': fields.Boolean(required=False), 257 'fsmSpecialCar': fields.Boolean(required=False),
249 'fsmBestPrice': fields.Boolean(required=False), 258 'fsmBestPrice': fields.Boolean(required=False),
250 'isAutoSettlement': fields.Boolean(required=False), 259 'isAutoSettlement': fields.Boolean(required=False),
260 'fsmLandingDealer': fields.Str(required=False, validate=validate.Length(max=1024)),
251 261
252 'individualCusInfo': fields.List(fields.Nested(se_individual_args), 262 'individualCusInfo': fields.List(fields.Nested(se_individual_args),
253 required=True, validate=validate.Length(min=1, max=4)), 263 required=True, validate=validate.Length(min=1, max=4)),
...@@ -551,6 +561,7 @@ class UploadDocView(GenericView, DocHandler): ...@@ -551,6 +561,7 @@ class UploadDocView(GenericView, DocHandler):
551 # authentication_classes = [] 561 # authentication_classes = []
552 permission_classes = [IsAuthenticated] 562 permission_classes = [IsAuthenticated]
553 authentication_classes = [OAuth2AuthenticationWithUser] 563 authentication_classes = [OAuth2AuthenticationWithUser]
564
554 # required_scopes = ['write'] 565 # required_scopes = ['write']
555 566
556 # 上传(接收)文件接口 567 # 上传(接收)文件接口
...@@ -563,6 +574,8 @@ class UploadDocView(GenericView, DocHandler): ...@@ -563,6 +574,8 @@ class UploadDocView(GenericView, DocHandler):
563 document = args.get('document') 574 document = args.get('document')
564 business_type = document.get('businessType') 575 business_type = document.get('businessType')
565 application_id = application_data.get('applicationId') 576 application_id = application_data.get('applicationId')
577 # 包含FSM 激活状态
578 application_status = application_data.get('applicationStatus', '')
566 document_scheme = document.get('documentScheme') 579 document_scheme = document.get('documentScheme')
567 data_source = document.get('dataSource') 580 data_source = document.get('dataSource')
568 document_name = document.get('documentName', '') 581 document_name = document.get('documentName', '')
...@@ -571,6 +584,34 @@ class UploadDocView(GenericView, DocHandler): ...@@ -571,6 +584,34 @@ class UploadDocView(GenericView, DocHandler):
571 data_source = self.fix_data_source(data_source) 584 data_source = self.fix_data_source(data_source)
572 document_scheme = self.fix_scheme(document_scheme) 585 document_scheme = self.fix_scheme(document_scheme)
573 586
587 # fsm激活状态, 更新ocr_result 表fsm状态
588 self.running_log.info('[doc upload applicationId-{0}] [applicationStatus-{1}, activated-{2}]'
589 .format(application_id, application_status,
590 True if consts.FSM_ACTIVITED_STATUS.get(application_status) else False))
591 if consts.FSM_ACTIVITED_STATUS.get(application_status):
592 result_class = None
593 if business_type == consts.HIL_PREFIX:
594 if document_scheme == RequestTeam.ACCEPTANCE.name:
595 result_class = HILOCRResult
596 elif document_scheme == RequestTeam.SETTLEMENT.name or document_scheme == RequestTeam.INSURANCE.name:
597 result_class = HILSEOCRResult
598 elif business_type == consts.AFC_PREFIX:
599 if document_scheme == RequestTeam.ACCEPTANCE.name:
600 result_class = AFCOCRResult
601 elif document_scheme == RequestTeam.SETTLEMENT.name or document_scheme == RequestTeam.INSURANCE.name:
602 result_class = AFCSEOCRResult
603
604 ocr_result_obj = result_class.objects.filter(application_id=application_id).first()
605 if ocr_result_obj:
606 ocr_result_obj.fsm_activited = 1
607 ocr_result_obj.save()
608 else:
609 ocr_result_obj = result_class()
610 ocr_result_obj.application_id = application_id
611 ocr_result_obj.fsm_activited = 1
612 ocr_result_obj.save()
613
614 self.running_log.info('[doc upload applicationId-{0}] [ocr result saved]'.format(application_id))
574 if data_source == consts.DATA_SOURCE_LIST[1]: 615 if data_source == consts.DATA_SOURCE_LIST[1]:
575 if document_name.endswith('-证书.pdf') or document_name.endswith('-证书'): 616 if document_name.endswith('-证书.pdf') or document_name.endswith('-证书'):
576 self.running_log.info('[doc upload success] [eapp license skip] [args={0}]'.format(args)) 617 self.running_log.info('[doc upload success] [eapp license skip] [args={0}]'.format(args))
...@@ -602,13 +643,22 @@ class UploadDocView(GenericView, DocHandler): ...@@ -602,13 +643,22 @@ class UploadDocView(GenericView, DocHandler):
602 is_zip = False 643 is_zip = False
603 644
604 classify_1 = 0 645 classify_1 = 0
605 # 电子合同 646 # 电子合同 Econtract or OVP(FSM)
606 if data_source == consts.DATA_SOURCE_LIST[-1] and document_scheme == consts.DOC_SCHEME_LIST[1]: 647 if data_source == consts.DATA_SOURCE_LIST[2] or data_source == consts.DATA_SOURCE_LIST[3]:
648 if document_scheme == consts.DOC_SCHEME_LIST[1]:
607 for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix): 649 for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix):
608 if keyword in document_name: 650 if keyword in document_name:
609 classify_1 = classify_1_tmp 651 classify_1 = classify_1_tmp
610 break 652 break
611 elif document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \ 653 # FSM合同:WEP/MSI/SC
654 elif data_source == consts.DATA_SOURCE_LIST[0] and document_scheme == consts.DOC_SCHEME_LIST[0]:
655 for keyword, classify_1_tmp in consts.FSM_ECONTRACT_KEYWORDS_MAP.get(prefix):
656 if keyword in document_name:
657 classify_1 = classify_1_tmp
658 break
659
660
661 if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \
612 or document_name.endswith('.RAR'): 662 or document_name.endswith('.RAR'):
613 is_zip = True 663 is_zip = True
614 664
...@@ -809,6 +859,9 @@ class CompareView(GenericView): ...@@ -809,6 +859,9 @@ class CompareView(GenericView):
809 ''' 859 '''
810 860
811 861
862 pre_fsm_url = conf.PRE_FSM_URL
863
864
812 class SECompareView(GenericView, PreSEHandler): 865 class SECompareView(GenericView, PreSEHandler):
813 permission_classes = [IsAuthenticated] 866 permission_classes = [IsAuthenticated]
814 authentication_classes = [OAuth2AuthenticationWithUser] 867 authentication_classes = [OAuth2AuthenticationWithUser]
...@@ -829,7 +882,52 @@ class SECompareView(GenericView, PreSEHandler): ...@@ -829,7 +882,52 @@ class SECompareView(GenericView, PreSEHandler):
829 fsm_flag = content.get('fsmFlag', False) 882 fsm_flag = content.get('fsmFlag', False)
830 fsm_special_car = content.get('fsmSpecialCar', False) 883 fsm_special_car = content.get('fsmSpecialCar', False)
831 fsm_best_price = content.get('fsmBestPrice', False) 884 fsm_best_price = content.get('fsmBestPrice', False)
885 fsm_landing_dealer = content.get('fsmLandingDealer')
832 886
887 if fsm_special_car:
888 compare_result = {
889 "is_pass": False,
890 "particulars": [{
891 "object_name": "",
892 "fields": [{
893 "input": "",
894 "ocr": "",
895 "field_is_pass": False,
896 "comments": "此申请为FSM 特殊申请,暂不支持预放款流程"
897 }]
898 }]
899 }
900 elif fsm_best_price:
901 compare_result = {
902 "is_pass": False,
903 "particulars": [{
904 "object_name": "",
905 "fields": [{
906 "input": "",
907 "ocr": "",
908 "field_is_pass": False,
909 "comments": "此申请为FSM 特殊申请,暂不支持预放款流程"
910 }]
911 }]
912 }
913 elif fsm_flag and (not fsm_special_car or not fsm_best_price):
914 # 调用Java pre fsm接口
915 try:
916 self.running_log.info("{0} request java pre fsm api, url:{1}, body:{2}".format(log_base, pre_fsm_url, json.dumps(content)))
917 headers = {
918 'Content-Type': 'application/json'
919 }
920 resp = requests.post(pre_fsm_url, headers=headers, json=content)
921 self.running_log.info("{0} response from java pre fsm api, resp:{1}".format(log_base, resp.text))
922 result = json.loads(resp.text)
923 compare_result = result.get("result")
924 if not compare_result:
925 compare_result = get_empty_result()
926 except Exception as e:
927 self.running_log.error("{0} pre fsm request to java error, url:{1}, param:{2}, errorMsg:{3}".format(
928 log_base, pre_fsm_url, json.dumps(content), traceback.format_exc()))
929 compare_result = get_empty_result()
930 elif not fsm_flag:
833 # 存库, 用于银行卡比对 931 # 存库, 用于银行卡比对
834 try: 932 try:
835 bank_class = HILbankVerification if business_type in consts.HIL_SET else AFCbankVerification 933 bank_class = HILbankVerification if business_type in consts.HIL_SET else AFCbankVerification
...@@ -853,7 +951,8 @@ class SECompareView(GenericView, PreSEHandler): ...@@ -853,7 +951,8 @@ class SECompareView(GenericView, PreSEHandler):
853 # preSettlement比对 951 # preSettlement比对
854 compare_result = self.pre_compare_entrance(content) 952 compare_result = self.pre_compare_entrance(content)
855 self.running_log.info('{0} [prese completed] [applicationEntity={1}] [application_id={2}] [uniq_seq={3}] ' 953 self.running_log.info('{0} [prese completed] [applicationEntity={1}] [application_id={2}] [uniq_seq={3}] '
856 '[result={4}]'.format(log_base, business_type, application_id, uniq_seq, compare_result)) 954 '[result={4}]'.format(log_base, business_type, application_id, uniq_seq,
955 compare_result))
857 956
858 try: 957 try:
859 end_time = time.time() 958 end_time = time.time()
...@@ -956,10 +1055,10 @@ class DocView(GenericView, DocHandler): ...@@ -956,10 +1055,10 @@ class DocView(GenericView, DocHandler):
956 application_id_query = Q(application_id__contains=application_id) if application_id is not None else Q() 1055 application_id_query = Q(application_id__contains=application_id) if application_id is not None else Q()
957 data_source_query = Q(data_source=data_source) if data_source is not None else Q() 1056 data_source_query = Q(data_source=data_source) if data_source is not None else Q()
958 upload_finish_time_query = Q(upload_finish_time__gte=upload_time_start, 1057 upload_finish_time_query = Q(upload_finish_time__gte=upload_time_start,
959 upload_finish_time__lt=upload_time_end + datetime.timedelta(days=1))\ 1058 upload_finish_time__lt=upload_time_end + datetime.timedelta(days=1)) \
960 if upload_time_start is not None and upload_time_end is not None else Q() 1059 if upload_time_start is not None and upload_time_end is not None else Q()
961 create_time_query = Q(create_time__gte=create_time_start, 1060 create_time_query = Q(create_time__gte=create_time_start,
962 create_time__lt=create_time_end + datetime.timedelta(days=1))\ 1061 create_time__lt=create_time_end + datetime.timedelta(days=1)) \
963 if create_time_start is not None and create_time_end is not None else Q() 1062 if create_time_start is not None and create_time_end is not None else Q()
964 query = application_id_query & status_query & data_source_query & upload_finish_time_query & create_time_query 1063 query = application_id_query & status_query & data_source_query & upload_finish_time_query & create_time_query
965 val_tuple = ('id', 'application_id', 'upload_finish_time', 'create_time', 'document_scheme', 'data_source', 1064 val_tuple = ('id', 'application_id', 'upload_finish_time', 'create_time', 'document_scheme', 'data_source',
...@@ -971,10 +1070,11 @@ class DocView(GenericView, DocHandler): ...@@ -971,10 +1070,11 @@ class DocView(GenericView, DocHandler):
971 if start_index >= total > 0: 1070 if start_index >= total > 0:
972 raise self.invalid_params('页数不存在') 1071 raise self.invalid_params('页数不存在')
973 1072
974 doc_queryset = doc_class.objects.filter(query).values(*val_tuple).order_by('-create_time')[start_index: end_index] 1073 doc_queryset = doc_class.objects.filter(query).values(*val_tuple).order_by('-create_time')[
1074 start_index: end_index]
975 # doc_list = self.get_doc_list(doc_queryset, prefix) 1075 # doc_list = self.get_doc_list(doc_queryset, prefix)
976 for doc_dict in doc_queryset: 1076 for doc_dict in doc_queryset:
977 tmp_scheme = consts.COMPARE_DOC_SCHEME_LIST[0] if doc_dict['document_scheme'] == consts.DOC_SCHEME_LIST[0]\ 1077 tmp_scheme = consts.COMPARE_DOC_SCHEME_LIST[0] if doc_dict['document_scheme'] == consts.DOC_SCHEME_LIST[0] \
978 else consts.COMPARE_DOC_SCHEME_LIST[1] 1078 else consts.COMPARE_DOC_SCHEME_LIST[1]
979 application_link = '{0}/showList/showList?entity={1}&scheme={2}&case_id={3}'.format( 1079 application_link = '{0}/showList/showList?entity={1}&scheme={2}&case_id={3}'.format(
980 conf.BASE_URL, prefix, tmp_scheme, doc_dict['application_id']) 1080 conf.BASE_URL, prefix, tmp_scheme, doc_dict['application_id'])
...@@ -1021,7 +1121,6 @@ class DocView(GenericView, DocHandler): ...@@ -1021,7 +1121,6 @@ class DocView(GenericView, DocHandler):
1021 # os.remove(tmp_save_path) 1121 # os.remove(tmp_save_path)
1022 # raise self.invalid_params(msg='invalid params: PDF file XSS') 1122 # raise self.invalid_params(msg='invalid params: PDF file XSS')
1023 1123
1024
1025 file.close() 1124 file.close()
1026 # 1. 上传信息记录 1125 # 1. 上传信息记录
1027 application_id = '{0}{1}'.format(consts.FIXED_APPLICATION_ID_PREFIX, metadata_version_id) 1126 application_id = '{0}{1}'.format(consts.FIXED_APPLICATION_ID_PREFIX, metadata_version_id)
...@@ -1104,7 +1203,8 @@ class CompareResultView(GenericView): ...@@ -1104,7 +1203,8 @@ class CompareResultView(GenericView):
1104 latest_compared_time = '' 1203 latest_compared_time = ''
1105 else: 1204 else:
1106 whole_result = consts.RESULT_Y if result_obj.ocr_auto_result_pass else consts.RESULT_N 1205 whole_result = consts.RESULT_Y if result_obj.ocr_auto_result_pass else consts.RESULT_N
1107 latest_compared_time = '' if result_obj.ocr_latest_comparison_time is None else result_obj.ocr_latest_comparison_time.strftime('%Y-%m-%d %H:%M') 1206 latest_compared_time = '' if result_obj.ocr_latest_comparison_time is None else result_obj.ocr_latest_comparison_time.strftime(
1207 '%Y-%m-%d %H:%M')
1108 1208
1109 source = consts.INFO_SOURCE[1] 1209 source = consts.INFO_SOURCE[1]
1110 version = comments = '' 1210 version = comments = ''
...@@ -1120,7 +1220,8 @@ class CompareResultView(GenericView): ...@@ -1120,7 +1220,8 @@ class CompareResultView(GenericView):
1120 'source': source, 1220 'source': source,
1121 'version': version, 1221 'version': version,
1122 'comments': comments, 1222 'comments': comments,
1123 'result': [] if result_obj is None or not result_obj.ocr_auto_result else json.loads(result_obj.ocr_auto_result) 1223 'result': [] if result_obj is None or not result_obj.ocr_auto_result else json.loads(
1224 result_obj.ocr_auto_result)
1124 } 1225 }
1125 1226
1126 return response.ok(data=compare_result) 1227 return response.ok(data=compare_result)
...@@ -1155,7 +1256,8 @@ class CompareResultView(GenericView): ...@@ -1155,7 +1256,8 @@ class CompareResultView(GenericView):
1155 'id': 0 if result_obj is None else result_obj.id, 1256 'id': 0 if result_obj is None else result_obj.id,
1156 'application_id': case_id, 1257 'application_id': case_id,
1157 'entity': entity, 1258 'entity': entity,
1158 'scheme': consts.DOC_SCHEME_LIST[0] if scheme == consts.COMPARE_DOC_SCHEME_LIST[0] else consts.DOC_SCHEME_LIST[1], 1259 'scheme': consts.DOC_SCHEME_LIST[0] if scheme == consts.COMPARE_DOC_SCHEME_LIST[0] else
1260 consts.DOC_SCHEME_LIST[1],
1159 'whole_result': whole_result, 1261 'whole_result': whole_result,
1160 'latest_compared_time': '' if result_obj is None else result_obj.update_time.strftime('%Y-%m-%d %H:%M'), 1262 'latest_compared_time': '' if result_obj is None else result_obj.update_time.strftime('%Y-%m-%d %H:%M'),
1161 'source': source, 1263 'source': source,
...@@ -1328,7 +1430,8 @@ class SECMSView(GenericView): ...@@ -1328,7 +1430,8 @@ class SECMSView(GenericView):
1328 1430
1329 args = request.data 1431 args = request.data
1330 cms_info = args.get('content', {}) 1432 cms_info = args.get('content', {})
1331 business_type = consts.AFC_PREFIX if cms_info.get('financeCompany', '').startswith('宝马') else consts.HIL_PREFIX 1433 business_type = consts.AFC_PREFIX if cms_info.get('financeCompany', '').startswith(
1434 '宝马') else consts.HIL_PREFIX
1332 src_application_id = cms_info.get('settlemnetVerification', {}).get('applicationNo', '') 1435 src_application_id = cms_info.get('settlemnetVerification', {}).get('applicationNo', '')
1333 application_id = src_application_id[:src_application_id.rfind('-')] 1436 application_id = src_application_id[:src_application_id.rfind('-')]
1334 1437
...@@ -1363,6 +1466,32 @@ class SECMSView(GenericView): ...@@ -1363,6 +1466,32 @@ class SECMSView(GenericView):
1363 content=content_str, 1466 content=content_str,
1364 ) 1467 )
1365 1468
1469 # 检查是否fsm流程(SE)
1470 fsm_contract = cms_info.get('FSMContract', False)
1471 fsm_best_price = cms_info.get('FSMBestPrice', False)
1472 if fsm_contract:
1473 # 记录fsm 流程的cms 提交
1474 try:
1475 cms_status_class = HILCmsStatusInfo if business_type in consts.HIL_SET else AFCCmsStatusInfo
1476 cms_status_info = cms_status_class.objects.filter(application_id=application_id).first()
1477 if cms_status_info:
1478 cms_status_info.is_fsm = 1
1479 cms_status_info.update_time = datetime.datetime.now()
1480 cms_status_info.save()
1481 else:
1482 cms_status_info = cms_status_class()
1483 cms_status_info.application_id = application_id
1484 cms_status_info.business_type = business_type
1485 cms_status_info.is_fsm = 1
1486 cms_status_info.update_time = datetime.datetime.now()
1487 cms_status_info.create_time = datetime.datetime.now()
1488 cms_status_info.save()
1489 except Exception as e:
1490 self.exception_log.exception(
1491 '[cms view] [cms_status_info db save failed] [error={0}]'.format(traceback.format_exc()))
1492 fsm_compare.apply_async((application_id, business_type, None, None, False, True),
1493 queue='queue_compare')
1494 else:
1366 # 触发比对 1495 # 触发比对
1367 compare.apply_async((application_id, business_type, None, None, False, True), 1496 compare.apply_async((application_id, business_type, None, None, False, True),
1368 queue='queue_compare') 1497 queue='queue_compare')
...@@ -1458,7 +1587,7 @@ class AutoSettlementView(GenericView): ...@@ -1458,7 +1587,7 @@ class AutoSettlementView(GenericView):
1458 whole_result_query = Q(ocr_whole_result_pass=whole_result) if not isinstance(whole_result, str) else Q() 1587 whole_result_query = Q(ocr_whole_result_pass=whole_result) if not isinstance(whole_result, str) else Q()
1459 rpa_result_query = Q(rpa_result=rpa_result) if not isinstance(rpa_result, str) else Q() 1588 rpa_result_query = Q(rpa_result=rpa_result) if not isinstance(rpa_result, str) else Q()
1460 time1_query = Q(rpa_get_case_from_ocr_time__gte=get_case_from_ocr_time_start, 1589 time1_query = Q(rpa_get_case_from_ocr_time__gte=get_case_from_ocr_time_start,
1461 rpa_get_case_from_ocr_time__lt=get_case_from_ocr_time_end + datetime.timedelta(days=1))\ 1590 rpa_get_case_from_ocr_time__lt=get_case_from_ocr_time_end + datetime.timedelta(days=1)) \
1462 if get_case_from_ocr_time_start is not None and get_case_from_ocr_time_end is not None else Q() 1591 if get_case_from_ocr_time_start is not None and get_case_from_ocr_time_end is not None else Q()
1463 time2_query = Q(rpa_activated_time__gte=activated_time_start, 1592 time2_query = Q(rpa_activated_time__gte=activated_time_start,
1464 rpa_activated_time__lt=activated_time_end + datetime.timedelta(days=1)) \ 1593 rpa_activated_time__lt=activated_time_end + datetime.timedelta(days=1)) \
......
...@@ -7,6 +7,9 @@ import traceback ...@@ -7,6 +7,9 @@ import traceback
7 import numpy as np 7 import numpy as np
8 from datetime import datetime, timedelta 8 from datetime import datetime, timedelta
9 from collections import OrderedDict 9 from collections import OrderedDict
10
11 import requests
12
10 from . import app 13 from . import app
11 from settings import conf 14 from settings import conf
12 from apps.doc.models import ( 15 from apps.doc.models import (
...@@ -3266,6 +3269,33 @@ def se_compare(application_id, application_entity, ocr_res_id, last_obj, ocr_res ...@@ -3266,6 +3269,33 @@ def se_compare(application_id, application_entity, ocr_res_id, last_obj, ocr_res
3266 3269
3267 3270
3268 @app.task 3271 @app.task
3272 def fsm_compare(application_id, application_entity, uniq_seq, ocr_res_id, is_ca=True, is_cms=False):
3273 compare_log.info('{0} [receive fsm task] [entity={1}] [id={2}] [uniq_seq={3}] [ocr_res_id={4}] [is_ca={5}] '
3274 '[is_cms={6}]'.format(log_base, application_entity, application_id, uniq_seq, ocr_res_id,
3275 is_ca, is_cms))
3276 # 调用java fsm 比对流程接口(http)
3277 # 调用Java fsm 比对流程接口, fsm 是se流程, ca可以暂时忽略
3278 url = conf.FSM_URL
3279 body = {
3280 'applicationId': application_id,
3281 'businessType': application_entity,
3282 'ocrResId': ocr_res_id,
3283 'isCa': is_ca,
3284 'isCms': is_cms
3285 }
3286 try:
3287 compare_log.info("request java fsm api, url:{0}, body:{1}".format(url, json.dumps(body)))
3288 headers = {
3289 'Content-Type': 'application/json'
3290 }
3291 resp = requests.post(url, headers=headers, json=body)
3292 compare_log.info("response from fsm api, resp:{0}".format(resp.text))
3293 except Exception as e:
3294 compare_log.error("fsm full request to java error, url:{0}, param:{1}, errorMsg:{2}".format(
3295 url, json.dumps(body), traceback.format_exc()))
3296
3297
3298 @app.task
3269 def compare(application_id, application_entity, uniq_seq, ocr_res_id, is_ca=True, is_cms=False): 3299 def compare(application_id, application_entity, uniq_seq, ocr_res_id, is_ca=True, is_cms=False):
3270 # POS: application_id, application_entity, uniq_seq, None 3300 # POS: application_id, application_entity, uniq_seq, None
3271 # OCR: application_id, business_type(application_entity), None, ocr_res_id 3301 # OCR: application_id, business_type(application_entity), None, ocr_res_id
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
6 # @Description : 6 # @Description :
7 7
8 from .get_char import Finder 8 from .get_char import Finder
9 from .get_char_fsm import Finder as FSMFinder
9 import numpy as np 10 import numpy as np
10 11
11 12
...@@ -23,7 +24,7 @@ def extract_info(ocr_results): ...@@ -23,7 +24,7 @@ def extract_info(ocr_results):
23 return {'page_1': {'合同编号': contract_no}} 24 return {'page_1': {'合同编号': contract_no}}
24 25
25 26
26 def predict(pdf_info, is_qrs=False): 27 def predict(pdf_info, is_qrs=False, is_fsm=False):
27 ocr_results = {} 28 ocr_results = {}
28 for pno in pdf_info: 29 for pno in pdf_info:
29 ocr_results[pno] = {} 30 ocr_results[pno] = {}
...@@ -50,6 +51,9 @@ def predict(pdf_info, is_qrs=False): ...@@ -50,6 +51,9 @@ def predict(pdf_info, is_qrs=False):
50 results = extract_info(ocr_results) 51 results = extract_info(ocr_results)
51 else: 52 else:
52 # 输入是整个 PDF 中的信息 53 # 输入是整个 PDF 中的信息
54 if is_fsm:
55 f = FSMFinder(pdf_info, ocr_results=ocr_results)
56 else:
53 f = Finder(pdf_info, ocr_results=ocr_results) 57 f = Finder(pdf_info, ocr_results=ocr_results)
54 results = f.get_info() 58 results = f.get_info()
55 return results 59 return results
......
1 import re
2 import numpy as np
3 from fuzzywuzzy import fuzz
4 from shapely.geometry import Polygon
5
6
7 class Finder:
8
9 def __init__(self, pdf_info, ocr_results):
10 self.pdf_info = pdf_info
11 self.ocr_results = ocr_results
12 self.is_asp = False
13 self.item = {"words": None,
14 "position": None,
15 }
16
17 def gen_init_result(self, is_asp):
18 # 格式化算法输出
19 self.init_result = {"page_1": {"合同编号": self.item,
20 "所购车辆价格": self.item,
21 "车架号": self.item,
22 "贷款本金金额": {"大写": self.item,
23 "小写": self.item,
24 "车辆贷款本金金额": self.item,
25 "附加产品融资贷款本金总金额": self.item,
26 },
27 "贷款期限": self.item,
28 "附加产品融资贷款本金总金额明细": self.item,
29 "借款人签字及时间": self.item,
30 },
31 "page_2": {"合同编号": self.item,
32 "借款人及抵押人": {"name": self.item,
33 "id": self.item,
34 },
35 "共同借款人及共同抵押人": {"name": self.item,
36 "id": self.item,
37 },
38 "保证人1": {"name": self.item,
39 "id": self.item,
40 },
41 "保证人2": {"name": self.item,
42 "id": self.item,
43 },
44 "所购车辆价格": self.item,
45 "车架号": self.item,
46 "经销商": self.item,
47 "贷款本金金额": {"大写": self.item,
48 "小写": self.item,
49 "车辆贷款本金金额": self.item,
50 "附加产品融资贷款本金总金额": self.item,
51 },
52 "贷款期限": self.item,
53 "标准利率": self.item,
54 "借款人收款账户": {"账号": self.item,
55 "户名": self.item,
56 "开户行": self.item,
57 },
58 "还款账户": {"账号": self.item,
59 "户名": self.item,
60 "开户行": self.item,
61 },
62 },
63 "page_3": {"合同编号": self.item,
64 "还款计划表": self.item,
65 "车辆代理商": self.item,
66 },
67 "page_4": {"合同编号": self.item,
68 "附加产品融资贷款本金总金额明细": self.item,
69 },
70 "page_5": {"合同编号": self.item,
71 },
72 "page_6": {"合同编号": self.item,
73 },
74 }
75 if self.is_asp:
76 self.init_result["page_7"] = {"合同编号": self.item,
77 }
78 self.init_result["page_8"] = {"合同编号": self.item,
79 "主借人签字": {"签字": self.item,
80 "日期": self.item,
81 },
82 "共借人签字": {"签字": self.item,
83 "日期": self.item,
84 },
85 "保证人1签字": {"签字": self.item,
86 "日期": self.item,
87 },
88 "保证人2签字": {"签字": self.item,
89 "日期": self.item,
90 },
91 "见证人签字": {"签字": self.item,
92 "日期": self.item,
93 },
94 }
95 else:
96 self.init_result["page_7"] = {"合同编号": self.item,
97 "主借人签字": {"签字": self.item,
98 "日期": self.item,
99 },
100 "共借人签字": {"签字": self.item,
101 "日期": self.item,
102 },
103 "保证人1签字": {"签字": self.item,
104 "日期": self.item,
105 },
106 "保证人2签字": {"签字": self.item,
107 "日期": self.item,
108 },
109 "见证人签字": {"签字": self.item,
110 "日期": self.item,
111 },
112 }
113
114
115 def get_top_iou(self, poly, ocr_result):
116 """传入一个多边形, 找到与之最匹配的多边形
117
118 Args:
119 poly (TYPE): Description
120 """
121 iou_list = []
122 for key in ocr_result:
123 bbox, text = ocr_result[key]
124 g = Polygon(np.array(bbox).reshape((-1, 2)))
125 p = Polygon(np.array(poly).reshape((-1, 2)))
126 if not g.is_valid or not p.is_valid:
127 continue
128 inter = Polygon(g).intersection(Polygon(p)).area
129 union = g.area + p.area - inter
130 iou = inter/union
131 iou_list.append([iou, key])
132 if len(iou_list) == 0:
133 return -1, -1
134 top_iou = sorted(iou_list, key=lambda x: x[0])[-1]
135 return top_iou
136
137 def poly_to_rectangle(self, poly):
138 xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax = poly
139 bbox = [xmin, ymin, xmax, ymax]
140 return bbox
141
142 def get_contract_no(self, page_num):
143 """传入页码,查看该页码右上角的编号
144
145 Args:
146 page_num (string):
147
148 Returns:
149 sting:
150 """
151 contract_no = self.item.copy()
152 # contract_no['words'] = ''
153 # contract_no['position'] = [-1, -1, -1, -1]
154 # 只看第一页
155 for key in self.ocr_results[page_num]:
156 bbox, text = self.ocr_results[page_num][key]
157 if '合同编号:' in text:
158 words = text.split(':')[-1]
159 location = self.poly_to_rectangle(bbox)
160 contract_no['words'] = words
161 contract_no['position'] = location
162 return contract_no
163
164 def get_vehicle_price(self, page_num='0'):
165 vehicle_price = self.item.copy()
166 # vehicle_price['words'] = ''
167 # vehicle_price['position'] = [-1, -1, -1, -1]
168 for key in self.ocr_results[page_num]:
169 bbox, text = self.ocr_results[page_num][key]
170 if '所购车辆价格为人民币' in text:
171 words = text.split('币')[-1]
172 location = self.poly_to_rectangle(bbox)
173 vehicle_price['words'] = words
174 vehicle_price['position'] = location
175 return vehicle_price
176
177 def get_vin(self, page_num='0'):
178 vin = self.item.copy()
179 # vin['words'] = ''
180 # vin['position'] = [-1, -1, -1, -1]
181 for key in self.ocr_results[page_num]:
182 bbox, text = self.ocr_results[page_num][key]
183 if '车架号:' in text:
184 words = text.split(':')[-1]
185 location = self.poly_to_rectangle(bbox)
186 vin['words'] = words
187 vin['position'] = location
188 return vin
189
190 def get_loan_principal(self, page_num='0'):
191 chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾',
192 '佰', '仟', '万', '亿', '元', '角', '分', '零', '整']
193 upper = self.item.copy()
194 lower = self.item.copy()
195 asp_1 = self.item.copy()
196 asp_2 = self.item.copy()
197 anchor_bbox = None
198 for block in self.pdf_info[page_num]['blocks']:
199 if block['type'] != 0:
200 continue
201 for line in block['lines']:
202 for span in line['spans']:
203 bbox, text = span['bbox'], span['text']
204 if fuzz.ratio(''.join(chinese_keywords), text) > 15:
205 text = text.split(':')[-1].strip()
206 upper['position'] = bbox
207 upper['words'] = text
208 if '小写:¥' in text:
209 words = text.split('¥')[-1].strip()
210 lower['position'] = bbox
211 lower['words'] = words
212 if '附加产品融资贷款本金总金额' == text:
213 anchor_bbox = bbox
214 if anchor_bbox:
215 for block in self.pdf_info[page_num]['blocks']:
216 if block['type'] != 0:
217 continue
218 for line in block['lines']:
219 for span in line['spans']:
220 bbox, text = span['bbox'], span['text']
221 if np.mean(bbox[1::2]) < np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text:
222 words = re.findall(r'人民币:小写:\[(.*)\]', text)[0]
223 asp_1['position'] = bbox
224 asp_1['words'] = words
225 if np.mean(bbox[1::2]) > np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text:
226 words = re.findall(r'人民币:小写:\[(.*)\]', text)[0]
227 asp_2['position'] = bbox
228 asp_2['words'] = words
229 return upper, lower, asp_1, asp_2
230
231 def get_loan_term(self, page_num='0'):
232 loan_term = self.item.copy()
233 all_text = ''
234 for block in self.pdf_info[page_num]['blocks']:
235 if block['type'] != 0:
236 continue
237 for line in block['lines']:
238 for span in line['spans']:
239 bbox, text = span['bbox'], span['text']
240 all_text += text
241 matchs = re.search(r'贷款期限(\d+)个月', all_text)
242 if matchs:
243 words = matchs.group(1)
244 for block in self.pdf_info[page_num]['blocks']:
245 if block['type'] != 0:
246 continue
247 for line in block['lines']:
248 for span in line['spans']:
249 bbox, text = span['bbox'], span['text']
250 if f'{words}个月' in text:
251 loan_term['position'] = bbox
252 loan_term['words'] = words
253 return loan_term
254
255 def get_standard_rate(self, page_num='0'):
256 standard_rate = self.item.copy()
257 for block in self.pdf_info[page_num]['blocks']:
258 if block['type'] != 0:
259 continue
260 for line in block['lines']:
261 for span in line['spans']:
262 bbox, text = span['bbox'], span['text']
263 matchs = re.search(r'本合同当期的标准利率为(\S+)%/年', text)
264 if matchs:
265 standard_rate['position'] = bbox
266 standard_rate['words'] = matchs.group(1)
267 return standard_rate
268
269 def mergelist(self, text_list):
270 pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符
271 mergeindex = -1
272 for index, i in enumerate(text_list):
273 if '所购' in i and len(pattern.sub('', pattern.sub('', text_list[index+1]))) != 0:
274 # if '所购' in i and '.00' not in text_list[index+1]:
275 mergeindex = index
276 if mergeindex == -1:
277 return text_list
278 else:
279 new_text_list = text_list[:mergeindex] + [text_list[mergeindex] + text_list[mergeindex+1]] + text_list[mergeindex+2:]
280 return self.mergelist(new_text_list)
281
282 def get_asp_details(self, page_num):
283 asp_details_table_term = self.item.copy()
284
285 asp_details_table = [['附加产品融资贷款本金总金额及贷款利率明细'], ['项目1', '用途总金额2', '贷款本金3']]
286
287 bbox_xm = None
288 bbox_ytzje = None
289 bbox_dkbj = None
290 bbox_total = None
291 for key in self.ocr_results[page_num]:
292 bbox, text = self.ocr_results[page_num][key]
293 if text == '项目1':
294 bbox_xm = bbox
295 if text == '用途总金额2':
296 bbox_ytzje = bbox
297 if text == '贷款本金3':
298 bbox_dkbj = bbox
299 if text in ['附加产品融资贷款本', '附加产品融资贷款本金', '附加产品融资贷']:
300 bbox_total = bbox
301
302 if bbox_xm:
303 for i in range(10):
304 rh = abs(bbox_xm[1]-bbox_xm[-1])
305 anchor = np.array(bbox_xm).reshape((-1 ,2))
306 anchor[:, 1] += int(rh*1.4)
307 _iou, _key = self.get_top_iou(poly=anchor, ocr_result=self.ocr_results[page_num])
308 if _iou > 0:
309 bbox, xm_text = self.ocr_results[page_num][_key]
310 bbox_xm = bbox
311 # 解决项目内容是两行的问题
312 if not '所购' in xm_text:
313 line = asp_details_table[-1]
314 line[0] += xm_text
315 asp_details_table[-1] = line
316 continue
317 # print(xm_text)
318 anchor_1 = [bbox_ytzje[0], bbox[1], bbox_ytzje[2], bbox[3],
319 bbox_ytzje[4], bbox[5], bbox_ytzje[6], bbox[7]]
320 _iou, _key = self.get_top_iou(poly=anchor_1, ocr_result=self.ocr_results[page_num])
321 bbox, ytzje_text = self.ocr_results[page_num][_key]
322 # print(ytzje_text)
323 anchor_2 = [bbox_dkbj[0], bbox[1], bbox_dkbj[2], bbox[3],
324 bbox_dkbj[4], bbox[5], bbox_dkbj[6], bbox[7]]
325 _iou, _key = self.get_top_iou(poly=anchor_2, ocr_result=self.ocr_results[page_num])
326 bbox, dkbj_text = self.ocr_results[page_num][_key]
327 # print(dkbj_text)
328 if xm_text == ytzje_text:
329 xm_text, ytzje_text = xm_text.split(' ')
330 line = [xm_text, ytzje_text, dkbj_text]
331 asp_details_table.append(line)
332 else:
333 break
334
335 if bbox_total:
336 anchor = [bbox_dkbj[0], bbox_total[1], bbox_dkbj[2], bbox_total[3],
337 bbox_dkbj[4], bbox_total[5], bbox_dkbj[6], bbox_total[7]]
338 _iou, _key = self.get_top_iou(poly=anchor, ocr_result=self.ocr_results[page_num])
339 bbox, total_text = self.ocr_results[page_num][_key]
340 asp_details_table.append(['附加产品融资贷款本金总金额:', '', total_text])
341 asp_details_table_term['words'] = asp_details_table
342
343 return asp_details_table_term
344
345 def get_signature(self):
346 signature = self.item.copy()
347
348 for block in self.pdf_info['0']['blocks']:
349 if block['type'] != 0:
350 continue
351 for line in block['lines']:
352 for span in line['spans']:
353 bbox, text = span['bbox'], span['text']
354 if '签署日期' in text:
355 words = text
356 signature['words'] = words
357 signature['position'] = bbox
358 return signature
359
360 def get_somebody(self, top, bottom):
361 # 指定上下边界后,返回上下边界内的客户信息
362 _name = self.item.copy()
363 _id = self.item.copy()
364 # 只看第一页,先划定上下边界
365 y_top = 0
366 y_bottom = 0
367 for block in self.pdf_info['1']['blocks']:
368 if block['type'] != 0:
369 continue
370 for line in block['lines']:
371 for span in line['spans']:
372 bbox, text = span['bbox'], span['text']
373 if top in text:
374 y_top = bbox[3]
375 if bottom in text:
376 y_bottom = bbox[3]
377 for block in self.pdf_info['1']['blocks']:
378 if block['type'] != 0:
379 continue
380 for line in block['lines']:
381 for span in line['spans']:
382 bbox, text = span['bbox'], span['text']
383 if y_top < bbox[3] < y_bottom:
384 # print(top, bottom, text)
385 if '姓名/名称' in text:
386 words = text.split(':')[-1]
387 _name['position'] = bbox
388 _name['words'] = words
389 if '自然人身份证件号码/法人执照号码' in text:
390 words = text.split(':')[-1]
391 _id['position'] = bbox
392 _id['words'] = words
393 return _name, _id
394
395 def get_seller(self):
396 seller = self.item.copy()
397 # 先找到 key
398 anchor_bbox = None
399 for block in self.pdf_info['1']['blocks']:
400 if block['type'] != 0:
401 continue
402 for line in block['lines']:
403 for span in line['spans']:
404 bbox, text = span['bbox'], span['text']
405 if text in ['经销商', '车辆销售方']:
406 anchor_bbox = bbox
407 # 当找到了 key, 则根据 key 去匹配 value
408 if anchor_bbox:
409 half_width = self.pdf_info['1']['width'] * 0.5
410 for block in self.pdf_info['1']['blocks']:
411 if block['type'] != 0:
412 continue
413 for line in block['lines']:
414 for span in line['spans']:
415 bbox, text = span['bbox'], span['text']
416 if anchor_bbox[2]<np.mean(bbox[::2])<half_width and \
417 anchor_bbox[1]<np.mean(bbox[1::2])<anchor_bbox[3]:
418 seller['position'] = bbox
419 seller['words'] = text
420 return seller
421
422 def get_cldls(self):
423 seller = self.item.copy()
424 # 先找到 key
425 anchor_bbox = None
426 for block in self.pdf_info['2']['blocks']:
427 if anchor_bbox is not None:
428 break
429 if block['type'] != 0:
430 continue
431 for line in block['lines']:
432 if anchor_bbox is not None:
433 break
434 for span in line['spans']:
435 bbox, text = span['bbox'], span['text']
436 if text.strip() == '车辆代理商':
437 anchor_bbox = bbox
438 # print(anchor_bbox)
439 # 当找到了 key, 则根据 key 去匹配 value
440 if anchor_bbox:
441 half_width = self.pdf_info['2']['width'] * 0.5
442 for block in self.pdf_info['2']['blocks']:
443 if block['type'] != 0:
444 continue
445 for line in block['lines']:
446 for span in line['spans']:
447 bbox, text = span['bbox'], span['text']
448 if anchor_bbox[2]<np.mean(bbox[::2])<half_width and \
449 anchor_bbox[1]<np.mean(bbox[1::2])<anchor_bbox[3]:
450 seller['position'] = bbox
451 seller['words'] = text
452 return seller
453 return seller
454
455 def get_borrower_collection_account(self):
456 account = self.item.copy()
457 account_name = self.item.copy()
458 account_bank = self.item.copy()
459 all_text = ''
460 for block in self.pdf_info['1']['blocks']:
461 if block['type'] != 0:
462 continue
463 for line in block['lines']:
464 for span in line['spans']:
465 bbox, text = span['bbox'], span['text']
466 all_text += text
467 # 首先确定账户信息是哪种,我们只输出非另行通知的格式
468 if '借款人收款账户' in all_text:
469 all_text = all_text.replace(' ', '').replace(' ', '')
470 matchs_1 = re.findall(r'账号:(.*?)户名', all_text)
471 if matchs_1:
472 words = matchs_1[0]
473 for block in self.pdf_info['1']['blocks']:
474 if block['type'] != 0:
475 continue
476 for line in block['lines']:
477 for span in line['spans']:
478 bbox, text = span['bbox'], span['text']
479 if f'{words}' in text:
480 account['position'] = bbox
481 account['words'] = words
482 matchs_2 = re.findall(r'户名:(.*?)开户行', all_text)
483 if matchs_2:
484 words = matchs_2[0]
485 for block in self.pdf_info['1']['blocks']:
486 if block['type'] != 0:
487 continue
488 for line in block['lines']:
489 for span in line['spans']:
490 bbox, text = span['bbox'], span['text']
491 if f'{words}' in text:
492 account_name['position'] = bbox
493 account_name['words'] = words
494 matchs_3 = re.findall(r'开户行:(.*?)借款人', all_text)
495 if matchs_3:
496 words = matchs_3[0]
497 for block in self.pdf_info['1']['blocks']:
498 if block['type'] != 0:
499 continue
500 for line in block['lines']:
501 for span in line['spans']:
502 bbox, text = span['bbox'], span['text']
503 if f'{words}' in text:
504 account_bank['position'] = bbox
505 account_bank['words'] = words
506 return account, account_name, account_bank
507
508 def get_payback_account(self):
509 account = self.item.copy()
510 account_name = self.item.copy()
511 account_bank = self.item.copy()
512 all_text = ''
513 for block in self.pdf_info['1']['blocks']:
514 if block['type'] != 0:
515 continue
516 for line in block['lines']:
517 for span in line['spans']:
518 bbox, text = span['bbox'], span['text']
519 all_text += text
520 # 首先确定账户信息是哪种,我们只输出非另行通知的格式
521 if '(13) 还款账户' in all_text:
522 all_text = all_text.split('(13) 还款账户')[-1]
523 all_text = all_text.replace(' ', '').replace(' ', '')
524 matchs_1 = re.findall(r'账号:(.*?)户名', all_text)
525 if matchs_1:
526 words = matchs_1[0]
527 for block in self.pdf_info['1']['blocks']:
528 if block['type'] != 0:
529 continue
530 for line in block['lines']:
531 for span in line['spans']:
532 bbox, text = span['bbox'], span['text']
533 if f'{words}' in text:
534 account['position'] = bbox
535 account['words'] = words
536 matchs_2 = re.findall(r'户名:(.*?)开户行', all_text)
537 if matchs_2:
538 words = matchs_2[0]
539 for block in self.pdf_info['1']['blocks']:
540 if block['type'] != 0:
541 continue
542 for line in block['lines']:
543 for span in line['spans']:
544 bbox, text = span['bbox'], span['text']
545 if f'{words}' in text:
546 account_name['position'] = bbox
547 account_name['words'] = words
548 matchs_3 = re.findall(r'开户行:(.*?);', all_text)
549 if matchs_3:
550 words = matchs_3[0]
551 for block in self.pdf_info['1']['blocks']:
552 if block['type'] != 0:
553 continue
554 for line in block['lines']:
555 for span in line['spans']:
556 bbox, text = span['bbox'], span['text']
557 if f'开户行:{words};' in text.replace(' ', ''):
558 account_bank['position'] = bbox
559 account_bank['words'] = words
560 return account, account_name, account_bank
561
562 def get_repayment_schedule(self):
563 repayment_schedule = self.item.copy()
564 # 只看第二页
565 repayment_schedule_table = []
566 repayment_schedule_text_list = []
567 table = False
568 for block in self.pdf_info['2']['blocks']:
569 if block['type'] != 0:
570 continue
571 for line in block['lines']:
572 for span in line['spans']:
573 bbox, text = span['bbox'], span['text']
574 if '序号' == text:
575 table = True
576 if '以上表格中所列的序号并非还款期数' in text:
577 table = False
578 if table == True:
579 repayment_schedule_text_list.append(text)
580
581 for i in range(len(repayment_schedule_text_list)//5):
582
583 line = []
584 # 5表示5列的意思
585 for j in range(5):
586 line.append(repayment_schedule_text_list[i*5+j])
587
588 if str(i+1) == line[1]:
589 break
590
591 repayment_schedule_table.append(line)
592
593 if len(repayment_schedule_table) > 0:
594 repayment_schedule['words'] = repayment_schedule_table
595 return repayment_schedule
596
597 def get_signature_role_1(self):
598 signature_role_1 = self.init_item.copy()
599 # 先定位签字区域
600 texts = []
601 boxes = []
602 page_num = None
603 position = None
604 words = None
605 region = False
606 for i in list(self.pdf_info.keys()):
607 for block in self.pdf_info[i]['blocks']:
608 if block['type'] != 0:
609 continue
610 for line in block['lines']:
611 for span in line['spans']:
612 bbox, text = span['bbox'], span['text']
613 if '借款人(抵押人)' in text:
614 region = True
615 if '日期' in text:
616 region = False
617 if region == True:
618 page_num = i
619 texts.append(text)
620 boxes.append(bbox)
621 if len(texts) > 4:
622 words = '有'
623 else:
624 words = '无'
625 boxes = np.array(boxes).reshape((-1, 2))
626 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
627 signature_role_1['page_num'] = page_num
628 signature_role_1['position'] = position
629 signature_role_1['words'] = words
630 return signature_role_1
631
632 def get_signature_role_2(self):
633 signature_role_2 = self.init_item.copy()
634 # 先定位签字区域
635 texts = []
636 boxes = []
637 page_num = None
638 position = None
639 words = None
640 region = False
641 for i in list(self.pdf_info.keys()):
642 for block in self.pdf_info[i]['blocks']:
643 if block['type'] != 0:
644 continue
645 for line in block['lines']:
646 for span in line['spans']:
647 bbox, text = span['bbox'], span['text']
648 if '共同借款人(共同抵押人)' in text:
649 region = True
650 if '日期' in text:
651 region = False
652 if region == True:
653 page_num = i
654 texts.append(text)
655 boxes.append(bbox)
656 if len(texts) > 4:
657 words = '有'
658 else:
659 words = '无'
660 boxes = np.array(boxes).reshape((-1, 2))
661 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
662 signature_role_2['page_num'] = page_num
663 signature_role_2['position'] = position
664 signature_role_2['words'] = words
665 return signature_role_2
666
667 def get_signature_role_3(self):
668 signature_role_3 = self.init_item.copy()
669 # 先定位签字区域
670 texts = []
671 boxes = []
672 page_num = None
673 position = None
674 words = None
675 region = False
676 for i in list(self.pdf_info.keys()):
677 for block in self.pdf_info[i]['blocks']:
678 if block['type'] != 0:
679 continue
680 for line in block['lines']:
681 for span in line['spans']:
682 bbox, text = span['bbox'], span['text']
683 if '保证人1' in text and int(i) != 0:
684 region = True
685 if '日期' in text:
686 region = False
687 if region == True:
688 page_num = i
689 texts.append(text)
690 boxes.append(bbox)
691 if len(texts) > 4:
692 words = '有'
693 else:
694 words = '无'
695 boxes = np.array(boxes).reshape((-1, 2))
696 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
697 signature_role_3['page_num'] = page_num
698 signature_role_3['position'] = position
699 signature_role_3['words'] = words
700 return signature_role_3
701
702 def get_signature_role_4(self):
703 signature_role_4 = self.init_item.copy()
704 # 先定位签字区域
705 texts = []
706 boxes = []
707 page_num = None
708 position = None
709 words = None
710 region = False
711 for i in list(self.pdf_info.keys()):
712 for block in self.pdf_info[i]['blocks']:
713 if block['type'] != 0:
714 continue
715 for line in block['lines']:
716 for span in line['spans']:
717 bbox, text = span['bbox'], span['text']
718 if '保证人2' in text and int(i) != 0:
719 region = True
720 if '日期' in text:
721 region = False
722 if region == True:
723 page_num = i
724 texts.append(text)
725 boxes.append(bbox)
726 if len(texts) > 4:
727 words = '有'
728 else:
729 words = '无'
730 boxes = np.array(boxes).reshape((-1, 2))
731 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
732 signature_role_4['page_num'] = page_num
733 signature_role_4['position'] = position
734 signature_role_4['words'] = words
735 return signature_role_4
736
737 def get_signature_role_5(self):
738 signature_role_5 = self.init_item.copy()
739 # 先定位签字区域
740 texts = []
741 boxes = []
742 page_num = None
743 position = None
744 words = None
745 region = False
746 for i in list(self.pdf_info.keys()):
747 for block in self.pdf_info[i]['blocks']:
748 if block['type'] != 0:
749 continue
750 for line in block['lines']:
751 for span in line['spans']:
752 bbox, text = span['bbox'], span['text']
753 if '见证人签字' in text and int(i) != 0:
754 region = True
755 if '年' in text:
756 region = False
757 if region == True:
758 page_num = i
759 texts.append(text)
760 boxes.append(bbox)
761 print(texts)
762 if len(texts) > 4:
763 words = '有'
764 else:
765 words = '无'
766 boxes = np.array(boxes).reshape((-1, 2))
767 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
768 signature_role_5['page_num'] = page_num
769 signature_role_5['position'] = position
770 signature_role_5['words'] = words
771 return signature_role_5
772
773 def get_last_page_signature(self, page_num, top, bottom):
774 signature_name = self.item.copy()
775 signature_date = self.item.copy()
776 anchor_top = None
777 anchor_bottom = None
778 for block in self.pdf_info[page_num]['blocks']:
779 if block['type'] != 0:
780 continue
781 for line in block['lines']:
782 for span in line['spans']:
783 bbox, text = span['bbox'], span['text']
784 if top in text:
785 anchor_top = bbox[1]
786 if bottom in text:
787 anchor_bottom = bbox[1]
788 # print(top, anchor_top, anchor_bottom)
789 if anchor_top is not None and anchor_bottom is not None:
790 for block in self.pdf_info[page_num]['blocks']:
791 if block['type'] != 0:
792 continue
793 for line in block['lines']:
794 for span in line['spans']:
795 bbox, text = span['bbox'], span['text']
796 if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom):
797 name = text.split(' ')[0]
798 date = text.split(':')[-1]
799 signature_name['words'] = name
800 signature_name['position'] = bbox
801 signature_date['words'] = date
802 signature_date['position'] = bbox
803 return signature_name, signature_date
804
805 def get_info(self):
806 """
807 block['type'] == 0 : 表示该元素为图片
808
809 Returns:
810 dict: Description
811 """
812
813 # 先判断是否为 ASP 产品
814 # 只看第一页,判断是否有 '附加产品融资贷款本金总金额' 这一句话,若有则为 ASP 产品
815 # print(self.pdf_info['0']['blocks'])
816 # for block in self.pdf_info['0']['blocks']:
817 # if block['type'] != 0:
818 # continue
819 # for line in block['lines']:
820 # for span in line['spans']:
821 # bbox, text = span['bbox'], span['text']
822 # if '附加产品融资贷款本金总金额' == text:
823 # self.is_asp = True
824 for key in self.ocr_results['0']:
825 bbox, text = self.ocr_results['0'][key]
826 if '附加产品融资贷款本金总金额' in text:
827 self.is_asp = True
828
829 self.gen_init_result(self.is_asp)
830
831 if len(list(self.ocr_results.keys())) <= 8: # 8.5 版本客户提供的样本出现串页的情况,暂时无法识别
832 # Page 1
833 # 找合同编号
834 contract_no = self.get_contract_no(page_num='0')
835 # print(contract_no)
836 self.init_result['page_1']['合同编号'] = contract_no
837 # 所购车辆价格
838 vehicle_price = self.get_vehicle_price()
839 # print(vehicle_price)
840 self.init_result['page_1']['所购车辆价格'] = vehicle_price
841 # 车架号
842 vin = self.get_vin()
843 # print(vin)
844 self.init_result['page_1']['车架号'] = vin
845 # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目
846 upper, lower, asp_1, asp_2 = self.get_loan_principal()
847 # print(upper, lower, asp_1, asp_2)
848 self.init_result['page_1']['贷款本金金额']['大写'] = upper
849 self.init_result['page_1']['贷款本金金额']['小写'] = lower
850 self.init_result['page_1']['贷款本金金额']['车辆贷款本金金额'] = asp_1
851 self.init_result['page_1']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2
852 # 贷款期限
853 loan_term = self.get_loan_term()
854 # print(loan_term)
855 self.init_result['page_1']['贷款期限'] = loan_term
856 # 附加产品融资贷款本金总金额明细(ASP-表格)
857 asp_details_table = self.get_asp_details(page_num='0')
858 # print(asp_details_table)
859 self.init_result['page_1']['附加产品融资贷款本金总金额明细'] = asp_details_table
860 # 借款人签字及时间
861 signature = self.get_signature()
862 # print(signature)
863 self.init_result['page_1']['借款人签字及时间'] = signature
864 #######################################
865 # Page 2
866 # 找合同编号
867 contract_no = self.get_contract_no(page_num='0')
868 # print(contract_no)
869 self.init_result['page_2']['合同编号'] = contract_no
870 # 找借款人及抵押人(地址字段原本有空格)
871 borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人:')
872 # 这是为了同时兼容 8.1 版本
873 if borrower_name['words'] == None:
874 borrower_name, borrower_id = self.get_somebody(top='借款人及抵押人:', bottom='共同借款人及共同抵押人:')
875 # 这是为了兼容车贷分离版本
876 if borrower_name['words'] == None:
877 borrower_name, borrower_id = self.get_somebody(top='借款人:', bottom='共同借款人及抵押人:')
878 # print(borrower_name, borrower_id)
879 self.init_result['page_2']['借款人及抵押人']['name'] = borrower_name
880 self.init_result['page_2']['借款人及抵押人']['id'] = borrower_id
881 # 找共同借款人及共同抵押人
882 co_borrower_name, co_borrower_id = self.get_somebody(top='共同借款人:', bottom='保证人1:')
883 # print(co_borrower_name, co_borrower_id)
884 self.init_result['page_2']['共同借款人及共同抵押人']['name'] = co_borrower_name
885 self.init_result['page_2']['共同借款人及共同抵押人']['id'] = co_borrower_id
886 # 保证人1
887 first_guarantor_name, first_guarantor_id = self.get_somebody(top='保证人1:', bottom='保证人2:')
888 self.init_result['page_2']['保证人1']['name'] = first_guarantor_name
889 self.init_result['page_2']['保证人1']['id'] = first_guarantor_id
890 # 保证人2
891 second_guarantor_name, second_guarantor_id = self.get_somebody(top='保证人2:', bottom='第一章')
892 self.init_result['page_2']['保证人2']['name'] = second_guarantor_name
893 self.init_result['page_2']['保证人2']['id'] = second_guarantor_id
894 # 所购车辆价格
895 vehicle_price = self.get_vehicle_price(page_num='1')
896 # print(vehicle_price)
897 self.init_result['page_2']['所购车辆价格'] = vehicle_price
898 # 车架号
899 vin = self.get_vin(page_num='1')
900 # print(vin)
901 self.init_result['page_2']['车架号'] = vin
902 # 经销商
903 seller = self.get_seller()
904 # print(seller)
905 self.init_result['page_2']['经销商'] = seller
906 # 贷款本金金额(如果是 ASP产品)则'贷款本金金额'项目中包含'车辆贷款本金金额'和'附加产品融资贷款本金总金额'两个项目
907 upper, lower, asp_1, asp_2 = self.get_loan_principal(page_num='1')
908 # print(upper, lower, asp_1, asp_2)
909 self.init_result['page_2']['贷款本金金额']['大写'] = upper
910 self.init_result['page_2']['贷款本金金额']['小写'] = lower
911 self.init_result['page_2']['贷款本金金额']['车辆贷款本金金额'] = asp_1
912 self.init_result['page_2']['贷款本金金额']['附加产品融资贷款本金总金额'] = asp_2
913 # 贷款期限
914 loan_term = self.get_loan_term(page_num='1')
915 # print(loan_term)
916 self.init_result['page_2']['贷款期限'] = loan_term
917 # 本合同当期的标准利率
918 standard_rate = self.get_standard_rate(page_num='1')
919 # print(standard_rate)
920 self.init_result['page_2']['标准利率'] = standard_rate
921 # 202212 release 新增借款人收款账户
922 account, account_name, account_bank = self.get_borrower_collection_account()
923 # print(account, account_name, account_bank)
924 self.init_result['page_2']['借款人收款账户']['账号'] = account
925 self.init_result['page_2']['借款人收款账户']['户名'] = account_name
926 self.init_result['page_2']['借款人收款账户']['开户行'] = account_bank
927 # 还款账户
928 account, account_name, account_bank = self.get_payback_account()
929 # print(account, account_name, account_bank)
930 self.init_result['page_2']['还款账户']['账号'] = account
931 self.init_result['page_2']['还款账户']['户名'] = account_name
932 self.init_result['page_2']['还款账户']['开户行'] = account_bank
933 #######################################
934 # Page 3
935 # 找合同编号
936 contract_no = self.get_contract_no(page_num='2')
937 self.init_result['page_3']['合同编号'] = contract_no
938 # 还款计划表(表格)
939 repayment_schedule_table = self.get_repayment_schedule()
940 # print(repayment_schedule_table)
941 self.init_result['page_3']['还款计划表'] = repayment_schedule_table
942 # 车辆代理商
943 cldls = self.get_cldls()
944 self.init_result['page_3']['车辆代理商'] = cldls
945 #######################################
946 # Page 4
947 # 找合同编号
948 contract_no = self.get_contract_no(page_num='3')
949 # print(contract_no)
950 self.init_result['page_4']['合同编号'] = contract_no
951 # 附加产品融资贷款本金总金额明细(ASP-表格)
952 asp_details_table = self.get_asp_details(page_num='3')
953 # print(asp_details_table)
954 self.init_result['page_4']['附加产品融资贷款本金总金额明细'] = asp_details_table
955 #######################################
956 # Page 5
957 # 找合同编号
958 contract_no = self.get_contract_no(page_num='4')
959 # print(contract_no)
960 self.init_result['page_5']['合同编号'] = contract_no
961 #######################################
962 # Page 6
963 # 找合同编号
964 contract_no = self.get_contract_no(page_num='5')
965 # print(contract_no)
966 self.init_result['page_6']['合同编号'] = contract_no
967
968 if self.is_asp:
969 # Page 7
970 # 找合同编号
971 contract_no = self.get_contract_no(page_num='6')
972 self.init_result['page_7']['合同编号'] = contract_no
973 # Page 8
974 # 找合同编号
975 contract_no = self.get_contract_no(page_num='7')
976 self.init_result['page_8']['合同编号'] = contract_no
977 signature_name, signature_date = self.get_last_page_signature(page_num='7',
978 top='合同编号', bottom='共同借款人')
979 if signature_name['words'] == None:
980 signature_name, signature_date = self.get_last_page_signature(page_num='7',
981 top='合同编号', bottom='共同借款人(抵押人)')
982 # print(signature_name, signature_date)
983 self.init_result['page_8']['主借人签字']['签字'] = signature_name
984 self.init_result['page_8']['主借人签字']['日期'] = signature_date
985 signature_name, signature_date = self.get_last_page_signature(page_num='7',
986 top='共同借款人', bottom='保证人1')
987 if signature_name['words'] == None:
988 signature_name, signature_date = self.get_last_page_signature(page_num='7',
989 top='共同借款人(抵押人)', bottom='保证人1')
990 # print(signature_name, signature_date)
991 self.init_result['page_8']['共借人签字']['签字'] = signature_name
992 self.init_result['page_8']['共借人签字']['日期'] = signature_date
993 signature_name, signature_date = self.get_last_page_signature(page_num='7',
994 top='保证人1', bottom='保证人2')
995 self.init_result['page_8']['保证人1签字']['签字'] = signature_name
996 self.init_result['page_8']['保证人1签字']['日期'] = signature_date
997 signature_name, signature_date = self.get_last_page_signature(page_num='7',
998 top='保证人2', bottom='在本人面前亲笔签署本合同')
999 self.init_result['page_8']['保证人2签字']['签字'] = signature_name
1000 self.init_result['page_8']['保证人2签字']['日期'] = signature_date
1001 signature_name, signature_date = self.get_last_page_signature(page_num='7',
1002 top='在本人面前亲笔签署本合同', bottom='以下无正文')
1003 # print(signature_name, signature_date)
1004 self.init_result['page_8']['见证人签字']['签字'] = signature_name
1005 self.init_result['page_8']['见证人签字']['日期'] = signature_date
1006 else:
1007 # Page 7
1008 # 找合同编号
1009 contract_no = self.get_contract_no(page_num='6')
1010 self.init_result['page_7']['合同编号'] = contract_no
1011 signature_name, signature_date = self.get_last_page_signature(page_num='6',
1012 top='合同编号', bottom='共同借款人')
1013 if signature_name['words'] == None:
1014 signature_name, signature_date = self.get_last_page_signature(page_num='6',
1015 top='合同编号', bottom='共同借款人(抵押人)')
1016 # print(signature_name, signature_date)
1017 self.init_result['page_7']['主借人签字']['签字'] = signature_name
1018 self.init_result['page_7']['主借人签字']['日期'] = signature_date
1019 signature_name, signature_date = self.get_last_page_signature(page_num='6',
1020 top='共同借款人', bottom='保证人1')
1021 if signature_name['words'] == None:
1022 signature_name, signature_date = self.get_last_page_signature(page_num='6',
1023 top='共同借款人(抵押人)', bottom='保证人1')
1024 # print(signature_name, signature_date)
1025 self.init_result['page_7']['共借人签字']['签字'] = signature_name
1026 self.init_result['page_7']['共借人签字']['日期'] = signature_date
1027 signature_name, signature_date = self.get_last_page_signature(page_num='6',
1028 top='保证人1', bottom='保证人2')
1029 self.init_result['page_7']['保证人1签字']['签字'] = signature_name
1030 self.init_result['page_7']['保证人1签字']['日期'] = signature_date
1031 signature_name, signature_date = self.get_last_page_signature(page_num='6',
1032 top='保证人2', bottom='在本人面前亲笔签署本合同')
1033 self.init_result['page_7']['保证人2签字']['签字'] = signature_name
1034 self.init_result['page_7']['保证人2签字']['日期'] = signature_date
1035 signature_name, signature_date = self.get_last_page_signature(page_num='6',
1036 top='在本人面前亲笔签署本合同', bottom='以下无正文')
1037 # print(signature_name, signature_date)
1038 self.init_result['page_7']['见证人签字']['签字'] = signature_name
1039 self.init_result['page_7']['见证人签字']['日期'] = signature_date
1040
1041
1042 # 重新定制输出
1043 new_results = {"is_asp": self.is_asp,
1044 "page_info": self.init_result
1045 }
1046 return new_results
...\ No newline at end of file ...\ No newline at end of file
1 import re
2 import numpy as np
3 from fuzzywuzzy import fuzz
4 from shapely.geometry import Polygon
5
6 def caculate_iou(g, p):
7 g = Polygon(np.array(g).reshape((-1, 2)))
8 p = Polygon(np.array(p).reshape((-1, 2)))
9 inter = Polygon(g).intersection(Polygon(p)).area
10 union = g.area + p.area - inter
11 return inter/union
12
13 def get_table_info(bbox_1, bbox_2, ocr_result):
14 anchor = [bbox_2[0], bbox_1[1], bbox_2[2], bbox_1[3],
15 bbox_2[4], bbox_1[5], bbox_2[6], bbox_1[7]]
16 table_info = ''
17 for span in ocr_result:
18 iou = caculate_iou(anchor, span[0])
19 if iou > 0:
20 table_info = span[1]
21 return table_info
22
23 class Finder:
24
25 def __init__(self, pdf_info):
26 self.pdf_info = pdf_info
27 self.item = {"words": None,
28 "page": None,
29 "position": None,
30 }
31 # 格式化算法输出
32 self.init_result = {"合同编号": self.item,
33 "承租人-姓名": self.item,
34 "承租人-证件号码": self.item,
35 "承租人-法定代表人或授权代表": self.item,
36
37 "共同承租人-姓名": self.item,
38 "共同承租人-证件号码": self.item,
39 "共同承租人-法定代表人或授权代表": self.item,
40
41 "保证人1-姓名": self.item,
42 "保证人1-证件号码": self.item,
43 "保证人1-法定代表人或授权代表": self.item,
44
45 "保证人2-姓名": self.item,
46 "保证人2-证件号码": self.item,
47 "保证人2-法定代表人或授权代表": self.item,
48 "保证人3-姓名": self.item,
49 "保证人3-证件号码": self.item,
50 "保证人3-法定代表人或授权代表": self.item,
51 "合同编号(正文)": self.item,
52 "车辆识别代码": self.item,
53 "车辆卖方(经销商)": self.item,
54 "车辆代理商": self.item,
55 "车辆原始销售价格(《机动车销售统一发票》所列金额)": self.item,
56 "车辆附加产品明细表": self.item,
57 "融资成本总额": self.item,
58 "租期": self.item,
59 "付款计划表": self.item,
60 "收款银行账户-户名": self.item,
61 "收款银行账户-银行账号": self.item,
62 "收款银行账户-开户行": self.item,
63 "银行账户-户名": self.item,
64 "银行账户-银行账号": self.item,
65 "银行账户-开户行": self.item,
66 "签字页-承租人姓名": self.item,
67 "签字页-承租人签章": self.item,
68
69 "签字页-共同承租人姓名": self.item,
70 "签字页-共同承租人签章": self.item,
71
72 "签字页-保证人1姓名": self.item,
73 "签字页-保证人1签章": self.item,
74
75 "签字页-保证人2姓名": self.item,
76 "签字页-保证人2签章": self.item,
77 "签字页-保证人3姓名": self.item,
78 "签字页-保证人3签章": self.item,
79 }
80
81 # 格式化输出 车辆处置协议 要是别的字段
82 self.init_result_1 = {"合同编号": self.item,
83 "承租人-姓名": self.item,
84 "承租人-证件号码": self.item,
85 "销售经销商": self.item,
86 "合同编号(正文)": self.item,
87 "签字页-承租人姓名": self.item,
88 "签字页-承租人证件号码": self.item,
89 "签字页-承租人签章": self.item,
90 "签字页-销售经销商": self.item,
91 "签字页-销售经销商签章": self.item,
92 }
93
94 # 格式化输出 车辆租赁抵押合同
95 self.init_result_2 = {"合同编号": self.item,
96 "合同编号(正文)": self.item,
97 "抵押人姓名/名称": self.item,
98 "抵押人证件号码": self.item,
99 "抵押人配偶姓名/名称": self.item,
100 "抵押人配偶证件号码": self.item,
101 "车辆识别代码": self.item,
102 "租金总额": self.item,
103 "融资租赁期限": self.item,
104 "签字页-抵押人姓名": self.item,
105 "签字页-抵押人签章": self.item,
106 "签字页-抵押人配偶姓名": self.item,
107 "签字页-抵押人配偶签章": self.item,
108 }
109
110 def get_contract_no(self, page_num):
111 """传入页码,查看该页码右上角的编号
112
113 Args:
114 page_num (string):
115
116 Returns:
117 sting:
118 """
119 contract_no = self.item.copy()
120 # 只看第一页
121 for block in self.pdf_info[page_num]['blocks']:
122 if block['type'] != 0:
123 continue
124 for line in block['lines']:
125 for span in line['spans']:
126 bbox, text = span['bbox'], span['text']
127 if '合同编号:' in text:
128 words = text.split(':')[-1]
129 contract_no['position'] = bbox
130 contract_no['page'] = page_num
131 contract_no['words'] = words
132 if contract_no['words'] == '':
133 for block in self.pdf_info[page_num]['blocks']:
134 if block['type'] != 0:
135 continue
136 for line in block['lines']:
137 for span in line['spans']:
138 bbox, text = span['bbox'], span['text']
139 if bbox[1] < contract_no['position'][3] and 'CH' in text:
140 contract_no['position'] = bbox
141 contract_no['page'] = page_num
142 contract_no['words'] = text
143 return contract_no
144
145 def get_vehicle_price(self, page_num='0'):
146 vehicle_price = self.item.copy()
147 for block in self.pdf_info[page_num]['blocks']:
148 if block['type'] != 0:
149 continue
150 for line in block['lines']:
151 for span in line['spans']:
152 bbox, text = span['bbox'], span['text']
153 if '所购车辆价格为人民币' in text:
154 words = text.split('币')[-1]
155 vehicle_price['position'] = bbox
156 vehicle_price['words'] = words
157 return vehicle_price
158
159 def get_contract_no_one(self):
160 # 查找正文中的合同编号,有可能存在换行的情况
161 contract_no = self.item.copy()
162 for pno in self.pdf_info:
163 all_text = ''
164 for block in self.pdf_info[pno]['blocks']:
165 if block['type'] != 0:
166 continue
167 for line in block['lines']:
168 for span in line['spans']:
169 bbox, text = span['bbox'], span['text']
170 all_text += text
171 all_text = all_text.replace(' ', '')
172 matchObj = re.search(r'(合同编号:\[(.*?)\])', all_text)
173 if matchObj:
174 words = matchObj.group(1)
175 contract_no['position'] = None
176 contract_no['page'] = pno
177 # contract_no['words'] = words
178 contract_no['words'] = re.sub("\s", "", words).replace(")", "")
179 return contract_no
180
181 matchObj = re.search(r'编号为(.*?)的', all_text)
182 if matchObj:
183 words = matchObj.group(1).strip()
184 contract_no['position'] = None
185 contract_no['page'] = pno
186 # contract_no['words'] = words
187 contract_no['words'] = re.sub("\s", "", words).replace(")", "")
188 return contract_no
189
190 matchObj = re.search(r'编号为(.*?))的', all_text)
191 if matchObj:
192 words = matchObj.group(1).strip()
193 contract_no['position'] = None
194 contract_no['page'] = pno
195 # contract_no['words'] = words
196 contract_no['words'] = re.sub("\s", "", words)
197 return contract_no
198
199 def get_key_value(self, key, page_num=None):
200 value = self.item.copy()
201 if page_num is not None:
202 pno = page_num
203 for block in self.pdf_info[pno]['blocks']:
204 if block['type'] != 0:
205 continue
206 for line in block['lines']:
207 for span in line['spans']:
208 bbox, text = span['bbox'], span['text']
209 if key in text:
210 words = text.split(':')[-1].replace("。", "")
211 value['position'] = bbox
212 value['page'] = pno
213 # value['words'] = words
214 value['words'] = re.sub("\s", "", words)
215 else:
216 for pno in self.pdf_info:
217 for block in self.pdf_info[pno]['blocks']:
218 if block['type'] != 0:
219 continue
220 for line in block['lines']:
221 for span in line['spans']:
222 bbox, text = span['bbox'], span['text']
223 if key in text:
224 # print(self.pdf_info[pno])
225 words = text.split(':')[-1].replace("。", "")
226 value['position'] = bbox
227 value['page'] = pno
228 # value['words'] = words
229 value['words'] = re.sub("\s", "", words)
230 return value
231
232 def get_loan_principal(self, page_num='0'):
233 chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾',
234 '佰', '仟', '万', '亿', '元', '角', '分', '零', '整']
235 upper = self.item.copy()
236 lower = self.item.copy()
237 asp_1 = self.item.copy()
238 asp_2 = self.item.copy()
239 anchor_bbox = None
240 for block in self.pdf_info[page_num]['blocks']:
241 if block['type'] != 0:
242 continue
243 for line in block['lines']:
244 for span in line['spans']:
245 bbox, text = span['bbox'], span['text']
246 if fuzz.ratio(''.join(chinese_keywords), text) > 15:
247 text = text.split(':')[-1].strip()
248 upper['position'] = bbox
249 upper['words'] = text
250 if '小写:¥' in text:
251 words = text.split('¥')[-1].strip()
252 lower['position'] = bbox
253 lower['words'] = words
254 if '附加产品融资贷款本金总金额' == text:
255 anchor_bbox = bbox
256 if anchor_bbox:
257 for block in self.pdf_info[page_num]['blocks']:
258 if block['type'] != 0:
259 continue
260 for line in block['lines']:
261 for span in line['spans']:
262 bbox, text = span['bbox'], span['text']
263 if np.mean(bbox[1::2]) < np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text:
264 words = re.findall(r'人民币:小写:\[(.*)\]', text)[0]
265 asp_1['position'] = bbox
266 asp_1['words'] = words
267 if np.mean(bbox[1::2]) > np.mean(anchor_bbox[1::2]) and '人民币:小写:' in text:
268 words = re.findall(r'人民币:小写:\[(.*)\]', text)[0]
269 asp_2['position'] = bbox
270 asp_2['words'] = words
271 return upper, lower, asp_1, asp_2
272
273 def get_loan_term(self, page_num='0'):
274 loan_term = self.item.copy()
275 all_text = ''
276 for block in self.pdf_info[page_num]['blocks']:
277 if block['type'] != 0:
278 continue
279 for line in block['lines']:
280 for span in line['spans']:
281 bbox, text = span['bbox'], span['text']
282 all_text += text
283 matchs = re.search(r'贷款期限(\d+)个月', all_text)
284 if matchs:
285 words = matchs.group(1)
286 for block in self.pdf_info[page_num]['blocks']:
287 if block['type'] != 0:
288 continue
289 for line in block['lines']:
290 for span in line['spans']:
291 bbox, text = span['bbox'], span['text']
292 if f'{words}个月' in text:
293 loan_term['position'] = bbox
294 loan_term['words'] = words
295 return loan_term
296
297 def get_asp_details(self, page_num):
298 asp_details_table_term = self.item.copy()
299
300 asp_details_table = []
301 asp_details_text_list = []
302 table = False
303 for block in self.pdf_info[page_num]['blocks']:
304 if block['type'] != 0:
305 continue
306 for line in block['lines']:
307 for span in line['spans']:
308 bbox, text = span['bbox'], span['text']
309 if '附加产品融资贷款本金总金额明细' == text:
310 table = True
311 if '第二条' in text or '征信管理' in text:
312 table = False
313 if table == True:
314 asp_details_text_list.append(text)
315
316 for i in range((len(asp_details_text_list)+2)//3):
317
318 line = []
319 if i == 0:
320 line = [asp_details_text_list[0]]
321 else:
322 for j in range(3):
323 line.append(asp_details_text_list[i*3-2+j])
324
325 asp_details_table.append(line)
326
327 if len(asp_details_table) > 0:
328 asp_details_table_term['words'] = asp_details_table
329 return asp_details_table_term
330
331 def get_signature(self):
332 signature = self.item.copy()
333
334 for block in self.pdf_info['0']['blocks']:
335 if block['type'] != 0:
336 continue
337 for line in block['lines']:
338 for span in line['spans']:
339 bbox, text = span['bbox'], span['text']
340 if '签署日期' in text:
341 words = text
342 signature['words'] = words
343 signature['position'] = bbox
344 return signature
345
346 def get_somebody(self, top, bottom):
347 # 指定上下边界后,返回上下边界内的客户信息
348 _name = self.item.copy()
349 _id = self.item.copy()
350 # 只看第一页,先划定上下边界
351 y_top = 0
352 y_bottom = 0
353 for block in self.pdf_info['1']['blocks']:
354 if block['type'] != 0:
355 continue
356 for line in block['lines']:
357 for span in line['spans']:
358 bbox, text = span['bbox'], span['text']
359 if top in text:
360 y_top = bbox[3]
361 if bottom in text:
362 y_bottom = bbox[3]
363 for block in self.pdf_info['1']['blocks']:
364 if block['type'] != 0:
365 continue
366 for line in block['lines']:
367 for span in line['spans']:
368 bbox, text = span['bbox'], span['text']
369 if y_top < bbox[3] < y_bottom:
370 if '姓名/名称' in text:
371 words = text.split(':')[-1]
372 _name['position'] = bbox
373 _name['words'] = words
374 if '自然人身份证件号码/法人执照号码' in text:
375 words = text.split(':')[-1]
376 _id['position'] = bbox
377 _id['words'] = words
378 return _name, _id
379
380 def get_seller(self):
381 seller = self.item.copy()
382 # 先找到 key
383 anchor_bbox = None
384 for block in self.pdf_info['1']['blocks']:
385 if block['type'] != 0:
386 continue
387 for line in block['lines']:
388 for span in line['spans']:
389 bbox, text = span['bbox'], span['text']
390 if '经销商' == text:
391 anchor_bbox = bbox
392 # 当找到了 key, 则根据 key 去匹配 value
393 if anchor_bbox:
394 half_width = self.pdf_info['1']['width'] * 0.5
395 for block in self.pdf_info['1']['blocks']:
396 if block['type'] != 0:
397 continue
398 for line in block['lines']:
399 for span in line['spans']:
400 bbox, text = span['bbox'], span['text']
401 if anchor_bbox[2]<np.mean(bbox[::2])<half_width and \
402 anchor_bbox[1]<np.mean(bbox[1::2])<anchor_bbox[3]:
403 seller['position'] = bbox
404 seller['words'] = text
405 return seller
406
407 def get_payback_account(self):
408 account = self.item.copy()
409 account_name = self.item.copy()
410 account_bank = self.item.copy()
411 all_text = ''
412 for block in self.pdf_info['1']['blocks']:
413 if block['type'] != 0:
414 continue
415 for line in block['lines']:
416 for span in line['spans']:
417 bbox, text = span['bbox'], span['text']
418 all_text += text
419 # 首先确定账户信息是哪种,我们只输出非另行通知的格式
420 if '☑账号' in all_text:
421 all_text = all_text.replace(' ', '')
422 matchs_1 = re.findall(r'账号:(.*)户名', all_text)
423 if matchs_1:
424 words = matchs_1[0]
425 for block in self.pdf_info['1']['blocks']:
426 if block['type'] != 0:
427 continue
428 for line in block['lines']:
429 for span in line['spans']:
430 bbox, text = span['bbox'], span['text']
431 if f'{words}' in text:
432 account['position'] = bbox
433 account['words'] = words
434 matchs_2 = re.findall(r'户名:(.*)开户行', all_text)
435 if matchs_2:
436 words = matchs_2[0]
437 for block in self.pdf_info['1']['blocks']:
438 if block['type'] != 0:
439 continue
440 for line in block['lines']:
441 for span in line['spans']:
442 bbox, text = span['bbox'], span['text']
443 if f'{words}' in text:
444 account_name['position'] = bbox
445 account_name['words'] = words
446 matchs_3 = re.findall(r'开户行:(.*);', all_text)
447 if matchs_3:
448 words = matchs_3[0]
449 for block in self.pdf_info['1']['blocks']:
450 if block['type'] != 0:
451 continue
452 for line in block['lines']:
453 for span in line['spans']:
454 bbox, text = span['bbox'], span['text']
455 if f'开户行:{words};' in text.replace(' ', ''):
456 account_bank['position'] = bbox
457 account_bank['words'] = words
458 return account, account_name, account_bank
459
460 def get_repayment_schedule(self):
461 repayment_schedule = self.item.copy()
462
463 repayment_schedule_text_list = []
464 table = False
465 page = None
466 left = 0
467 right = 0
468 for pno in self.pdf_info:
469 for block in self.pdf_info[pno]['blocks']:
470 if block['type'] != 0:
471 continue
472 for line in block['lines']:
473 for span in line['spans']:
474 bbox, text = span['bbox'], span['text']
475 if '剩余融资' in text:
476 right = bbox[2]
477 if '以上表格中所列序号' in text:
478 table = False
479 if table == True:
480 # 过滤汉字
481 if re.compile(r'[\u4e00-\u9fff]').search(text):
482 continue
483 # 过滤 1. - 61. 这些标题
484 if re.findall("\d+", text):
485 if len(re.findall("\d+", text)) == 1:
486 continue
487 if not left < bbox[0] < right:
488 continue
489 repayment_schedule_text_list.append(text)
490
491 if text.strip() == "61.":
492 page = pno
493 table = True
494 left = bbox[0]
495 # print("repayment_schedule_text_list = ", repayment_schedule_text_list)
496 # repayment_schedule_table = [['序号', '融资租赁成本', '融资租赁费用', '租金', '剩余融资租赁成本']]
497 repayment_schedule_table = [['序号', '租金']]
498 for i in range(len(repayment_schedule_text_list)//4):
499 line = [f'{i+1}.']
500 # 4表示4列的意思
501 for j in range(4):
502 line.append(repayment_schedule_text_list[i*4+j])
503
504 # 只保留序号和租金列
505 line = [line[0].replace('.', ''), line[3]]
506
507 repayment_schedule_table.append(line)
508
509 repayment_schedule['words'] = repayment_schedule_table
510 repayment_schedule['page'] = page
511 return repayment_schedule
512
513 def get_signature_role_1(self):
514 signature_role_1 = self.item.copy()
515 for pno in self.pdf_info:
516 for block in self.pdf_info[pno]['blocks']:
517 if block['type'] != 0:
518 continue
519 for line in block['lines']:
520 for span in line['spans']:
521 bbox, text = span['bbox'], span['text']
522 if '签署日期' in text:
523 signature_role_1['position'] = bbox
524 signature_role_1['page'] = pno
525 signature_role_1['words'] = text
526 return signature_role_1
527
528 def get_signature_role_2(self):
529 signature_role_2 = self.init_item.copy()
530 # 先定位签字区域
531 texts = []
532 boxes = []
533 page_num = None
534 position = None
535 words = None
536 region = False
537 for i in list(self.pdf_info.keys()):
538 for block in self.pdf_info[i]['blocks']:
539 if block['type'] != 0:
540 continue
541 for line in block['lines']:
542 for span in line['spans']:
543 bbox, text = span['bbox'], span['text']
544 if '共同借款人(共同抵押人)' in text:
545 region = True
546 if '日期' in text:
547 region = False
548 if region == True:
549 page_num = i
550 texts.append(text)
551 boxes.append(bbox)
552 if len(texts) > 4:
553 words = '有'
554 else:
555 words = '无'
556 boxes = np.array(boxes).reshape((-1, 2))
557 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
558 signature_role_2['page_num'] = page_num
559 signature_role_2['position'] = position
560 signature_role_2['words'] = words
561 return signature_role_2
562
563 def get_signature_role_3(self):
564 signature_role_3 = self.init_item.copy()
565 # 先定位签字区域
566 texts = []
567 boxes = []
568 page_num = None
569 position = None
570 words = None
571 region = False
572 for i in list(self.pdf_info.keys()):
573 for block in self.pdf_info[i]['blocks']:
574 if block['type'] != 0:
575 continue
576 for line in block['lines']:
577 for span in line['spans']:
578 bbox, text = span['bbox'], span['text']
579 if '保证人1' in text and int(i) != 0:
580 region = True
581 if '日期' in text:
582 region = False
583 if region == True:
584 page_num = i
585 texts.append(text)
586 boxes.append(bbox)
587 if len(texts) > 4:
588 words = '有'
589 else:
590 words = '无'
591 boxes = np.array(boxes).reshape((-1, 2))
592 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
593 signature_role_3['page_num'] = page_num
594 signature_role_3['position'] = position
595 signature_role_3['words'] = words
596 return signature_role_3
597
598 def get_signature_role_4(self):
599 signature_role_4 = self.init_item.copy()
600 # 先定位签字区域
601 texts = []
602 boxes = []
603 page_num = None
604 position = None
605 words = None
606 region = False
607 for i in list(self.pdf_info.keys()):
608 for block in self.pdf_info[i]['blocks']:
609 if block['type'] != 0:
610 continue
611 for line in block['lines']:
612 for span in line['spans']:
613 bbox, text = span['bbox'], span['text']
614 if '保证人2' in text and int(i) != 0:
615 region = True
616 if '日期' in text:
617 region = False
618 if region == True:
619 page_num = i
620 texts.append(text)
621 boxes.append(bbox)
622 if len(texts) > 4:
623 words = '有'
624 else:
625 words = '无'
626 boxes = np.array(boxes).reshape((-1, 2))
627 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
628 signature_role_4['page_num'] = page_num
629 signature_role_4['position'] = position
630 signature_role_4['words'] = words
631 return signature_role_4
632
633 def get_signature_role_5(self):
634 signature_role_5 = self.init_item.copy()
635 # 先定位签字区域
636 texts = []
637 boxes = []
638 page_num = None
639 position = None
640 words = None
641 region = False
642 for i in list(self.pdf_info.keys()):
643 for block in self.pdf_info[i]['blocks']:
644 if block['type'] != 0:
645 continue
646 for line in block['lines']:
647 for span in line['spans']:
648 bbox, text = span['bbox'], span['text']
649 if '见证人签字' in text and int(i) != 0:
650 region = True
651 if '年' in text:
652 region = False
653 if region == True:
654 page_num = i
655 texts.append(text)
656 boxes.append(bbox)
657 # print(texts)
658 if len(texts) > 4:
659 words = '有'
660 else:
661 words = '无'
662 boxes = np.array(boxes).reshape((-1, 2))
663 position = [min(boxes[:,0]), min(boxes[:,1]), max(boxes[:,0]), max(boxes[:,1])]
664 signature_role_5['page_num'] = page_num
665 signature_role_5['position'] = position
666 signature_role_5['words'] = words
667 return signature_role_5
668
669 def get_last_page_signature(self, page_num, top, bottom):
670 signature_name = self.item.copy()
671 signature_date = self.item.copy()
672 anchor_top = None
673 anchor_bottom = None
674 for block in self.pdf_info[page_num]['blocks']:
675 if block['type'] != 0:
676 continue
677 for line in block['lines']:
678 for span in line['spans']:
679 bbox, text = span['bbox'], span['text']
680 if top in text:
681 anchor_top = bbox[1]
682 if bottom in text:
683 anchor_bottom = bbox[1]
684 if anchor_top is not None and anchor_bottom is not None:
685 for block in self.pdf_info[page_num]['blocks']:
686 if block['type'] != 0:
687 continue
688 for line in block['lines']:
689 for span in line['spans']:
690 bbox, text = span['bbox'], span['text']
691 if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom):
692 name = text.split(' ')[0]
693 date = text.split(':')[-1]
694 signature_name['words'] = name
695 signature_name['position'] = bbox
696 signature_date['words'] = date
697 signature_name['position'] = bbox
698 return signature_name, signature_date
699
700 def get_electronic_signature(self, top, bottom, t_pno=None):
701 signature = self.item.copy()
702 anchor_top = None
703 anchor_bottom = None
704 for pno in self.pdf_info:
705 if t_pno is not None and pno != t_pno:
706 continue
707 for block in self.pdf_info[pno]['blocks']:
708 if block['type'] != 0:
709 continue
710 for line in block['lines']:
711 for span in line['spans']:
712 bbox, text = span['bbox'], span['text']
713 if top in text:
714 anchor_top = bbox[1]
715 elif bottom in text and anchor_top is not None and bbox[3] > anchor_top:
716 anchor_bottom = bbox[3]
717 if anchor_top is not None and anchor_bottom is not None:
718 # print('in')
719 # print(anchor_top)
720 # print(anchor_bottom)
721 for pno in self.pdf_info:
722 if t_pno is not None and pno != t_pno:
723 continue
724 for block in self.pdf_info[pno]['blocks']:
725 if block['type'] != 0:
726 continue
727 for line in block['lines']:
728 for span in line['spans']:
729 bbox, text = span['bbox'], span['text']
730 # ------------ #
731 # print("--text = ", text)
732 if '签署日期' in text and int(anchor_top)<np.mean(bbox[1::2])<int(anchor_bottom):
733 words = text
734 signature['words'] = words
735 signature['page'] = pno
736 signature['position'] = bbox
737 return signature
738
739 def get_role_info(self, role_key, page_num='0'):
740 name = self.item.copy()
741 id_num = self.item.copy()
742 representative = self.item.copy()
743
744 # 以保证人3 的左上角为定位点
745 anchor = None
746 for block in self.pdf_info[page_num]['blocks']:
747 if block['type'] != 0:
748 continue
749 for line in block['lines']:
750 for span in line['spans']:
751 bbox, text = span['bbox'], span['text']
752 # 找到角色姓名
753 if re.match('保证人3', text) is not None:
754 anchor = [bbox[0], bbox[1]]
755
756 if anchor is not None:
757 for block in self.pdf_info[page_num]['blocks']:
758 if block['type'] != 0:
759 continue
760 for line in block['lines']:
761 for span in line['spans']:
762 bbox, text = span['bbox'], span['text']
763 # 找到角色姓名
764 if re.match(role_key, text) is not None:
765 words = text.split(':')[-1]
766 name['words'] = words
767 name['page'] = page_num
768 name['position'] = bbox
769 if role_key == '承租人:':
770 # 找到证件号码且确定位置
771 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
772 words = text.split(':')[-1]
773 id_num['words'] = words
774 id_num['page'] = page_num
775 id_num['position'] = bbox
776 # 找到法人代表且确定位置
777 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
778 words = text.split(':')[-1]
779 representative['words'] = words
780 representative['page'] = page_num
781 representative['position'] = bbox
782 if role_key == '保证人1:':
783 # 找到证件号码且确定位置
784 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
785 words = text.split(':')[-1]
786 id_num['words'] = words
787 id_num['page'] = page_num
788 id_num['position'] = bbox
789 # 找到法人代表且确定位置
790 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
791 words = text.split(':')[-1]
792 representative['words'] = words
793 representative['page'] = page_num
794 representative['position'] = bbox
795 if role_key == '保证人2:':
796 # 找到证件号码且确定位置
797 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
798 words = text.split(':')[-1]
799 id_num['words'] = words
800 id_num['page'] = page_num
801 id_num['position'] = bbox
802 # 找到法人代表且确定位置
803 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
804 words = text.split(':')[-1]
805 representative['words'] = words
806 representative['page'] = page_num
807 representative['position'] = bbox
808 if role_key == '保证人3:':
809 # 找到证件号码且确定位置
810 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
811 words = text.split(':')[-1]
812 id_num['words'] = words
813 id_num['page'] = page_num
814 id_num['position'] = bbox
815 # 找到法人代表且确定位置
816 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
817 words = text.split(':')[-1]
818 representative['words'] = words
819 representative['page'] = page_num
820 representative['position'] = bbox
821 return name, id_num, representative
822
823 def get_table_add_product(self):
824 table_add_product = self.item.copy()
825
826 add_product_page_num = None
827 for pno in self.pdf_info:
828 for block in self.pdf_info[f'{pno}']['blocks']:
829 if block['type'] != 0:
830 continue
831 for line in block['lines']:
832 for span in line['spans']:
833 bbox, text = span['bbox'], span['text']
834 if '车辆附加产品(明细见下表)' in text:
835 add_product_page_num = pno
836 ocr_results = []
837 for block in self.pdf_info[f'{add_product_page_num}']['blocks']:
838 if block['type'] != 0:
839 continue
840 for line in block['lines']:
841 for span in line['spans']:
842 bbox, text = span['bbox'], span['text']
843 xmin, ymin, xmax, ymax = bbox
844 bbox = [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax]
845 ocr_results.append([bbox, text])
846
847 lines = [['项目', '购买价格', '实际融资金额']]
848
849 key_xm = None
850 key_gmjg = None
851 key_sjrzje = None
852 key_total = None
853
854 for index, span in enumerate(ocr_results):
855 if span[1] == '项目':
856 key_xm = index
857 if span[1] == '购买价格':
858 key_gmjg = index
859 if span[1] == '实际融资金额':
860 key_sjrzje = index
861 if span[1] == '总计':
862 key_total = index
863
864 bbox, text = ocr_results[key_xm]
865 rh = abs(bbox[1]-bbox[-1])
866 anchor = np.array(bbox).reshape((-1, 2))
867 anchor[:, 0] += 2*rh
868 anchor[:, 1] += rh
869
870 for i in range(5):
871 for span in ocr_results:
872 iou = caculate_iou(anchor, span[0])
873 if iou > 0.01 and span[1].strip() != '所购':
874 x = get_table_info(span[0], ocr_results[key_gmjg][0], ocr_results)
875 y = get_table_info(span[0], ocr_results[key_sjrzje][0], ocr_results)
876 line = [span[1].replace('\u3000', ' '), x, y]
877 # print(line)
878 lines.append(line)
879 anchor = np.array(span[0]).reshape((-1, 2))
880 anchor[:, 1] += rh
881
882 total = get_table_info(ocr_results[key_total][0], ocr_results[key_sjrzje][0], ocr_results)
883 lines.append(['总计', '', total])
884
885 # 所购 BMW悦然焕
886 # 新服务
887
888 # 所购 BMW5年10
889 # 万公里长悦保养套餐
890
891 # 所购 事故维修补偿
892 # 方案
893
894 # 所购 BMW5年10万公里
895 # 长悦保养套餐
896
897 # 所购 MINI4年6万公里长悦
898 # 保养套餐
899
900 filtered_lines = []
901 for line in lines:
902 if line[0][:2] not in ['所购', '项目', '总计']:
903 continue
904 if 'BMW悦然' in line[0]:
905 line[0] = '所购 BMW悦然焕新服务'
906 if 'BMW5年10' in line[0]:
907 line[0] = '所购 BMW5年10万公里长悦保养套餐'
908 if '事故维修补' in line[0]:
909 line[0] = '所购 事故维修补偿方案'
910 if 'MINI4年6万公里长悦' in line[0]:
911 line[0] = '所购 MINI4年6万公里长悦保养套餐'
912 filtered_lines.append(line)
913 table_add_product['words'] = filtered_lines
914 table_add_product['page'] = add_product_page_num
915 table_add_product['position'] = None
916 return table_add_product
917
918 def get_contract_no_dy(self):
919 # 查找抵押合同编号
920 contract_no = self.item.copy()
921
922 key_box = None
923 for pno in self.pdf_info:
924 for block in self.pdf_info[pno]['blocks']:
925 if block['type'] != 0:
926 continue
927 for line in block['lines']:
928 for span in line['spans']:
929 bbox, text = span['bbox'], span['text']
930 if '抵押合同编号' in text:
931 key_box = bbox
932
933 if key_box is not None:
934 for pno in self.pdf_info:
935 for block in self.pdf_info[pno]['blocks']:
936 if block['type'] != 0:
937 continue
938 for line in block['lines']:
939 for span in line['spans']:
940 bbox, text = span['bbox'], span['text']
941 if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and 'CH-' in text:
942 contract_no['position'] = bbox
943 contract_no['page'] = pno
944 contract_no['words'] = text
945 return contract_no
946
947 def get_dyr_name_id(self):
948 name = self.item.copy()
949 _id = self.item.copy()
950
951 key_box = None
952 for pno in self.pdf_info:
953 for block in self.pdf_info[pno]['blocks']:
954 if block['type'] != 0:
955 continue
956 for line in block['lines']:
957 for span in line['spans']:
958 bbox, text = span['bbox'], span['text']
959 if text == '抵押人':
960 key_box = bbox
961
962 if key_box is not None:
963 rh = abs(key_box[1]-key_box[3])
964 for pno in self.pdf_info:
965 for block in self.pdf_info[pno]['blocks']:
966 if block['type'] != 0:
967 continue
968 for line in block['lines']:
969 for span in line['spans']:
970 bbox, text = span['bbox'], span['text']
971 if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '姓名' in text:
972 words = text.split(':')[-1]
973 name['position'] = bbox
974 name['page'] = pno
975 name['words'] = words
976 if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '证件号码' in text:
977 words = text.split(':')[-1]
978 _id['position'] = bbox
979 _id['page'] = pno
980 _id['words'] = words
981 return name, _id
982
983 def get_dyrpo_name_id(self):
984 name = self.item.copy()
985 _id = self.item.copy()
986
987 key_box = None
988 for pno in self.pdf_info:
989 for block in self.pdf_info[pno]['blocks']:
990 if block['type'] != 0:
991 continue
992 for line in block['lines']:
993 for span in line['spans']:
994 bbox, text = span['bbox'], span['text']
995 if text == '抵押人配偶(如适':
996 key_box = bbox
997
998 if key_box is not None:
999 rh = abs(key_box[1]-key_box[3])
1000 for pno in self.pdf_info:
1001 for block in self.pdf_info[pno]['blocks']:
1002 if block['type'] != 0:
1003 continue
1004 for line in block['lines']:
1005 for span in line['spans']:
1006 bbox, text = span['bbox'], span['text']
1007 if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '姓名' in text:
1008 words = text.split(':')[-1]
1009 name['position'] = bbox
1010 name['page'] = pno
1011 name['words'] = words
1012 if key_box[1] < np.mean(bbox[1::2]) < key_box[3]+rh*3 and '证件号码' in text:
1013 words = text.split(':')[-1]
1014 _id['position'] = bbox
1015 _id['page'] = pno
1016 _id['words'] = words.strip()
1017 return name, _id
1018
1019 def get_key_value_position(self, key):
1020 value = self.item.copy()
1021
1022 key_box = None
1023 for pno in self.pdf_info:
1024 for block in self.pdf_info[pno]['blocks']:
1025 if block['type'] != 0:
1026 continue
1027 for line in block['lines']:
1028 for span in line['spans']:
1029 bbox, text = span['bbox'], span['text']
1030 if text == key:
1031 key_box = bbox
1032
1033 if key_box is not None:
1034 rh = abs(key_box[1]-key_box[3])
1035 for pno in self.pdf_info:
1036 for block in self.pdf_info[pno]['blocks']:
1037 if block['type'] != 0:
1038 continue
1039 for line in block['lines']:
1040 for span in line['spans']:
1041 bbox, text = span['bbox'], span['text']
1042 if key_box[1] < np.mean(bbox[1::2]) < key_box[3] and key_box[0] < bbox[0] and abs(key_box[2]-bbox[0]) < rh*10:
1043 words = text
1044 value['position'] = bbox
1045 value['page'] = pno
1046 value['words'] = words
1047 return value
1048
1049 def get_role_info_3_3(self, role_key, page_num='0'):
1050 name = self.item.copy()
1051 id_num = self.item.copy()
1052 representative = self.item.copy()
1053
1054 # 以保证人2 的左上角为定位点
1055 anchor = None
1056 for block in self.pdf_info[page_num]['blocks']:
1057 if block['type'] != 0:
1058 continue
1059 for line in block['lines']:
1060 for span in line['spans']:
1061 bbox, text = span['bbox'], span['text']
1062 # 找到角色姓名
1063 if re.match('保证人2', text) is not None:
1064 anchor = [bbox[0], bbox[1]]
1065
1066 if anchor is not None:
1067 for block in self.pdf_info[page_num]['blocks']:
1068 if block['type'] != 0:
1069 continue
1070 for line in block['lines']:
1071 for span in line['spans']:
1072 bbox, text = span['bbox'], span['text']
1073 # 找到角色姓名
1074 if re.match(role_key, text) is not None:
1075 words = text.split(':')[-1]
1076 name['words'] = words
1077 name['page'] = page_num
1078 name['position'] = bbox
1079 if role_key == '承租人一:':
1080 # 找到证件号码且确定位置
1081 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
1082 words = text.split(':')[-1]
1083 id_num['words'] = words
1084 id_num['page'] = page_num
1085 id_num['position'] = bbox
1086 # 找到法人代表且确定位置
1087 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
1088 words = text.split(':')[-1]
1089 representative['words'] = words
1090 representative['page'] = page_num
1091 representative['position'] = bbox
1092 if role_key == '共同承租人:':
1093 # 找到证件号码且确定位置
1094 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
1095 words = text.split(':')[-1]
1096 id_num['words'] = words
1097 id_num['page'] = page_num
1098 id_num['position'] = bbox
1099 # 找到法人代表且确定位置
1100 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
1101 words = text.split(':')[-1]
1102 representative['words'] = words
1103 representative['page'] = page_num
1104 representative['position'] = bbox
1105 if role_key == '保证人1:':
1106 # 找到证件号码且确定位置
1107 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
1108 words = text.split(':')[-1]
1109 id_num['words'] = words
1110 id_num['page'] = page_num
1111 id_num['position'] = bbox
1112 # 找到法人代表且确定位置
1113 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]:
1114 words = text.split(':')[-1]
1115 representative['words'] = words
1116 representative['page'] = page_num
1117 representative['position'] = bbox
1118 if role_key == '保证人2:':
1119 # 找到证件号码且确定位置
1120 if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
1121 words = text.split(':')[-1]
1122 id_num['words'] = words
1123 id_num['page'] = page_num
1124 id_num['position'] = bbox
1125 # 找到法人代表且确定位置
1126 if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]:
1127 words = text.split(':')[-1]
1128 representative['words'] = words
1129 representative['page'] = page_num
1130 representative['position'] = bbox
1131 return name, id_num, representative
1132
1133 def get_value_by_findall(self, prefix, suffix, page_num):
1134 value = self.item.copy()
1135 all_text = ''
1136 pno = page_num
1137 for block in self.pdf_info[pno]['blocks']:
1138 if block['type'] != 0:
1139 continue
1140 for line in block['lines']:
1141 for span in line['spans']:
1142 bbox, text = span['bbox'], span['text']
1143 all_text += text
1144 words_list = re.findall(f"{prefix}(.*?){suffix}", all_text)
1145 if len(words_list) > 0:
1146 for block in self.pdf_info[pno]['blocks']:
1147 if block['type'] != 0:
1148 continue
1149 for line in block['lines']:
1150 for span in line['spans']:
1151 bbox, text = span['bbox'], span['text']
1152 if words_list[0] in text:
1153 value['position'] = bbox
1154 value['page'] = pno
1155 value['words'] = words_list[0]
1156 return value
1157
1158 def get_info(self):
1159 """
1160 block['type'] == 0 : 表示该元素为图片
1161
1162 Returns:
1163 dict: Description
1164 """
1165 if len(self.pdf_info) > 0:
1166 # 取 Page 1 上的合同编号
1167 contract_no = self.get_contract_no(page_num='0')
1168 self.init_result['合同编号'] = contract_no
1169
1170 # 粗略判断是否是 ‘车贷分离版本’ 的合同
1171 is_cdfl = False
1172 for block in self.pdf_info['0']['blocks']:
1173 if block['type'] != 0:
1174 continue
1175 for line in block['lines']:
1176 for span in line['spans']:
1177 bbox, text = span['bbox'], span['text']
1178 if '共同承租人:' in text:
1179 is_cdfl = True
1180
1181 if is_cdfl == False:
1182 # 从第一页上取四个角色的姓名和证件号码
1183 name, id_num, representative = self.get_role_info(role_key='承租人:', page_num='0')
1184
1185 if name["words"] == None:
1186 name, id_num, representative = self.get_role_info_3_3(role_key='承租人一:', page_num='0')
1187 self.init_result['承租人-姓名'] = name
1188 self.init_result['承租人-证件号码'] = id_num
1189 self.init_result['承租人-法定代表人或授权代表'] = representative
1190
1191 name, id_num, representative = self.get_role_info(role_key='保证人1:', page_num='0')
1192 self.init_result['保证人1-姓名'] = name
1193 self.init_result['保证人1-证件号码'] = id_num
1194 self.init_result['保证人1-法定代表人或授权代表'] = representative
1195 # if条件判别 对应3_3版本
1196 if name["words"] == None:
1197 name, id_num, representative = self.get_role_info_3_3(role_key='共同承租人:', page_num='0')
1198 self.init_result['共同承租人-姓名'] = name
1199 self.init_result['共同承租人-证件号码'] = id_num
1200 self.init_result['共同承租人-法定代表人或授权代表'] = representative
1201
1202 name, id_num, representative = self.get_role_info(role_key='保证人2:', page_num='0')
1203 self.init_result['保证人2-姓名'] = name
1204 self.init_result['保证人2-证件号码'] = id_num
1205 self.init_result['保证人2-法定代表人或授权代表'] = representative
1206 # if条件判别 对应3_3版本
1207 if name["words"] == None:
1208 name, id_num, representative = self.get_role_info_3_3(role_key='保证人1:', page_num='0')
1209 self.init_result['保证人2-姓名'] = name
1210 self.init_result['保证人2-证件号码'] = id_num
1211 self.init_result['保证人2-法定代表人或授权代表'] = representative
1212
1213 name, id_num, representative = self.get_role_info(role_key='保证人3:', page_num='0')
1214 self.init_result['保证人3-姓名'] = name
1215 self.init_result['保证人3-证件号码'] = id_num
1216 self.init_result['保证人3-法定代表人或授权代表'] = representative
1217 if name["words"] == None:
1218 name, id_num, representative = self.get_role_info_3_3(role_key='保证人2:', page_num='0')
1219 self.init_result['保证人3-姓名'] = name
1220 self.init_result['保证人3-证件号码'] = id_num
1221 self.init_result['保证人3-法定代表人或授权代表'] = representative
1222 else:
1223 name, id_num, representative = self.get_role_info_3_3(role_key='承租人一:', page_num='0')
1224 self.init_result['承租人-姓名'] = name
1225 self.init_result['承租人-证件号码'] = id_num
1226 self.init_result['承租人-法定代表人或授权代表'] = representative
1227
1228 name, id_num, representative = self.get_role_info_3_3(role_key='共同承租人:', page_num='0')
1229 self.init_result['共同承租人-姓名'] = name
1230 self.init_result['共同承租人-证件号码'] = id_num
1231 self.init_result['共同承租人-法定代表人或授权代表'] = representative
1232
1233 name, id_num, representative = self.get_role_info_3_3(role_key='保证人1:', page_num='0')
1234 self.init_result['保证人1-姓名'] = name
1235 self.init_result['保证人1-证件号码'] = id_num
1236 self.init_result['保证人1-法定代表人或授权代表'] = representative
1237
1238 name, id_num, representative = self.get_role_info_3_3(role_key='保证人2:', page_num='0')
1239 self.init_result['保证人2-姓名'] = name
1240 self.init_result['保证人2-证件号码'] = id_num
1241 self.init_result['保证人2-法定代表人或授权代表'] = representative
1242
1243 # 在所有页面中找正文中(第二部分 融资租赁主要条款及付款计划)的那个编号,因为存在换行的情况所以暂时不带位置输出
1244 contract_no = self.get_contract_no_one()
1245 self.init_result['合同编号(正文)'] = contract_no
1246 # 找到车辆识别代码
1247 vin = self.get_key_value(key='车辆识别代码:')
1248 self.init_result['车辆识别代码'] = vin
1249 # 找到经销商(车辆卖方(经销商))
1250 seller = self.get_key_value(key='车辆卖方(经销商):')
1251 if seller['words'] == None:
1252 seller = self.get_key_value(key='车辆卖方:')
1253 self.init_result['车辆卖方(经销商)'] = seller
1254 # 找到车辆代理商
1255 cldls = self.get_key_value(key='车辆代理商', page_num='4')
1256 self.init_result['车辆代理商'] = cldls
1257 # 找到 —— 车辆原始销售价格
1258 vehicle_price = self.get_key_value(key='车辆原始销售价格(《机动车销售统一发票》所列金额):')
1259 self.init_result['车辆原始销售价格(《机动车销售统一发票》所列金额)'] = vehicle_price
1260 # 找车辆附加产品明细(表)
1261 table_add_product = self.get_table_add_product()
1262 self.init_result['车辆附加产品明细表'] = table_add_product
1263 # 找融资成本总额
1264 financing_cost = self.get_key_value(key='融资成本总额:')
1265 self.init_result['融资成本总额'] = financing_cost
1266 # 找租期
1267 lease_term = self.get_key_value(key='租期:')
1268 self.init_result['租期'] = lease_term
1269 # 找还款计划(表)
1270 repayment_schedule = self.get_repayment_schedule()
1271 self.init_result['付款计划表'] = repayment_schedule
1272 # 找承租人收款账户户名、银行账号、银行
1273 name = self.get_key_value(key='户名:', page_num='4')
1274 self.init_result['收款银行账户-户名'] = name
1275 account = self.get_key_value(key='银行账号:', page_num='4')
1276 self.init_result['收款银行账户-银行账号'] = account
1277 bank = self.get_key_value(key='开户银行:', page_num='4')
1278 self.init_result['收款银行账户-开户行'] = bank
1279 # 找承租人扣款账户户名、银行账号、银行
1280 name = self.get_key_value(key='户名:', page_num='5')
1281 self.init_result['银行账户-户名'] = name
1282 account = self.get_key_value(key='银行账号:', page_num='5')
1283 self.init_result['银行账户-银行账号'] = account
1284 bank = self.get_key_value(key='开户银行:', page_num='5')
1285 self.init_result['银行账户-开户行'] = bank
1286
1287 # 找签字页上的系列信息
1288 # 承租人姓名、签章
1289 if is_cdfl == False:
1290 name = self.get_key_value(key='承租人姓名:')
1291 electronic_signature = self.get_electronic_signature(top='承租人姓名:', bottom='保证人1姓名:', t_pno='5')
1292
1293 if name["words"] == None:
1294 name = self.get_key_value(key='承租人一姓名:')
1295 electronic_signature = self.get_electronic_signature(top='承租人一姓名:', bottom='共同承租人名称:', t_pno='5')
1296
1297 self.init_result['签字页-承租人姓名'] = name
1298 self.init_result['签字页-承租人签章'] = electronic_signature
1299 # 保证人1姓名、签章
1300 name = self.get_key_value(key='保证人1姓名:')
1301 electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:', t_pno='5')
1302 self.init_result['签字页-保证人1姓名'] = name
1303 self.init_result['签字页-保证人1签章'] = electronic_signature
1304 # 这里用的是 name["words"] == ""
1305 if name["words"] == "":
1306 name = self.get_key_value(key='共同承租人名称:')
1307 electronic_signature = self.get_electronic_signature(top='共同承租人名称:', bottom='保证人1姓名:', t_pno='5')
1308 self.init_result['签字页-共同承租人姓名'] = name
1309 self.init_result['签字页-共同承租人签章'] = electronic_signature
1310 # 保证人2姓名、签章
1311 name = self.get_key_value(key='保证人2姓名:')
1312 electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='保证人3姓名:', t_pno='5')
1313 self.init_result['签字页-保证人2姓名'] = name
1314 self.init_result['签字页-保证人2签章'] = electronic_signature
1315 # if判断条件对应3_3版本
1316 if name["words"] == "":
1317 name = self.get_key_value(key='保证人1姓名:')
1318 electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:', t_pno='5')
1319 self.init_result['签字页-保证人1姓名'] = name
1320 self.init_result['签字页-保证人1签章'] = electronic_signature
1321 # 保证人3姓名、签章
1322 name = self.get_key_value(key='保证人3姓名:')
1323 electronic_signature = self.get_electronic_signature(top='保证人3姓名:', bottom='日期:', t_pno='5')
1324 self.init_result['签字页-保证人3姓名'] = name
1325 self.init_result['签字页-保证人3签章'] = electronic_signature
1326 # if判断条件对应3_3版本
1327 if name["words"] == None:
1328 name = self.get_key_value(key='保证人2姓名:')
1329 electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='日期:', t_pno='5')
1330 self.init_result['签字页-保证人2姓名'] = name
1331 self.init_result['签字页-保证人2签章'] = electronic_signature
1332 else:
1333 name = self.get_key_value(key='承租人一姓名:')
1334 electronic_signature = self.get_electronic_signature(top='承租人一姓名:', bottom='共同承租人名称:', t_pno='5')
1335 self.init_result['签字页-承租人姓名'] = name
1336 self.init_result['签字页-承租人签章'] = electronic_signature
1337
1338 name = self.get_key_value(key='共同承租人名称:')
1339 electronic_signature = self.get_electronic_signature(top='共同承租人名称:', bottom='保证人1姓名:', t_pno='5')
1340 self.init_result['签字页-共同承租人姓名'] = name
1341 self.init_result['签字页-共同承租人签章'] = electronic_signature
1342
1343 name = self.get_key_value(key='保证人1姓名:')
1344 electronic_signature = self.get_electronic_signature(top='保证人1姓名:', bottom='保证人2姓名:', t_pno='5')
1345 self.init_result['签字页-保证人1姓名'] = name
1346 self.init_result['签字页-保证人1签章'] = electronic_signature
1347
1348 name = self.get_key_value(key='保证人2姓名:')
1349 electronic_signature = self.get_electronic_signature(top='保证人2姓名:', bottom='保证人3姓名:', t_pno='5')
1350 self.init_result['签字页-保证人2姓名'] = name
1351 self.init_result['签字页-保证人2签章'] = electronic_signature
1352
1353 return self.init_result
1354
1355 def get_info_1(self):
1356 if len(self.pdf_info) > 0:
1357 contract_no = self.get_contract_no(page_num='0')
1358 self.init_result_1['合同编号'] = contract_no
1359 # 承租人姓名
1360 name = self.get_key_value(key='承租人:', page_num='0')
1361 self.init_result_1['承租人-姓名'] = name
1362 # 承租人证件号码
1363 _id = self.get_key_value(key='证件号码:', page_num='0')
1364 self.init_result_1['承租人-证件号码'] = _id
1365 # 销售经销商
1366 seller = self.get_key_value(key='销售经销商:', page_num='0')
1367 if seller['words'] == "":
1368 seller = self.get_value_by_findall('销售经销商:', '地址:', page_num='0')
1369 self.init_result_1['销售经销商'] = seller
1370 # 合同编号(正文)
1371 contract_no = self.get_contract_no_one()
1372 self.init_result_1['合同编号(正文)'] = contract_no
1373 # 签字页-承租人姓名
1374 name = self.get_key_value(key='姓名/名称:')
1375 self.init_result_1['签字页-承租人姓名'] = name
1376 # 签字页-承租人证件号码
1377 _id = self.get_key_value(key='自然人身份证件号码/法人执照号码:')
1378 self.init_result_1['签字页-承租人证件号码'] = _id
1379 # 签字页-承租人签章
1380 signature_role_1 = self.get_signature_role_1()
1381 self.init_result_1['签字页-承租人签章'] = signature_role_1
1382 # 签字页-销售经销商
1383 seller = self.get_key_value(key='销售经销商:')
1384 if seller['words'] == "":
1385 # 销售经销商:深圳市宝创汽车贸易有限公司南山分公司(请授权代表签字并请盖章)
1386 seller = self.get_value_by_findall('销售经销商:', '(请授权代表签字并请盖章)', page_num='3')
1387 self.init_result_1['签字页-销售经销商'] = seller
1388 # 经销商签章
1389 pass
1390 return self.init_result_1
1391
1392 def get_info_2(self):
1393 if len(self.pdf_info) > 0:
1394 contract_no = self.get_contract_no_dy()
1395 self.init_result_2['合同编号'] = contract_no
1396 # 合同编号(正文)
1397 contract_no = self.get_contract_no_one()
1398 self.init_result_2['合同编号(正文)'] = contract_no
1399 # 抵押人姓名/名称
1400 name, _id = self.get_dyr_name_id()
1401 self.init_result_2['抵押人姓名/名称'] = name
1402 self.init_result_2['抵押人证件号码'] = _id
1403 # 抵押人配偶信息
1404 name, _id = self.get_dyrpo_name_id()
1405 self.init_result_2['抵押人配偶姓名/名称'] = name
1406 self.init_result_2['抵押人配偶证件号码'] = _id
1407 # 车辆识别代码
1408 vin = self.get_key_value(key='车辆识别代码:')
1409 self.init_result_2['车辆识别代码'] = vin
1410 # 租金总额
1411 rent = self.get_key_value_position(key='租金总额')
1412 self.init_result_2['租金总额'] = rent
1413 # 融资租赁期限
1414 lease_term = self.get_key_value_position(key='融资租赁期限')
1415 self.init_result_2['融资租赁期限'] = lease_term
1416 # 签字页抵押人姓名和签章
1417 name = self.get_key_value(key='抵押人姓名:')
1418 electronic_signature = self.get_electronic_signature(top='抵押权人盖章', bottom='抵押人配偶姓名:', t_pno='1')
1419 self.init_result_2['签字页-抵押人姓名'] = name
1420 self.init_result_2['签字页-抵押人签章'] = electronic_signature
1421 # 签字页抵押人配偶姓名和签章
1422 name = self.get_key_value(key='抵押人配偶姓名:')
1423 electronic_signature = self.get_electronic_signature(top='抵押人配偶姓名:', bottom='日期', t_pno='1')
1424 self.init_result_2['签字页-抵押人配偶姓名'] = name
1425 self.init_result_2['签字页-抵押人配偶签章'] = electronic_signature
1426 return self.init_result_2
...\ No newline at end of file ...\ No newline at end of file
...@@ -6,9 +6,10 @@ ...@@ -6,9 +6,10 @@
6 # @Description : 6 # @Description :
7 7
8 from .get_char import Finder 8 from .get_char import Finder
9 from .get_char_fsm import Finder as FSMFinder
9 10
10 11
11 def predict(pdf_info, file_cls): 12 def predict(pdf_info, file_cls, is_fsm=False):
12 """Summary 13 """Summary
13 14
14 Args: 15 Args:
...@@ -58,6 +59,10 @@ def predict(pdf_info, file_cls): ...@@ -58,6 +59,10 @@ def predict(pdf_info, file_cls):
58 pdf_info = dict() 59 pdf_info = dict()
59 for pno, page_info in enumerate(pdf_info_1): 60 for pno, page_info in enumerate(pdf_info_1):
60 pdf_info[str(pno)] = page_info 61 pdf_info[str(pno)] = page_info
62
63 if is_fsm:
64 f = FSMFinder(pdf_info)
65 else:
61 f = Finder(pdf_info) 66 f = Finder(pdf_info)
62 if file_cls == 0: 67 if file_cls == 0:
63 results = f.get_info() 68 results = f.get_info()
......
1 WEP_FIELD = {
2 "0": {
3 'keys': {
4 '客户姓名': [('客户姓名', (r'^姓名.?$', r'^企业名称.?$'), 'top1', {})],
5 '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})],
6 '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})],
7 '合同价格(小写)': [('人民币', (r'^人民币¥.?$', ), 'top1', {})],
8 '客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', ), 'top1', {})],
9 '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})],
10 },
11 'value': {
12 '客户姓名': ('text', 'right', {'offset_tuple': (-1.1, 1, 0.3, 0)}, ''),
13 '证件类型': ('text', 'right', {'offset_tuple': (-1, 1, 0, 0)}, ''),
14 '证件号码': ('text', 'right', {'offset_tuple': (-1, 2, 0.3, 0)}, ''),
15 '合同价格(小写)': ('text', 'right', {'offset_tuple': (-1, 1, 0.3, 0)}, ''),
16 '客户签名': ('img', 'under', {'offset_tuple': (0, 0, 0, 4), 'rigorous': True}, '无'),
17 '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'),
18 },
19 }
20
21 }
22
23 MSI_FIELD = {
24 "0": {
25 'keys': {
26 '客户姓名': [('客户姓名', (r'^客户姓名.?$', r'^企业名称.?$'), 'top1', {})],
27 '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})],
28 '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})],
29 '合同价格(小写)': [('人民币', (r'^人民币¥.?$', ), 'top1', {})],
30 },
31 'value': {
32 '客户姓名': ('text', 'right', {'offset_tuple': (-1.2, 1, 0.3, 0)}, ''),
33 '证件类型': ('text', 'right', {'offset_tuple': (-1, 1, 0, 0)}, ''),
34 '证件号码': ('text', 'right', {'offset_tuple': (-1, 2, 0.3, 0)}, ''),
35 '合同价格(小写)': ('text', 'right', {'offset_tuple': (-1, 1, 0.3, 0)}, ''),
36 },
37 },
38 "1": {
39 'keys': {
40 '客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', ), 'top1', {})],
41 '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})],
42 },
43 'value': {
44 '客户签名': ('img', 'under', {'offset_tuple': (0, 0, 0, 4), 'rigorous': True}, '无'),
45 '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'),
46 },
47 }
48 }
49
50 SC_FIELD = {
51 "0": {
52 'keys': {
53 '姓名': [('姓名', (r'^姓名.?$', r'^企业名称.?$'), 'top1', {})],
54 '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})],
55 '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})],
56 '总价': [('总价', (r'^总价.?$', ), 'top1', {})],
57 },
58 'value': {
59 '姓名': ('text', 'right', {'offset_tuple': (-2, 8, 0.5, 0)}, ''),
60 '证件类型': ('text', 'right', {'offset_tuple': (-2, 6, 0.5, 0)}, ''),
61 '证件号码': ('text', 'right', {'offset_tuple': (-2, 6, 0.5, 0)}, ''),
62 '总价': ('text', 'right', {'offset_tuple': (-2, 12, 0.5, 0)}, ''),
63 },
64 },
65 "-1": {
66 'keys': {
67 '客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', r'^客户签名/盖章.*$'), 'top1', {})],
68 '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})],
69 },
70 'value': {
71 '客户签名': ('img', 'under', {'offset_tuple': (1.5, 1, 0, 4), 'rigorous': True}, '无'),
72 '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'),
73 },
74 }
75 }
1 from .retriever import Retriever
2 from .const import WEP_FIELD, MSI_FIELD, SC_FIELD
3 from .tools import pdf_info_rebuild
4
5 retriever_list = [Retriever(WEP_FIELD), Retriever(MSI_FIELD), Retriever(SC_FIELD)]
6
7 def predict(pdf_info, file_type=0):
8 retriever = retriever_list[file_type]
9 pdf_text_list, pdf_img_list = pdf_info_rebuild(pdf_info)
10 return retriever.get_target_fields(pdf_text_list, pdf_img_list)
11
12
1 from .retriever import HMHRetriever
2 from .tools import pdf_info_rebuild
3
4 hmh_retriever = HMHRetriever()
5
6 def predict(pdf_info):
7 pdf_text_list, _ = pdf_info_rebuild(pdf_info, fix_bbox=False)
8 return hmh_retriever.get_target_fields(pdf_text_list)
9
10
1 import re
2
3
4 class HMHRetriever:
5
6 def __init__(self):
7 self.words_str = 'words'
8 self.position_str = 'location'
9 self.fix_hava_str = '有'
10 self.default_position = [0, 0, 0, 0]
11 self.search_fields_list = [
12 ('借款/承租人姓名', ''),
13 ('证件号码', ''),
14 ('渠道', ''),
15 ('合同编号', ''),
16 ('借款人签字/盖章', '无'),
17 ]
18
19 def get_target_fields(self, pdf_text_list):
20 result = dict()
21 is_find_name_id_company, is_find_application_no, is_find_name_date = False, False, False
22 for bbox, text in pdf_text_list.pop(str(0), []):
23 # print(text)
24 if not is_find_name_id_company:
25 name_id_company_list = re.findall(r'姓名(.*)证件号码(.*)与(.*公司)', text)
26 for name_id_company_tuple in name_id_company_list:
27 if len(name_id_company_tuple) == 3:
28 result[self.search_fields_list[0][0]] = {
29 self.words_str: name_id_company_tuple[0].replace('\u3000', '').strip(),
30 self.position_str: bbox
31 }
32 result[self.search_fields_list[1][0]] = {
33 self.words_str: name_id_company_tuple[1].replace('\u3000', '').replace(')', '').replace(')', '').strip(),
34 self.position_str: bbox
35 }
36 result[self.search_fields_list[2][0]] = {
37 self.words_str: name_id_company_tuple[2],
38 self.position_str: bbox
39 }
40 is_find_name_id_company = True
41 break
42 if not is_find_application_no:
43 application_no_list = re.findall(r'合同编号.*(CH-B\d*-\d*).*', text)
44 if len(application_no_list) == 1:
45 result[self.search_fields_list[3][0]] = {
46 self.words_str: application_no_list[0],
47 self.position_str: bbox
48 }
49 is_find_application_no = True
50 if not is_find_name_date:
51 name_date_list = re.findall(r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})', text)
52 for name_date_tuple in name_date_list:
53 if len(name_date_tuple) == 2:
54 result[self.search_fields_list[4][0]] = {
55 # self.words_str: '{0} {1}'.format(name_date_tuple[0].replace('\u3000', '').strip(), name_date_tuple[1]),
56 self.words_str: self.fix_hava_str,
57 self.position_str: bbox
58 }
59 is_find_name_date = True
60 break
61
62 for find_key, default_value in self.search_fields_list:
63 if find_key not in result:
64 result[find_key] = {
65 self.words_str: default_value,
66 self.position_str: self.default_position,
67 }
68 # simple_result = []
69 # for key, value_dict in result.items():
70 # simple_result.append((key, value_dict[self.words_str]))
71
72 # return simple_result
73 return {"words_result": result}
74
75 class Retriever:
76
77 def __init__(self, target_fields):
78 self.keys_str = 'keys'
79 self.value_str = 'value'
80 self.text_str = 'text'
81 self.words_str = 'words'
82 self.position_str = 'position'
83 self.default_position = [-1, -1, -1, -1]
84 self.target_fields = target_fields
85 self.replace_map = {
86 'int': {
87 '(': '0'
88 }
89 }
90
91 @staticmethod
92 def key_top1(coordinates_list, key_coordinates):
93 # 关键词查找方向:最上面
94 coordinates_list.sort(key=lambda x: x[1])
95 return coordinates_list[0]
96
97 def key_right(self, coordinates_list, key_coordinates, offset_tuple, rigorous=False):
98 # 关键词查找方向:右侧
99 if len(coordinates_list) == 1:
100 return coordinates_list[0]
101
102 # 没有上一层关键词的坐标时,返回最上面的坐标
103 if key_coordinates is None:
104 return self.key_top1(coordinates_list, key_coordinates)
105
106 x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple)
107
108 x_min_find, find_key_coordinates = None, None
109 for x0, y0, x1, y1 in coordinates_list:
110 if rigorous:
111 is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max
112 else:
113 cent_x = x0 + ((x1 - x0) / 2)
114 cent_y = y0 + ((y1 - y0) / 2)
115 is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max
116 if is_eligible:
117 if x_min_find is None or x0 < x_min_find:
118 x_min_find = x0
119 find_key_coordinates = (x0, y0, x1, y1)
120
121 if find_key_coordinates is None:
122 return self.key_top1(coordinates_list, key_coordinates)
123 else:
124 return find_key_coordinates
125
126 def value_right(self, search_list, key_coordinates, offset_tuple, value_type=None, rigorous=False):
127 # 字段值查找方向:右侧
128 x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple)
129
130 x_min_find, value, coordinates = None, None, None
131 for (x0, y0, x1, y1), text in search_list:
132 if rigorous:
133 is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max
134 else:
135 cent_x = x0 + ((x1 - x0) / 2)
136 cent_y = y0 + ((y1 - y0) / 2)
137 is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max
138 if is_eligible:
139 if x_min_find is None or x0 < x_min_find:
140 if len(text.strip()) > 0:
141 x_min_find = x0
142 value = text
143 coordinates = (x0, y0, x1, y1)
144
145 if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str):
146 new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {})))
147 return new_value, coordinates
148
149 return value, coordinates
150
151 def value_under(self, search_list, key_coordinates, offset_tuple, value_type=None, append=False, rigorous=False):
152 # 字段值查找方向:下方
153 x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple)
154
155 find_list = []
156 for (x0, y0, x1, y1), text in search_list:
157 if rigorous:
158 is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max
159 else:
160 cent_x = x0 + ((x1 - x0) / 2)
161 cent_y = y0 + ((y1 - y0) / 2)
162 is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max
163 if is_eligible:
164 if len(text.strip()) > 0:
165 find_list.append((x0, y0, x1, y1, text))
166
167 if len(find_list) == 0:
168 return None, None
169 else:
170 find_list.sort(key=lambda x: (x[1], x[0]))
171 coordinates = find_list[0][:-1]
172 if append:
173 value = ''.join([text for _, _, _, _, text in find_list])
174 else:
175 value = find_list[0][-1]
176
177 if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str):
178 new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {})))
179 return new_value, coordinates
180
181 return value, coordinates
182
183 @staticmethod
184 def get_target_bbox(key_coordinates, offset_tuple):
185 offset_xmin, offset_xmax, offset_ymin, offset_ymax = offset_tuple
186
187 width = key_coordinates[2] - key_coordinates[0]
188 height = key_coordinates[-1] - key_coordinates[1]
189
190 x_min = key_coordinates[0] - (width * offset_xmin) # -1
191 x_max = key_coordinates[2] + (width * offset_xmax)
192 y_min = key_coordinates[1] - (height * offset_ymin) # -1
193 y_max = key_coordinates[-1] + (height * offset_ymax)
194 return x_min, y_min, x_max, y_max
195
196 def get_target_fields(self, pdf_text_list, pdf_img_list):
197 pdf_result = dict()
198
199 for pno_str, fields_dict in self.target_fields.items():
200 is_last_pno = False
201 if pno_str == '-1':
202 is_last_pno = True
203 pno_int_list = [int(pno_str) for pno_str in pdf_text_list.keys()]
204 pno_str = str(max(pno_int_list))
205
206 # 搜索关键词
207 key_text_info = dict()
208 for key_text_list in fields_dict[self.keys_str].values():
209 for key_text, key_re_tuple, _, _ in key_text_list:
210 for (x0, y0, x1, y1), text in pdf_text_list.get(pno_str, []):
211 for key_re in key_re_tuple:
212 if re.match(key_re, text):
213 key_text_info.setdefault(key_text, list()).append((x0, y0, x1, y1))
214
215 # 搜索关键词
216 key_coordinates_info = dict()
217 for field, key_text_list in fields_dict[self.keys_str].items():
218 last_key_coordinates = None
219 for key_text, _, direction, kwargs in key_text_list:
220 if key_text not in key_text_info:
221 last_key_coordinates = None
222 continue
223 last_key_coordinates = getattr(self, 'key_{0}'.format(direction))(
224 key_text_info[key_text],
225 last_key_coordinates,
226 **kwargs)
227
228 key_coordinates_info[field] = last_key_coordinates
229
230 # 搜索字段值
231 page_result = dict()
232 for field, (source, direction, kwargs, default_value) in fields_dict[self.value_str].items():
233 if not isinstance(key_coordinates_info.get(field), tuple):
234 page_result[field] = {
235 self.words_str: default_value,
236 self.position_str: self.default_position,
237 }
238 continue
239 value, coordinates = getattr(self, 'value_{0}'.format(direction))(
240 pdf_text_list.get(pno_str, []) if source == self.text_str else pdf_img_list.get(pno_str, []),
241 key_coordinates_info[field],
242 **kwargs
243 )
244 if not isinstance(value, str):
245 page_result[field] = {
246 self.words_str: default_value,
247 self.position_str: self.default_position,
248 }
249 else:
250 page_result[field] = {
251 self.words_str: value,
252 self.position_str: list(coordinates),
253 }
254
255 page_key = 'page_12' if is_last_pno else 'page_{0}'.format(int(pno_str) + 1)
256 pdf_result[page_key] = page_result
257
258 return pdf_result
1 def pdf_info_rebuild(pdf_info, fix_bbox=True):
2 pdf_text_info = dict()
3 pdf_img_info = dict()
4 for pno_str, page_info in pdf_info.items():
5 text_set = set()
6 for block in page_info['blocks']:
7 if block['type'] == 0:
8 # text有重复的现象
9 text_set.clear()
10 for line in block['lines']:
11 for span in line['spans']:
12 bbox, text = span['bbox'], span['text'].strip()
13 if len(text) != 0 and text not in text_set:
14 text_set.add(text)
15 # bbox的高,不准
16 if fix_bbox and bbox[-1] - bbox[1] < span['size']:
17 bbox[-1] = bbox[-1] + span['size']
18 pdf_text_info.setdefault(pno_str, list()).append([bbox, text])
19 elif block['type'] == 1:
20 pdf_img_info.setdefault(pno_str, list()).append((block['bbox'], '有'))
21
22 return pdf_text_info, pdf_img_info
...\ No newline at end of file ...\ No newline at end of file
1 import pyodbc
2
3 hil_sql = """
4 ALTER TABLE hil_ocr_result ADD fsm_wep_ocr nvarchar(max);
5 ALTER TABLE hil_ocr_result ADD fsm_msi_ocr nvarchar(max);
6 ALTER TABLE hil_ocr_result ADD fsm_sc_ocr nvarchar(max);
7 ALTER TABLE hil_se_ocr_result ADD fsm_wep_ocr nvarchar(max);
8 ALTER TABLE hil_se_ocr_result ADD fsm_msi_ocr nvarchar(max);
9 ALTER TABLE hil_se_ocr_result ADD fsm_sc_ocr nvarchar(max);
10 """
11
12 afc_sql = """
13 ALTER TABLE afc_ocr_result ADD fsm_wep_ocr nvarchar(max);
14 ALTER TABLE afc_ocr_result ADD fsm_msi_ocr nvarchar(max);
15 ALTER TABLE afc_ocr_result ADD fsm_sc_ocr nvarchar(max);
16 ALTER TABLE afc_se_ocr_result ADD fsm_wep_ocr nvarchar(max);
17 ALTER TABLE afc_se_ocr_result ADD fsm_msi_ocr nvarchar(max);
18 ALTER TABLE afc_se_ocr_result ADD fsm_sc_ocr nvarchar(max);
19 """
20
21 hil_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True)
22
23 hil_cursor = hil_cnxn.cursor()
24 hil_cursor.execute(hil_sql)
25
26 hil_cursor.close()
27 hil_cnxn.close()
28
29 afc_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True)
30
31 afc_cursor = afc_cnxn.cursor()
32 afc_cursor.execute(afc_sql)
33
34 afc_cursor.close()
35 afc_cnxn.close()
...@@ -8,13 +8,16 @@ from common.tools.comparison import cp ...@@ -8,13 +8,16 @@ from common.tools.comparison import cp
8 from common.mixins import LoggerMixin 8 from common.mixins import LoggerMixin
9 from rest_framework.permissions import IsAuthenticated 9 from rest_framework.permissions import IsAuthenticated
10 from apps.account.authentication import OAuth2AuthenticationWithUser 10 from apps.account.authentication import OAuth2AuthenticationWithUser
11 from apps.doc.models import NscInvoice
12 import json
13 from datetime import datetime
11 14
12 params = { 15 params = {
13 'invoiceCode': fields.Str(required=True, validate=validate.Length(max=128)), 16 'invoiceCode': fields.Str(required=True, validate=validate.Length(max=128)),
14 'invoiceNumber': fields.Str(required=True, validate=validate.Length(max=64)), 17 'invoiceNumber': fields.Str(required=True, validate=validate.Length(max=64)),
15 'issueDate': CustomDate(required=True), 18 'issueDate': CustomDate(required=True),
16 'buyerName': fields.Str(required=True, validate=validate.Length(max=64)), 19 'buyerName': fields.Str(required=True, validate=validate.Length(max=64)),
17 "buyerId": fields.Int(required=True), 20 "buyerId": fields.Str(required=True, validate=validate.Length(max=64)),
18 'vin': fields.Str(required=True, validate=validate.Length(max=128)), 21 'vin': fields.Str(required=True, validate=validate.Length(max=128)),
19 'dealer': fields.Str(required=False, validate=validate.Length(max=64)), 22 'dealer': fields.Str(required=False, validate=validate.Length(max=64)),
20 'priceWithVat': CustomDecimal(required=True), 23 'priceWithVat': CustomDecimal(required=True),
...@@ -29,7 +32,7 @@ input_args = { ...@@ -29,7 +32,7 @@ input_args = {
29 } 32 }
30 33
31 34
32 # poss 接口接收NSC 发票信息 35 # pos 接口接收NSC 发票信息
33 class NSCInvoiceView(GenericView): 36 class NSCInvoiceView(GenericView):
34 permission_classes = [IsAuthenticated] 37 permission_classes = [IsAuthenticated]
35 authentication_classes = [OAuth2AuthenticationWithUser] 38 authentication_classes = [OAuth2AuthenticationWithUser]
...@@ -50,6 +53,7 @@ class NSCInvoiceView(GenericView): ...@@ -50,6 +53,7 @@ class NSCInvoiceView(GenericView):
50 vat = content.get('vat', 0.0) 53 vat = content.get('vat', 0.0)
51 vat_rate = content.get('vatRate', 0.0) 54 vat_rate = content.get('vatRate', 0.0)
52 55
56 NscInvoice.objects.create(vin=vin, content=json.dumps(content), create_time=datetime.now())
53 return response.ok() 57 return response.ok()
54 58
55 59
...@@ -90,11 +94,17 @@ class DeMortgageView(GenericView): ...@@ -90,11 +94,17 @@ class DeMortgageView(GenericView):
90 'applicationName': application_name, 94 'applicationName': application_name,
91 'deMortgageDate': de_mortgage_date 95 'deMortgageDate': de_mortgage_date
92 } 96 }
93 de_mortgage_info = {} 97 de_mortgage_info = {'customer_name':'','applicationName':'','deMortgageDate':''}
94 # 绿本必须分开ocr 98 # 绿本必须分开ocr
95 for file_obj in files: 99 for file_obj in files:
96 info = PosHandler.de_mortgage_ocr_process1(file_obj) 100 info = PosHandler.de_mortgage_ocr_process1(file_obj)
97 de_mortgage_info.update(info) 101 if info.get('customerName') is not '':
102 de_mortgage_info['customerName'] = info.get('customerName')
103 if info.get('applicationName') is not '':
104 de_mortgage_info['applicationName'] = info.get('applicationName')
105 if info.get('deMortgageDate') is not '':
106 de_mortgage_info['deMortgageDate'] = info.get('deMortgageDate')
107 #de_mortgage_info.update(info)
98 108
99 request_pass = True 109 request_pass = True
100 fields_result = [] 110 fields_result = []
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!