e0d31a26 by 周伟奇

Merge branch 'fix/report_ca' into feature/uat-tmp

2 parents 4398d1df e2de024d
...@@ -11,7 +11,7 @@ PAGE_SIZE_DEFAULT = 10 ...@@ -11,7 +11,7 @@ PAGE_SIZE_DEFAULT = 10
11 FIXED_APPLICATION_ID_PREFIX = 'CH-S' 11 FIXED_APPLICATION_ID_PREFIX = 'CH-S'
12 12
13 DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACTMANAGEMENT'] 13 DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACTMANAGEMENT']
14 DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT'] 14 DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT', 'OVP']
15 COMPARE_DOC_SCHEME_LIST = ['CA', 'SE'] 15 COMPARE_DOC_SCHEME_LIST = ['CA', 'SE']
16 16
17 HIL_PREFIX = 'HIL' 17 HIL_PREFIX = 'HIL'
...@@ -1057,7 +1057,25 @@ HIL_CONTRACT_2_CLASSIFY = 44 ...@@ -1057,7 +1057,25 @@ HIL_CONTRACT_2_CLASSIFY = 44
1057 HIL_CONTRACT_3_CN_NAME = '车辆处置协议' 1057 HIL_CONTRACT_3_CN_NAME = '车辆处置协议'
1058 HIL_CONTRACT_3_CLASSIFY = 45 1058 HIL_CONTRACT_3_CLASSIFY = 45
1059 1059
1060 CONTRACT_SET = {CONTRACT_QRS_CLASSIFY, CONTRACT_CLASSIFY, HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_2_CLASSIFY, HIL_CONTRACT_3_CLASSIFY} 1060 FSM_CONTRACT_WEP_CN_NAME = '延长保修合同'
1061 FSM_CONTRACT_WEP_CLASSIFY = 51
1062
1063 FSM_CONTRACT_MSI_CN_NAME = '长悦保养合同'
1064 FSM_CONTRACT_MSI_CLASSIFY = 52
1065
1066 FSM_CONTRACT_SC_CN_NAME = '汽车销售合同'
1067 FSM_CONTRACT_SC_CLASSIFY = 53
1068
1069 CONTRACT_SET = {
1070 CONTRACT_QRS_CLASSIFY,
1071 CONTRACT_CLASSIFY,
1072 HIL_CONTRACT_1_CLASSIFY,
1073 HIL_CONTRACT_2_CLASSIFY,
1074 HIL_CONTRACT_3_CLASSIFY,
1075 FSM_CONTRACT_WEP_CLASSIFY,
1076 FSM_CONTRACT_MSI_CLASSIFY,
1077 FSM_CONTRACT_SC_CLASSIFY,
1078 }
1061 1079
1062 CONTRACT_MAP = { 1080 CONTRACT_MAP = {
1063 HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME, 1081 HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME,
...@@ -1065,8 +1083,13 @@ CONTRACT_MAP = { ...@@ -1065,8 +1083,13 @@ CONTRACT_MAP = {
1065 HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME, 1083 HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME,
1066 CONTRACT_CLASSIFY: CONTRACT_CN_NAME, 1084 CONTRACT_CLASSIFY: CONTRACT_CN_NAME,
1067 CONTRACT_QRS_CLASSIFY: CONTRACT_QRS_CN_NAME, 1085 CONTRACT_QRS_CLASSIFY: CONTRACT_QRS_CN_NAME,
1086 FSM_CONTRACT_WEP_CLASSIFY: FSM_CONTRACT_WEP_CN_NAME,
1087 FSM_CONTRACT_MSI_CLASSIFY: FSM_CONTRACT_MSI_CN_NAME,
1088 FSM_CONTRACT_SC_CLASSIFY: FSM_CONTRACT_SC_CN_NAME,
1068 } 1089 }
1069 1090
1091 FSM_CONTRACT_CLASSIFY_SET = {FSM_CONTRACT_WEP_CLASSIFY, FSM_CONTRACT_MSI_CLASSIFY, FSM_CONTRACT_SC_CLASSIFY}
1092
1070 # 保单 1093 # 保单
1071 INSURANCE_CN_NAME = '保单' 1094 INSURANCE_CN_NAME = '保单'
1072 INSURANCE_CLASSIFY = 42 1095 INSURANCE_CLASSIFY = 42
...@@ -1214,6 +1237,11 @@ BS_FIELD = 'bss_ocr' ...@@ -1214,6 +1237,11 @@ BS_FIELD = 'bss_ocr'
1214 HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr' 1237 HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr'
1215 HIL_CONTRACT_2_FIELD = 'hil_contract_2_ocr' 1238 HIL_CONTRACT_2_FIELD = 'hil_contract_2_ocr'
1216 HIL_CONTRACT_3_FIELD = 'hil_contract_3_ocr' 1239 HIL_CONTRACT_3_FIELD = 'hil_contract_3_ocr'
1240 FSM_CONTRACT_WEP_FIELD = 'fsm_wep_ocr'
1241 FSM_CONTRACT_MSI_FIELD = 'fsm_msi_ocr'
1242 FSM_CONTRACT_SC_FIELD = 'fsm_sc_ocr'
1243
1244
1217 BS_CLASSIFY = 10089 1245 BS_CLASSIFY = 10089
1218 1246
1219 RESULT_MAPPING = { 1247 RESULT_MAPPING = {
...@@ -1238,6 +1266,9 @@ RESULT_MAPPING = { ...@@ -1238,6 +1266,9 @@ RESULT_MAPPING = {
1238 HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD, 1266 HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD,
1239 HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_FIELD, 1267 HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_FIELD,
1240 HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_FIELD, 1268 HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_FIELD,
1269 FSM_CONTRACT_WEP_CLASSIFY: FSM_CONTRACT_WEP_FIELD,
1270 FSM_CONTRACT_MSI_CLASSIFY: FSM_CONTRACT_MSI_FIELD,
1271 FSM_CONTRACT_SC_CLASSIFY: FSM_CONTRACT_SC_FIELD,
1241 } 1272 }
1242 1273
1243 CA_ADD_COMPARE_FIELDS = (IC_OCR_FIELD, BL_OCR_FIELD, BS_FIELD) 1274 CA_ADD_COMPARE_FIELDS = (IC_OCR_FIELD, BL_OCR_FIELD, BS_FIELD)
...@@ -2313,29 +2344,42 @@ APPLICANT_TYPE_MAP = { ...@@ -2313,29 +2344,42 @@ APPLICANT_TYPE_MAP = {
2313 2344
2314 APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager'] 2345 APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager']
2315 2346
2316 FILE_NAME_PREFIX_MAP = { 2347 # FILE_NAME_PREFIX_MAP = {
2317 AFC_PREFIX: [ 2348 # AFC_PREFIX: [
2318 ((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'), 2349 # ((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'),
2319 ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), 2350 # ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
2320 ], 2351 # ],
2321 HIL_PREFIX: [ 2352 # HIL_PREFIX: [
2322 ((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'), 2353 # ((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'),
2323 ((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'), 2354 # ((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'),
2324 ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), 2355 # ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
2325 ] 2356 # ]
2326 } 2357 # }
2327 2358
2328 ECONTRACT_KEYWORDS_MAP = { 2359 ECONTRACT_KEYWORDS_MAP = {
2329 AFC_PREFIX: [ 2360 AFC_PREFIX: [
2330 ('抵押贷款合同', CONTRACT_CLASSIFY), 2361 ('抵押贷款合同', CONTRACT_CLASSIFY),
2331 ('送达地址确认书', CONTRACT_QRS_CLASSIFY), 2362 ('送达地址确认书', CONTRACT_QRS_CLASSIFY),
2332 # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0), 2363 ('抵押登记豁免函', HMH_CLASSIFY),
2333 ], 2364 ],
2334 HIL_PREFIX: [ 2365 HIL_PREFIX: [
2335 ('售后回租合同', HIL_CONTRACT_1_CLASSIFY), 2366 ('售后回租合同', HIL_CONTRACT_1_CLASSIFY),
2336 ('租赁抵押合同', HIL_CONTRACT_2_CLASSIFY), 2367 ('租赁抵押合同', HIL_CONTRACT_2_CLASSIFY),
2337 ('车辆处置协议', HIL_CONTRACT_3_CLASSIFY), 2368 ('车辆处置协议', HIL_CONTRACT_3_CLASSIFY),
2338 # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0), 2369 ('抵押登记豁免函', HMH_CLASSIFY),
2370 ]
2371 }
2372
2373 FSM_ECONTRACT_KEYWORDS_MAP = {
2374 AFC_PREFIX: [
2375 ('延长保修条款与条件', FSM_CONTRACT_WEP_CLASSIFY),
2376 ('长悦保养套餐服务合约', FSM_CONTRACT_MSI_CLASSIFY),
2377 ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY),
2378 ],
2379 HIL_PREFIX: [
2380 ('延长保修条款与条件', FSM_CONTRACT_WEP_CLASSIFY),
2381 ('长悦保养套餐服务合同', FSM_CONTRACT_MSI_CLASSIFY),
2382 ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY),
2339 ] 2383 ]
2340 } 2384 }
2341 2385
...@@ -2345,6 +2389,12 @@ HIL_CONTRACT_TYPE_MAP = { ...@@ -2345,6 +2389,12 @@ HIL_CONTRACT_TYPE_MAP = {
2345 str(HIL_CONTRACT_3_CLASSIFY): 1, 2389 str(HIL_CONTRACT_3_CLASSIFY): 1,
2346 } 2390 }
2347 2391
2392 FSM_CONTRACT_TYPE_MAP = {
2393 str(FSM_CONTRACT_WEP_CLASSIFY): 0,
2394 str(FSM_CONTRACT_MSI_CLASSIFY): 1,
2395 str(FSM_CONTRACT_SC_CLASSIFY): 2,
2396 }
2397
2348 RESULT_MAP = { 2398 RESULT_MAP = {
2349 0: None, 2399 0: None,
2350 1: True, 2400 1: True,
......
...@@ -20,6 +20,8 @@ from common.tools.file_tools import get_pwd_list_from_str, extract_zip_or_rar, g ...@@ -20,6 +20,8 @@ from common.tools.file_tools import get_pwd_list_from_str, extract_zip_or_rar, g
20 from common.tools.pdf_to_img import PDFHandler 20 from common.tools.pdf_to_img import PDFHandler
21 from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict 21 from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict
22 from common.electronic_hil_contract.hil_contract_ocr import predict as hil_predict 22 from common.electronic_hil_contract.hil_contract_ocr import predict as hil_predict
23 from common.fsm_econtract.fsm_contract_ocr import predict as fsm_predict
24 from common.fsm_econtract.hmh_ocr import predict as hmh_predict
23 from apps.doc import consts 25 from apps.doc import consts
24 # from apps.doc.ocr.edms import EDMS, rh 26 # from apps.doc.ocr.edms import EDMS, rh
25 from apps.doc.ocr.ecm import ECM, rh 27 from apps.doc.ocr.ecm import ECM, rh
...@@ -996,7 +998,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -996,7 +998,7 @@ class Command(BaseCommand, LoggerMixin):
996 res.setdefault(consts.ALL_POSITION_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get( 998 res.setdefault(consts.ALL_POSITION_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get(
997 consts.ALL_POSITION_KEY, {}).get(key1, []) 999 consts.ALL_POSITION_KEY, {}).get(key1, [])
998 license_summary[classify] = [res] 1000 license_summary[classify] = [res]
999 else: 1001 elif classify in consts.SE_HIL_CON_MAP: # TODO FSM新合同写入数据库用于比对
1000 res = {} 1002 res = {}
1001 for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items(): 1003 for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items():
1002 if pno1 is None: 1004 if pno1 is None:
...@@ -1442,7 +1444,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1442,7 +1444,7 @@ class Command(BaseCommand, LoggerMixin):
1442 self.log_base, traceback.format_exc())) 1444 self.log_base, traceback.format_exc()))
1443 error_list.append(1) 1445 error_list.append(1)
1444 return 1446 return
1445 else: # e-contract 1447 else: # e-contract or or e-fsm-contract or e-hmh
1446 try: 1448 try:
1447 # pdf下载 处理 图片存储 识别 1449 # pdf下载 处理 图片存储 识别
1448 for times in range(consts.RETRY_TIMES): 1450 for times in range(consts.RETRY_TIMES):
...@@ -1472,8 +1474,10 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1472,8 +1474,10 @@ class Command(BaseCommand, LoggerMixin):
1472 self.online_log.error('{0} [process error (db save)] [error={1}]'.format( 1474 self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
1473 self.log_base, traceback.format_exc())) 1475 self.log_base, traceback.format_exc()))
1474 1476
1477 # AFC合同
1475 if classify_1_str == str(consts.CONTRACT_CLASSIFY): 1478 if classify_1_str == str(consts.CONTRACT_CLASSIFY):
1476 ocr_result = afc_predict(pdf_handler.pdf_info) 1479 is_fsm = doc.data_source == consts.DATA_SOURCE_LIST[3]
1480 ocr_result = afc_predict(pdf_handler.pdf_info, is_fsm=is_fsm)
1477 page_res = {} 1481 page_res = {}
1478 for page_num, page_info in ocr_result.get('page_info', {}).items(): 1482 for page_num, page_info in ocr_result.get('page_info', {}).items():
1479 if isinstance(page_num, str) and page_num.startswith('page_'): 1483 if isinstance(page_num, str) and page_num.startswith('page_'):
...@@ -1483,6 +1487,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1483,6 +1487,7 @@ class Command(BaseCommand, LoggerMixin):
1483 'page_num': page_num, 1487 'page_num': page_num,
1484 'page_info': page_info 1488 'page_info': page_info
1485 } 1489 }
1490 # 送达地址确认书
1486 elif classify_1_str == str(consts.CONTRACT_QRS_CLASSIFY): 1491 elif classify_1_str == str(consts.CONTRACT_QRS_CLASSIFY):
1487 ocr_result = afc_predict(pdf_handler.pdf_info, is_qrs=True) 1492 ocr_result = afc_predict(pdf_handler.pdf_info, is_qrs=True)
1488 page_num = 'page_1' 1493 page_num = 'page_1'
...@@ -1493,9 +1498,11 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1493,9 +1498,11 @@ class Command(BaseCommand, LoggerMixin):
1493 'page_info': ocr_result.pop(page_num, {}) 1498 'page_info': ocr_result.pop(page_num, {})
1494 } 1499 }
1495 } 1500 }
1496 else: 1501 # HIL合同
1502 elif classify_1_str in consts.HIL_CONTRACT_TYPE_MAP:
1503 is_fsm = doc.data_source == consts.DATA_SOURCE_LIST[3]
1497 file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str) 1504 file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str)
1498 ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1) 1505 ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1, is_fsm=is_fsm)
1499 rebuild_res_1 = {} 1506 rebuild_res_1 = {}
1500 page_res = {} 1507 page_res = {}
1501 for field_name, field_info in ocr_result_1.items(): 1508 for field_name, field_info in ocr_result_1.items():
...@@ -1508,28 +1515,55 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1508,28 +1515,55 @@ class Command(BaseCommand, LoggerMixin):
1508 'page_num': page_num, 1515 'page_num': page_num,
1509 'page_info': page_info 1516 'page_info': page_info
1510 } 1517 }
1518 # FSM合同 WEP MSI SC
1519 elif classify_1_str in consts.FSM_CONTRACT_TYPE_MAP:
1520 file_type = consts.FSM_CONTRACT_TYPE_MAP.get(classify_1_str)
1521 ocr_result = fsm_predict(pdf_handler.pdf_info, file_type)
1522 page_res = {}
1523 for page_num, page_info in ocr_result.items():
1524 if isinstance(page_num, str) and page_num.startswith('page_'):
1525 page_res[page_num] = {
1526 'classify': int(classify_1_str),
1527 'page_num': page_num,
1528 'page_info': page_info
1529 }
1530 # hmh
1531 # else:
1532 # pass
1533
1511 1534
1512 contract_res = {} 1535 contract_res = {}
1513 for img_path_tmp, page_key in pdf_handler.img_path_pno_list: 1536 for img_path_tmp, page_key in pdf_handler.img_path_pno_list:
1514 if page_key in page_res: 1537 if classify_1_str == str(consts.HMH_CLASSIFY):
1515 img_contract_res = { 1538 img_contract_res = {
1516 'code': 1, 1539 'code': 1,
1517 'data': [ 1540 'data': [
1518 { 1541 {
1519 'classify': page_res[page_key].pop('classify', consts.OTHER_CLASSIFY), 1542 'classify': consts.HMH_CLASSIFY,
1520 'data': page_res[page_key] 1543 'data': hmh_predict(pdf_handler.pdf_info)
1521 } 1544 }
1522 ] 1545 ]
1523 } 1546 }
1524 else: 1547 else:
1525 img_contract_res = { 1548 if page_key in page_res:
1526 'code': 1, 1549 img_contract_res = {
1527 'data': [ 1550 'code': 1,
1528 { 1551 'data': [
1529 'classify': int(classify_1_str), 1552 {
1530 } 1553 'classify': page_res[page_key].pop('classify', consts.OTHER_CLASSIFY),
1531 ] 1554 'data': page_res[page_key]
1532 } 1555 }
1556 ]
1557 }
1558 else:
1559 img_contract_res = {
1560 'code': 1,
1561 'data': [
1562 {
1563 'classify': int(classify_1_str),
1564 }
1565 ]
1566 }
1533 contract_res[img_path_tmp] = img_contract_res 1567 contract_res[img_path_tmp] = img_contract_res
1534 1568
1535 with lock: 1569 with lock:
......
...@@ -36,6 +36,7 @@ class RequestTrigger(NamedEnum): ...@@ -36,6 +36,7 @@ class RequestTrigger(NamedEnum):
36 DOCUPLOAD = (3, 'Document Upload') 36 DOCUPLOAD = (3, 'Document Upload')
37 SUBMITING = (4, 'Submiting') 37 SUBMITING = (4, 'Submiting')
38 UPLOADING = (5, 'Uploading') 38 UPLOADING = (5, 'Uploading')
39 OVP = (6, 'OVP')
39 40
40 41
41 class FailureReason(NamedEnum): 42 class FailureReason(NamedEnum):
......
...@@ -780,10 +780,12 @@ class BSWorkbook(Workbook): ...@@ -780,10 +780,12 @@ class BSWorkbook(Workbook):
780 if field_str is not None: 780 if field_str is not None:
781 count_list.append((field_str, count)) 781 count_list.append((field_str, count))
782 782
783 def contract_rebuild(self, contract_result_dict): 783 def contract_rebuild(self, contract_result_dict, is_ca=False):
784 for classify, contract_result in contract_result_dict.items(): 784 for classify, contract_result in contract_result_dict.items():
785 if len(contract_result) == 0: 785 if len(contract_result) == 0:
786 continue 786 continue
787 if is_ca and classify not in consts.FSM_CONTRACT_CLASSIFY_SET:
788 continue
787 ws = self.create_sheet(consts.CONTRACT_MAP.get(classify)) 789 ws = self.create_sheet(consts.CONTRACT_MAP.get(classify))
788 for i in range(30): 790 for i in range(30):
789 if str(i) in contract_result: 791 if str(i) in contract_result:
...@@ -906,6 +908,7 @@ class BSWorkbook(Workbook): ...@@ -906,6 +908,7 @@ class BSWorkbook(Workbook):
906 else: 908 else:
907 self.bs_rebuild(bs_summary, res_count_tuple, metadata) 909 self.bs_rebuild(bs_summary, res_count_tuple, metadata)
908 self.license_rebuild(license_summary, document_scheme, count_list) 910 self.license_rebuild(license_summary, document_scheme, count_list)
911 self.contract_rebuild(contract_result, True)
909 self.move_res_sheet() 912 self.move_res_sheet()
910 self.remove_base_sheet() 913 self.remove_base_sheet()
911 return count_list 914 return count_list
......
...@@ -602,13 +602,22 @@ class UploadDocView(GenericView, DocHandler): ...@@ -602,13 +602,22 @@ class UploadDocView(GenericView, DocHandler):
602 is_zip = False 602 is_zip = False
603 603
604 classify_1 = 0 604 classify_1 = 0
605 # 电子合同 605 # 电子合同 Econtract or OVP(FSM)
606 if data_source == consts.DATA_SOURCE_LIST[-1] and document_scheme == consts.DOC_SCHEME_LIST[1]: 606 if data_source == consts.DATA_SOURCE_LIST[2] or data_source == consts.DATA_SOURCE_LIST[3]:
607 for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix): 607 if document_scheme == consts.DOC_SCHEME_LIST[1]:
608 for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix):
609 if keyword in document_name:
610 classify_1 = classify_1_tmp
611 break
612 # FSM合同:WEP/MSI/SC
613 elif data_source == consts.DATA_SOURCE_LIST[0] and document_scheme == consts.DOC_SCHEME_LIST[0]:
614 for keyword, classify_1_tmp in consts.FSM_ECONTRACT_KEYWORDS_MAP.get(prefix):
608 if keyword in document_name: 615 if keyword in document_name:
609 classify_1 = classify_1_tmp 616 classify_1 = classify_1_tmp
610 break 617 break
611 elif document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \ 618
619
620 if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \
612 or document_name.endswith('.RAR'): 621 or document_name.endswith('.RAR'):
613 is_zip = True 622 is_zip = True
614 623
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
6 # @Description : 6 # @Description :
7 7
8 from .get_char import Finder 8 from .get_char import Finder
9 from .get_char_fsm import Finder as FSMFinder
9 import numpy as np 10 import numpy as np
10 11
11 12
...@@ -23,7 +24,7 @@ def extract_info(ocr_results): ...@@ -23,7 +24,7 @@ def extract_info(ocr_results):
23 return {'page_1': {'合同编号': contract_no}} 24 return {'page_1': {'合同编号': contract_no}}
24 25
25 26
26 def predict(pdf_info, is_qrs=False): 27 def predict(pdf_info, is_qrs=False, is_fsm=False):
27 ocr_results = {} 28 ocr_results = {}
28 for pno in pdf_info: 29 for pno in pdf_info:
29 ocr_results[pno] = {} 30 ocr_results[pno] = {}
...@@ -50,7 +51,10 @@ def predict(pdf_info, is_qrs=False): ...@@ -50,7 +51,10 @@ def predict(pdf_info, is_qrs=False):
50 results = extract_info(ocr_results) 51 results = extract_info(ocr_results)
51 else: 52 else:
52 # 输入是整个 PDF 中的信息 53 # 输入是整个 PDF 中的信息
53 f = Finder(pdf_info, ocr_results=ocr_results) 54 if is_fsm:
55 f = FSMFinder(pdf_info, ocr_results=ocr_results)
56 else:
57 f = Finder(pdf_info, ocr_results=ocr_results)
54 results = f.get_info() 58 results = f.get_info()
55 return results 59 return results
56 60
......
...@@ -6,9 +6,10 @@ ...@@ -6,9 +6,10 @@
6 # @Description : 6 # @Description :
7 7
8 from .get_char import Finder 8 from .get_char import Finder
9 from .get_char_fsm import Finder as FSMFinder
9 10
10 11
11 def predict(pdf_info, file_cls): 12 def predict(pdf_info, file_cls, is_fsm=False):
12 """Summary 13 """Summary
13 14
14 Args: 15 Args:
...@@ -58,7 +59,11 @@ def predict(pdf_info, file_cls): ...@@ -58,7 +59,11 @@ def predict(pdf_info, file_cls):
58 pdf_info = dict() 59 pdf_info = dict()
59 for pno, page_info in enumerate(pdf_info_1): 60 for pno, page_info in enumerate(pdf_info_1):
60 pdf_info[str(pno)] = page_info 61 pdf_info[str(pno)] = page_info
61 f = Finder(pdf_info) 62
63 if is_fsm:
64 f = FSMFinder(pdf_info)
65 else:
66 f = Finder(pdf_info)
62 if file_cls == 0: 67 if file_cls == 0:
63 results = f.get_info() 68 results = f.get_info()
64 if file_cls == 1: 69 if file_cls == 1:
......
1 WEP_FIELD = {
2 "0": {
3 'keys': {
4 '客户姓名': [('客户姓名', (r'^客户姓名.?$', r'^企业名称.?$'), 'top1', {})],
5 '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})],
6 '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})],
7 '合同价格(小写)': [('人民币', (r'^人民币¥.?$', ), 'top1', {})],
8 '客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', ), 'top1', {})],
9 '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})],
10 },
11 'value': {
12 '客户姓名': ('text', 'right', {'offset_tuple': (-1.1, 1, 0.3, 0)}, ''),
13 '证件类型': ('text', 'right', {'offset_tuple': (-1, 1, 0, 0)}, ''),
14 '证件号码': ('text', 'right', {'offset_tuple': (-1, 2, 0.3, 0)}, ''),
15 '合同价格(小写)': ('text', 'right', {'offset_tuple': (-1, 1, 0.3, 0)}, ''),
16 '客户签名': ('img', 'under', {'offset_tuple': (0, 0, 0, 4), 'rigorous': True}, '无'),
17 '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'),
18 },
19 }
20
21 }
22
23 MSI_FIELD = {
24 "0": {
25 'keys': {
26 '客户姓名': [('客户姓名', (r'^客户姓名.?$', r'^企业名称.?$'), 'top1', {})],
27 '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})],
28 '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})],
29 '合同价格(小写)': [('人民币', (r'^人民币¥.?$', ), 'top1', {})],
30 },
31 'value': {
32 '客户姓名': ('text', 'right', {'offset_tuple': (-1.2, 1, 0.3, 0)}, ''),
33 '证件类型': ('text', 'right', {'offset_tuple': (-1, 1, 0, 0)}, ''),
34 '证件号码': ('text', 'right', {'offset_tuple': (-1, 2, 0.3, 0)}, ''),
35 '合同价格(小写)': ('text', 'right', {'offset_tuple': (-1, 1, 0.3, 0)}, ''),
36 },
37 },
38 "1": {
39 'keys': {
40 '客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', ), 'top1', {})],
41 '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})],
42 },
43 'value': {
44 '客户签名': ('img', 'under', {'offset_tuple': (0, 0, 0, 4), 'rigorous': True}, '无'),
45 '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'),
46 },
47 }
48 }
49
50 SC_FIELD = {
51 "0": {
52 'keys': {
53 '姓名': [('姓名', (r'^姓名.?$', r'^企业名称.?$'), 'top1', {})],
54 '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})],
55 '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})],
56 '总价': [('总价', (r'^总价.?$', ), 'top1', {})],
57 },
58 'value': {
59 '姓名': ('text', 'right', {'offset_tuple': (-2, 8, 0.5, 0)}, ''),
60 '证件类型': ('text', 'right', {'offset_tuple': (-2, 6, 0.5, 0)}, ''),
61 '证件号码': ('text', 'right', {'offset_tuple': (-2, 6, 0.5, 0)}, ''),
62 '总价': ('text', 'right', {'offset_tuple': (-2, 12, 0.5, 0)}, ''),
63 },
64 },
65 "-1": {
66 'keys': {
67 '客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', r'^客户签名/盖章.*$'), 'top1', {})],
68 '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})],
69 },
70 'value': {
71 '客户签名': ('img', 'under', {'offset_tuple': (1.5, 1, 0, 4), 'rigorous': True}, '无'),
72 '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'),
73 },
74 }
75 }
1 from .retriever import Retriever
2 from .const import WEP_FIELD, MSI_FIELD, SC_FIELD
3 from .tools import pdf_info_rebuild
4
5 retriever_list = [Retriever(WEP_FIELD), Retriever(MSI_FIELD), Retriever(SC_FIELD)]
6
7 def predict(pdf_info, file_type=0):
8 retriever = retriever_list[file_type]
9 pdf_text_list, pdf_img_list = pdf_info_rebuild(pdf_info)
10 return retriever.get_target_fields(pdf_text_list, pdf_img_list)
11
12
1 from .retriever import HMHRetriever
2 from .tools import pdf_info_rebuild
3
4 hmh_retriever = HMHRetriever()
5
6 def predict(pdf_info):
7 pdf_text_list, _ = pdf_info_rebuild(pdf_info, fix_bbox=False)
8 return hmh_retriever.get_target_fields(pdf_text_list)
9
10
1 def pdf_info_rebuild(pdf_info, fix_bbox=True):
2 pdf_text_info = dict()
3 pdf_img_info = dict()
4 for pno_str, page_info in pdf_info.items():
5 text_set = set()
6 for block in page_info['blocks']:
7 if block['type'] == 0:
8 # text有重复的现象
9 text_set.clear()
10 for line in block['lines']:
11 for span in line['spans']:
12 bbox, text = span['bbox'], span['text'].strip()
13 if len(text) != 0 and text not in text_set:
14 text_set.add(text)
15 # bbox的高,不准
16 if fix_bbox and bbox[-1] - bbox[1] < span['size']:
17 bbox[-1] = bbox[-1] + span['size']
18 pdf_text_info.setdefault(pno_str, list()).append([bbox, text])
19 elif block['type'] == 1:
20 pdf_img_info.setdefault(pno_str, list()).append((block['bbox'], '有'))
21
22 return pdf_text_info, pdf_img_info
...\ No newline at end of file ...\ No newline at end of file
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!