Merge branch 'fix/report_ca' into feature/uat-tmp
Showing
14 changed files
with
272 additions
and
47 deletions
... | @@ -11,7 +11,7 @@ PAGE_SIZE_DEFAULT = 10 | ... | @@ -11,7 +11,7 @@ PAGE_SIZE_DEFAULT = 10 |
11 | FIXED_APPLICATION_ID_PREFIX = 'CH-S' | 11 | FIXED_APPLICATION_ID_PREFIX = 'CH-S' |
12 | 12 | ||
13 | DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACTMANAGEMENT'] | 13 | DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACTMANAGEMENT'] |
14 | DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT'] | 14 | DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT', 'OVP'] |
15 | COMPARE_DOC_SCHEME_LIST = ['CA', 'SE'] | 15 | COMPARE_DOC_SCHEME_LIST = ['CA', 'SE'] |
16 | 16 | ||
17 | HIL_PREFIX = 'HIL' | 17 | HIL_PREFIX = 'HIL' |
... | @@ -1057,7 +1057,25 @@ HIL_CONTRACT_2_CLASSIFY = 44 | ... | @@ -1057,7 +1057,25 @@ HIL_CONTRACT_2_CLASSIFY = 44 |
1057 | HIL_CONTRACT_3_CN_NAME = '车辆处置协议' | 1057 | HIL_CONTRACT_3_CN_NAME = '车辆处置协议' |
1058 | HIL_CONTRACT_3_CLASSIFY = 45 | 1058 | HIL_CONTRACT_3_CLASSIFY = 45 |
1059 | 1059 | ||
1060 | CONTRACT_SET = {CONTRACT_QRS_CLASSIFY, CONTRACT_CLASSIFY, HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_2_CLASSIFY, HIL_CONTRACT_3_CLASSIFY} | 1060 | FSM_CONTRACT_WEP_CN_NAME = '延长保修合同' |
1061 | FSM_CONTRACT_WEP_CLASSIFY = 51 | ||
1062 | |||
1063 | FSM_CONTRACT_MSI_CN_NAME = '长悦保养合同' | ||
1064 | FSM_CONTRACT_MSI_CLASSIFY = 52 | ||
1065 | |||
1066 | FSM_CONTRACT_SC_CN_NAME = '汽车销售合同' | ||
1067 | FSM_CONTRACT_SC_CLASSIFY = 53 | ||
1068 | |||
1069 | CONTRACT_SET = { | ||
1070 | CONTRACT_QRS_CLASSIFY, | ||
1071 | CONTRACT_CLASSIFY, | ||
1072 | HIL_CONTRACT_1_CLASSIFY, | ||
1073 | HIL_CONTRACT_2_CLASSIFY, | ||
1074 | HIL_CONTRACT_3_CLASSIFY, | ||
1075 | FSM_CONTRACT_WEP_CLASSIFY, | ||
1076 | FSM_CONTRACT_MSI_CLASSIFY, | ||
1077 | FSM_CONTRACT_SC_CLASSIFY, | ||
1078 | } | ||
1061 | 1079 | ||
1062 | CONTRACT_MAP = { | 1080 | CONTRACT_MAP = { |
1063 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME, | 1081 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME, |
... | @@ -1065,8 +1083,13 @@ CONTRACT_MAP = { | ... | @@ -1065,8 +1083,13 @@ CONTRACT_MAP = { |
1065 | HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME, | 1083 | HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME, |
1066 | CONTRACT_CLASSIFY: CONTRACT_CN_NAME, | 1084 | CONTRACT_CLASSIFY: CONTRACT_CN_NAME, |
1067 | CONTRACT_QRS_CLASSIFY: CONTRACT_QRS_CN_NAME, | 1085 | CONTRACT_QRS_CLASSIFY: CONTRACT_QRS_CN_NAME, |
1086 | FSM_CONTRACT_WEP_CLASSIFY: FSM_CONTRACT_WEP_CN_NAME, | ||
1087 | FSM_CONTRACT_MSI_CLASSIFY: FSM_CONTRACT_MSI_CN_NAME, | ||
1088 | FSM_CONTRACT_SC_CLASSIFY: FSM_CONTRACT_SC_CN_NAME, | ||
1068 | } | 1089 | } |
1069 | 1090 | ||
1091 | FSM_CONTRACT_CLASSIFY_SET = {FSM_CONTRACT_WEP_CLASSIFY, FSM_CONTRACT_MSI_CLASSIFY, FSM_CONTRACT_SC_CLASSIFY} | ||
1092 | |||
1070 | # 保单 | 1093 | # 保单 |
1071 | INSURANCE_CN_NAME = '保单' | 1094 | INSURANCE_CN_NAME = '保单' |
1072 | INSURANCE_CLASSIFY = 42 | 1095 | INSURANCE_CLASSIFY = 42 |
... | @@ -1214,6 +1237,11 @@ BS_FIELD = 'bss_ocr' | ... | @@ -1214,6 +1237,11 @@ BS_FIELD = 'bss_ocr' |
1214 | HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr' | 1237 | HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr' |
1215 | HIL_CONTRACT_2_FIELD = 'hil_contract_2_ocr' | 1238 | HIL_CONTRACT_2_FIELD = 'hil_contract_2_ocr' |
1216 | HIL_CONTRACT_3_FIELD = 'hil_contract_3_ocr' | 1239 | HIL_CONTRACT_3_FIELD = 'hil_contract_3_ocr' |
1240 | FSM_CONTRACT_WEP_FIELD = 'fsm_wep_ocr' | ||
1241 | FSM_CONTRACT_MSI_FIELD = 'fsm_msi_ocr' | ||
1242 | FSM_CONTRACT_SC_FIELD = 'fsm_sc_ocr' | ||
1243 | |||
1244 | |||
1217 | BS_CLASSIFY = 10089 | 1245 | BS_CLASSIFY = 10089 |
1218 | 1246 | ||
1219 | RESULT_MAPPING = { | 1247 | RESULT_MAPPING = { |
... | @@ -1238,6 +1266,9 @@ RESULT_MAPPING = { | ... | @@ -1238,6 +1266,9 @@ RESULT_MAPPING = { |
1238 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD, | 1266 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD, |
1239 | HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_FIELD, | 1267 | HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_FIELD, |
1240 | HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_FIELD, | 1268 | HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_FIELD, |
1269 | FSM_CONTRACT_WEP_CLASSIFY: FSM_CONTRACT_WEP_FIELD, | ||
1270 | FSM_CONTRACT_MSI_CLASSIFY: FSM_CONTRACT_MSI_FIELD, | ||
1271 | FSM_CONTRACT_SC_CLASSIFY: FSM_CONTRACT_SC_FIELD, | ||
1241 | } | 1272 | } |
1242 | 1273 | ||
1243 | CA_ADD_COMPARE_FIELDS = (IC_OCR_FIELD, BL_OCR_FIELD, BS_FIELD) | 1274 | CA_ADD_COMPARE_FIELDS = (IC_OCR_FIELD, BL_OCR_FIELD, BS_FIELD) |
... | @@ -2313,29 +2344,42 @@ APPLICANT_TYPE_MAP = { | ... | @@ -2313,29 +2344,42 @@ APPLICANT_TYPE_MAP = { |
2313 | 2344 | ||
2314 | APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager'] | 2345 | APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager'] |
2315 | 2346 | ||
2316 | FILE_NAME_PREFIX_MAP = { | 2347 | # FILE_NAME_PREFIX_MAP = { |
2317 | AFC_PREFIX: [ | 2348 | # AFC_PREFIX: [ |
2318 | ((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'), | 2349 | # ((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'), |
2319 | ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), | 2350 | # ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), |
2320 | ], | 2351 | # ], |
2321 | HIL_PREFIX: [ | 2352 | # HIL_PREFIX: [ |
2322 | ((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'), | 2353 | # ((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'), |
2323 | ((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'), | 2354 | # ((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'), |
2324 | ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), | 2355 | # ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), |
2325 | ] | 2356 | # ] |
2326 | } | 2357 | # } |
2327 | 2358 | ||
2328 | ECONTRACT_KEYWORDS_MAP = { | 2359 | ECONTRACT_KEYWORDS_MAP = { |
2329 | AFC_PREFIX: [ | 2360 | AFC_PREFIX: [ |
2330 | ('抵押贷款合同', CONTRACT_CLASSIFY), | 2361 | ('抵押贷款合同', CONTRACT_CLASSIFY), |
2331 | ('送达地址确认书', CONTRACT_QRS_CLASSIFY), | 2362 | ('送达地址确认书', CONTRACT_QRS_CLASSIFY), |
2332 | # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0), | 2363 | ('抵押登记豁免函', HMH_CLASSIFY), |
2333 | ], | 2364 | ], |
2334 | HIL_PREFIX: [ | 2365 | HIL_PREFIX: [ |
2335 | ('售后回租合同', HIL_CONTRACT_1_CLASSIFY), | 2366 | ('售后回租合同', HIL_CONTRACT_1_CLASSIFY), |
2336 | ('租赁抵押合同', HIL_CONTRACT_2_CLASSIFY), | 2367 | ('租赁抵押合同', HIL_CONTRACT_2_CLASSIFY), |
2337 | ('车辆处置协议', HIL_CONTRACT_3_CLASSIFY), | 2368 | ('车辆处置协议', HIL_CONTRACT_3_CLASSIFY), |
2338 | # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0), | 2369 | ('抵押登记豁免函', HMH_CLASSIFY), |
2370 | ] | ||
2371 | } | ||
2372 | |||
2373 | FSM_ECONTRACT_KEYWORDS_MAP = { | ||
2374 | AFC_PREFIX: [ | ||
2375 | ('延长保修条款与条件', FSM_CONTRACT_WEP_CLASSIFY), | ||
2376 | ('长悦保养套餐服务合约', FSM_CONTRACT_MSI_CLASSIFY), | ||
2377 | ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY), | ||
2378 | ], | ||
2379 | HIL_PREFIX: [ | ||
2380 | ('延长保修条款与条件', FSM_CONTRACT_WEP_CLASSIFY), | ||
2381 | ('长悦保养套餐服务合同', FSM_CONTRACT_MSI_CLASSIFY), | ||
2382 | ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY), | ||
2339 | ] | 2383 | ] |
2340 | } | 2384 | } |
2341 | 2385 | ||
... | @@ -2345,6 +2389,12 @@ HIL_CONTRACT_TYPE_MAP = { | ... | @@ -2345,6 +2389,12 @@ HIL_CONTRACT_TYPE_MAP = { |
2345 | str(HIL_CONTRACT_3_CLASSIFY): 1, | 2389 | str(HIL_CONTRACT_3_CLASSIFY): 1, |
2346 | } | 2390 | } |
2347 | 2391 | ||
2392 | FSM_CONTRACT_TYPE_MAP = { | ||
2393 | str(FSM_CONTRACT_WEP_CLASSIFY): 0, | ||
2394 | str(FSM_CONTRACT_MSI_CLASSIFY): 1, | ||
2395 | str(FSM_CONTRACT_SC_CLASSIFY): 2, | ||
2396 | } | ||
2397 | |||
2348 | RESULT_MAP = { | 2398 | RESULT_MAP = { |
2349 | 0: None, | 2399 | 0: None, |
2350 | 1: True, | 2400 | 1: True, | ... | ... |
... | @@ -20,6 +20,8 @@ from common.tools.file_tools import get_pwd_list_from_str, extract_zip_or_rar, g | ... | @@ -20,6 +20,8 @@ from common.tools.file_tools import get_pwd_list_from_str, extract_zip_or_rar, g |
20 | from common.tools.pdf_to_img import PDFHandler | 20 | from common.tools.pdf_to_img import PDFHandler |
21 | from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict | 21 | from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict |
22 | from common.electronic_hil_contract.hil_contract_ocr import predict as hil_predict | 22 | from common.electronic_hil_contract.hil_contract_ocr import predict as hil_predict |
23 | from common.fsm_econtract.fsm_contract_ocr import predict as fsm_predict | ||
24 | from common.fsm_econtract.hmh_ocr import predict as hmh_predict | ||
23 | from apps.doc import consts | 25 | from apps.doc import consts |
24 | # from apps.doc.ocr.edms import EDMS, rh | 26 | # from apps.doc.ocr.edms import EDMS, rh |
25 | from apps.doc.ocr.ecm import ECM, rh | 27 | from apps.doc.ocr.ecm import ECM, rh |
... | @@ -996,7 +998,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -996,7 +998,7 @@ class Command(BaseCommand, LoggerMixin): |
996 | res.setdefault(consts.ALL_POSITION_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get( | 998 | res.setdefault(consts.ALL_POSITION_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get( |
997 | consts.ALL_POSITION_KEY, {}).get(key1, []) | 999 | consts.ALL_POSITION_KEY, {}).get(key1, []) |
998 | license_summary[classify] = [res] | 1000 | license_summary[classify] = [res] |
999 | else: | 1001 | elif classify in consts.SE_HIL_CON_MAP: # TODO FSM新合同写入数据库用于比对 |
1000 | res = {} | 1002 | res = {} |
1001 | for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items(): | 1003 | for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items(): |
1002 | if pno1 is None: | 1004 | if pno1 is None: |
... | @@ -1442,7 +1444,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1442,7 +1444,7 @@ class Command(BaseCommand, LoggerMixin): |
1442 | self.log_base, traceback.format_exc())) | 1444 | self.log_base, traceback.format_exc())) |
1443 | error_list.append(1) | 1445 | error_list.append(1) |
1444 | return | 1446 | return |
1445 | else: # e-contract | 1447 | else: # e-contract or or e-fsm-contract or e-hmh |
1446 | try: | 1448 | try: |
1447 | # pdf下载 处理 图片存储 识别 | 1449 | # pdf下载 处理 图片存储 识别 |
1448 | for times in range(consts.RETRY_TIMES): | 1450 | for times in range(consts.RETRY_TIMES): |
... | @@ -1472,8 +1474,10 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1472,8 +1474,10 @@ class Command(BaseCommand, LoggerMixin): |
1472 | self.online_log.error('{0} [process error (db save)] [error={1}]'.format( | 1474 | self.online_log.error('{0} [process error (db save)] [error={1}]'.format( |
1473 | self.log_base, traceback.format_exc())) | 1475 | self.log_base, traceback.format_exc())) |
1474 | 1476 | ||
1477 | # AFC合同 | ||
1475 | if classify_1_str == str(consts.CONTRACT_CLASSIFY): | 1478 | if classify_1_str == str(consts.CONTRACT_CLASSIFY): |
1476 | ocr_result = afc_predict(pdf_handler.pdf_info) | 1479 | is_fsm = doc.data_source == consts.DATA_SOURCE_LIST[3] |
1480 | ocr_result = afc_predict(pdf_handler.pdf_info, is_fsm=is_fsm) | ||
1477 | page_res = {} | 1481 | page_res = {} |
1478 | for page_num, page_info in ocr_result.get('page_info', {}).items(): | 1482 | for page_num, page_info in ocr_result.get('page_info', {}).items(): |
1479 | if isinstance(page_num, str) and page_num.startswith('page_'): | 1483 | if isinstance(page_num, str) and page_num.startswith('page_'): |
... | @@ -1483,6 +1487,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1483,6 +1487,7 @@ class Command(BaseCommand, LoggerMixin): |
1483 | 'page_num': page_num, | 1487 | 'page_num': page_num, |
1484 | 'page_info': page_info | 1488 | 'page_info': page_info |
1485 | } | 1489 | } |
1490 | # 送达地址确认书 | ||
1486 | elif classify_1_str == str(consts.CONTRACT_QRS_CLASSIFY): | 1491 | elif classify_1_str == str(consts.CONTRACT_QRS_CLASSIFY): |
1487 | ocr_result = afc_predict(pdf_handler.pdf_info, is_qrs=True) | 1492 | ocr_result = afc_predict(pdf_handler.pdf_info, is_qrs=True) |
1488 | page_num = 'page_1' | 1493 | page_num = 'page_1' |
... | @@ -1493,9 +1498,11 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1493,9 +1498,11 @@ class Command(BaseCommand, LoggerMixin): |
1493 | 'page_info': ocr_result.pop(page_num, {}) | 1498 | 'page_info': ocr_result.pop(page_num, {}) |
1494 | } | 1499 | } |
1495 | } | 1500 | } |
1496 | else: | 1501 | # HIL合同 |
1502 | elif classify_1_str in consts.HIL_CONTRACT_TYPE_MAP: | ||
1503 | is_fsm = doc.data_source == consts.DATA_SOURCE_LIST[3] | ||
1497 | file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str) | 1504 | file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str) |
1498 | ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1) | 1505 | ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1, is_fsm=is_fsm) |
1499 | rebuild_res_1 = {} | 1506 | rebuild_res_1 = {} |
1500 | page_res = {} | 1507 | page_res = {} |
1501 | for field_name, field_info in ocr_result_1.items(): | 1508 | for field_name, field_info in ocr_result_1.items(): |
... | @@ -1508,28 +1515,55 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1508,28 +1515,55 @@ class Command(BaseCommand, LoggerMixin): |
1508 | 'page_num': page_num, | 1515 | 'page_num': page_num, |
1509 | 'page_info': page_info | 1516 | 'page_info': page_info |
1510 | } | 1517 | } |
1518 | # FSM合同 WEP MSI SC | ||
1519 | elif classify_1_str in consts.FSM_CONTRACT_TYPE_MAP: | ||
1520 | file_type = consts.FSM_CONTRACT_TYPE_MAP.get(classify_1_str) | ||
1521 | ocr_result = fsm_predict(pdf_handler.pdf_info, file_type) | ||
1522 | page_res = {} | ||
1523 | for page_num, page_info in ocr_result.items(): | ||
1524 | if isinstance(page_num, str) and page_num.startswith('page_'): | ||
1525 | page_res[page_num] = { | ||
1526 | 'classify': int(classify_1_str), | ||
1527 | 'page_num': page_num, | ||
1528 | 'page_info': page_info | ||
1529 | } | ||
1530 | # hmh | ||
1531 | # else: | ||
1532 | # pass | ||
1533 | |||
1511 | 1534 | ||
1512 | contract_res = {} | 1535 | contract_res = {} |
1513 | for img_path_tmp, page_key in pdf_handler.img_path_pno_list: | 1536 | for img_path_tmp, page_key in pdf_handler.img_path_pno_list: |
1514 | if page_key in page_res: | 1537 | if classify_1_str == str(consts.HMH_CLASSIFY): |
1515 | img_contract_res = { | 1538 | img_contract_res = { |
1516 | 'code': 1, | 1539 | 'code': 1, |
1517 | 'data': [ | 1540 | 'data': [ |
1518 | { | 1541 | { |
1519 | 'classify': page_res[page_key].pop('classify', consts.OTHER_CLASSIFY), | 1542 | 'classify': consts.HMH_CLASSIFY, |
1520 | 'data': page_res[page_key] | 1543 | 'data': hmh_predict(pdf_handler.pdf_info) |
1521 | } | 1544 | } |
1522 | ] | 1545 | ] |
1523 | } | 1546 | } |
1524 | else: | 1547 | else: |
1525 | img_contract_res = { | 1548 | if page_key in page_res: |
1526 | 'code': 1, | 1549 | img_contract_res = { |
1527 | 'data': [ | 1550 | 'code': 1, |
1528 | { | 1551 | 'data': [ |
1529 | 'classify': int(classify_1_str), | 1552 | { |
1530 | } | 1553 | 'classify': page_res[page_key].pop('classify', consts.OTHER_CLASSIFY), |
1531 | ] | 1554 | 'data': page_res[page_key] |
1532 | } | 1555 | } |
1556 | ] | ||
1557 | } | ||
1558 | else: | ||
1559 | img_contract_res = { | ||
1560 | 'code': 1, | ||
1561 | 'data': [ | ||
1562 | { | ||
1563 | 'classify': int(classify_1_str), | ||
1564 | } | ||
1565 | ] | ||
1566 | } | ||
1533 | contract_res[img_path_tmp] = img_contract_res | 1567 | contract_res[img_path_tmp] = img_contract_res |
1534 | 1568 | ||
1535 | with lock: | 1569 | with lock: | ... | ... |
... | @@ -36,6 +36,7 @@ class RequestTrigger(NamedEnum): | ... | @@ -36,6 +36,7 @@ class RequestTrigger(NamedEnum): |
36 | DOCUPLOAD = (3, 'Document Upload') | 36 | DOCUPLOAD = (3, 'Document Upload') |
37 | SUBMITING = (4, 'Submiting') | 37 | SUBMITING = (4, 'Submiting') |
38 | UPLOADING = (5, 'Uploading') | 38 | UPLOADING = (5, 'Uploading') |
39 | OVP = (6, 'OVP') | ||
39 | 40 | ||
40 | 41 | ||
41 | class FailureReason(NamedEnum): | 42 | class FailureReason(NamedEnum): | ... | ... |
... | @@ -780,10 +780,12 @@ class BSWorkbook(Workbook): | ... | @@ -780,10 +780,12 @@ class BSWorkbook(Workbook): |
780 | if field_str is not None: | 780 | if field_str is not None: |
781 | count_list.append((field_str, count)) | 781 | count_list.append((field_str, count)) |
782 | 782 | ||
783 | def contract_rebuild(self, contract_result_dict): | 783 | def contract_rebuild(self, contract_result_dict, is_ca=False): |
784 | for classify, contract_result in contract_result_dict.items(): | 784 | for classify, contract_result in contract_result_dict.items(): |
785 | if len(contract_result) == 0: | 785 | if len(contract_result) == 0: |
786 | continue | 786 | continue |
787 | if is_ca and classify not in consts.FSM_CONTRACT_CLASSIFY_SET: | ||
788 | continue | ||
787 | ws = self.create_sheet(consts.CONTRACT_MAP.get(classify)) | 789 | ws = self.create_sheet(consts.CONTRACT_MAP.get(classify)) |
788 | for i in range(30): | 790 | for i in range(30): |
789 | if str(i) in contract_result: | 791 | if str(i) in contract_result: |
... | @@ -906,6 +908,7 @@ class BSWorkbook(Workbook): | ... | @@ -906,6 +908,7 @@ class BSWorkbook(Workbook): |
906 | else: | 908 | else: |
907 | self.bs_rebuild(bs_summary, res_count_tuple, metadata) | 909 | self.bs_rebuild(bs_summary, res_count_tuple, metadata) |
908 | self.license_rebuild(license_summary, document_scheme, count_list) | 910 | self.license_rebuild(license_summary, document_scheme, count_list) |
911 | self.contract_rebuild(contract_result, True) | ||
909 | self.move_res_sheet() | 912 | self.move_res_sheet() |
910 | self.remove_base_sheet() | 913 | self.remove_base_sheet() |
911 | return count_list | 914 | return count_list | ... | ... |
... | @@ -602,13 +602,22 @@ class UploadDocView(GenericView, DocHandler): | ... | @@ -602,13 +602,22 @@ class UploadDocView(GenericView, DocHandler): |
602 | is_zip = False | 602 | is_zip = False |
603 | 603 | ||
604 | classify_1 = 0 | 604 | classify_1 = 0 |
605 | # 电子合同 | 605 | # 电子合同 Econtract or OVP(FSM) |
606 | if data_source == consts.DATA_SOURCE_LIST[-1] and document_scheme == consts.DOC_SCHEME_LIST[1]: | 606 | if data_source == consts.DATA_SOURCE_LIST[2] or data_source == consts.DATA_SOURCE_LIST[3]: |
607 | for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix): | 607 | if document_scheme == consts.DOC_SCHEME_LIST[1]: |
608 | for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix): | ||
609 | if keyword in document_name: | ||
610 | classify_1 = classify_1_tmp | ||
611 | break | ||
612 | # FSM合同:WEP/MSI/SC | ||
613 | elif data_source == consts.DATA_SOURCE_LIST[0] and document_scheme == consts.DOC_SCHEME_LIST[0]: | ||
614 | for keyword, classify_1_tmp in consts.FSM_ECONTRACT_KEYWORDS_MAP.get(prefix): | ||
608 | if keyword in document_name: | 615 | if keyword in document_name: |
609 | classify_1 = classify_1_tmp | 616 | classify_1 = classify_1_tmp |
610 | break | 617 | break |
611 | elif document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \ | 618 | |
619 | |||
620 | if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \ | ||
612 | or document_name.endswith('.RAR'): | 621 | or document_name.endswith('.RAR'): |
613 | is_zip = True | 622 | is_zip = True |
614 | 623 | ... | ... |
... | @@ -6,6 +6,7 @@ | ... | @@ -6,6 +6,7 @@ |
6 | # @Description : | 6 | # @Description : |
7 | 7 | ||
8 | from .get_char import Finder | 8 | from .get_char import Finder |
9 | from .get_char_fsm import Finder as FSMFinder | ||
9 | import numpy as np | 10 | import numpy as np |
10 | 11 | ||
11 | 12 | ||
... | @@ -23,7 +24,7 @@ def extract_info(ocr_results): | ... | @@ -23,7 +24,7 @@ def extract_info(ocr_results): |
23 | return {'page_1': {'合同编号': contract_no}} | 24 | return {'page_1': {'合同编号': contract_no}} |
24 | 25 | ||
25 | 26 | ||
26 | def predict(pdf_info, is_qrs=False): | 27 | def predict(pdf_info, is_qrs=False, is_fsm=False): |
27 | ocr_results = {} | 28 | ocr_results = {} |
28 | for pno in pdf_info: | 29 | for pno in pdf_info: |
29 | ocr_results[pno] = {} | 30 | ocr_results[pno] = {} |
... | @@ -50,7 +51,10 @@ def predict(pdf_info, is_qrs=False): | ... | @@ -50,7 +51,10 @@ def predict(pdf_info, is_qrs=False): |
50 | results = extract_info(ocr_results) | 51 | results = extract_info(ocr_results) |
51 | else: | 52 | else: |
52 | # 输入是整个 PDF 中的信息 | 53 | # 输入是整个 PDF 中的信息 |
53 | f = Finder(pdf_info, ocr_results=ocr_results) | 54 | if is_fsm: |
55 | f = FSMFinder(pdf_info, ocr_results=ocr_results) | ||
56 | else: | ||
57 | f = Finder(pdf_info, ocr_results=ocr_results) | ||
54 | results = f.get_info() | 58 | results = f.get_info() |
55 | return results | 59 | return results |
56 | 60 | ... | ... |
This diff is collapsed.
Click to expand it.
This diff is collapsed.
Click to expand it.
... | @@ -6,9 +6,10 @@ | ... | @@ -6,9 +6,10 @@ |
6 | # @Description : | 6 | # @Description : |
7 | 7 | ||
8 | from .get_char import Finder | 8 | from .get_char import Finder |
9 | from .get_char_fsm import Finder as FSMFinder | ||
9 | 10 | ||
10 | 11 | ||
11 | def predict(pdf_info, file_cls): | 12 | def predict(pdf_info, file_cls, is_fsm=False): |
12 | """Summary | 13 | """Summary |
13 | 14 | ||
14 | Args: | 15 | Args: |
... | @@ -58,7 +59,11 @@ def predict(pdf_info, file_cls): | ... | @@ -58,7 +59,11 @@ def predict(pdf_info, file_cls): |
58 | pdf_info = dict() | 59 | pdf_info = dict() |
59 | for pno, page_info in enumerate(pdf_info_1): | 60 | for pno, page_info in enumerate(pdf_info_1): |
60 | pdf_info[str(pno)] = page_info | 61 | pdf_info[str(pno)] = page_info |
61 | f = Finder(pdf_info) | 62 | |
63 | if is_fsm: | ||
64 | f = FSMFinder(pdf_info) | ||
65 | else: | ||
66 | f = Finder(pdf_info) | ||
62 | if file_cls == 0: | 67 | if file_cls == 0: |
63 | results = f.get_info() | 68 | results = f.get_info() |
64 | if file_cls == 1: | 69 | if file_cls == 1: | ... | ... |
src/common/fsm_econtract/const.py
0 → 100644
1 | WEP_FIELD = { | ||
2 | "0": { | ||
3 | 'keys': { | ||
4 | '客户姓名': [('客户姓名', (r'^客户姓名.?$', r'^企业名称.?$'), 'top1', {})], | ||
5 | '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})], | ||
6 | '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})], | ||
7 | '合同价格(小写)': [('人民币', (r'^人民币¥.?$', ), 'top1', {})], | ||
8 | '客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', ), 'top1', {})], | ||
9 | '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})], | ||
10 | }, | ||
11 | 'value': { | ||
12 | '客户姓名': ('text', 'right', {'offset_tuple': (-1.1, 1, 0.3, 0)}, ''), | ||
13 | '证件类型': ('text', 'right', {'offset_tuple': (-1, 1, 0, 0)}, ''), | ||
14 | '证件号码': ('text', 'right', {'offset_tuple': (-1, 2, 0.3, 0)}, ''), | ||
15 | '合同价格(小写)': ('text', 'right', {'offset_tuple': (-1, 1, 0.3, 0)}, ''), | ||
16 | '客户签名': ('img', 'under', {'offset_tuple': (0, 0, 0, 4), 'rigorous': True}, '无'), | ||
17 | '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'), | ||
18 | }, | ||
19 | } | ||
20 | |||
21 | } | ||
22 | |||
23 | MSI_FIELD = { | ||
24 | "0": { | ||
25 | 'keys': { | ||
26 | '客户姓名': [('客户姓名', (r'^客户姓名.?$', r'^企业名称.?$'), 'top1', {})], | ||
27 | '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})], | ||
28 | '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})], | ||
29 | '合同价格(小写)': [('人民币', (r'^人民币¥.?$', ), 'top1', {})], | ||
30 | }, | ||
31 | 'value': { | ||
32 | '客户姓名': ('text', 'right', {'offset_tuple': (-1.2, 1, 0.3, 0)}, ''), | ||
33 | '证件类型': ('text', 'right', {'offset_tuple': (-1, 1, 0, 0)}, ''), | ||
34 | '证件号码': ('text', 'right', {'offset_tuple': (-1, 2, 0.3, 0)}, ''), | ||
35 | '合同价格(小写)': ('text', 'right', {'offset_tuple': (-1, 1, 0.3, 0)}, ''), | ||
36 | }, | ||
37 | }, | ||
38 | "1": { | ||
39 | 'keys': { | ||
40 | '客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', ), 'top1', {})], | ||
41 | '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})], | ||
42 | }, | ||
43 | 'value': { | ||
44 | '客户签名': ('img', 'under', {'offset_tuple': (0, 0, 0, 4), 'rigorous': True}, '无'), | ||
45 | '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'), | ||
46 | }, | ||
47 | } | ||
48 | } | ||
49 | |||
50 | SC_FIELD = { | ||
51 | "0": { | ||
52 | 'keys': { | ||
53 | '姓名': [('姓名', (r'^姓名.?$', r'^企业名称.?$'), 'top1', {})], | ||
54 | '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})], | ||
55 | '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})], | ||
56 | '总价': [('总价', (r'^总价.?$', ), 'top1', {})], | ||
57 | }, | ||
58 | 'value': { | ||
59 | '姓名': ('text', 'right', {'offset_tuple': (-2, 8, 0.5, 0)}, ''), | ||
60 | '证件类型': ('text', 'right', {'offset_tuple': (-2, 6, 0.5, 0)}, ''), | ||
61 | '证件号码': ('text', 'right', {'offset_tuple': (-2, 6, 0.5, 0)}, ''), | ||
62 | '总价': ('text', 'right', {'offset_tuple': (-2, 12, 0.5, 0)}, ''), | ||
63 | }, | ||
64 | }, | ||
65 | "-1": { | ||
66 | 'keys': { | ||
67 | '客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', r'^客户签名/盖章.*$'), 'top1', {})], | ||
68 | '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})], | ||
69 | }, | ||
70 | 'value': { | ||
71 | '客户签名': ('img', 'under', {'offset_tuple': (1.5, 1, 0, 4), 'rigorous': True}, '无'), | ||
72 | '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'), | ||
73 | }, | ||
74 | } | ||
75 | } |
src/common/fsm_econtract/fsm_contract_ocr.py
0 → 100644
1 | from .retriever import Retriever | ||
2 | from .const import WEP_FIELD, MSI_FIELD, SC_FIELD | ||
3 | from .tools import pdf_info_rebuild | ||
4 | |||
5 | retriever_list = [Retriever(WEP_FIELD), Retriever(MSI_FIELD), Retriever(SC_FIELD)] | ||
6 | |||
7 | def predict(pdf_info, file_type=0): | ||
8 | retriever = retriever_list[file_type] | ||
9 | pdf_text_list, pdf_img_list = pdf_info_rebuild(pdf_info) | ||
10 | return retriever.get_target_fields(pdf_text_list, pdf_img_list) | ||
11 | |||
12 |
src/common/fsm_econtract/hmh_ocr.py
0 → 100644
src/common/fsm_econtract/retriever.py
0 → 100644
This diff is collapsed.
Click to expand it.
src/common/fsm_econtract/tools.py
0 → 100644
1 | def pdf_info_rebuild(pdf_info, fix_bbox=True): | ||
2 | pdf_text_info = dict() | ||
3 | pdf_img_info = dict() | ||
4 | for pno_str, page_info in pdf_info.items(): | ||
5 | text_set = set() | ||
6 | for block in page_info['blocks']: | ||
7 | if block['type'] == 0: | ||
8 | # text有重复的现象 | ||
9 | text_set.clear() | ||
10 | for line in block['lines']: | ||
11 | for span in line['spans']: | ||
12 | bbox, text = span['bbox'], span['text'].strip() | ||
13 | if len(text) != 0 and text not in text_set: | ||
14 | text_set.add(text) | ||
15 | # bbox的高,不准 | ||
16 | if fix_bbox and bbox[-1] - bbox[1] < span['size']: | ||
17 | bbox[-1] = bbox[-1] + span['size'] | ||
18 | pdf_text_info.setdefault(pno_str, list()).append([bbox, text]) | ||
19 | elif block['type'] == 1: | ||
20 | pdf_img_info.setdefault(pno_str, list()).append((block['bbox'], '有')) | ||
21 | |||
22 | return pdf_text_info, pdf_img_info | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or sign in to post a comment