add FSM e-contract
Showing
7 changed files
with
482 additions
and
17 deletions
| ... | @@ -1057,7 +1057,25 @@ HIL_CONTRACT_2_CLASSIFY = 44 | ... | @@ -1057,7 +1057,25 @@ HIL_CONTRACT_2_CLASSIFY = 44 |
| 1057 | HIL_CONTRACT_3_CN_NAME = '车辆处置协议' | 1057 | HIL_CONTRACT_3_CN_NAME = '车辆处置协议' |
| 1058 | HIL_CONTRACT_3_CLASSIFY = 45 | 1058 | HIL_CONTRACT_3_CLASSIFY = 45 |
| 1059 | 1059 | ||
| 1060 | CONTRACT_SET = {CONTRACT_QRS_CLASSIFY, CONTRACT_CLASSIFY, HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_2_CLASSIFY, HIL_CONTRACT_3_CLASSIFY} | 1060 | FSM_CONTRACT_WEP_CN_NAME = '延长保修合同' |
| 1061 | FSM_CONTRACT_WEP_CLASSIFY = 51 | ||
| 1062 | |||
| 1063 | FSM_CONTRACT_MSI_CN_NAME = '长悦保养合同' | ||
| 1064 | FSM_CONTRACT_MSI_CLASSIFY = 52 | ||
| 1065 | |||
| 1066 | FSM_CONTRACT_SC_CN_NAME = '汽车销售合同' | ||
| 1067 | FSM_CONTRACT_SC_CLASSIFY = 53 | ||
| 1068 | |||
| 1069 | CONTRACT_SET = { | ||
| 1070 | CONTRACT_QRS_CLASSIFY, | ||
| 1071 | CONTRACT_CLASSIFY, | ||
| 1072 | HIL_CONTRACT_1_CLASSIFY, | ||
| 1073 | HIL_CONTRACT_2_CLASSIFY, | ||
| 1074 | HIL_CONTRACT_3_CLASSIFY, | ||
| 1075 | FSM_CONTRACT_WEP_CLASSIFY, | ||
| 1076 | FSM_CONTRACT_MSI_CLASSIFY, | ||
| 1077 | FSM_CONTRACT_SC_CLASSIFY, | ||
| 1078 | } | ||
| 1061 | 1079 | ||
| 1062 | CONTRACT_MAP = { | 1080 | CONTRACT_MAP = { |
| 1063 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME, | 1081 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME, |
| ... | @@ -1065,6 +1083,10 @@ CONTRACT_MAP = { | ... | @@ -1065,6 +1083,10 @@ CONTRACT_MAP = { |
| 1065 | HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME, | 1083 | HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME, |
| 1066 | CONTRACT_CLASSIFY: CONTRACT_CN_NAME, | 1084 | CONTRACT_CLASSIFY: CONTRACT_CN_NAME, |
| 1067 | CONTRACT_QRS_CLASSIFY: CONTRACT_QRS_CN_NAME, | 1085 | CONTRACT_QRS_CLASSIFY: CONTRACT_QRS_CN_NAME, |
| 1086 | FSM_CONTRACT_WEP_CLASSIFY: FSM_CONTRACT_WEP_CN_NAME, | ||
| 1087 | FSM_CONTRACT_MSI_CLASSIFY: FSM_CONTRACT_MSI_CN_NAME, | ||
| 1088 | FSM_CONTRACT_SC_CLASSIFY: FSM_CONTRACT_SC_CN_NAME, | ||
| 1089 | |||
| 1068 | } | 1090 | } |
| 1069 | 1091 | ||
| 1070 | # 保单 | 1092 | # 保单 |
| ... | @@ -1214,6 +1236,11 @@ BS_FIELD = 'bss_ocr' | ... | @@ -1214,6 +1236,11 @@ BS_FIELD = 'bss_ocr' |
| 1214 | HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr' | 1236 | HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr' |
| 1215 | HIL_CONTRACT_2_FIELD = 'hil_contract_2_ocr' | 1237 | HIL_CONTRACT_2_FIELD = 'hil_contract_2_ocr' |
| 1216 | HIL_CONTRACT_3_FIELD = 'hil_contract_3_ocr' | 1238 | HIL_CONTRACT_3_FIELD = 'hil_contract_3_ocr' |
| 1239 | FSM_CONTRACT_WEP_FIELD = 'fsm_wep_ocr', | ||
| 1240 | FSM_CONTRACT_MSI_FIELD = 'fsm_msi_ocr', | ||
| 1241 | FSM_CONTRACT_SC_FIELD = 'fsm_sc_ocr', | ||
| 1242 | |||
| 1243 | |||
| 1217 | BS_CLASSIFY = 10089 | 1244 | BS_CLASSIFY = 10089 |
| 1218 | 1245 | ||
| 1219 | RESULT_MAPPING = { | 1246 | RESULT_MAPPING = { |
| ... | @@ -1238,6 +1265,9 @@ RESULT_MAPPING = { | ... | @@ -1238,6 +1265,9 @@ RESULT_MAPPING = { |
| 1238 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD, | 1265 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD, |
| 1239 | HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_FIELD, | 1266 | HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_FIELD, |
| 1240 | HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_FIELD, | 1267 | HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_FIELD, |
| 1268 | FSM_CONTRACT_WEP_CLASSIFY: FSM_CONTRACT_WEP_FIELD, | ||
| 1269 | FSM_CONTRACT_MSI_CLASSIFY: FSM_CONTRACT_MSI_FIELD, | ||
| 1270 | FSM_CONTRACT_SC_CLASSIFY: FSM_CONTRACT_SC_FIELD, | ||
| 1241 | } | 1271 | } |
| 1242 | 1272 | ||
| 1243 | CA_ADD_COMPARE_FIELDS = (IC_OCR_FIELD, BL_OCR_FIELD, BS_FIELD) | 1273 | CA_ADD_COMPARE_FIELDS = (IC_OCR_FIELD, BL_OCR_FIELD, BS_FIELD) |
| ... | @@ -2313,29 +2343,35 @@ APPLICANT_TYPE_MAP = { | ... | @@ -2313,29 +2343,35 @@ APPLICANT_TYPE_MAP = { |
| 2313 | 2343 | ||
| 2314 | APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager'] | 2344 | APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager'] |
| 2315 | 2345 | ||
| 2316 | FILE_NAME_PREFIX_MAP = { | 2346 | # FILE_NAME_PREFIX_MAP = { |
| 2317 | AFC_PREFIX: [ | 2347 | # AFC_PREFIX: [ |
| 2318 | ((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'), | 2348 | # ((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'), |
| 2319 | ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), | 2349 | # ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), |
| 2320 | ], | 2350 | # ], |
| 2321 | HIL_PREFIX: [ | 2351 | # HIL_PREFIX: [ |
| 2322 | ((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'), | 2352 | # ((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'), |
| 2323 | ((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'), | 2353 | # ((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'), |
| 2324 | ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), | 2354 | # ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), |
| 2325 | ] | 2355 | # ] |
| 2326 | } | 2356 | # } |
| 2327 | 2357 | ||
| 2328 | ECONTRACT_KEYWORDS_MAP = { | 2358 | ECONTRACT_KEYWORDS_MAP = { |
| 2329 | AFC_PREFIX: [ | 2359 | AFC_PREFIX: [ |
| 2330 | ('抵押贷款合同', CONTRACT_CLASSIFY), | 2360 | ('抵押贷款合同', CONTRACT_CLASSIFY), |
| 2331 | ('送达地址确认书', CONTRACT_QRS_CLASSIFY), | 2361 | ('送达地址确认书', CONTRACT_QRS_CLASSIFY), |
| 2332 | # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0), | 2362 | ('抵押登记豁免函', HMH_CLASSIFY), |
| 2363 | ('延长保修', FSM_CONTRACT_WEP_CLASSIFY), | ||
| 2364 | ('长悦保养', FSM_CONTRACT_MSI_CLASSIFY), | ||
| 2365 | ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY), | ||
| 2333 | ], | 2366 | ], |
| 2334 | HIL_PREFIX: [ | 2367 | HIL_PREFIX: [ |
| 2335 | ('售后回租合同', HIL_CONTRACT_1_CLASSIFY), | 2368 | ('售后回租合同', HIL_CONTRACT_1_CLASSIFY), |
| 2336 | ('租赁抵押合同', HIL_CONTRACT_2_CLASSIFY), | 2369 | ('租赁抵押合同', HIL_CONTRACT_2_CLASSIFY), |
| 2337 | ('车辆处置协议', HIL_CONTRACT_3_CLASSIFY), | 2370 | ('车辆处置协议', HIL_CONTRACT_3_CLASSIFY), |
| 2338 | # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0), | 2371 | ('抵押登记豁免函', HMH_CLASSIFY), |
| 2372 | ('延长保修', FSM_CONTRACT_WEP_CLASSIFY), | ||
| 2373 | ('长悦保养', FSM_CONTRACT_MSI_CLASSIFY), | ||
| 2374 | ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY), | ||
| 2339 | ] | 2375 | ] |
| 2340 | } | 2376 | } |
| 2341 | 2377 | ||
| ... | @@ -2345,6 +2381,12 @@ HIL_CONTRACT_TYPE_MAP = { | ... | @@ -2345,6 +2381,12 @@ HIL_CONTRACT_TYPE_MAP = { |
| 2345 | str(HIL_CONTRACT_3_CLASSIFY): 1, | 2381 | str(HIL_CONTRACT_3_CLASSIFY): 1, |
| 2346 | } | 2382 | } |
| 2347 | 2383 | ||
| 2384 | FSM_CONTRACT_TYPE_MAP = { | ||
| 2385 | str(FSM_CONTRACT_WEP_CLASSIFY): 0, | ||
| 2386 | str(FSM_CONTRACT_MSI_CLASSIFY): 1, | ||
| 2387 | str(FSM_CONTRACT_SC_CLASSIFY): 2, | ||
| 2388 | } | ||
| 2389 | |||
| 2348 | RESULT_MAP = { | 2390 | RESULT_MAP = { |
| 2349 | 0: None, | 2391 | 0: None, |
| 2350 | 1: True, | 2392 | 1: True, | ... | ... |
| ... | @@ -20,6 +20,8 @@ from common.tools.file_tools import get_pwd_list_from_str, extract_zip_or_rar, g | ... | @@ -20,6 +20,8 @@ from common.tools.file_tools import get_pwd_list_from_str, extract_zip_or_rar, g |
| 20 | from common.tools.pdf_to_img import PDFHandler | 20 | from common.tools.pdf_to_img import PDFHandler |
| 21 | from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict | 21 | from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict |
| 22 | from common.electronic_hil_contract.hil_contract_ocr import predict as hil_predict | 22 | from common.electronic_hil_contract.hil_contract_ocr import predict as hil_predict |
| 23 | from common.fsm_econtract.fsm_contract_ocr import predict as fsm_predict | ||
| 24 | from common.fsm_econtract.hmh_ocr import predict as hmh_predict | ||
| 23 | from apps.doc import consts | 25 | from apps.doc import consts |
| 24 | # from apps.doc.ocr.edms import EDMS, rh | 26 | # from apps.doc.ocr.edms import EDMS, rh |
| 25 | from apps.doc.ocr.ecm import ECM, rh | 27 | from apps.doc.ocr.ecm import ECM, rh |
| ... | @@ -996,7 +998,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -996,7 +998,7 @@ class Command(BaseCommand, LoggerMixin): |
| 996 | res.setdefault(consts.ALL_POSITION_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get( | 998 | res.setdefault(consts.ALL_POSITION_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get( |
| 997 | consts.ALL_POSITION_KEY, {}).get(key1, []) | 999 | consts.ALL_POSITION_KEY, {}).get(key1, []) |
| 998 | license_summary[classify] = [res] | 1000 | license_summary[classify] = [res] |
| 999 | else: | 1001 | elif classify in consts.SE_HIL_CON_MAP: # TODO FSM新合同写入数据库用于比对 |
| 1000 | res = {} | 1002 | res = {} |
| 1001 | for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items(): | 1003 | for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items(): |
| 1002 | if pno1 is None: | 1004 | if pno1 is None: |
| ... | @@ -1442,7 +1444,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1442,7 +1444,7 @@ class Command(BaseCommand, LoggerMixin): |
| 1442 | self.log_base, traceback.format_exc())) | 1444 | self.log_base, traceback.format_exc())) |
| 1443 | error_list.append(1) | 1445 | error_list.append(1) |
| 1444 | return | 1446 | return |
| 1445 | else: # e-contract | 1447 | else: # e-contract or or e-fsm-contract or e-hmh |
| 1446 | try: | 1448 | try: |
| 1447 | # pdf下载 处理 图片存储 识别 | 1449 | # pdf下载 处理 图片存储 识别 |
| 1448 | for times in range(consts.RETRY_TIMES): | 1450 | for times in range(consts.RETRY_TIMES): |
| ... | @@ -1472,6 +1474,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1472,6 +1474,7 @@ class Command(BaseCommand, LoggerMixin): |
| 1472 | self.online_log.error('{0} [process error (db save)] [error={1}]'.format( | 1474 | self.online_log.error('{0} [process error (db save)] [error={1}]'.format( |
| 1473 | self.log_base, traceback.format_exc())) | 1475 | self.log_base, traceback.format_exc())) |
| 1474 | 1476 | ||
| 1477 | # AFC合同 | ||
| 1475 | if classify_1_str == str(consts.CONTRACT_CLASSIFY): | 1478 | if classify_1_str == str(consts.CONTRACT_CLASSIFY): |
| 1476 | ocr_result = afc_predict(pdf_handler.pdf_info) | 1479 | ocr_result = afc_predict(pdf_handler.pdf_info) |
| 1477 | page_res = {} | 1480 | page_res = {} |
| ... | @@ -1483,6 +1486,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1483,6 +1486,7 @@ class Command(BaseCommand, LoggerMixin): |
| 1483 | 'page_num': page_num, | 1486 | 'page_num': page_num, |
| 1484 | 'page_info': page_info | 1487 | 'page_info': page_info |
| 1485 | } | 1488 | } |
| 1489 | # 送达地址确认书 | ||
| 1486 | elif classify_1_str == str(consts.CONTRACT_QRS_CLASSIFY): | 1490 | elif classify_1_str == str(consts.CONTRACT_QRS_CLASSIFY): |
| 1487 | ocr_result = afc_predict(pdf_handler.pdf_info, is_qrs=True) | 1491 | ocr_result = afc_predict(pdf_handler.pdf_info, is_qrs=True) |
| 1488 | page_num = 'page_1' | 1492 | page_num = 'page_1' |
| ... | @@ -1493,7 +1497,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1493,7 +1497,8 @@ class Command(BaseCommand, LoggerMixin): |
| 1493 | 'page_info': ocr_result.pop(page_num, {}) | 1497 | 'page_info': ocr_result.pop(page_num, {}) |
| 1494 | } | 1498 | } |
| 1495 | } | 1499 | } |
| 1496 | else: | 1500 | # HIL合同 |
| 1501 | elif classify_1_str in consts.HIL_CONTRACT_TYPE_MAP: | ||
| 1497 | file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str) | 1502 | file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str) |
| 1498 | ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1) | 1503 | ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1) |
| 1499 | rebuild_res_1 = {} | 1504 | rebuild_res_1 = {} |
| ... | @@ -1508,9 +1513,35 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1508,9 +1513,35 @@ class Command(BaseCommand, LoggerMixin): |
| 1508 | 'page_num': page_num, | 1513 | 'page_num': page_num, |
| 1509 | 'page_info': page_info | 1514 | 'page_info': page_info |
| 1510 | } | 1515 | } |
| 1516 | # FSM合同 WEP MSI SC | ||
| 1517 | elif classify_1_str in consts.FSM_CONTRACT_TYPE_MAP: | ||
| 1518 | file_type = consts.FSM_CONTRACT_TYPE_MAP.get(classify_1_str) | ||
| 1519 | ocr_result = fsm_predict(pdf_handler.pdf_info, file_type) | ||
| 1520 | for page_num, page_info in ocr_result.items(): | ||
| 1521 | if isinstance(page_num, str) and page_num.startswith('page_'): | ||
| 1522 | page_res[page_num] = { | ||
| 1523 | 'classify': int(classify_1_str), | ||
| 1524 | 'page_num': page_num, | ||
| 1525 | 'page_info': page_info | ||
| 1526 | } | ||
| 1527 | # hmh | ||
| 1528 | else: | ||
| 1529 | pass | ||
| 1530 | |||
| 1511 | 1531 | ||
| 1512 | contract_res = {} | 1532 | contract_res = {} |
| 1513 | for img_path_tmp, page_key in pdf_handler.img_path_pno_list: | 1533 | for img_path_tmp, page_key in pdf_handler.img_path_pno_list: |
| 1534 | if classify_1_str == str(consts.HMH_CLASSIFY): | ||
| 1535 | img_contract_res = { | ||
| 1536 | 'code': 1, | ||
| 1537 | 'data': [ | ||
| 1538 | { | ||
| 1539 | 'classify': consts.HMH_CLASSIFY, | ||
| 1540 | 'data': hmh_predict(pdf_handler.pdf_info) | ||
| 1541 | } | ||
| 1542 | ] | ||
| 1543 | } | ||
| 1544 | else: | ||
| 1514 | if page_key in page_res: | 1545 | if page_key in page_res: |
| 1515 | img_contract_res = { | 1546 | img_contract_res = { |
| 1516 | 'code': 1, | 1547 | 'code': 1, | ... | ... |
src/common/fsm_econtract/const.py
0 → 100644
| 1 | WEP_FIELD = { | ||
| 2 | "0": { | ||
| 3 | 'keys': { | ||
| 4 | '客户姓名': [('客户姓名', (r'^客户姓名.?$', r'^企业名称.?$'), 'top1', {})], | ||
| 5 | '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})], | ||
| 6 | '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})], | ||
| 7 | '合同价格(小写)': [('人民币', (r'^人民币¥.?$', ), 'top1', {})], | ||
| 8 | '客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', ), 'top1', {})], | ||
| 9 | '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})], | ||
| 10 | }, | ||
| 11 | 'value': { | ||
| 12 | '客户姓名': ('text', 'right', {'offset_tuple': (-1.1, 1, 0.3, 0)}, ''), | ||
| 13 | '证件类型': ('text', 'right', {'offset_tuple': (-1, 1, 0, 0)}, ''), | ||
| 14 | '证件号码': ('text', 'right', {'offset_tuple': (-1, 2, 0.3, 0)}, ''), | ||
| 15 | '合同价格(小写)': ('text', 'right', {'offset_tuple': (-1, 1, 0.3, 0)}, ''), | ||
| 16 | '客户签名': ('img', 'under', {'offset_tuple': (0, 0, 0, 4), 'rigorous': True}, '无'), | ||
| 17 | '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'), | ||
| 18 | }, | ||
| 19 | } | ||
| 20 | |||
| 21 | } | ||
| 22 | |||
| 23 | MSI_FIELD = { | ||
| 24 | "0": { | ||
| 25 | 'keys': { | ||
| 26 | '客户姓名': [('客户姓名', (r'^客户姓名.?$', r'^企业名称.?$'), 'top1', {})], | ||
| 27 | '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})], | ||
| 28 | '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})], | ||
| 29 | '合同价格(小写)': [('人民币', (r'^人民币¥.?$', ), 'top1', {})], | ||
| 30 | }, | ||
| 31 | 'value': { | ||
| 32 | '客户姓名': ('text', 'right', {'offset_tuple': (-1.2, 1, 0.3, 0)}, ''), | ||
| 33 | '证件类型': ('text', 'right', {'offset_tuple': (-1, 1, 0, 0)}, ''), | ||
| 34 | '证件号码': ('text', 'right', {'offset_tuple': (-1, 2, 0.3, 0)}, ''), | ||
| 35 | '合同价格(小写)': ('text', 'right', {'offset_tuple': (-1, 1, 0.3, 0)}, ''), | ||
| 36 | }, | ||
| 37 | }, | ||
| 38 | "1": { | ||
| 39 | 'keys': { | ||
| 40 | '客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', ), 'top1', {})], | ||
| 41 | '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})], | ||
| 42 | }, | ||
| 43 | 'value': { | ||
| 44 | '客户签名': ('img', 'under', {'offset_tuple': (0, 0, 0, 4), 'rigorous': True}, '无'), | ||
| 45 | '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'), | ||
| 46 | }, | ||
| 47 | } | ||
| 48 | } | ||
| 49 | |||
| 50 | SC_FIELD = { | ||
| 51 | "0": { | ||
| 52 | 'keys': { | ||
| 53 | '姓名': [('姓名', (r'^姓名.?$', r'^企业名称.?$'), 'top1', {})], | ||
| 54 | '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})], | ||
| 55 | '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})], | ||
| 56 | '总价': [('总价', (r'^总价.?$', ), 'top1', {})], | ||
| 57 | }, | ||
| 58 | 'value': { | ||
| 59 | '姓名': ('text', 'right', {'offset_tuple': (-2, 8, 0.5, 0)}, ''), | ||
| 60 | '证件类型': ('text', 'right', {'offset_tuple': (-2, 6, 0.5, 0)}, ''), | ||
| 61 | '证件号码': ('text', 'right', {'offset_tuple': (-2, 6, 0.5, 0)}, ''), | ||
| 62 | '总价': ('text', 'right', {'offset_tuple': (-2, 12, 0.5, 0)}, ''), | ||
| 63 | }, | ||
| 64 | }, | ||
| 65 | "-1": { | ||
| 66 | 'keys': { | ||
| 67 | '客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', r'^客户签名/盖章.*$'), 'top1', {})], | ||
| 68 | '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})], | ||
| 69 | }, | ||
| 70 | 'value': { | ||
| 71 | '客户签名': ('img', 'under', {'offset_tuple': (1.5, 1, 0, 4), 'rigorous': True}, '无'), | ||
| 72 | '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'), | ||
| 73 | }, | ||
| 74 | } | ||
| 75 | } |
src/common/fsm_econtract/fsm_contract_ocr.py
0 → 100644
| 1 | from .retriever import Retriever | ||
| 2 | from .const import WEP_FIELD, MSI_FIELD, SC_FIELD | ||
| 3 | from .tools import pdf_info_rebuild | ||
| 4 | |||
| 5 | retriever_list = [Retriever(WEP_FIELD), Retriever(MSI_FIELD), Retriever(SC_FIELD)] | ||
| 6 | |||
| 7 | def predict(pdf_info, file_type=0): | ||
| 8 | retriever = retriever_list[file_type] | ||
| 9 | pdf_text_list, pdf_img_list = pdf_info_rebuild(pdf_info) | ||
| 10 | return retriever.get_target_fields(pdf_text_list, pdf_img_list) | ||
| 11 | |||
| 12 |
src/common/fsm_econtract/hmh_ocr.py
0 → 100644
src/common/fsm_econtract/retriever.py
0 → 100644
| 1 | import re | ||
| 2 | |||
| 3 | |||
| 4 | class HMHRetriever: | ||
| 5 | |||
| 6 | def __init__(self): | ||
| 7 | self.words_str = 'words' | ||
| 8 | self.position_str = 'location' | ||
| 9 | self.default_position = [0, 0, 0, 0] | ||
| 10 | self.search_fields_list = [ | ||
| 11 | ('借款/承租人姓名', ''), | ||
| 12 | ('证件号码', ''), | ||
| 13 | ('渠道', ''), | ||
| 14 | ('合同编号', ''), | ||
| 15 | ('借款人签字/盖章', '无'), | ||
| 16 | ] | ||
| 17 | |||
| 18 | def get_target_fields(self, pdf_text_list): | ||
| 19 | result = dict() | ||
| 20 | is_find_name_id_company, is_find_application_no, is_find_name_date = False, False, False | ||
| 21 | for bbox, text in pdf_text_list.pop(str(0), []): | ||
| 22 | # print(text) | ||
| 23 | if not is_find_name_id_company: | ||
| 24 | # name_id_company_list = re.findall(r'借款人\(姓名(.*)证件号码(.*)与(.*公司)', text) | ||
| 25 | name_id_company_list = re.findall(r'承租人\(姓名(.*)证件号码(.*)与(.*公司)', text) | ||
| 26 | for name_id_company_tuple in name_id_company_list: | ||
| 27 | if len(name_id_company_tuple) == 3: | ||
| 28 | result[self.search_fields_list[0][0]] = { | ||
| 29 | self.words_str: name_id_company_tuple[0].replace('\u3000', '').strip(), | ||
| 30 | self.position_str: bbox | ||
| 31 | } | ||
| 32 | result[self.search_fields_list[1][0]] = { | ||
| 33 | self.words_str: name_id_company_tuple[1].replace('\u3000', '').strip(), | ||
| 34 | self.position_str: bbox | ||
| 35 | } | ||
| 36 | result[self.search_fields_list[2][0]] = { | ||
| 37 | self.words_str: name_id_company_tuple[2], | ||
| 38 | self.position_str: bbox | ||
| 39 | } | ||
| 40 | is_find_name_id_company = True | ||
| 41 | break | ||
| 42 | if not is_find_name_id_company: | ||
| 43 | name_id_company_list = re.findall(r'借款人\(姓名(.*)证件号码(.*)与(.*公司)', text) | ||
| 44 | # name_id_company_list = re.findall(r'承租人\(姓名(.*)证件号码(.*)与(.*公司)', text) | ||
| 45 | for name_id_company_tuple in name_id_company_list: | ||
| 46 | if len(name_id_company_tuple) == 3: | ||
| 47 | result[self.search_fields_list[0][0]] = { | ||
| 48 | self.words_str: name_id_company_tuple[0].replace('\u3000', '').strip(), | ||
| 49 | self.position_str: bbox | ||
| 50 | } | ||
| 51 | result[self.search_fields_list[1][0]] = { | ||
| 52 | self.words_str: name_id_company_tuple[1].replace('\u3000', '').strip(), | ||
| 53 | self.position_str: bbox | ||
| 54 | } | ||
| 55 | result[self.search_fields_list[2][0]] = { | ||
| 56 | self.words_str: name_id_company_tuple[2], | ||
| 57 | self.position_str: bbox | ||
| 58 | } | ||
| 59 | is_find_name_id_company = True | ||
| 60 | break | ||
| 61 | if not is_find_application_no: | ||
| 62 | application_no_list = re.findall(r'合同编号.*(CH-B\d*-\d*).*', text) | ||
| 63 | if len(application_no_list) == 1: | ||
| 64 | result[self.search_fields_list[3][0]] = { | ||
| 65 | self.words_str: application_no_list[0], | ||
| 66 | self.position_str: bbox | ||
| 67 | } | ||
| 68 | is_find_application_no = True | ||
| 69 | if not is_find_name_date: | ||
| 70 | name_date_list = re.findall(r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})', text) | ||
| 71 | for name_date_tuple in name_date_list: | ||
| 72 | if len(name_date_tuple) == 2: | ||
| 73 | result[self.search_fields_list[4][0]] = { | ||
| 74 | self.words_str: '{0} {1}'.format(name_date_tuple[0].replace('\u3000', '').strip(), name_date_tuple[1]), | ||
| 75 | self.position_str: bbox | ||
| 76 | } | ||
| 77 | is_find_name_date = True | ||
| 78 | break | ||
| 79 | |||
| 80 | for find_key, default_value in self.search_fields_list: | ||
| 81 | if find_key not in result: | ||
| 82 | result[find_key] = { | ||
| 83 | self.words_str: default_value, | ||
| 84 | self.position_str: self.default_position, | ||
| 85 | } | ||
| 86 | # simple_result = [] | ||
| 87 | # for key, value_dict in result.items(): | ||
| 88 | # simple_result.append((key, value_dict[self.words_str])) | ||
| 89 | |||
| 90 | # return simple_result | ||
| 91 | return {"words_result": result} | ||
| 92 | |||
| 93 | class Retriever: | ||
| 94 | |||
| 95 | def __init__(self, target_fields): | ||
| 96 | self.keys_str = 'keys' | ||
| 97 | self.value_str = 'value' | ||
| 98 | self.text_str = 'text' | ||
| 99 | self.words_str = 'words' | ||
| 100 | self.position_str = 'position' | ||
| 101 | self.default_position = [-1, -1, -1, -1] | ||
| 102 | self.target_fields = target_fields | ||
| 103 | self.replace_map = { | ||
| 104 | 'int': { | ||
| 105 | '(': '0' | ||
| 106 | } | ||
| 107 | } | ||
| 108 | |||
| 109 | @staticmethod | ||
| 110 | def key_top1(coordinates_list, key_coordinates): | ||
| 111 | # 关键词查找方向:最上面 | ||
| 112 | coordinates_list.sort(key=lambda x: x[1]) | ||
| 113 | return coordinates_list[0] | ||
| 114 | |||
| 115 | def key_right(self, coordinates_list, key_coordinates, offset_tuple, rigorous=False): | ||
| 116 | # 关键词查找方向:右侧 | ||
| 117 | if len(coordinates_list) == 1: | ||
| 118 | return coordinates_list[0] | ||
| 119 | |||
| 120 | # 没有上一层关键词的坐标时,返回最上面的坐标 | ||
| 121 | if key_coordinates is None: | ||
| 122 | return self.key_top1(coordinates_list, key_coordinates) | ||
| 123 | |||
| 124 | x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple) | ||
| 125 | |||
| 126 | x_min_find, find_key_coordinates = None, None | ||
| 127 | for x0, y0, x1, y1 in coordinates_list: | ||
| 128 | if rigorous: | ||
| 129 | is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max | ||
| 130 | else: | ||
| 131 | cent_x = x0 + ((x1 - x0) / 2) | ||
| 132 | cent_y = y0 + ((y1 - y0) / 2) | ||
| 133 | is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max | ||
| 134 | if is_eligible: | ||
| 135 | if x_min_find is None or x0 < x_min_find: | ||
| 136 | x_min_find = x0 | ||
| 137 | find_key_coordinates = (x0, y0, x1, y1) | ||
| 138 | |||
| 139 | if find_key_coordinates is None: | ||
| 140 | return self.key_top1(coordinates_list, key_coordinates) | ||
| 141 | else: | ||
| 142 | return find_key_coordinates | ||
| 143 | |||
| 144 | def value_right(self, search_list, key_coordinates, offset_tuple, value_type=None, rigorous=False): | ||
| 145 | # 字段值查找方向:右侧 | ||
| 146 | x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple) | ||
| 147 | |||
| 148 | x_min_find, value, coordinates = None, None, None | ||
| 149 | for (x0, y0, x1, y1), text in search_list: | ||
| 150 | if rigorous: | ||
| 151 | is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max | ||
| 152 | else: | ||
| 153 | cent_x = x0 + ((x1 - x0) / 2) | ||
| 154 | cent_y = y0 + ((y1 - y0) / 2) | ||
| 155 | is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max | ||
| 156 | if is_eligible: | ||
| 157 | if x_min_find is None or x0 < x_min_find: | ||
| 158 | if len(text.strip()) > 0: | ||
| 159 | x_min_find = x0 | ||
| 160 | value = text | ||
| 161 | coordinates = (x0, y0, x1, y1) | ||
| 162 | |||
| 163 | if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str): | ||
| 164 | new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {}))) | ||
| 165 | return new_value, coordinates | ||
| 166 | |||
| 167 | return value, coordinates | ||
| 168 | |||
| 169 | def value_under(self, search_list, key_coordinates, offset_tuple, value_type=None, append=False, rigorous=False): | ||
| 170 | # 字段值查找方向:下方 | ||
| 171 | x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple) | ||
| 172 | |||
| 173 | find_list = [] | ||
| 174 | for (x0, y0, x1, y1), text in search_list: | ||
| 175 | if rigorous: | ||
| 176 | is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max | ||
| 177 | else: | ||
| 178 | cent_x = x0 + ((x1 - x0) / 2) | ||
| 179 | cent_y = y0 + ((y1 - y0) / 2) | ||
| 180 | is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max | ||
| 181 | if is_eligible: | ||
| 182 | if len(text.strip()) > 0: | ||
| 183 | find_list.append((x0, y0, x1, y1, text)) | ||
| 184 | |||
| 185 | if len(find_list) == 0: | ||
| 186 | return None, None | ||
| 187 | else: | ||
| 188 | find_list.sort(key=lambda x: (x[1], x[0])) | ||
| 189 | coordinates = find_list[0][:-1] | ||
| 190 | if append: | ||
| 191 | value = ''.join([text for _, _, _, _, text in find_list]) | ||
| 192 | else: | ||
| 193 | value = find_list[0][-1] | ||
| 194 | |||
| 195 | if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str): | ||
| 196 | new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {}))) | ||
| 197 | return new_value, coordinates | ||
| 198 | |||
| 199 | return value, coordinates | ||
| 200 | |||
| 201 | @staticmethod | ||
| 202 | def get_target_bbox(key_coordinates, offset_tuple): | ||
| 203 | offset_xmin, offset_xmax, offset_ymin, offset_ymax = offset_tuple | ||
| 204 | |||
| 205 | width = key_coordinates[2] - key_coordinates[0] | ||
| 206 | height = key_coordinates[-1] - key_coordinates[1] | ||
| 207 | |||
| 208 | x_min = key_coordinates[0] - (width * offset_xmin) # -1 | ||
| 209 | x_max = key_coordinates[2] + (width * offset_xmax) | ||
| 210 | y_min = key_coordinates[1] - (height * offset_ymin) # -1 | ||
| 211 | y_max = key_coordinates[-1] + (height * offset_ymax) | ||
| 212 | return x_min, y_min, x_max, y_max | ||
| 213 | |||
| 214 | def get_target_fields(self, pdf_text_list, pdf_img_list): | ||
| 215 | pdf_result = dict() | ||
| 216 | |||
| 217 | for pno_str, fields_dict in self.target_fields.items(): | ||
| 218 | if pno_str == '-1': | ||
| 219 | pno_int_list = [int(pno_str) for pno_str in pdf_text_list.keys()] | ||
| 220 | pno_str = str(max(pno_int_list)) | ||
| 221 | |||
| 222 | # 搜索关键词 | ||
| 223 | key_text_info = dict() | ||
| 224 | for key_text_list in fields_dict[self.keys_str].values(): | ||
| 225 | for key_text, key_re_tuple, _, _ in key_text_list: | ||
| 226 | for (x0, y0, x1, y1), text in pdf_text_list.get(pno_str, []): | ||
| 227 | for key_re in key_re_tuple: | ||
| 228 | if re.match(key_re, text): | ||
| 229 | key_text_info.setdefault(key_text, list()).append((x0, y0, x1, y1)) | ||
| 230 | |||
| 231 | # 搜索关键词 | ||
| 232 | key_coordinates_info = dict() | ||
| 233 | for field, key_text_list in fields_dict[self.keys_str].items(): | ||
| 234 | last_key_coordinates = None | ||
| 235 | for key_text, _, direction, kwargs in key_text_list: | ||
| 236 | if key_text not in key_text_info: | ||
| 237 | last_key_coordinates = None | ||
| 238 | continue | ||
| 239 | last_key_coordinates = getattr(self, 'key_{0}'.format(direction))( | ||
| 240 | key_text_info[key_text], | ||
| 241 | last_key_coordinates, | ||
| 242 | **kwargs) | ||
| 243 | |||
| 244 | key_coordinates_info[field] = last_key_coordinates | ||
| 245 | |||
| 246 | # 搜索字段值 | ||
| 247 | page_result = dict() | ||
| 248 | for field, (source, direction, kwargs, default_value) in fields_dict[self.value_str].items(): | ||
| 249 | if not isinstance(key_coordinates_info.get(field), tuple): | ||
| 250 | page_result[field] = { | ||
| 251 | self.words_str: default_value, | ||
| 252 | self.position_str: self.default_position, | ||
| 253 | } | ||
| 254 | continue | ||
| 255 | value, coordinates = getattr(self, 'value_{0}'.format(direction))( | ||
| 256 | pdf_text_list.get(pno_str, []) if source == self.text_str else pdf_img_list.get(pno_str, []), | ||
| 257 | key_coordinates_info[field], | ||
| 258 | **kwargs | ||
| 259 | ) | ||
| 260 | if not isinstance(value, str): | ||
| 261 | page_result[field] = { | ||
| 262 | self.words_str: default_value, | ||
| 263 | self.position_str: self.default_position, | ||
| 264 | } | ||
| 265 | else: | ||
| 266 | page_result[field] = { | ||
| 267 | self.words_str: value, | ||
| 268 | self.position_str: list(coordinates), | ||
| 269 | } | ||
| 270 | |||
| 271 | pdf_result['page_{0}'.format(int(pno_str) + 1)] = page_result | ||
| 272 | |||
| 273 | return pdf_result |
src/common/fsm_econtract/tools.py
0 → 100644
| 1 | def pdf_info_rebuild(pdf_info, fix_bbox=True): | ||
| 2 | pdf_text_info = dict() | ||
| 3 | pdf_img_info = dict() | ||
| 4 | for pno_str, page_info in pdf_info.items(): | ||
| 5 | text_set = set() | ||
| 6 | for block in page_info['blocks']: | ||
| 7 | if block['type'] == 0: | ||
| 8 | # text有重复的现象 | ||
| 9 | text_set.clear() | ||
| 10 | for line in block['lines']: | ||
| 11 | for span in line['spans']: | ||
| 12 | bbox, text = span['bbox'], span['text'].strip() | ||
| 13 | if len(text) != 0 and text not in text_set: | ||
| 14 | text_set.add(text) | ||
| 15 | # bbox的高,不准 | ||
| 16 | if fix_bbox and bbox[-1] - bbox[1] < span['size']: | ||
| 17 | bbox[-1] = bbox[-1] + span['size'] | ||
| 18 | pdf_text_info.setdefault(pno_str, list()).append([bbox, text]) | ||
| 19 | elif block['type'] == 1: | ||
| 20 | pdf_img_info.setdefault(pno_str, list()).append((block['bbox'], '有')) | ||
| 21 | |||
| 22 | return pdf_text_info, pdf_img_info | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or sign in to post a comment