Merge branch 'feature/fsm-contract' into fix/report_ca
Showing
7 changed files
with
482 additions
and
17 deletions
... | @@ -1057,7 +1057,25 @@ HIL_CONTRACT_2_CLASSIFY = 44 | ... | @@ -1057,7 +1057,25 @@ HIL_CONTRACT_2_CLASSIFY = 44 |
1057 | HIL_CONTRACT_3_CN_NAME = '车辆处置协议' | 1057 | HIL_CONTRACT_3_CN_NAME = '车辆处置协议' |
1058 | HIL_CONTRACT_3_CLASSIFY = 45 | 1058 | HIL_CONTRACT_3_CLASSIFY = 45 |
1059 | 1059 | ||
1060 | CONTRACT_SET = {CONTRACT_QRS_CLASSIFY, CONTRACT_CLASSIFY, HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_2_CLASSIFY, HIL_CONTRACT_3_CLASSIFY} | 1060 | FSM_CONTRACT_WEP_CN_NAME = '延长保修合同' |
1061 | FSM_CONTRACT_WEP_CLASSIFY = 51 | ||
1062 | |||
1063 | FSM_CONTRACT_MSI_CN_NAME = '长悦保养合同' | ||
1064 | FSM_CONTRACT_MSI_CLASSIFY = 52 | ||
1065 | |||
1066 | FSM_CONTRACT_SC_CN_NAME = '汽车销售合同' | ||
1067 | FSM_CONTRACT_SC_CLASSIFY = 53 | ||
1068 | |||
1069 | CONTRACT_SET = { | ||
1070 | CONTRACT_QRS_CLASSIFY, | ||
1071 | CONTRACT_CLASSIFY, | ||
1072 | HIL_CONTRACT_1_CLASSIFY, | ||
1073 | HIL_CONTRACT_2_CLASSIFY, | ||
1074 | HIL_CONTRACT_3_CLASSIFY, | ||
1075 | FSM_CONTRACT_WEP_CLASSIFY, | ||
1076 | FSM_CONTRACT_MSI_CLASSIFY, | ||
1077 | FSM_CONTRACT_SC_CLASSIFY, | ||
1078 | } | ||
1061 | 1079 | ||
1062 | CONTRACT_MAP = { | 1080 | CONTRACT_MAP = { |
1063 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME, | 1081 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME, |
... | @@ -1065,6 +1083,10 @@ CONTRACT_MAP = { | ... | @@ -1065,6 +1083,10 @@ CONTRACT_MAP = { |
1065 | HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME, | 1083 | HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME, |
1066 | CONTRACT_CLASSIFY: CONTRACT_CN_NAME, | 1084 | CONTRACT_CLASSIFY: CONTRACT_CN_NAME, |
1067 | CONTRACT_QRS_CLASSIFY: CONTRACT_QRS_CN_NAME, | 1085 | CONTRACT_QRS_CLASSIFY: CONTRACT_QRS_CN_NAME, |
1086 | FSM_CONTRACT_WEP_CLASSIFY: FSM_CONTRACT_WEP_CN_NAME, | ||
1087 | FSM_CONTRACT_MSI_CLASSIFY: FSM_CONTRACT_MSI_CN_NAME, | ||
1088 | FSM_CONTRACT_SC_CLASSIFY: FSM_CONTRACT_SC_CN_NAME, | ||
1089 | |||
1068 | } | 1090 | } |
1069 | 1091 | ||
1070 | # 保单 | 1092 | # 保单 |
... | @@ -1214,6 +1236,11 @@ BS_FIELD = 'bss_ocr' | ... | @@ -1214,6 +1236,11 @@ BS_FIELD = 'bss_ocr' |
1214 | HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr' | 1236 | HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr' |
1215 | HIL_CONTRACT_2_FIELD = 'hil_contract_2_ocr' | 1237 | HIL_CONTRACT_2_FIELD = 'hil_contract_2_ocr' |
1216 | HIL_CONTRACT_3_FIELD = 'hil_contract_3_ocr' | 1238 | HIL_CONTRACT_3_FIELD = 'hil_contract_3_ocr' |
1239 | FSM_CONTRACT_WEP_FIELD = 'fsm_wep_ocr', | ||
1240 | FSM_CONTRACT_MSI_FIELD = 'fsm_msi_ocr', | ||
1241 | FSM_CONTRACT_SC_FIELD = 'fsm_sc_ocr', | ||
1242 | |||
1243 | |||
1217 | BS_CLASSIFY = 10089 | 1244 | BS_CLASSIFY = 10089 |
1218 | 1245 | ||
1219 | RESULT_MAPPING = { | 1246 | RESULT_MAPPING = { |
... | @@ -1238,6 +1265,9 @@ RESULT_MAPPING = { | ... | @@ -1238,6 +1265,9 @@ RESULT_MAPPING = { |
1238 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD, | 1265 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD, |
1239 | HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_FIELD, | 1266 | HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_FIELD, |
1240 | HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_FIELD, | 1267 | HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_FIELD, |
1268 | FSM_CONTRACT_WEP_CLASSIFY: FSM_CONTRACT_WEP_FIELD, | ||
1269 | FSM_CONTRACT_MSI_CLASSIFY: FSM_CONTRACT_MSI_FIELD, | ||
1270 | FSM_CONTRACT_SC_CLASSIFY: FSM_CONTRACT_SC_FIELD, | ||
1241 | } | 1271 | } |
1242 | 1272 | ||
1243 | CA_ADD_COMPARE_FIELDS = (IC_OCR_FIELD, BL_OCR_FIELD, BS_FIELD) | 1273 | CA_ADD_COMPARE_FIELDS = (IC_OCR_FIELD, BL_OCR_FIELD, BS_FIELD) |
... | @@ -2313,29 +2343,35 @@ APPLICANT_TYPE_MAP = { | ... | @@ -2313,29 +2343,35 @@ APPLICANT_TYPE_MAP = { |
2313 | 2343 | ||
2314 | APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager'] | 2344 | APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager'] |
2315 | 2345 | ||
2316 | FILE_NAME_PREFIX_MAP = { | 2346 | # FILE_NAME_PREFIX_MAP = { |
2317 | AFC_PREFIX: [ | 2347 | # AFC_PREFIX: [ |
2318 | ((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'), | 2348 | # ((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'), |
2319 | ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), | 2349 | # ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), |
2320 | ], | 2350 | # ], |
2321 | HIL_PREFIX: [ | 2351 | # HIL_PREFIX: [ |
2322 | ((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'), | 2352 | # ((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'), |
2323 | ((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'), | 2353 | # ((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'), |
2324 | ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), | 2354 | # ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), |
2325 | ] | 2355 | # ] |
2326 | } | 2356 | # } |
2327 | 2357 | ||
2328 | ECONTRACT_KEYWORDS_MAP = { | 2358 | ECONTRACT_KEYWORDS_MAP = { |
2329 | AFC_PREFIX: [ | 2359 | AFC_PREFIX: [ |
2330 | ('抵押贷款合同', CONTRACT_CLASSIFY), | 2360 | ('抵押贷款合同', CONTRACT_CLASSIFY), |
2331 | ('送达地址确认书', CONTRACT_QRS_CLASSIFY), | 2361 | ('送达地址确认书', CONTRACT_QRS_CLASSIFY), |
2332 | # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0), | 2362 | ('抵押登记豁免函', HMH_CLASSIFY), |
2363 | ('延长保修', FSM_CONTRACT_WEP_CLASSIFY), | ||
2364 | ('长悦保养', FSM_CONTRACT_MSI_CLASSIFY), | ||
2365 | ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY), | ||
2333 | ], | 2366 | ], |
2334 | HIL_PREFIX: [ | 2367 | HIL_PREFIX: [ |
2335 | ('售后回租合同', HIL_CONTRACT_1_CLASSIFY), | 2368 | ('售后回租合同', HIL_CONTRACT_1_CLASSIFY), |
2336 | ('租赁抵押合同', HIL_CONTRACT_2_CLASSIFY), | 2369 | ('租赁抵押合同', HIL_CONTRACT_2_CLASSIFY), |
2337 | ('车辆处置协议', HIL_CONTRACT_3_CLASSIFY), | 2370 | ('车辆处置协议', HIL_CONTRACT_3_CLASSIFY), |
2338 | # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0), | 2371 | ('抵押登记豁免函', HMH_CLASSIFY), |
2372 | ('延长保修', FSM_CONTRACT_WEP_CLASSIFY), | ||
2373 | ('长悦保养', FSM_CONTRACT_MSI_CLASSIFY), | ||
2374 | ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY), | ||
2339 | ] | 2375 | ] |
2340 | } | 2376 | } |
2341 | 2377 | ||
... | @@ -2345,6 +2381,12 @@ HIL_CONTRACT_TYPE_MAP = { | ... | @@ -2345,6 +2381,12 @@ HIL_CONTRACT_TYPE_MAP = { |
2345 | str(HIL_CONTRACT_3_CLASSIFY): 1, | 2381 | str(HIL_CONTRACT_3_CLASSIFY): 1, |
2346 | } | 2382 | } |
2347 | 2383 | ||
2384 | FSM_CONTRACT_TYPE_MAP = { | ||
2385 | str(FSM_CONTRACT_WEP_CLASSIFY): 0, | ||
2386 | str(FSM_CONTRACT_MSI_CLASSIFY): 1, | ||
2387 | str(FSM_CONTRACT_SC_CLASSIFY): 2, | ||
2388 | } | ||
2389 | |||
2348 | RESULT_MAP = { | 2390 | RESULT_MAP = { |
2349 | 0: None, | 2391 | 0: None, |
2350 | 1: True, | 2392 | 1: True, | ... | ... |
... | @@ -20,6 +20,8 @@ from common.tools.file_tools import get_pwd_list_from_str, extract_zip_or_rar, g | ... | @@ -20,6 +20,8 @@ from common.tools.file_tools import get_pwd_list_from_str, extract_zip_or_rar, g |
20 | from common.tools.pdf_to_img import PDFHandler | 20 | from common.tools.pdf_to_img import PDFHandler |
21 | from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict | 21 | from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict |
22 | from common.electronic_hil_contract.hil_contract_ocr import predict as hil_predict | 22 | from common.electronic_hil_contract.hil_contract_ocr import predict as hil_predict |
23 | from common.fsm_econtract.fsm_contract_ocr import predict as fsm_predict | ||
24 | from common.fsm_econtract.hmh_ocr import predict as hmh_predict | ||
23 | from apps.doc import consts | 25 | from apps.doc import consts |
24 | # from apps.doc.ocr.edms import EDMS, rh | 26 | # from apps.doc.ocr.edms import EDMS, rh |
25 | from apps.doc.ocr.ecm import ECM, rh | 27 | from apps.doc.ocr.ecm import ECM, rh |
... | @@ -996,7 +998,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -996,7 +998,7 @@ class Command(BaseCommand, LoggerMixin): |
996 | res.setdefault(consts.ALL_POSITION_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get( | 998 | res.setdefault(consts.ALL_POSITION_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get( |
997 | consts.ALL_POSITION_KEY, {}).get(key1, []) | 999 | consts.ALL_POSITION_KEY, {}).get(key1, []) |
998 | license_summary[classify] = [res] | 1000 | license_summary[classify] = [res] |
999 | else: | 1001 | elif classify in consts.SE_HIL_CON_MAP: # TODO FSM新合同写入数据库用于比对 |
1000 | res = {} | 1002 | res = {} |
1001 | for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items(): | 1003 | for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items(): |
1002 | if pno1 is None: | 1004 | if pno1 is None: |
... | @@ -1442,7 +1444,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1442,7 +1444,7 @@ class Command(BaseCommand, LoggerMixin): |
1442 | self.log_base, traceback.format_exc())) | 1444 | self.log_base, traceback.format_exc())) |
1443 | error_list.append(1) | 1445 | error_list.append(1) |
1444 | return | 1446 | return |
1445 | else: # e-contract | 1447 | else: # e-contract or or e-fsm-contract or e-hmh |
1446 | try: | 1448 | try: |
1447 | # pdf下载 处理 图片存储 识别 | 1449 | # pdf下载 处理 图片存储 识别 |
1448 | for times in range(consts.RETRY_TIMES): | 1450 | for times in range(consts.RETRY_TIMES): |
... | @@ -1472,6 +1474,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1472,6 +1474,7 @@ class Command(BaseCommand, LoggerMixin): |
1472 | self.online_log.error('{0} [process error (db save)] [error={1}]'.format( | 1474 | self.online_log.error('{0} [process error (db save)] [error={1}]'.format( |
1473 | self.log_base, traceback.format_exc())) | 1475 | self.log_base, traceback.format_exc())) |
1474 | 1476 | ||
1477 | # AFC合同 | ||
1475 | if classify_1_str == str(consts.CONTRACT_CLASSIFY): | 1478 | if classify_1_str == str(consts.CONTRACT_CLASSIFY): |
1476 | ocr_result = afc_predict(pdf_handler.pdf_info) | 1479 | ocr_result = afc_predict(pdf_handler.pdf_info) |
1477 | page_res = {} | 1480 | page_res = {} |
... | @@ -1483,6 +1486,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1483,6 +1486,7 @@ class Command(BaseCommand, LoggerMixin): |
1483 | 'page_num': page_num, | 1486 | 'page_num': page_num, |
1484 | 'page_info': page_info | 1487 | 'page_info': page_info |
1485 | } | 1488 | } |
1489 | # 送达地址确认书 | ||
1486 | elif classify_1_str == str(consts.CONTRACT_QRS_CLASSIFY): | 1490 | elif classify_1_str == str(consts.CONTRACT_QRS_CLASSIFY): |
1487 | ocr_result = afc_predict(pdf_handler.pdf_info, is_qrs=True) | 1491 | ocr_result = afc_predict(pdf_handler.pdf_info, is_qrs=True) |
1488 | page_num = 'page_1' | 1492 | page_num = 'page_1' |
... | @@ -1493,7 +1497,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1493,7 +1497,8 @@ class Command(BaseCommand, LoggerMixin): |
1493 | 'page_info': ocr_result.pop(page_num, {}) | 1497 | 'page_info': ocr_result.pop(page_num, {}) |
1494 | } | 1498 | } |
1495 | } | 1499 | } |
1496 | else: | 1500 | # HIL合同 |
1501 | elif classify_1_str in consts.HIL_CONTRACT_TYPE_MAP: | ||
1497 | file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str) | 1502 | file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str) |
1498 | ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1) | 1503 | ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1) |
1499 | rebuild_res_1 = {} | 1504 | rebuild_res_1 = {} |
... | @@ -1508,9 +1513,35 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1508,9 +1513,35 @@ class Command(BaseCommand, LoggerMixin): |
1508 | 'page_num': page_num, | 1513 | 'page_num': page_num, |
1509 | 'page_info': page_info | 1514 | 'page_info': page_info |
1510 | } | 1515 | } |
1516 | # FSM合同 WEP MSI SC | ||
1517 | elif classify_1_str in consts.FSM_CONTRACT_TYPE_MAP: | ||
1518 | file_type = consts.FSM_CONTRACT_TYPE_MAP.get(classify_1_str) | ||
1519 | ocr_result = fsm_predict(pdf_handler.pdf_info, file_type) | ||
1520 | for page_num, page_info in ocr_result.items(): | ||
1521 | if isinstance(page_num, str) and page_num.startswith('page_'): | ||
1522 | page_res[page_num] = { | ||
1523 | 'classify': int(classify_1_str), | ||
1524 | 'page_num': page_num, | ||
1525 | 'page_info': page_info | ||
1526 | } | ||
1527 | # hmh | ||
1528 | else: | ||
1529 | pass | ||
1530 | |||
1511 | 1531 | ||
1512 | contract_res = {} | 1532 | contract_res = {} |
1513 | for img_path_tmp, page_key in pdf_handler.img_path_pno_list: | 1533 | for img_path_tmp, page_key in pdf_handler.img_path_pno_list: |
1534 | if classify_1_str == str(consts.HMH_CLASSIFY): | ||
1535 | img_contract_res = { | ||
1536 | 'code': 1, | ||
1537 | 'data': [ | ||
1538 | { | ||
1539 | 'classify': consts.HMH_CLASSIFY, | ||
1540 | 'data': hmh_predict(pdf_handler.pdf_info) | ||
1541 | } | ||
1542 | ] | ||
1543 | } | ||
1544 | else: | ||
1514 | if page_key in page_res: | 1545 | if page_key in page_res: |
1515 | img_contract_res = { | 1546 | img_contract_res = { |
1516 | 'code': 1, | 1547 | 'code': 1, | ... | ... |
src/common/fsm_econtract/const.py
0 → 100644
1 | WEP_FIELD = { | ||
2 | "0": { | ||
3 | 'keys': { | ||
4 | '客户姓名': [('客户姓名', (r'^客户姓名.?$', r'^企业名称.?$'), 'top1', {})], | ||
5 | '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})], | ||
6 | '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})], | ||
7 | '合同价格(小写)': [('人民币', (r'^人民币¥.?$', ), 'top1', {})], | ||
8 | '客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', ), 'top1', {})], | ||
9 | '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})], | ||
10 | }, | ||
11 | 'value': { | ||
12 | '客户姓名': ('text', 'right', {'offset_tuple': (-1.1, 1, 0.3, 0)}, ''), | ||
13 | '证件类型': ('text', 'right', {'offset_tuple': (-1, 1, 0, 0)}, ''), | ||
14 | '证件号码': ('text', 'right', {'offset_tuple': (-1, 2, 0.3, 0)}, ''), | ||
15 | '合同价格(小写)': ('text', 'right', {'offset_tuple': (-1, 1, 0.3, 0)}, ''), | ||
16 | '客户签名': ('img', 'under', {'offset_tuple': (0, 0, 0, 4), 'rigorous': True}, '无'), | ||
17 | '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'), | ||
18 | }, | ||
19 | } | ||
20 | |||
21 | } | ||
22 | |||
23 | MSI_FIELD = { | ||
24 | "0": { | ||
25 | 'keys': { | ||
26 | '客户姓名': [('客户姓名', (r'^客户姓名.?$', r'^企业名称.?$'), 'top1', {})], | ||
27 | '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})], | ||
28 | '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})], | ||
29 | '合同价格(小写)': [('人民币', (r'^人民币¥.?$', ), 'top1', {})], | ||
30 | }, | ||
31 | 'value': { | ||
32 | '客户姓名': ('text', 'right', {'offset_tuple': (-1.2, 1, 0.3, 0)}, ''), | ||
33 | '证件类型': ('text', 'right', {'offset_tuple': (-1, 1, 0, 0)}, ''), | ||
34 | '证件号码': ('text', 'right', {'offset_tuple': (-1, 2, 0.3, 0)}, ''), | ||
35 | '合同价格(小写)': ('text', 'right', {'offset_tuple': (-1, 1, 0.3, 0)}, ''), | ||
36 | }, | ||
37 | }, | ||
38 | "1": { | ||
39 | 'keys': { | ||
40 | '客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', ), 'top1', {})], | ||
41 | '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})], | ||
42 | }, | ||
43 | 'value': { | ||
44 | '客户签名': ('img', 'under', {'offset_tuple': (0, 0, 0, 4), 'rigorous': True}, '无'), | ||
45 | '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'), | ||
46 | }, | ||
47 | } | ||
48 | } | ||
49 | |||
50 | SC_FIELD = { | ||
51 | "0": { | ||
52 | 'keys': { | ||
53 | '姓名': [('姓名', (r'^姓名.?$', r'^企业名称.?$'), 'top1', {})], | ||
54 | '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})], | ||
55 | '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})], | ||
56 | '总价': [('总价', (r'^总价.?$', ), 'top1', {})], | ||
57 | }, | ||
58 | 'value': { | ||
59 | '姓名': ('text', 'right', {'offset_tuple': (-2, 8, 0.5, 0)}, ''), | ||
60 | '证件类型': ('text', 'right', {'offset_tuple': (-2, 6, 0.5, 0)}, ''), | ||
61 | '证件号码': ('text', 'right', {'offset_tuple': (-2, 6, 0.5, 0)}, ''), | ||
62 | '总价': ('text', 'right', {'offset_tuple': (-2, 12, 0.5, 0)}, ''), | ||
63 | }, | ||
64 | }, | ||
65 | "-1": { | ||
66 | 'keys': { | ||
67 | '客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', r'^客户签名/盖章.*$'), 'top1', {})], | ||
68 | '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})], | ||
69 | }, | ||
70 | 'value': { | ||
71 | '客户签名': ('img', 'under', {'offset_tuple': (1.5, 1, 0, 4), 'rigorous': True}, '无'), | ||
72 | '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'), | ||
73 | }, | ||
74 | } | ||
75 | } |
src/common/fsm_econtract/fsm_contract_ocr.py
0 → 100644
1 | from .retriever import Retriever | ||
2 | from .const import WEP_FIELD, MSI_FIELD, SC_FIELD | ||
3 | from .tools import pdf_info_rebuild | ||
4 | |||
5 | retriever_list = [Retriever(WEP_FIELD), Retriever(MSI_FIELD), Retriever(SC_FIELD)] | ||
6 | |||
7 | def predict(pdf_info, file_type=0): | ||
8 | retriever = retriever_list[file_type] | ||
9 | pdf_text_list, pdf_img_list = pdf_info_rebuild(pdf_info) | ||
10 | return retriever.get_target_fields(pdf_text_list, pdf_img_list) | ||
11 | |||
12 |
src/common/fsm_econtract/hmh_ocr.py
0 → 100644
src/common/fsm_econtract/retriever.py
0 → 100644
1 | import re | ||
2 | |||
3 | |||
4 | class HMHRetriever: | ||
5 | |||
6 | def __init__(self): | ||
7 | self.words_str = 'words' | ||
8 | self.position_str = 'location' | ||
9 | self.default_position = [0, 0, 0, 0] | ||
10 | self.search_fields_list = [ | ||
11 | ('借款/承租人姓名', ''), | ||
12 | ('证件号码', ''), | ||
13 | ('渠道', ''), | ||
14 | ('合同编号', ''), | ||
15 | ('借款人签字/盖章', '无'), | ||
16 | ] | ||
17 | |||
18 | def get_target_fields(self, pdf_text_list): | ||
19 | result = dict() | ||
20 | is_find_name_id_company, is_find_application_no, is_find_name_date = False, False, False | ||
21 | for bbox, text in pdf_text_list.pop(str(0), []): | ||
22 | # print(text) | ||
23 | if not is_find_name_id_company: | ||
24 | # name_id_company_list = re.findall(r'借款人\(姓名(.*)证件号码(.*)与(.*公司)', text) | ||
25 | name_id_company_list = re.findall(r'承租人\(姓名(.*)证件号码(.*)与(.*公司)', text) | ||
26 | for name_id_company_tuple in name_id_company_list: | ||
27 | if len(name_id_company_tuple) == 3: | ||
28 | result[self.search_fields_list[0][0]] = { | ||
29 | self.words_str: name_id_company_tuple[0].replace('\u3000', '').strip(), | ||
30 | self.position_str: bbox | ||
31 | } | ||
32 | result[self.search_fields_list[1][0]] = { | ||
33 | self.words_str: name_id_company_tuple[1].replace('\u3000', '').strip(), | ||
34 | self.position_str: bbox | ||
35 | } | ||
36 | result[self.search_fields_list[2][0]] = { | ||
37 | self.words_str: name_id_company_tuple[2], | ||
38 | self.position_str: bbox | ||
39 | } | ||
40 | is_find_name_id_company = True | ||
41 | break | ||
42 | if not is_find_name_id_company: | ||
43 | name_id_company_list = re.findall(r'借款人\(姓名(.*)证件号码(.*)与(.*公司)', text) | ||
44 | # name_id_company_list = re.findall(r'承租人\(姓名(.*)证件号码(.*)与(.*公司)', text) | ||
45 | for name_id_company_tuple in name_id_company_list: | ||
46 | if len(name_id_company_tuple) == 3: | ||
47 | result[self.search_fields_list[0][0]] = { | ||
48 | self.words_str: name_id_company_tuple[0].replace('\u3000', '').strip(), | ||
49 | self.position_str: bbox | ||
50 | } | ||
51 | result[self.search_fields_list[1][0]] = { | ||
52 | self.words_str: name_id_company_tuple[1].replace('\u3000', '').strip(), | ||
53 | self.position_str: bbox | ||
54 | } | ||
55 | result[self.search_fields_list[2][0]] = { | ||
56 | self.words_str: name_id_company_tuple[2], | ||
57 | self.position_str: bbox | ||
58 | } | ||
59 | is_find_name_id_company = True | ||
60 | break | ||
61 | if not is_find_application_no: | ||
62 | application_no_list = re.findall(r'合同编号.*(CH-B\d*-\d*).*', text) | ||
63 | if len(application_no_list) == 1: | ||
64 | result[self.search_fields_list[3][0]] = { | ||
65 | self.words_str: application_no_list[0], | ||
66 | self.position_str: bbox | ||
67 | } | ||
68 | is_find_application_no = True | ||
69 | if not is_find_name_date: | ||
70 | name_date_list = re.findall(r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})', text) | ||
71 | for name_date_tuple in name_date_list: | ||
72 | if len(name_date_tuple) == 2: | ||
73 | result[self.search_fields_list[4][0]] = { | ||
74 | self.words_str: '{0} {1}'.format(name_date_tuple[0].replace('\u3000', '').strip(), name_date_tuple[1]), | ||
75 | self.position_str: bbox | ||
76 | } | ||
77 | is_find_name_date = True | ||
78 | break | ||
79 | |||
80 | for find_key, default_value in self.search_fields_list: | ||
81 | if find_key not in result: | ||
82 | result[find_key] = { | ||
83 | self.words_str: default_value, | ||
84 | self.position_str: self.default_position, | ||
85 | } | ||
86 | # simple_result = [] | ||
87 | # for key, value_dict in result.items(): | ||
88 | # simple_result.append((key, value_dict[self.words_str])) | ||
89 | |||
90 | # return simple_result | ||
91 | return {"words_result": result} | ||
92 | |||
93 | class Retriever: | ||
94 | |||
95 | def __init__(self, target_fields): | ||
96 | self.keys_str = 'keys' | ||
97 | self.value_str = 'value' | ||
98 | self.text_str = 'text' | ||
99 | self.words_str = 'words' | ||
100 | self.position_str = 'position' | ||
101 | self.default_position = [-1, -1, -1, -1] | ||
102 | self.target_fields = target_fields | ||
103 | self.replace_map = { | ||
104 | 'int': { | ||
105 | '(': '0' | ||
106 | } | ||
107 | } | ||
108 | |||
109 | @staticmethod | ||
110 | def key_top1(coordinates_list, key_coordinates): | ||
111 | # 关键词查找方向:最上面 | ||
112 | coordinates_list.sort(key=lambda x: x[1]) | ||
113 | return coordinates_list[0] | ||
114 | |||
115 | def key_right(self, coordinates_list, key_coordinates, offset_tuple, rigorous=False): | ||
116 | # 关键词查找方向:右侧 | ||
117 | if len(coordinates_list) == 1: | ||
118 | return coordinates_list[0] | ||
119 | |||
120 | # 没有上一层关键词的坐标时,返回最上面的坐标 | ||
121 | if key_coordinates is None: | ||
122 | return self.key_top1(coordinates_list, key_coordinates) | ||
123 | |||
124 | x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple) | ||
125 | |||
126 | x_min_find, find_key_coordinates = None, None | ||
127 | for x0, y0, x1, y1 in coordinates_list: | ||
128 | if rigorous: | ||
129 | is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max | ||
130 | else: | ||
131 | cent_x = x0 + ((x1 - x0) / 2) | ||
132 | cent_y = y0 + ((y1 - y0) / 2) | ||
133 | is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max | ||
134 | if is_eligible: | ||
135 | if x_min_find is None or x0 < x_min_find: | ||
136 | x_min_find = x0 | ||
137 | find_key_coordinates = (x0, y0, x1, y1) | ||
138 | |||
139 | if find_key_coordinates is None: | ||
140 | return self.key_top1(coordinates_list, key_coordinates) | ||
141 | else: | ||
142 | return find_key_coordinates | ||
143 | |||
144 | def value_right(self, search_list, key_coordinates, offset_tuple, value_type=None, rigorous=False): | ||
145 | # 字段值查找方向:右侧 | ||
146 | x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple) | ||
147 | |||
148 | x_min_find, value, coordinates = None, None, None | ||
149 | for (x0, y0, x1, y1), text in search_list: | ||
150 | if rigorous: | ||
151 | is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max | ||
152 | else: | ||
153 | cent_x = x0 + ((x1 - x0) / 2) | ||
154 | cent_y = y0 + ((y1 - y0) / 2) | ||
155 | is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max | ||
156 | if is_eligible: | ||
157 | if x_min_find is None or x0 < x_min_find: | ||
158 | if len(text.strip()) > 0: | ||
159 | x_min_find = x0 | ||
160 | value = text | ||
161 | coordinates = (x0, y0, x1, y1) | ||
162 | |||
163 | if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str): | ||
164 | new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {}))) | ||
165 | return new_value, coordinates | ||
166 | |||
167 | return value, coordinates | ||
168 | |||
169 | def value_under(self, search_list, key_coordinates, offset_tuple, value_type=None, append=False, rigorous=False): | ||
170 | # 字段值查找方向:下方 | ||
171 | x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple) | ||
172 | |||
173 | find_list = [] | ||
174 | for (x0, y0, x1, y1), text in search_list: | ||
175 | if rigorous: | ||
176 | is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max | ||
177 | else: | ||
178 | cent_x = x0 + ((x1 - x0) / 2) | ||
179 | cent_y = y0 + ((y1 - y0) / 2) | ||
180 | is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max | ||
181 | if is_eligible: | ||
182 | if len(text.strip()) > 0: | ||
183 | find_list.append((x0, y0, x1, y1, text)) | ||
184 | |||
185 | if len(find_list) == 0: | ||
186 | return None, None | ||
187 | else: | ||
188 | find_list.sort(key=lambda x: (x[1], x[0])) | ||
189 | coordinates = find_list[0][:-1] | ||
190 | if append: | ||
191 | value = ''.join([text for _, _, _, _, text in find_list]) | ||
192 | else: | ||
193 | value = find_list[0][-1] | ||
194 | |||
195 | if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str): | ||
196 | new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {}))) | ||
197 | return new_value, coordinates | ||
198 | |||
199 | return value, coordinates | ||
200 | |||
201 | @staticmethod | ||
202 | def get_target_bbox(key_coordinates, offset_tuple): | ||
203 | offset_xmin, offset_xmax, offset_ymin, offset_ymax = offset_tuple | ||
204 | |||
205 | width = key_coordinates[2] - key_coordinates[0] | ||
206 | height = key_coordinates[-1] - key_coordinates[1] | ||
207 | |||
208 | x_min = key_coordinates[0] - (width * offset_xmin) # -1 | ||
209 | x_max = key_coordinates[2] + (width * offset_xmax) | ||
210 | y_min = key_coordinates[1] - (height * offset_ymin) # -1 | ||
211 | y_max = key_coordinates[-1] + (height * offset_ymax) | ||
212 | return x_min, y_min, x_max, y_max | ||
213 | |||
214 | def get_target_fields(self, pdf_text_list, pdf_img_list): | ||
215 | pdf_result = dict() | ||
216 | |||
217 | for pno_str, fields_dict in self.target_fields.items(): | ||
218 | if pno_str == '-1': | ||
219 | pno_int_list = [int(pno_str) for pno_str in pdf_text_list.keys()] | ||
220 | pno_str = str(max(pno_int_list)) | ||
221 | |||
222 | # 搜索关键词 | ||
223 | key_text_info = dict() | ||
224 | for key_text_list in fields_dict[self.keys_str].values(): | ||
225 | for key_text, key_re_tuple, _, _ in key_text_list: | ||
226 | for (x0, y0, x1, y1), text in pdf_text_list.get(pno_str, []): | ||
227 | for key_re in key_re_tuple: | ||
228 | if re.match(key_re, text): | ||
229 | key_text_info.setdefault(key_text, list()).append((x0, y0, x1, y1)) | ||
230 | |||
231 | # 搜索关键词 | ||
232 | key_coordinates_info = dict() | ||
233 | for field, key_text_list in fields_dict[self.keys_str].items(): | ||
234 | last_key_coordinates = None | ||
235 | for key_text, _, direction, kwargs in key_text_list: | ||
236 | if key_text not in key_text_info: | ||
237 | last_key_coordinates = None | ||
238 | continue | ||
239 | last_key_coordinates = getattr(self, 'key_{0}'.format(direction))( | ||
240 | key_text_info[key_text], | ||
241 | last_key_coordinates, | ||
242 | **kwargs) | ||
243 | |||
244 | key_coordinates_info[field] = last_key_coordinates | ||
245 | |||
246 | # 搜索字段值 | ||
247 | page_result = dict() | ||
248 | for field, (source, direction, kwargs, default_value) in fields_dict[self.value_str].items(): | ||
249 | if not isinstance(key_coordinates_info.get(field), tuple): | ||
250 | page_result[field] = { | ||
251 | self.words_str: default_value, | ||
252 | self.position_str: self.default_position, | ||
253 | } | ||
254 | continue | ||
255 | value, coordinates = getattr(self, 'value_{0}'.format(direction))( | ||
256 | pdf_text_list.get(pno_str, []) if source == self.text_str else pdf_img_list.get(pno_str, []), | ||
257 | key_coordinates_info[field], | ||
258 | **kwargs | ||
259 | ) | ||
260 | if not isinstance(value, str): | ||
261 | page_result[field] = { | ||
262 | self.words_str: default_value, | ||
263 | self.position_str: self.default_position, | ||
264 | } | ||
265 | else: | ||
266 | page_result[field] = { | ||
267 | self.words_str: value, | ||
268 | self.position_str: list(coordinates), | ||
269 | } | ||
270 | |||
271 | pdf_result['page_{0}'.format(int(pno_str) + 1)] = page_result | ||
272 | |||
273 | return pdf_result |
src/common/fsm_econtract/tools.py
0 → 100644
1 | def pdf_info_rebuild(pdf_info, fix_bbox=True): | ||
2 | pdf_text_info = dict() | ||
3 | pdf_img_info = dict() | ||
4 | for pno_str, page_info in pdf_info.items(): | ||
5 | text_set = set() | ||
6 | for block in page_info['blocks']: | ||
7 | if block['type'] == 0: | ||
8 | # text有重复的现象 | ||
9 | text_set.clear() | ||
10 | for line in block['lines']: | ||
11 | for span in line['spans']: | ||
12 | bbox, text = span['bbox'], span['text'].strip() | ||
13 | if len(text) != 0 and text not in text_set: | ||
14 | text_set.add(text) | ||
15 | # bbox的高,不准 | ||
16 | if fix_bbox and bbox[-1] - bbox[1] < span['size']: | ||
17 | bbox[-1] = bbox[-1] + span['size'] | ||
18 | pdf_text_info.setdefault(pno_str, list()).append([bbox, text]) | ||
19 | elif block['type'] == 1: | ||
20 | pdf_img_info.setdefault(pno_str, list()).append((block['bbox'], '有')) | ||
21 | |||
22 | return pdf_text_info, pdf_img_info | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or sign in to post a comment