add FSM e-contract
Showing
7 changed files
with
209 additions
and
17 deletions
... | @@ -1057,7 +1057,25 @@ HIL_CONTRACT_2_CLASSIFY = 44 | ... | @@ -1057,7 +1057,25 @@ HIL_CONTRACT_2_CLASSIFY = 44 |
1057 | HIL_CONTRACT_3_CN_NAME = '车辆处置协议' | 1057 | HIL_CONTRACT_3_CN_NAME = '车辆处置协议' |
1058 | HIL_CONTRACT_3_CLASSIFY = 45 | 1058 | HIL_CONTRACT_3_CLASSIFY = 45 |
1059 | 1059 | ||
1060 | CONTRACT_SET = {CONTRACT_QRS_CLASSIFY, CONTRACT_CLASSIFY, HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_2_CLASSIFY, HIL_CONTRACT_3_CLASSIFY} | 1060 | FSM_CONTRACT_WEP_CN_NAME = '延长保修合同' |
1061 | FSM_CONTRACT_WEP_CLASSIFY = 51 | ||
1062 | |||
1063 | FSM_CONTRACT_MSI_CN_NAME = '长悦保养合同' | ||
1064 | FSM_CONTRACT_MSI_CLASSIFY = 52 | ||
1065 | |||
1066 | FSM_CONTRACT_SC_CN_NAME = '汽车销售合同' | ||
1067 | FSM_CONTRACT_SC_CLASSIFY = 53 | ||
1068 | |||
1069 | CONTRACT_SET = { | ||
1070 | CONTRACT_QRS_CLASSIFY, | ||
1071 | CONTRACT_CLASSIFY, | ||
1072 | HIL_CONTRACT_1_CLASSIFY, | ||
1073 | HIL_CONTRACT_2_CLASSIFY, | ||
1074 | HIL_CONTRACT_3_CLASSIFY, | ||
1075 | FSM_CONTRACT_WEP_CLASSIFY, | ||
1076 | FSM_CONTRACT_MSI_CLASSIFY, | ||
1077 | FSM_CONTRACT_SC_CLASSIFY, | ||
1078 | } | ||
1061 | 1079 | ||
1062 | CONTRACT_MAP = { | 1080 | CONTRACT_MAP = { |
1063 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME, | 1081 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME, |
... | @@ -1065,6 +1083,10 @@ CONTRACT_MAP = { | ... | @@ -1065,6 +1083,10 @@ CONTRACT_MAP = { |
1065 | HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME, | 1083 | HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME, |
1066 | CONTRACT_CLASSIFY: CONTRACT_CN_NAME, | 1084 | CONTRACT_CLASSIFY: CONTRACT_CN_NAME, |
1067 | CONTRACT_QRS_CLASSIFY: CONTRACT_QRS_CN_NAME, | 1085 | CONTRACT_QRS_CLASSIFY: CONTRACT_QRS_CN_NAME, |
1086 | FSM_CONTRACT_WEP_CLASSIFY: FSM_CONTRACT_WEP_CN_NAME, | ||
1087 | FSM_CONTRACT_MSI_CLASSIFY: FSM_CONTRACT_MSI_CN_NAME, | ||
1088 | FSM_CONTRACT_SC_CLASSIFY: FSM_CONTRACT_SC_CN_NAME, | ||
1089 | |||
1068 | } | 1090 | } |
1069 | 1091 | ||
1070 | # 保单 | 1092 | # 保单 |
... | @@ -1214,6 +1236,11 @@ BS_FIELD = 'bss_ocr' | ... | @@ -1214,6 +1236,11 @@ BS_FIELD = 'bss_ocr' |
1214 | HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr' | 1236 | HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr' |
1215 | HIL_CONTRACT_2_FIELD = 'hil_contract_2_ocr' | 1237 | HIL_CONTRACT_2_FIELD = 'hil_contract_2_ocr' |
1216 | HIL_CONTRACT_3_FIELD = 'hil_contract_3_ocr' | 1238 | HIL_CONTRACT_3_FIELD = 'hil_contract_3_ocr' |
1239 | FSM_CONTRACT_WEP_FIELD = 'fsm_wep_ocr', | ||
1240 | FSM_CONTRACT_MSI_FIELD = 'fsm_msi_ocr', | ||
1241 | FSM_CONTRACT_SC_FIELD = 'fsm_sc_ocr', | ||
1242 | |||
1243 | |||
1217 | BS_CLASSIFY = 10089 | 1244 | BS_CLASSIFY = 10089 |
1218 | 1245 | ||
1219 | RESULT_MAPPING = { | 1246 | RESULT_MAPPING = { |
... | @@ -1238,6 +1265,9 @@ RESULT_MAPPING = { | ... | @@ -1238,6 +1265,9 @@ RESULT_MAPPING = { |
1238 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD, | 1265 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD, |
1239 | HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_FIELD, | 1266 | HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_FIELD, |
1240 | HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_FIELD, | 1267 | HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_FIELD, |
1268 | FSM_CONTRACT_WEP_CLASSIFY: FSM_CONTRACT_WEP_FIELD, | ||
1269 | FSM_CONTRACT_MSI_CLASSIFY: FSM_CONTRACT_MSI_FIELD, | ||
1270 | FSM_CONTRACT_SC_CLASSIFY: FSM_CONTRACT_SC_FIELD, | ||
1241 | } | 1271 | } |
1242 | 1272 | ||
1243 | CA_ADD_COMPARE_FIELDS = (IC_OCR_FIELD, BL_OCR_FIELD, BS_FIELD) | 1273 | CA_ADD_COMPARE_FIELDS = (IC_OCR_FIELD, BL_OCR_FIELD, BS_FIELD) |
... | @@ -2313,29 +2343,35 @@ APPLICANT_TYPE_MAP = { | ... | @@ -2313,29 +2343,35 @@ APPLICANT_TYPE_MAP = { |
2313 | 2343 | ||
2314 | APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager'] | 2344 | APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager'] |
2315 | 2345 | ||
2316 | FILE_NAME_PREFIX_MAP = { | 2346 | # FILE_NAME_PREFIX_MAP = { |
2317 | AFC_PREFIX: [ | 2347 | # AFC_PREFIX: [ |
2318 | ((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'), | 2348 | # ((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'), |
2319 | ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), | 2349 | # ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), |
2320 | ], | 2350 | # ], |
2321 | HIL_PREFIX: [ | 2351 | # HIL_PREFIX: [ |
2322 | ((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'), | 2352 | # ((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'), |
2323 | ((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'), | 2353 | # ((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'), |
2324 | ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), | 2354 | # ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), |
2325 | ] | 2355 | # ] |
2326 | } | 2356 | # } |
2327 | 2357 | ||
2328 | ECONTRACT_KEYWORDS_MAP = { | 2358 | ECONTRACT_KEYWORDS_MAP = { |
2329 | AFC_PREFIX: [ | 2359 | AFC_PREFIX: [ |
2330 | ('抵押贷款合同', CONTRACT_CLASSIFY), | 2360 | ('抵押贷款合同', CONTRACT_CLASSIFY), |
2331 | ('送达地址确认书', CONTRACT_QRS_CLASSIFY), | 2361 | ('送达地址确认书', CONTRACT_QRS_CLASSIFY), |
2332 | # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0), | 2362 | ('抵押登记豁免函', HMH_CLASSIFY), |
2363 | ('延长保修', FSM_CONTRACT_WEP_CLASSIFY), | ||
2364 | ('长悦保养', FSM_CONTRACT_MSI_CLASSIFY), | ||
2365 | ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY), | ||
2333 | ], | 2366 | ], |
2334 | HIL_PREFIX: [ | 2367 | HIL_PREFIX: [ |
2335 | ('售后回租合同', HIL_CONTRACT_1_CLASSIFY), | 2368 | ('售后回租合同', HIL_CONTRACT_1_CLASSIFY), |
2336 | ('租赁抵押合同', HIL_CONTRACT_2_CLASSIFY), | 2369 | ('租赁抵押合同', HIL_CONTRACT_2_CLASSIFY), |
2337 | ('车辆处置协议', HIL_CONTRACT_3_CLASSIFY), | 2370 | ('车辆处置协议', HIL_CONTRACT_3_CLASSIFY), |
2338 | # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0), | 2371 | ('抵押登记豁免函', HMH_CLASSIFY), |
2372 | ('延长保修', FSM_CONTRACT_WEP_CLASSIFY), | ||
2373 | ('长悦保养', FSM_CONTRACT_MSI_CLASSIFY), | ||
2374 | ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY), | ||
2339 | ] | 2375 | ] |
2340 | } | 2376 | } |
2341 | 2377 | ||
... | @@ -2345,6 +2381,12 @@ HIL_CONTRACT_TYPE_MAP = { | ... | @@ -2345,6 +2381,12 @@ HIL_CONTRACT_TYPE_MAP = { |
2345 | str(HIL_CONTRACT_3_CLASSIFY): 1, | 2381 | str(HIL_CONTRACT_3_CLASSIFY): 1, |
2346 | } | 2382 | } |
2347 | 2383 | ||
2384 | FSM_CONTRACT_TYPE_MAP = { | ||
2385 | str(FSM_CONTRACT_WEP_CLASSIFY): 0, | ||
2386 | str(FSM_CONTRACT_MSI_CLASSIFY): 1, | ||
2387 | str(FSM_CONTRACT_SC_CLASSIFY): 2, | ||
2388 | } | ||
2389 | |||
2348 | RESULT_MAP = { | 2390 | RESULT_MAP = { |
2349 | 0: None, | 2391 | 0: None, |
2350 | 1: True, | 2392 | 1: True, | ... | ... |
... | @@ -20,6 +20,8 @@ from common.tools.file_tools import get_pwd_list_from_str, extract_zip_or_rar, g | ... | @@ -20,6 +20,8 @@ from common.tools.file_tools import get_pwd_list_from_str, extract_zip_or_rar, g |
20 | from common.tools.pdf_to_img import PDFHandler | 20 | from common.tools.pdf_to_img import PDFHandler |
21 | from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict | 21 | from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict |
22 | from common.electronic_hil_contract.hil_contract_ocr import predict as hil_predict | 22 | from common.electronic_hil_contract.hil_contract_ocr import predict as hil_predict |
23 | from common.fsm_econtract.fsm_contract_ocr import predict as fsm_predict | ||
24 | from common.fsm_econtract.hmh_ocr import predict as hmh_predict | ||
23 | from apps.doc import consts | 25 | from apps.doc import consts |
24 | # from apps.doc.ocr.edms import EDMS, rh | 26 | # from apps.doc.ocr.edms import EDMS, rh |
25 | from apps.doc.ocr.ecm import ECM, rh | 27 | from apps.doc.ocr.ecm import ECM, rh |
... | @@ -996,7 +998,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -996,7 +998,7 @@ class Command(BaseCommand, LoggerMixin): |
996 | res.setdefault(consts.ALL_POSITION_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get( | 998 | res.setdefault(consts.ALL_POSITION_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get( |
997 | consts.ALL_POSITION_KEY, {}).get(key1, []) | 999 | consts.ALL_POSITION_KEY, {}).get(key1, []) |
998 | license_summary[classify] = [res] | 1000 | license_summary[classify] = [res] |
999 | else: | 1001 | elif classify in consts.SE_HIL_CON_MAP: # TODO FSM新合同写入数据库用于比对 |
1000 | res = {} | 1002 | res = {} |
1001 | for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items(): | 1003 | for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items(): |
1002 | if pno1 is None: | 1004 | if pno1 is None: |
... | @@ -1442,7 +1444,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1442,7 +1444,7 @@ class Command(BaseCommand, LoggerMixin): |
1442 | self.log_base, traceback.format_exc())) | 1444 | self.log_base, traceback.format_exc())) |
1443 | error_list.append(1) | 1445 | error_list.append(1) |
1444 | return | 1446 | return |
1445 | else: # e-contract | 1447 | else: # e-contract or or e-fsm-contract or e-hmh |
1446 | try: | 1448 | try: |
1447 | # pdf下载 处理 图片存储 识别 | 1449 | # pdf下载 处理 图片存储 识别 |
1448 | for times in range(consts.RETRY_TIMES): | 1450 | for times in range(consts.RETRY_TIMES): |
... | @@ -1472,6 +1474,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1472,6 +1474,7 @@ class Command(BaseCommand, LoggerMixin): |
1472 | self.online_log.error('{0} [process error (db save)] [error={1}]'.format( | 1474 | self.online_log.error('{0} [process error (db save)] [error={1}]'.format( |
1473 | self.log_base, traceback.format_exc())) | 1475 | self.log_base, traceback.format_exc())) |
1474 | 1476 | ||
1477 | # AFC合同 | ||
1475 | if classify_1_str == str(consts.CONTRACT_CLASSIFY): | 1478 | if classify_1_str == str(consts.CONTRACT_CLASSIFY): |
1476 | ocr_result = afc_predict(pdf_handler.pdf_info) | 1479 | ocr_result = afc_predict(pdf_handler.pdf_info) |
1477 | page_res = {} | 1480 | page_res = {} |
... | @@ -1483,6 +1486,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1483,6 +1486,7 @@ class Command(BaseCommand, LoggerMixin): |
1483 | 'page_num': page_num, | 1486 | 'page_num': page_num, |
1484 | 'page_info': page_info | 1487 | 'page_info': page_info |
1485 | } | 1488 | } |
1489 | # 送达地址确认书 | ||
1486 | elif classify_1_str == str(consts.CONTRACT_QRS_CLASSIFY): | 1490 | elif classify_1_str == str(consts.CONTRACT_QRS_CLASSIFY): |
1487 | ocr_result = afc_predict(pdf_handler.pdf_info, is_qrs=True) | 1491 | ocr_result = afc_predict(pdf_handler.pdf_info, is_qrs=True) |
1488 | page_num = 'page_1' | 1492 | page_num = 'page_1' |
... | @@ -1493,7 +1497,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1493,7 +1497,8 @@ class Command(BaseCommand, LoggerMixin): |
1493 | 'page_info': ocr_result.pop(page_num, {}) | 1497 | 'page_info': ocr_result.pop(page_num, {}) |
1494 | } | 1498 | } |
1495 | } | 1499 | } |
1496 | else: | 1500 | # HIL合同 |
1501 | elif classify_1_str in consts.HIL_CONTRACT_TYPE_MAP: | ||
1497 | file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str) | 1502 | file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str) |
1498 | ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1) | 1503 | ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1) |
1499 | rebuild_res_1 = {} | 1504 | rebuild_res_1 = {} |
... | @@ -1508,9 +1513,35 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1508,9 +1513,35 @@ class Command(BaseCommand, LoggerMixin): |
1508 | 'page_num': page_num, | 1513 | 'page_num': page_num, |
1509 | 'page_info': page_info | 1514 | 'page_info': page_info |
1510 | } | 1515 | } |
1516 | # FSM合同 WEP MSI SC | ||
1517 | elif classify_1_str in consts.FSM_CONTRACT_TYPE_MAP: | ||
1518 | file_type = consts.FSM_CONTRACT_TYPE_MAP.get(classify_1_str) | ||
1519 | ocr_result = fsm_predict(pdf_handler.pdf_info, file_type) | ||
1520 | for page_num, page_info in ocr_result.items(): | ||
1521 | if isinstance(page_num, str) and page_num.startswith('page_'): | ||
1522 | page_res[page_num] = { | ||
1523 | 'classify': int(classify_1_str), | ||
1524 | 'page_num': page_num, | ||
1525 | 'page_info': page_info | ||
1526 | } | ||
1527 | # hmh | ||
1528 | else: | ||
1529 | pass | ||
1530 | |||
1511 | 1531 | ||
1512 | contract_res = {} | 1532 | contract_res = {} |
1513 | for img_path_tmp, page_key in pdf_handler.img_path_pno_list: | 1533 | for img_path_tmp, page_key in pdf_handler.img_path_pno_list: |
1534 | if classify_1_str == str(consts.HMH_CLASSIFY): | ||
1535 | img_contract_res = { | ||
1536 | 'code': 1, | ||
1537 | 'data': [ | ||
1538 | { | ||
1539 | 'classify': consts.HMH_CLASSIFY, | ||
1540 | 'data': hmh_predict(pdf_handler.pdf_info) | ||
1541 | } | ||
1542 | ] | ||
1543 | } | ||
1544 | else: | ||
1514 | if page_key in page_res: | 1545 | if page_key in page_res: |
1515 | img_contract_res = { | 1546 | img_contract_res = { |
1516 | 'code': 1, | 1547 | 'code': 1, | ... | ... |
src/common/fsm_econtract/const.py
0 → 100644
1 | WEP_FIELD = { | ||
2 | "0": { | ||
3 | 'keys': { | ||
4 | '客户姓名': [('客户姓名', (r'^客户姓名.?$', r'^企业名称.?$'), 'top1', {})], | ||
5 | '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})], | ||
6 | '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})], | ||
7 | '合同价格(小写)': [('人民币', (r'^人民币¥.?$', ), 'top1', {})], | ||
8 | '客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', ), 'top1', {})], | ||
9 | '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})], | ||
10 | }, | ||
11 | 'value': { | ||
12 | '客户姓名': ('text', 'right', {'offset_tuple': (-1.1, 1, 0.3, 0)}, ''), | ||
13 | '证件类型': ('text', 'right', {'offset_tuple': (-1, 1, 0, 0)}, ''), | ||
14 | '证件号码': ('text', 'right', {'offset_tuple': (-1, 2, 0.3, 0)}, ''), | ||
15 | '合同价格(小写)': ('text', 'right', {'offset_tuple': (-1, 1, 0.3, 0)}, ''), | ||
16 | '客户签名': ('img', 'under', {'offset_tuple': (0, 0, 0, 4), 'rigorous': True}, '无'), | ||
17 | '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'), | ||
18 | }, | ||
19 | } | ||
20 | |||
21 | } | ||
22 | |||
23 | MSI_FIELD = { | ||
24 | "0": { | ||
25 | 'keys': { | ||
26 | '客户姓名': [('客户姓名', (r'^客户姓名.?$', r'^企业名称.?$'), 'top1', {})], | ||
27 | '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})], | ||
28 | '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})], | ||
29 | '合同价格(小写)': [('人民币', (r'^人民币¥.?$', ), 'top1', {})], | ||
30 | }, | ||
31 | 'value': { | ||
32 | '客户姓名': ('text', 'right', {'offset_tuple': (-1.2, 1, 0.3, 0)}, ''), | ||
33 | '证件类型': ('text', 'right', {'offset_tuple': (-1, 1, 0, 0)}, ''), | ||
34 | '证件号码': ('text', 'right', {'offset_tuple': (-1, 2, 0.3, 0)}, ''), | ||
35 | '合同价格(小写)': ('text', 'right', {'offset_tuple': (-1, 1, 0.3, 0)}, ''), | ||
36 | }, | ||
37 | }, | ||
38 | "1": { | ||
39 | 'keys': { | ||
40 | '客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', ), 'top1', {})], | ||
41 | '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})], | ||
42 | }, | ||
43 | 'value': { | ||
44 | '客户签名': ('img', 'under', {'offset_tuple': (0, 0, 0, 4), 'rigorous': True}, '无'), | ||
45 | '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'), | ||
46 | }, | ||
47 | } | ||
48 | } | ||
49 | |||
50 | SC_FIELD = { | ||
51 | "0": { | ||
52 | 'keys': { | ||
53 | '姓名': [('姓名', (r'^姓名.?$', r'^企业名称.?$'), 'top1', {})], | ||
54 | '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})], | ||
55 | '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})], | ||
56 | '总价': [('总价', (r'^总价.?$', ), 'top1', {})], | ||
57 | }, | ||
58 | 'value': { | ||
59 | '姓名': ('text', 'right', {'offset_tuple': (-2, 8, 0.5, 0)}, ''), | ||
60 | '证件类型': ('text', 'right', {'offset_tuple': (-2, 6, 0.5, 0)}, ''), | ||
61 | '证件号码': ('text', 'right', {'offset_tuple': (-2, 6, 0.5, 0)}, ''), | ||
62 | '总价': ('text', 'right', {'offset_tuple': (-2, 12, 0.5, 0)}, ''), | ||
63 | }, | ||
64 | }, | ||
65 | "-1": { | ||
66 | 'keys': { | ||
67 | '客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', r'^客户签名/盖章.*$'), 'top1', {})], | ||
68 | '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})], | ||
69 | }, | ||
70 | 'value': { | ||
71 | '客户签名': ('img', 'under', {'offset_tuple': (1.5, 1, 0, 4), 'rigorous': True}, '无'), | ||
72 | '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'), | ||
73 | }, | ||
74 | } | ||
75 | } |
src/common/fsm_econtract/fsm_contract_ocr.py
0 → 100644
1 | from .retriever import Retriever | ||
2 | from .const import WEP_FIELD, MSI_FIELD, SC_FIELD | ||
3 | from .tools import pdf_info_rebuild | ||
4 | |||
5 | retriever_list = [Retriever(WEP_FIELD), Retriever(MSI_FIELD), Retriever(SC_FIELD)] | ||
6 | |||
7 | def predict(pdf_info, file_type=0): | ||
8 | retriever = retriever_list[file_type] | ||
9 | pdf_text_list, pdf_img_list = pdf_info_rebuild(pdf_info) | ||
10 | return retriever.get_target_fields(pdf_text_list, pdf_img_list) | ||
11 | |||
12 |
src/common/fsm_econtract/hmh_ocr.py
0 → 100644
src/common/fsm_econtract/retriever.py
0 → 100644
This diff is collapsed.
Click to expand it.
src/common/fsm_econtract/tools.py
0 → 100644
1 | def pdf_info_rebuild(pdf_info, fix_bbox=True): | ||
2 | pdf_text_info = dict() | ||
3 | pdf_img_info = dict() | ||
4 | for pno_str, page_info in pdf_info.items(): | ||
5 | text_set = set() | ||
6 | for block in page_info['blocks']: | ||
7 | if block['type'] == 0: | ||
8 | # text有重复的现象 | ||
9 | text_set.clear() | ||
10 | for line in block['lines']: | ||
11 | for span in line['spans']: | ||
12 | bbox, text = span['bbox'], span['text'].strip() | ||
13 | if len(text) != 0 and text not in text_set: | ||
14 | text_set.add(text) | ||
15 | # bbox的高,不准 | ||
16 | if fix_bbox and bbox[-1] - bbox[1] < span['size']: | ||
17 | bbox[-1] = bbox[-1] + span['size'] | ||
18 | pdf_text_info.setdefault(pno_str, list()).append([bbox, text]) | ||
19 | elif block['type'] == 1: | ||
20 | pdf_img_info.setdefault(pno_str, list()).append((block['bbox'], '有')) | ||
21 | |||
22 | return pdf_text_info, pdf_img_info | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or sign in to post a comment