d7847808 by 周伟奇

Merge branch 'feature/fsm-contract' into fix/report_ca

2 parents 30509ad4 784ff18a
...@@ -1057,7 +1057,25 @@ HIL_CONTRACT_2_CLASSIFY = 44 ...@@ -1057,7 +1057,25 @@ HIL_CONTRACT_2_CLASSIFY = 44
1057 HIL_CONTRACT_3_CN_NAME = '车辆处置协议' 1057 HIL_CONTRACT_3_CN_NAME = '车辆处置协议'
1058 HIL_CONTRACT_3_CLASSIFY = 45 1058 HIL_CONTRACT_3_CLASSIFY = 45
1059 1059
1060 CONTRACT_SET = {CONTRACT_QRS_CLASSIFY, CONTRACT_CLASSIFY, HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_2_CLASSIFY, HIL_CONTRACT_3_CLASSIFY} 1060 FSM_CONTRACT_WEP_CN_NAME = '延长保修合同'
1061 FSM_CONTRACT_WEP_CLASSIFY = 51
1062
1063 FSM_CONTRACT_MSI_CN_NAME = '长悦保养合同'
1064 FSM_CONTRACT_MSI_CLASSIFY = 52
1065
1066 FSM_CONTRACT_SC_CN_NAME = '汽车销售合同'
1067 FSM_CONTRACT_SC_CLASSIFY = 53
1068
1069 CONTRACT_SET = {
1070 CONTRACT_QRS_CLASSIFY,
1071 CONTRACT_CLASSIFY,
1072 HIL_CONTRACT_1_CLASSIFY,
1073 HIL_CONTRACT_2_CLASSIFY,
1074 HIL_CONTRACT_3_CLASSIFY,
1075 FSM_CONTRACT_WEP_CLASSIFY,
1076 FSM_CONTRACT_MSI_CLASSIFY,
1077 FSM_CONTRACT_SC_CLASSIFY,
1078 }
1061 1079
1062 CONTRACT_MAP = { 1080 CONTRACT_MAP = {
1063 HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME, 1081 HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME,
...@@ -1065,6 +1083,10 @@ CONTRACT_MAP = { ...@@ -1065,6 +1083,10 @@ CONTRACT_MAP = {
1065 HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME, 1083 HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME,
1066 CONTRACT_CLASSIFY: CONTRACT_CN_NAME, 1084 CONTRACT_CLASSIFY: CONTRACT_CN_NAME,
1067 CONTRACT_QRS_CLASSIFY: CONTRACT_QRS_CN_NAME, 1085 CONTRACT_QRS_CLASSIFY: CONTRACT_QRS_CN_NAME,
1086 FSM_CONTRACT_WEP_CLASSIFY: FSM_CONTRACT_WEP_CN_NAME,
1087 FSM_CONTRACT_MSI_CLASSIFY: FSM_CONTRACT_MSI_CN_NAME,
1088 FSM_CONTRACT_SC_CLASSIFY: FSM_CONTRACT_SC_CN_NAME,
1089
1068 } 1090 }
1069 1091
1070 # 保单 1092 # 保单
...@@ -1214,6 +1236,11 @@ BS_FIELD = 'bss_ocr' ...@@ -1214,6 +1236,11 @@ BS_FIELD = 'bss_ocr'
1214 HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr' 1236 HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr'
1215 HIL_CONTRACT_2_FIELD = 'hil_contract_2_ocr' 1237 HIL_CONTRACT_2_FIELD = 'hil_contract_2_ocr'
1216 HIL_CONTRACT_3_FIELD = 'hil_contract_3_ocr' 1238 HIL_CONTRACT_3_FIELD = 'hil_contract_3_ocr'
1239 FSM_CONTRACT_WEP_FIELD = 'fsm_wep_ocr',
1240 FSM_CONTRACT_MSI_FIELD = 'fsm_msi_ocr',
1241 FSM_CONTRACT_SC_FIELD = 'fsm_sc_ocr',
1242
1243
1217 BS_CLASSIFY = 10089 1244 BS_CLASSIFY = 10089
1218 1245
1219 RESULT_MAPPING = { 1246 RESULT_MAPPING = {
...@@ -1238,6 +1265,9 @@ RESULT_MAPPING = { ...@@ -1238,6 +1265,9 @@ RESULT_MAPPING = {
1238 HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD, 1265 HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD,
1239 HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_FIELD, 1266 HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_FIELD,
1240 HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_FIELD, 1267 HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_FIELD,
1268 FSM_CONTRACT_WEP_CLASSIFY: FSM_CONTRACT_WEP_FIELD,
1269 FSM_CONTRACT_MSI_CLASSIFY: FSM_CONTRACT_MSI_FIELD,
1270 FSM_CONTRACT_SC_CLASSIFY: FSM_CONTRACT_SC_FIELD,
1241 } 1271 }
1242 1272
1243 CA_ADD_COMPARE_FIELDS = (IC_OCR_FIELD, BL_OCR_FIELD, BS_FIELD) 1273 CA_ADD_COMPARE_FIELDS = (IC_OCR_FIELD, BL_OCR_FIELD, BS_FIELD)
...@@ -2313,29 +2343,35 @@ APPLICANT_TYPE_MAP = { ...@@ -2313,29 +2343,35 @@ APPLICANT_TYPE_MAP = {
2313 2343
2314 APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager'] 2344 APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager']
2315 2345
2316 FILE_NAME_PREFIX_MAP = { 2346 # FILE_NAME_PREFIX_MAP = {
2317 AFC_PREFIX: [ 2347 # AFC_PREFIX: [
2318 ((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'), 2348 # ((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'),
2319 ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), 2349 # ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
2320 ], 2350 # ],
2321 HIL_PREFIX: [ 2351 # HIL_PREFIX: [
2322 ((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'), 2352 # ((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'),
2323 ((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'), 2353 # ((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'),
2324 ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'), 2354 # ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
2325 ] 2355 # ]
2326 } 2356 # }
2327 2357
2328 ECONTRACT_KEYWORDS_MAP = { 2358 ECONTRACT_KEYWORDS_MAP = {
2329 AFC_PREFIX: [ 2359 AFC_PREFIX: [
2330 ('抵押贷款合同', CONTRACT_CLASSIFY), 2360 ('抵押贷款合同', CONTRACT_CLASSIFY),
2331 ('送达地址确认书', CONTRACT_QRS_CLASSIFY), 2361 ('送达地址确认书', CONTRACT_QRS_CLASSIFY),
2332 # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0), 2362 ('抵押登记豁免函', HMH_CLASSIFY),
2363 ('延长保修', FSM_CONTRACT_WEP_CLASSIFY),
2364 ('长悦保养', FSM_CONTRACT_MSI_CLASSIFY),
2365 ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY),
2333 ], 2366 ],
2334 HIL_PREFIX: [ 2367 HIL_PREFIX: [
2335 ('售后回租合同', HIL_CONTRACT_1_CLASSIFY), 2368 ('售后回租合同', HIL_CONTRACT_1_CLASSIFY),
2336 ('租赁抵押合同', HIL_CONTRACT_2_CLASSIFY), 2369 ('租赁抵押合同', HIL_CONTRACT_2_CLASSIFY),
2337 ('车辆处置协议', HIL_CONTRACT_3_CLASSIFY), 2370 ('车辆处置协议', HIL_CONTRACT_3_CLASSIFY),
2338 # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0), 2371 ('抵押登记豁免函', HMH_CLASSIFY),
2372 ('延长保修', FSM_CONTRACT_WEP_CLASSIFY),
2373 ('长悦保养', FSM_CONTRACT_MSI_CLASSIFY),
2374 ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY),
2339 ] 2375 ]
2340 } 2376 }
2341 2377
...@@ -2345,6 +2381,12 @@ HIL_CONTRACT_TYPE_MAP = { ...@@ -2345,6 +2381,12 @@ HIL_CONTRACT_TYPE_MAP = {
2345 str(HIL_CONTRACT_3_CLASSIFY): 1, 2381 str(HIL_CONTRACT_3_CLASSIFY): 1,
2346 } 2382 }
2347 2383
2384 FSM_CONTRACT_TYPE_MAP = {
2385 str(FSM_CONTRACT_WEP_CLASSIFY): 0,
2386 str(FSM_CONTRACT_MSI_CLASSIFY): 1,
2387 str(FSM_CONTRACT_SC_CLASSIFY): 2,
2388 }
2389
2348 RESULT_MAP = { 2390 RESULT_MAP = {
2349 0: None, 2391 0: None,
2350 1: True, 2392 1: True,
......
...@@ -20,6 +20,8 @@ from common.tools.file_tools import get_pwd_list_from_str, extract_zip_or_rar, g ...@@ -20,6 +20,8 @@ from common.tools.file_tools import get_pwd_list_from_str, extract_zip_or_rar, g
20 from common.tools.pdf_to_img import PDFHandler 20 from common.tools.pdf_to_img import PDFHandler
21 from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict 21 from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict
22 from common.electronic_hil_contract.hil_contract_ocr import predict as hil_predict 22 from common.electronic_hil_contract.hil_contract_ocr import predict as hil_predict
23 from common.fsm_econtract.fsm_contract_ocr import predict as fsm_predict
24 from common.fsm_econtract.hmh_ocr import predict as hmh_predict
23 from apps.doc import consts 25 from apps.doc import consts
24 # from apps.doc.ocr.edms import EDMS, rh 26 # from apps.doc.ocr.edms import EDMS, rh
25 from apps.doc.ocr.ecm import ECM, rh 27 from apps.doc.ocr.ecm import ECM, rh
...@@ -996,7 +998,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -996,7 +998,7 @@ class Command(BaseCommand, LoggerMixin):
996 res.setdefault(consts.ALL_POSITION_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get( 998 res.setdefault(consts.ALL_POSITION_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get(
997 consts.ALL_POSITION_KEY, {}).get(key1, []) 999 consts.ALL_POSITION_KEY, {}).get(key1, [])
998 license_summary[classify] = [res] 1000 license_summary[classify] = [res]
999 else: 1001 elif classify in consts.SE_HIL_CON_MAP: # TODO FSM新合同写入数据库用于比对
1000 res = {} 1002 res = {}
1001 for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items(): 1003 for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items():
1002 if pno1 is None: 1004 if pno1 is None:
...@@ -1442,7 +1444,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1442,7 +1444,7 @@ class Command(BaseCommand, LoggerMixin):
1442 self.log_base, traceback.format_exc())) 1444 self.log_base, traceback.format_exc()))
1443 error_list.append(1) 1445 error_list.append(1)
1444 return 1446 return
1445 else: # e-contract 1447 else: # e-contract or or e-fsm-contract or e-hmh
1446 try: 1448 try:
1447 # pdf下载 处理 图片存储 识别 1449 # pdf下载 处理 图片存储 识别
1448 for times in range(consts.RETRY_TIMES): 1450 for times in range(consts.RETRY_TIMES):
...@@ -1472,6 +1474,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1472,6 +1474,7 @@ class Command(BaseCommand, LoggerMixin):
1472 self.online_log.error('{0} [process error (db save)] [error={1}]'.format( 1474 self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
1473 self.log_base, traceback.format_exc())) 1475 self.log_base, traceback.format_exc()))
1474 1476
1477 # AFC合同
1475 if classify_1_str == str(consts.CONTRACT_CLASSIFY): 1478 if classify_1_str == str(consts.CONTRACT_CLASSIFY):
1476 ocr_result = afc_predict(pdf_handler.pdf_info) 1479 ocr_result = afc_predict(pdf_handler.pdf_info)
1477 page_res = {} 1480 page_res = {}
...@@ -1483,6 +1486,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1483,6 +1486,7 @@ class Command(BaseCommand, LoggerMixin):
1483 'page_num': page_num, 1486 'page_num': page_num,
1484 'page_info': page_info 1487 'page_info': page_info
1485 } 1488 }
1489 # 送达地址确认书
1486 elif classify_1_str == str(consts.CONTRACT_QRS_CLASSIFY): 1490 elif classify_1_str == str(consts.CONTRACT_QRS_CLASSIFY):
1487 ocr_result = afc_predict(pdf_handler.pdf_info, is_qrs=True) 1491 ocr_result = afc_predict(pdf_handler.pdf_info, is_qrs=True)
1488 page_num = 'page_1' 1492 page_num = 'page_1'
...@@ -1493,7 +1497,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1493,7 +1497,8 @@ class Command(BaseCommand, LoggerMixin):
1493 'page_info': ocr_result.pop(page_num, {}) 1497 'page_info': ocr_result.pop(page_num, {})
1494 } 1498 }
1495 } 1499 }
1496 else: 1500 # HIL合同
1501 elif classify_1_str in consts.HIL_CONTRACT_TYPE_MAP:
1497 file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str) 1502 file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str)
1498 ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1) 1503 ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1)
1499 rebuild_res_1 = {} 1504 rebuild_res_1 = {}
...@@ -1508,28 +1513,54 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1508,28 +1513,54 @@ class Command(BaseCommand, LoggerMixin):
1508 'page_num': page_num, 1513 'page_num': page_num,
1509 'page_info': page_info 1514 'page_info': page_info
1510 } 1515 }
1516 # FSM合同 WEP MSI SC
1517 elif classify_1_str in consts.FSM_CONTRACT_TYPE_MAP:
1518 file_type = consts.FSM_CONTRACT_TYPE_MAP.get(classify_1_str)
1519 ocr_result = fsm_predict(pdf_handler.pdf_info, file_type)
1520 for page_num, page_info in ocr_result.items():
1521 if isinstance(page_num, str) and page_num.startswith('page_'):
1522 page_res[page_num] = {
1523 'classify': int(classify_1_str),
1524 'page_num': page_num,
1525 'page_info': page_info
1526 }
1527 # hmh
1528 else:
1529 pass
1530
1511 1531
1512 contract_res = {} 1532 contract_res = {}
1513 for img_path_tmp, page_key in pdf_handler.img_path_pno_list: 1533 for img_path_tmp, page_key in pdf_handler.img_path_pno_list:
1514 if page_key in page_res: 1534 if classify_1_str == str(consts.HMH_CLASSIFY):
1515 img_contract_res = { 1535 img_contract_res = {
1516 'code': 1, 1536 'code': 1,
1517 'data': [ 1537 'data': [
1518 { 1538 {
1519 'classify': page_res[page_key].pop('classify', consts.OTHER_CLASSIFY), 1539 'classify': consts.HMH_CLASSIFY,
1520 'data': page_res[page_key] 1540 'data': hmh_predict(pdf_handler.pdf_info)
1521 } 1541 }
1522 ] 1542 ]
1523 } 1543 }
1524 else: 1544 else:
1525 img_contract_res = { 1545 if page_key in page_res:
1526 'code': 1, 1546 img_contract_res = {
1527 'data': [ 1547 'code': 1,
1528 { 1548 'data': [
1529 'classify': int(classify_1_str), 1549 {
1530 } 1550 'classify': page_res[page_key].pop('classify', consts.OTHER_CLASSIFY),
1531 ] 1551 'data': page_res[page_key]
1532 } 1552 }
1553 ]
1554 }
1555 else:
1556 img_contract_res = {
1557 'code': 1,
1558 'data': [
1559 {
1560 'classify': int(classify_1_str),
1561 }
1562 ]
1563 }
1533 contract_res[img_path_tmp] = img_contract_res 1564 contract_res[img_path_tmp] = img_contract_res
1534 1565
1535 with lock: 1566 with lock:
......
1 WEP_FIELD = {
2 "0": {
3 'keys': {
4 '客户姓名': [('客户姓名', (r'^客户姓名.?$', r'^企业名称.?$'), 'top1', {})],
5 '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})],
6 '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})],
7 '合同价格(小写)': [('人民币', (r'^人民币¥.?$', ), 'top1', {})],
8 '客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', ), 'top1', {})],
9 '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})],
10 },
11 'value': {
12 '客户姓名': ('text', 'right', {'offset_tuple': (-1.1, 1, 0.3, 0)}, ''),
13 '证件类型': ('text', 'right', {'offset_tuple': (-1, 1, 0, 0)}, ''),
14 '证件号码': ('text', 'right', {'offset_tuple': (-1, 2, 0.3, 0)}, ''),
15 '合同价格(小写)': ('text', 'right', {'offset_tuple': (-1, 1, 0.3, 0)}, ''),
16 '客户签名': ('img', 'under', {'offset_tuple': (0, 0, 0, 4), 'rigorous': True}, '无'),
17 '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'),
18 },
19 }
20
21 }
22
23 MSI_FIELD = {
24 "0": {
25 'keys': {
26 '客户姓名': [('客户姓名', (r'^客户姓名.?$', r'^企业名称.?$'), 'top1', {})],
27 '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})],
28 '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})],
29 '合同价格(小写)': [('人民币', (r'^人民币¥.?$', ), 'top1', {})],
30 },
31 'value': {
32 '客户姓名': ('text', 'right', {'offset_tuple': (-1.2, 1, 0.3, 0)}, ''),
33 '证件类型': ('text', 'right', {'offset_tuple': (-1, 1, 0, 0)}, ''),
34 '证件号码': ('text', 'right', {'offset_tuple': (-1, 2, 0.3, 0)}, ''),
35 '合同价格(小写)': ('text', 'right', {'offset_tuple': (-1, 1, 0.3, 0)}, ''),
36 },
37 },
38 "1": {
39 'keys': {
40 '客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', ), 'top1', {})],
41 '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})],
42 },
43 'value': {
44 '客户签名': ('img', 'under', {'offset_tuple': (0, 0, 0, 4), 'rigorous': True}, '无'),
45 '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'),
46 },
47 }
48 }
49
50 SC_FIELD = {
51 "0": {
52 'keys': {
53 '姓名': [('姓名', (r'^姓名.?$', r'^企业名称.?$'), 'top1', {})],
54 '证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})],
55 '证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})],
56 '总价': [('总价', (r'^总价.?$', ), 'top1', {})],
57 },
58 'value': {
59 '姓名': ('text', 'right', {'offset_tuple': (-2, 8, 0.5, 0)}, ''),
60 '证件类型': ('text', 'right', {'offset_tuple': (-2, 6, 0.5, 0)}, ''),
61 '证件号码': ('text', 'right', {'offset_tuple': (-2, 6, 0.5, 0)}, ''),
62 '总价': ('text', 'right', {'offset_tuple': (-2, 12, 0.5, 0)}, ''),
63 },
64 },
65 "-1": {
66 'keys': {
67 '客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', r'^客户签名/盖章.*$'), 'top1', {})],
68 '签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})],
69 },
70 'value': {
71 '客户签名': ('img', 'under', {'offset_tuple': (1.5, 1, 0, 4), 'rigorous': True}, '无'),
72 '签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'),
73 },
74 }
75 }
1 from .retriever import Retriever
2 from .const import WEP_FIELD, MSI_FIELD, SC_FIELD
3 from .tools import pdf_info_rebuild
4
5 retriever_list = [Retriever(WEP_FIELD), Retriever(MSI_FIELD), Retriever(SC_FIELD)]
6
7 def predict(pdf_info, file_type=0):
8 retriever = retriever_list[file_type]
9 pdf_text_list, pdf_img_list = pdf_info_rebuild(pdf_info)
10 return retriever.get_target_fields(pdf_text_list, pdf_img_list)
11
12
1 from .retriever import HMHRetriever
2 from .tools import pdf_info_rebuild
3
4 hmh_retriever = HMHRetriever()
5
6 def predict(pdf_info):
7 pdf_text_list, _ = pdf_info_rebuild(pdf_info, fix_bbox=False)
8 return hmh_retriever.get_target_fields(pdf_text_list)
9
10
1 import re
2
3
4 class HMHRetriever:
5
6 def __init__(self):
7 self.words_str = 'words'
8 self.position_str = 'location'
9 self.default_position = [0, 0, 0, 0]
10 self.search_fields_list = [
11 ('借款/承租人姓名', ''),
12 ('证件号码', ''),
13 ('渠道', ''),
14 ('合同编号', ''),
15 ('借款人签字/盖章', '无'),
16 ]
17
18 def get_target_fields(self, pdf_text_list):
19 result = dict()
20 is_find_name_id_company, is_find_application_no, is_find_name_date = False, False, False
21 for bbox, text in pdf_text_list.pop(str(0), []):
22 # print(text)
23 if not is_find_name_id_company:
24 # name_id_company_list = re.findall(r'借款人\(姓名(.*)证件号码(.*)与(.*公司)', text)
25 name_id_company_list = re.findall(r'承租人\(姓名(.*)证件号码(.*)与(.*公司)', text)
26 for name_id_company_tuple in name_id_company_list:
27 if len(name_id_company_tuple) == 3:
28 result[self.search_fields_list[0][0]] = {
29 self.words_str: name_id_company_tuple[0].replace('\u3000', '').strip(),
30 self.position_str: bbox
31 }
32 result[self.search_fields_list[1][0]] = {
33 self.words_str: name_id_company_tuple[1].replace('\u3000', '').strip(),
34 self.position_str: bbox
35 }
36 result[self.search_fields_list[2][0]] = {
37 self.words_str: name_id_company_tuple[2],
38 self.position_str: bbox
39 }
40 is_find_name_id_company = True
41 break
42 if not is_find_name_id_company:
43 name_id_company_list = re.findall(r'借款人\(姓名(.*)证件号码(.*)与(.*公司)', text)
44 # name_id_company_list = re.findall(r'承租人\(姓名(.*)证件号码(.*)与(.*公司)', text)
45 for name_id_company_tuple in name_id_company_list:
46 if len(name_id_company_tuple) == 3:
47 result[self.search_fields_list[0][0]] = {
48 self.words_str: name_id_company_tuple[0].replace('\u3000', '').strip(),
49 self.position_str: bbox
50 }
51 result[self.search_fields_list[1][0]] = {
52 self.words_str: name_id_company_tuple[1].replace('\u3000', '').strip(),
53 self.position_str: bbox
54 }
55 result[self.search_fields_list[2][0]] = {
56 self.words_str: name_id_company_tuple[2],
57 self.position_str: bbox
58 }
59 is_find_name_id_company = True
60 break
61 if not is_find_application_no:
62 application_no_list = re.findall(r'合同编号.*(CH-B\d*-\d*).*', text)
63 if len(application_no_list) == 1:
64 result[self.search_fields_list[3][0]] = {
65 self.words_str: application_no_list[0],
66 self.position_str: bbox
67 }
68 is_find_application_no = True
69 if not is_find_name_date:
70 name_date_list = re.findall(r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})', text)
71 for name_date_tuple in name_date_list:
72 if len(name_date_tuple) == 2:
73 result[self.search_fields_list[4][0]] = {
74 self.words_str: '{0} {1}'.format(name_date_tuple[0].replace('\u3000', '').strip(), name_date_tuple[1]),
75 self.position_str: bbox
76 }
77 is_find_name_date = True
78 break
79
80 for find_key, default_value in self.search_fields_list:
81 if find_key not in result:
82 result[find_key] = {
83 self.words_str: default_value,
84 self.position_str: self.default_position,
85 }
86 # simple_result = []
87 # for key, value_dict in result.items():
88 # simple_result.append((key, value_dict[self.words_str]))
89
90 # return simple_result
91 return {"words_result": result}
92
93 class Retriever:
94
95 def __init__(self, target_fields):
96 self.keys_str = 'keys'
97 self.value_str = 'value'
98 self.text_str = 'text'
99 self.words_str = 'words'
100 self.position_str = 'position'
101 self.default_position = [-1, -1, -1, -1]
102 self.target_fields = target_fields
103 self.replace_map = {
104 'int': {
105 '(': '0'
106 }
107 }
108
109 @staticmethod
110 def key_top1(coordinates_list, key_coordinates):
111 # 关键词查找方向:最上面
112 coordinates_list.sort(key=lambda x: x[1])
113 return coordinates_list[0]
114
115 def key_right(self, coordinates_list, key_coordinates, offset_tuple, rigorous=False):
116 # 关键词查找方向:右侧
117 if len(coordinates_list) == 1:
118 return coordinates_list[0]
119
120 # 没有上一层关键词的坐标时,返回最上面的坐标
121 if key_coordinates is None:
122 return self.key_top1(coordinates_list, key_coordinates)
123
124 x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple)
125
126 x_min_find, find_key_coordinates = None, None
127 for x0, y0, x1, y1 in coordinates_list:
128 if rigorous:
129 is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max
130 else:
131 cent_x = x0 + ((x1 - x0) / 2)
132 cent_y = y0 + ((y1 - y0) / 2)
133 is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max
134 if is_eligible:
135 if x_min_find is None or x0 < x_min_find:
136 x_min_find = x0
137 find_key_coordinates = (x0, y0, x1, y1)
138
139 if find_key_coordinates is None:
140 return self.key_top1(coordinates_list, key_coordinates)
141 else:
142 return find_key_coordinates
143
144 def value_right(self, search_list, key_coordinates, offset_tuple, value_type=None, rigorous=False):
145 # 字段值查找方向:右侧
146 x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple)
147
148 x_min_find, value, coordinates = None, None, None
149 for (x0, y0, x1, y1), text in search_list:
150 if rigorous:
151 is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max
152 else:
153 cent_x = x0 + ((x1 - x0) / 2)
154 cent_y = y0 + ((y1 - y0) / 2)
155 is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max
156 if is_eligible:
157 if x_min_find is None or x0 < x_min_find:
158 if len(text.strip()) > 0:
159 x_min_find = x0
160 value = text
161 coordinates = (x0, y0, x1, y1)
162
163 if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str):
164 new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {})))
165 return new_value, coordinates
166
167 return value, coordinates
168
169 def value_under(self, search_list, key_coordinates, offset_tuple, value_type=None, append=False, rigorous=False):
170 # 字段值查找方向:下方
171 x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple)
172
173 find_list = []
174 for (x0, y0, x1, y1), text in search_list:
175 if rigorous:
176 is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max
177 else:
178 cent_x = x0 + ((x1 - x0) / 2)
179 cent_y = y0 + ((y1 - y0) / 2)
180 is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max
181 if is_eligible:
182 if len(text.strip()) > 0:
183 find_list.append((x0, y0, x1, y1, text))
184
185 if len(find_list) == 0:
186 return None, None
187 else:
188 find_list.sort(key=lambda x: (x[1], x[0]))
189 coordinates = find_list[0][:-1]
190 if append:
191 value = ''.join([text for _, _, _, _, text in find_list])
192 else:
193 value = find_list[0][-1]
194
195 if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str):
196 new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {})))
197 return new_value, coordinates
198
199 return value, coordinates
200
201 @staticmethod
202 def get_target_bbox(key_coordinates, offset_tuple):
203 offset_xmin, offset_xmax, offset_ymin, offset_ymax = offset_tuple
204
205 width = key_coordinates[2] - key_coordinates[0]
206 height = key_coordinates[-1] - key_coordinates[1]
207
208 x_min = key_coordinates[0] - (width * offset_xmin) # -1
209 x_max = key_coordinates[2] + (width * offset_xmax)
210 y_min = key_coordinates[1] - (height * offset_ymin) # -1
211 y_max = key_coordinates[-1] + (height * offset_ymax)
212 return x_min, y_min, x_max, y_max
213
214 def get_target_fields(self, pdf_text_list, pdf_img_list):
215 pdf_result = dict()
216
217 for pno_str, fields_dict in self.target_fields.items():
218 if pno_str == '-1':
219 pno_int_list = [int(pno_str) for pno_str in pdf_text_list.keys()]
220 pno_str = str(max(pno_int_list))
221
222 # 搜索关键词
223 key_text_info = dict()
224 for key_text_list in fields_dict[self.keys_str].values():
225 for key_text, key_re_tuple, _, _ in key_text_list:
226 for (x0, y0, x1, y1), text in pdf_text_list.get(pno_str, []):
227 for key_re in key_re_tuple:
228 if re.match(key_re, text):
229 key_text_info.setdefault(key_text, list()).append((x0, y0, x1, y1))
230
231 # 搜索关键词
232 key_coordinates_info = dict()
233 for field, key_text_list in fields_dict[self.keys_str].items():
234 last_key_coordinates = None
235 for key_text, _, direction, kwargs in key_text_list:
236 if key_text not in key_text_info:
237 last_key_coordinates = None
238 continue
239 last_key_coordinates = getattr(self, 'key_{0}'.format(direction))(
240 key_text_info[key_text],
241 last_key_coordinates,
242 **kwargs)
243
244 key_coordinates_info[field] = last_key_coordinates
245
246 # 搜索字段值
247 page_result = dict()
248 for field, (source, direction, kwargs, default_value) in fields_dict[self.value_str].items():
249 if not isinstance(key_coordinates_info.get(field), tuple):
250 page_result[field] = {
251 self.words_str: default_value,
252 self.position_str: self.default_position,
253 }
254 continue
255 value, coordinates = getattr(self, 'value_{0}'.format(direction))(
256 pdf_text_list.get(pno_str, []) if source == self.text_str else pdf_img_list.get(pno_str, []),
257 key_coordinates_info[field],
258 **kwargs
259 )
260 if not isinstance(value, str):
261 page_result[field] = {
262 self.words_str: default_value,
263 self.position_str: self.default_position,
264 }
265 else:
266 page_result[field] = {
267 self.words_str: value,
268 self.position_str: list(coordinates),
269 }
270
271 pdf_result['page_{0}'.format(int(pno_str) + 1)] = page_result
272
273 return pdf_result
1 def pdf_info_rebuild(pdf_info, fix_bbox=True):
2 pdf_text_info = dict()
3 pdf_img_info = dict()
4 for pno_str, page_info in pdf_info.items():
5 text_set = set()
6 for block in page_info['blocks']:
7 if block['type'] == 0:
8 # text有重复的现象
9 text_set.clear()
10 for line in block['lines']:
11 for span in line['spans']:
12 bbox, text = span['bbox'], span['text'].strip()
13 if len(text) != 0 and text not in text_set:
14 text_set.add(text)
15 # bbox的高,不准
16 if fix_bbox and bbox[-1] - bbox[1] < span['size']:
17 bbox[-1] = bbox[-1] + span['size']
18 pdf_text_info.setdefault(pno_str, list()).append([bbox, text])
19 elif block['type'] == 1:
20 pdf_img_info.setdefault(pno_str, list()).append((block['bbox'], '有'))
21
22 return pdf_text_info, pdf_img_info
...\ No newline at end of file ...\ No newline at end of file
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!