573f28d7 by 周伟奇

add qrs ocr

1 parent 3690e26d
...@@ -1042,6 +1042,10 @@ JYPZ_FIELD_ORDER = (("type", "标题"), ...@@ -1042,6 +1042,10 @@ JYPZ_FIELD_ORDER = (("type", "标题"),
1042 CONTRACT_CN_NAME = '合同' 1042 CONTRACT_CN_NAME = '合同'
1043 CONTRACT_CLASSIFY = 41 1043 CONTRACT_CLASSIFY = 41
1044 1044
1045 # 合同-送达地址确认书
1046 CONTRACT_QRS_CN_NAME = '送达地址确认书'
1047 CONTRACT_QRS_CLASSIFY = 49
1048
1045 # 合同编号: 每页 1049 # 合同编号: 每页
1046 1050
1047 HIL_CONTRACT_1_CN_NAME = '售后回租合同' 1051 HIL_CONTRACT_1_CN_NAME = '售后回租合同'
...@@ -1053,13 +1057,14 @@ HIL_CONTRACT_2_CLASSIFY = 44 ...@@ -1053,13 +1057,14 @@ HIL_CONTRACT_2_CLASSIFY = 44
1053 HIL_CONTRACT_3_CN_NAME = '车辆处置协议' 1057 HIL_CONTRACT_3_CN_NAME = '车辆处置协议'
1054 HIL_CONTRACT_3_CLASSIFY = 45 1058 HIL_CONTRACT_3_CLASSIFY = 45
1055 1059
1056 CONTRACT_SET = {CONTRACT_CLASSIFY, HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_2_CLASSIFY, HIL_CONTRACT_3_CLASSIFY} 1060 CONTRACT_SET = {CONTRACT_QRS_CLASSIFY, CONTRACT_CLASSIFY, HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_2_CLASSIFY, HIL_CONTRACT_3_CLASSIFY}
1057 1061
1058 CONTRACT_MAP = { 1062 CONTRACT_MAP = {
1059 HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME, 1063 HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME,
1060 HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_CN_NAME, 1064 HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_CN_NAME,
1061 HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME, 1065 HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME,
1062 CONTRACT_CLASSIFY: CONTRACT_CN_NAME, 1066 CONTRACT_CLASSIFY: CONTRACT_CN_NAME,
1067 CONTRACT_QRS_CLASSIFY: CONTRACT_QRS_CN_NAME,
1063 } 1068 }
1064 1069
1065 # 保单 1070 # 保单
...@@ -1203,6 +1208,7 @@ DDA_OCR_FIELD = 'bs_ocr' ...@@ -1203,6 +1208,7 @@ DDA_OCR_FIELD = 'bs_ocr'
1203 HMH_OCR_FIELD = 'hmh_ocr' 1208 HMH_OCR_FIELD = 'hmh_ocr'
1204 JYPZ_OCR_FIELD = 'jypz_ocr' 1209 JYPZ_OCR_FIELD = 'jypz_ocr'
1205 HT_FIELD = 'ht_ocr' 1210 HT_FIELD = 'ht_ocr'
1211 QRS_FIELD = 'qrs_ocr'
1206 BD_FIELD = 'bd_ocr' 1212 BD_FIELD = 'bd_ocr'
1207 BS_FIELD = 'bss_ocr' 1213 BS_FIELD = 'bss_ocr'
1208 HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr' 1214 HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr'
...@@ -1226,6 +1232,7 @@ RESULT_MAPPING = { ...@@ -1226,6 +1232,7 @@ RESULT_MAPPING = {
1226 HMH_CLASSIFY: HMH_OCR_FIELD, 1232 HMH_CLASSIFY: HMH_OCR_FIELD,
1227 JYPZ_CLASSIFY: JYPZ_OCR_FIELD, 1233 JYPZ_CLASSIFY: JYPZ_OCR_FIELD,
1228 CONTRACT_CLASSIFY: HT_FIELD, 1234 CONTRACT_CLASSIFY: HT_FIELD,
1235 CONTRACT_QRS_CLASSIFY: QRS_FIELD,
1229 INSURANCE_CLASSIFY: BD_FIELD, 1236 INSURANCE_CLASSIFY: BD_FIELD,
1230 BS_CLASSIFY: BS_FIELD, 1237 BS_CLASSIFY: BS_FIELD,
1231 HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD, 1238 HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD,
...@@ -1479,6 +1486,10 @@ AFC_CON_FIELD_ORDER_LTGT = ( ...@@ -1479,6 +1486,10 @@ AFC_CON_FIELD_ORDER_LTGT = (
1479 ('标准利率', '标准利率'), 1486 ('标准利率', '标准利率'),
1480 ) 1487 )
1481 1488
1489 SE_AFC_CON_QRS_MAP = {
1490 '合同编号': (1, '合同编号'),
1491 }
1492
1482 SE_AFC_CON_MAP = { 1493 SE_AFC_CON_MAP = {
1483 '合同编号-每页': (None, None, '合同编号', None), 1494 '合同编号-每页': (None, None, '合同编号', None),
1484 '所购车辆价格-小写-重要条款': (1, 1, '所购车辆价格', None), 1495 '所购车辆价格-小写-重要条款': (1, 1, '所购车辆价格', None),
...@@ -2308,6 +2319,7 @@ FILE_NAME_PREFIX_MAP = { ...@@ -2308,6 +2319,7 @@ FILE_NAME_PREFIX_MAP = {
2308 ECONTRACT_KEYWORDS_MAP = { 2319 ECONTRACT_KEYWORDS_MAP = {
2309 AFC_PREFIX: [ 2320 AFC_PREFIX: [
2310 ('抵押贷款合同', CONTRACT_CLASSIFY), 2321 ('抵押贷款合同', CONTRACT_CLASSIFY),
2322 ('送达地址确认书', CONTRACT_QRS_CLASSIFY),
2311 # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0), 2323 # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0),
2312 ], 2324 ],
2313 HIL_PREFIX: [ 2325 HIL_PREFIX: [
......
...@@ -987,6 +987,15 @@ class Command(BaseCommand, LoggerMixin): ...@@ -987,6 +987,15 @@ class Command(BaseCommand, LoggerMixin):
987 # } 987 # }
988 # } 988 # }
989 license_summary[classify] = [res] 989 license_summary[classify] = [res]
990 elif classify == consts.CONTRACT_QRS_CLASSIFY:
991 res = {}
992 for key, (pno, key1) in consts.SE_AFC_CON_QRS_MAP.items():
993 res[key] = page_info_dict.get(str(pno), {}).get(key1, '')
994 res.setdefault(consts.IMG_PATH_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get(
995 consts.IMG_PATH_KEY, '')
996 res.setdefault(consts.ALL_POSITION_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get(
997 consts.ALL_POSITION_KEY, {}).get(key1, [])
998 license_summary[classify] = [res]
990 else: 999 else:
991 res = {} 1000 res = {}
992 for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items(): 1001 for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items():
...@@ -1474,6 +1483,16 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1474,6 +1483,16 @@ class Command(BaseCommand, LoggerMixin):
1474 'page_num': page_num, 1483 'page_num': page_num,
1475 'page_info': page_info 1484 'page_info': page_info
1476 } 1485 }
1486 elif classify_1_str == str(consts.CONTRACT_QRS_CLASSIFY):
1487 ocr_result = afc_predict(pdf_handler.pdf_info, is_qrs=True)
1488 page_num = 'page_1'
1489 page_res = {
1490 page_num: {
1491 'classify': int(classify_1_str),
1492 'page_num': page_num,
1493 'page_info': ocr_result.pop(page_num, {})
1494 }
1495 }
1477 else: 1496 else:
1478 file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str) 1497 file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str)
1479 ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1) 1498 ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1)
......
...@@ -328,6 +328,7 @@ class AFCOCRResult(models.Model): ...@@ -328,6 +328,7 @@ class AFCOCRResult(models.Model):
328 hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1") 328 hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1")
329 hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2") 329 hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2")
330 hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") 330 hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3")
331 qrs_ocr = models.TextField(null=True, verbose_name="AFC合同确认书")
331 332
332 update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') 333 update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间')
333 create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') 334 create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
...@@ -363,6 +364,7 @@ class HILOCRResult(models.Model): ...@@ -363,6 +364,7 @@ class HILOCRResult(models.Model):
363 hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1") 364 hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1")
364 hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2") 365 hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2")
365 hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") 366 hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3")
367 qrs_ocr = models.TextField(null=True, verbose_name="AFC合同确认书")
366 368
367 update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') 369 update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间')
368 create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') 370 create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
...@@ -397,6 +399,7 @@ class AFCSEOCRResult(models.Model): ...@@ -397,6 +399,7 @@ class AFCSEOCRResult(models.Model):
397 hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1") 399 hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1")
398 hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2") 400 hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2")
399 hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") 401 hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3")
402 qrs_ocr = models.TextField(null=True, verbose_name="AFC合同确认书")
400 403
401 update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') 404 update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间')
402 create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') 405 create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
...@@ -432,6 +435,7 @@ class HILSEOCRResult(models.Model): ...@@ -432,6 +435,7 @@ class HILSEOCRResult(models.Model):
432 hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1") 435 hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1")
433 hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2") 436 hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2")
434 hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") 437 hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3")
438 qrs_ocr = models.TextField(null=True, verbose_name="AFC合同确认书")
435 439
436 update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') 440 update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间')
437 create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') 441 create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
......
...@@ -9,7 +9,21 @@ from .get_char import Finder ...@@ -9,7 +9,21 @@ from .get_char import Finder
9 import numpy as np 9 import numpy as np
10 10
11 11
12 def predict(pdf_info): 12 def extract_info(ocr_results):
13 contract_no = {
14 "words": None,
15 "position": None
16 }
17 for bbox, text in ocr_results.get('0', {}).values():
18 if text.startswith('CH-B'):
19 contract_no['words'] = text
20 contract_no['position'] = [bbox[0], bbox[1], bbox[2], bbox[-1]]
21 break
22
23 return {'page_1': {'合同编号': contract_no}}
24
25
26 def predict(pdf_info, is_qrs=False):
13 ocr_results = {} 27 ocr_results = {}
14 for pno in pdf_info: 28 for pno in pdf_info:
15 ocr_results[pno] = {} 29 ocr_results[pno] = {}
...@@ -32,9 +46,12 @@ def predict(pdf_info): ...@@ -32,9 +46,12 @@ def predict(pdf_info):
32 keys = list(range(len(ocr_result))) 46 keys = list(range(len(ocr_result)))
33 ocr_result = dict(zip(keys, ocr_result)) 47 ocr_result = dict(zip(keys, ocr_result))
34 ocr_results[pno] = ocr_result 48 ocr_results[pno] = ocr_result
35 # 输入是整个 PDF 中的信息 49 if is_qrs:
36 f = Finder(pdf_info, ocr_results=ocr_results) 50 results = extract_info(ocr_results)
37 results = f.get_info() 51 else:
52 # 输入是整个 PDF 中的信息
53 f = Finder(pdf_info, ocr_results=ocr_results)
54 results = f.get_info()
38 return results 55 return results
39 56
40 57
......
1 import pyodbc
2
3 hil_sql = """
4 ALTER TABLE hil_ocr_result ADD qrs_ocr nvarchar(max);
5 ALTER TABLE hil_se_ocr_result ADD qrs_ocr nvarchar(max);
6 """
7
8 afc_sql = """
9 ALTER TABLE afc_ocr_result ADD qrs_ocr nvarchar(max);
10 ALTER TABLE afc_se_ocr_result ADD qrs_ocr nvarchar(max);
11 """
12
13 hil_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True)
14
15 hil_cursor = hil_cnxn.cursor()
16 hil_cursor.execute(hil_sql)
17
18 hil_cursor.close()
19 hil_cnxn.close()
20
21 afc_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True)
22
23 afc_cursor = afc_cnxn.cursor()
24 afc_cursor.execute(afc_sql)
25
26 afc_cursor.close()
27 afc_cnxn.close()
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!