add qrs ocr
Showing
5 changed files
with
84 additions
and
5 deletions
| ... | @@ -1042,6 +1042,10 @@ JYPZ_FIELD_ORDER = (("type", "标题"), | ... | @@ -1042,6 +1042,10 @@ JYPZ_FIELD_ORDER = (("type", "标题"), | 
| 1042 | CONTRACT_CN_NAME = '合同' | 1042 | CONTRACT_CN_NAME = '合同' | 
| 1043 | CONTRACT_CLASSIFY = 41 | 1043 | CONTRACT_CLASSIFY = 41 | 
| 1044 | 1044 | ||
| 1045 | # 合同-送达地址确认书 | ||
| 1046 | CONTRACT_QRS_CN_NAME = '送达地址确认书' | ||
| 1047 | CONTRACT_QRS_CLASSIFY = 49 | ||
| 1048 | |||
| 1045 | # 合同编号: 每页 | 1049 | # 合同编号: 每页 | 
| 1046 | 1050 | ||
| 1047 | HIL_CONTRACT_1_CN_NAME = '售后回租合同' | 1051 | HIL_CONTRACT_1_CN_NAME = '售后回租合同' | 
| ... | @@ -1053,13 +1057,14 @@ HIL_CONTRACT_2_CLASSIFY = 44 | ... | @@ -1053,13 +1057,14 @@ HIL_CONTRACT_2_CLASSIFY = 44 | 
| 1053 | HIL_CONTRACT_3_CN_NAME = '车辆处置协议' | 1057 | HIL_CONTRACT_3_CN_NAME = '车辆处置协议' | 
| 1054 | HIL_CONTRACT_3_CLASSIFY = 45 | 1058 | HIL_CONTRACT_3_CLASSIFY = 45 | 
| 1055 | 1059 | ||
| 1056 | CONTRACT_SET = {CONTRACT_CLASSIFY, HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_2_CLASSIFY, HIL_CONTRACT_3_CLASSIFY} | 1060 | CONTRACT_SET = {CONTRACT_QRS_CLASSIFY, CONTRACT_CLASSIFY, HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_2_CLASSIFY, HIL_CONTRACT_3_CLASSIFY} | 
| 1057 | 1061 | ||
| 1058 | CONTRACT_MAP = { | 1062 | CONTRACT_MAP = { | 
| 1059 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME, | 1063 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME, | 
| 1060 | HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_CN_NAME, | 1064 | HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_CN_NAME, | 
| 1061 | HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME, | 1065 | HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME, | 
| 1062 | CONTRACT_CLASSIFY: CONTRACT_CN_NAME, | 1066 | CONTRACT_CLASSIFY: CONTRACT_CN_NAME, | 
| 1067 | CONTRACT_QRS_CLASSIFY: CONTRACT_QRS_CN_NAME, | ||
| 1063 | } | 1068 | } | 
| 1064 | 1069 | ||
| 1065 | # 保单 | 1070 | # 保单 | 
| ... | @@ -1203,6 +1208,7 @@ DDA_OCR_FIELD = 'bs_ocr' | ... | @@ -1203,6 +1208,7 @@ DDA_OCR_FIELD = 'bs_ocr' | 
| 1203 | HMH_OCR_FIELD = 'hmh_ocr' | 1208 | HMH_OCR_FIELD = 'hmh_ocr' | 
| 1204 | JYPZ_OCR_FIELD = 'jypz_ocr' | 1209 | JYPZ_OCR_FIELD = 'jypz_ocr' | 
| 1205 | HT_FIELD = 'ht_ocr' | 1210 | HT_FIELD = 'ht_ocr' | 
| 1211 | QRS_FIELD = 'qrs_ocr' | ||
| 1206 | BD_FIELD = 'bd_ocr' | 1212 | BD_FIELD = 'bd_ocr' | 
| 1207 | BS_FIELD = 'bss_ocr' | 1213 | BS_FIELD = 'bss_ocr' | 
| 1208 | HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr' | 1214 | HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr' | 
| ... | @@ -1226,6 +1232,7 @@ RESULT_MAPPING = { | ... | @@ -1226,6 +1232,7 @@ RESULT_MAPPING = { | 
| 1226 | HMH_CLASSIFY: HMH_OCR_FIELD, | 1232 | HMH_CLASSIFY: HMH_OCR_FIELD, | 
| 1227 | JYPZ_CLASSIFY: JYPZ_OCR_FIELD, | 1233 | JYPZ_CLASSIFY: JYPZ_OCR_FIELD, | 
| 1228 | CONTRACT_CLASSIFY: HT_FIELD, | 1234 | CONTRACT_CLASSIFY: HT_FIELD, | 
| 1235 | CONTRACT_QRS_CLASSIFY: QRS_FIELD, | ||
| 1229 | INSURANCE_CLASSIFY: BD_FIELD, | 1236 | INSURANCE_CLASSIFY: BD_FIELD, | 
| 1230 | BS_CLASSIFY: BS_FIELD, | 1237 | BS_CLASSIFY: BS_FIELD, | 
| 1231 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD, | 1238 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD, | 
| ... | @@ -1479,6 +1486,10 @@ AFC_CON_FIELD_ORDER_LTGT = ( | ... | @@ -1479,6 +1486,10 @@ AFC_CON_FIELD_ORDER_LTGT = ( | 
| 1479 | ('标准利率', '标准利率'), | 1486 | ('标准利率', '标准利率'), | 
| 1480 | ) | 1487 | ) | 
| 1481 | 1488 | ||
| 1489 | SE_AFC_CON_QRS_MAP = { | ||
| 1490 | '合同编号': (1, '合同编号'), | ||
| 1491 | } | ||
| 1492 | |||
| 1482 | SE_AFC_CON_MAP = { | 1493 | SE_AFC_CON_MAP = { | 
| 1483 | '合同编号-每页': (None, None, '合同编号', None), | 1494 | '合同编号-每页': (None, None, '合同编号', None), | 
| 1484 | '所购车辆价格-小写-重要条款': (1, 1, '所购车辆价格', None), | 1495 | '所购车辆价格-小写-重要条款': (1, 1, '所购车辆价格', None), | 
| ... | @@ -2308,6 +2319,7 @@ FILE_NAME_PREFIX_MAP = { | ... | @@ -2308,6 +2319,7 @@ FILE_NAME_PREFIX_MAP = { | 
| 2308 | ECONTRACT_KEYWORDS_MAP = { | 2319 | ECONTRACT_KEYWORDS_MAP = { | 
| 2309 | AFC_PREFIX: [ | 2320 | AFC_PREFIX: [ | 
| 2310 | ('抵押贷款合同', CONTRACT_CLASSIFY), | 2321 | ('抵押贷款合同', CONTRACT_CLASSIFY), | 
| 2322 | ('送达地址确认书', CONTRACT_QRS_CLASSIFY), | ||
| 2311 | # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0), | 2323 | # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0), | 
| 2312 | ], | 2324 | ], | 
| 2313 | HIL_PREFIX: [ | 2325 | HIL_PREFIX: [ | ... | ... | 
| ... | @@ -987,6 +987,15 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -987,6 +987,15 @@ class Command(BaseCommand, LoggerMixin): | 
| 987 | # } | 987 | # } | 
| 988 | # } | 988 | # } | 
| 989 | license_summary[classify] = [res] | 989 | license_summary[classify] = [res] | 
| 990 | elif classify == consts.CONTRACT_QRS_CLASSIFY: | ||
| 991 | res = {} | ||
| 992 | for key, (pno, key1) in consts.SE_AFC_CON_QRS_MAP.items(): | ||
| 993 | res[key] = page_info_dict.get(str(pno), {}).get(key1, '') | ||
| 994 | res.setdefault(consts.IMG_PATH_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get( | ||
| 995 | consts.IMG_PATH_KEY, '') | ||
| 996 | res.setdefault(consts.ALL_POSITION_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get( | ||
| 997 | consts.ALL_POSITION_KEY, {}).get(key1, []) | ||
| 998 | license_summary[classify] = [res] | ||
| 990 | else: | 999 | else: | 
| 991 | res = {} | 1000 | res = {} | 
| 992 | for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items(): | 1001 | for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items(): | 
| ... | @@ -1474,6 +1483,16 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1474,6 +1483,16 @@ class Command(BaseCommand, LoggerMixin): | 
| 1474 | 'page_num': page_num, | 1483 | 'page_num': page_num, | 
| 1475 | 'page_info': page_info | 1484 | 'page_info': page_info | 
| 1476 | } | 1485 | } | 
| 1486 | elif classify_1_str == str(consts.CONTRACT_QRS_CLASSIFY): | ||
| 1487 | ocr_result = afc_predict(pdf_handler.pdf_info, is_qrs=True) | ||
| 1488 | page_num = 'page_1' | ||
| 1489 | page_res = { | ||
| 1490 | page_num: { | ||
| 1491 | 'classify': int(classify_1_str), | ||
| 1492 | 'page_num': page_num, | ||
| 1493 | 'page_info': ocr_result.pop(page_num, {}) | ||
| 1494 | } | ||
| 1495 | } | ||
| 1477 | else: | 1496 | else: | 
| 1478 | file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str) | 1497 | file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str) | 
| 1479 | ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1) | 1498 | ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1) | ... | ... | 
| ... | @@ -328,6 +328,7 @@ class AFCOCRResult(models.Model): | ... | @@ -328,6 +328,7 @@ class AFCOCRResult(models.Model): | 
| 328 | hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1") | 328 | hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1") | 
| 329 | hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2") | 329 | hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2") | 
| 330 | hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") | 330 | hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") | 
| 331 | qrs_ocr = models.TextField(null=True, verbose_name="AFC合同确认书") | ||
| 331 | 332 | ||
| 332 | update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') | 333 | update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') | 
| 333 | create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') | 334 | create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') | 
| ... | @@ -363,6 +364,7 @@ class HILOCRResult(models.Model): | ... | @@ -363,6 +364,7 @@ class HILOCRResult(models.Model): | 
| 363 | hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1") | 364 | hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1") | 
| 364 | hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2") | 365 | hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2") | 
| 365 | hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") | 366 | hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") | 
| 367 | qrs_ocr = models.TextField(null=True, verbose_name="AFC合同确认书") | ||
| 366 | 368 | ||
| 367 | update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') | 369 | update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') | 
| 368 | create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') | 370 | create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') | 
| ... | @@ -397,6 +399,7 @@ class AFCSEOCRResult(models.Model): | ... | @@ -397,6 +399,7 @@ class AFCSEOCRResult(models.Model): | 
| 397 | hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1") | 399 | hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1") | 
| 398 | hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2") | 400 | hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2") | 
| 399 | hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") | 401 | hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") | 
| 402 | qrs_ocr = models.TextField(null=True, verbose_name="AFC合同确认书") | ||
| 400 | 403 | ||
| 401 | update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') | 404 | update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') | 
| 402 | create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') | 405 | create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') | 
| ... | @@ -432,6 +435,7 @@ class HILSEOCRResult(models.Model): | ... | @@ -432,6 +435,7 @@ class HILSEOCRResult(models.Model): | 
| 432 | hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1") | 435 | hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1") | 
| 433 | hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2") | 436 | hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2") | 
| 434 | hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") | 437 | hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") | 
| 438 | qrs_ocr = models.TextField(null=True, verbose_name="AFC合同确认书") | ||
| 435 | 439 | ||
| 436 | update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') | 440 | update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') | 
| 437 | create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') | 441 | create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') | ... | ... | 
| ... | @@ -9,7 +9,21 @@ from .get_char import Finder | ... | @@ -9,7 +9,21 @@ from .get_char import Finder | 
| 9 | import numpy as np | 9 | import numpy as np | 
| 10 | 10 | ||
| 11 | 11 | ||
| 12 | def predict(pdf_info): | 12 | def extract_info(ocr_results): | 
| 13 | contract_no = { | ||
| 14 | "words": None, | ||
| 15 | "position": None | ||
| 16 | } | ||
| 17 | for bbox, text in ocr_results.get('0', {}).values(): | ||
| 18 | if text.startswith('CH-B'): | ||
| 19 | contract_no['words'] = text | ||
| 20 | contract_no['position'] = [bbox[0], bbox[1], bbox[2], bbox[-1]] | ||
| 21 | break | ||
| 22 | |||
| 23 | return {'page_1': {'合同编号': contract_no}} | ||
| 24 | |||
| 25 | |||
| 26 | def predict(pdf_info, is_qrs=False): | ||
| 13 | ocr_results = {} | 27 | ocr_results = {} | 
| 14 | for pno in pdf_info: | 28 | for pno in pdf_info: | 
| 15 | ocr_results[pno] = {} | 29 | ocr_results[pno] = {} | 
| ... | @@ -32,9 +46,12 @@ def predict(pdf_info): | ... | @@ -32,9 +46,12 @@ def predict(pdf_info): | 
| 32 | keys = list(range(len(ocr_result))) | 46 | keys = list(range(len(ocr_result))) | 
| 33 | ocr_result = dict(zip(keys, ocr_result)) | 47 | ocr_result = dict(zip(keys, ocr_result)) | 
| 34 | ocr_results[pno] = ocr_result | 48 | ocr_results[pno] = ocr_result | 
| 35 | # 输入是整个 PDF 中的信息 | 49 | if is_qrs: | 
| 36 | f = Finder(pdf_info, ocr_results=ocr_results) | 50 | results = extract_info(ocr_results) | 
| 37 | results = f.get_info() | 51 | else: | 
| 52 | # 输入是整个 PDF 中的信息 | ||
| 53 | f = Finder(pdf_info, ocr_results=ocr_results) | ||
| 54 | results = f.get_info() | ||
| 38 | return results | 55 | return results | 
| 39 | 56 | ||
| 40 | 57 | ... | ... | 
src/common/tools/mssql_script23.py
0 → 100644
| 1 | import pyodbc | ||
| 2 | |||
| 3 | hil_sql = """ | ||
| 4 | ALTER TABLE hil_ocr_result ADD qrs_ocr nvarchar(max); | ||
| 5 | ALTER TABLE hil_se_ocr_result ADD qrs_ocr nvarchar(max); | ||
| 6 | """ | ||
| 7 | |||
| 8 | afc_sql = """ | ||
| 9 | ALTER TABLE afc_ocr_result ADD qrs_ocr nvarchar(max); | ||
| 10 | ALTER TABLE afc_se_ocr_result ADD qrs_ocr nvarchar(max); | ||
| 11 | """ | ||
| 12 | |||
| 13 | hil_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True) | ||
| 14 | |||
| 15 | hil_cursor = hil_cnxn.cursor() | ||
| 16 | hil_cursor.execute(hil_sql) | ||
| 17 | |||
| 18 | hil_cursor.close() | ||
| 19 | hil_cnxn.close() | ||
| 20 | |||
| 21 | afc_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True) | ||
| 22 | |||
| 23 | afc_cursor = afc_cnxn.cursor() | ||
| 24 | afc_cursor.execute(afc_sql) | ||
| 25 | |||
| 26 | afc_cursor.close() | ||
| 27 | afc_cnxn.close() | 
- 
Please register or sign in to post a comment