add qrs ocr
Showing
5 changed files
with
81 additions
and
2 deletions
... | @@ -1042,6 +1042,10 @@ JYPZ_FIELD_ORDER = (("type", "标题"), | ... | @@ -1042,6 +1042,10 @@ JYPZ_FIELD_ORDER = (("type", "标题"), |
1042 | CONTRACT_CN_NAME = '合同' | 1042 | CONTRACT_CN_NAME = '合同' |
1043 | CONTRACT_CLASSIFY = 41 | 1043 | CONTRACT_CLASSIFY = 41 |
1044 | 1044 | ||
1045 | # 合同-送达地址确认书 | ||
1046 | CONTRACT_QRS_CN_NAME = '送达地址确认书' | ||
1047 | CONTRACT_QRS_CLASSIFY = 49 | ||
1048 | |||
1045 | # 合同编号: 每页 | 1049 | # 合同编号: 每页 |
1046 | 1050 | ||
1047 | HIL_CONTRACT_1_CN_NAME = '售后回租合同' | 1051 | HIL_CONTRACT_1_CN_NAME = '售后回租合同' |
... | @@ -1053,13 +1057,14 @@ HIL_CONTRACT_2_CLASSIFY = 44 | ... | @@ -1053,13 +1057,14 @@ HIL_CONTRACT_2_CLASSIFY = 44 |
1053 | HIL_CONTRACT_3_CN_NAME = '车辆处置协议' | 1057 | HIL_CONTRACT_3_CN_NAME = '车辆处置协议' |
1054 | HIL_CONTRACT_3_CLASSIFY = 45 | 1058 | HIL_CONTRACT_3_CLASSIFY = 45 |
1055 | 1059 | ||
1056 | CONTRACT_SET = {CONTRACT_CLASSIFY, HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_2_CLASSIFY, HIL_CONTRACT_3_CLASSIFY} | 1060 | CONTRACT_SET = {CONTRACT_QRS_CLASSIFY, CONTRACT_CLASSIFY, HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_2_CLASSIFY, HIL_CONTRACT_3_CLASSIFY} |
1057 | 1061 | ||
1058 | CONTRACT_MAP = { | 1062 | CONTRACT_MAP = { |
1059 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME, | 1063 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME, |
1060 | HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_CN_NAME, | 1064 | HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_CN_NAME, |
1061 | HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME, | 1065 | HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME, |
1062 | CONTRACT_CLASSIFY: CONTRACT_CN_NAME, | 1066 | CONTRACT_CLASSIFY: CONTRACT_CN_NAME, |
1067 | CONTRACT_QRS_CLASSIFY: CONTRACT_QRS_CN_NAME, | ||
1063 | } | 1068 | } |
1064 | 1069 | ||
1065 | # 保单 | 1070 | # 保单 |
... | @@ -1203,6 +1208,7 @@ DDA_OCR_FIELD = 'bs_ocr' | ... | @@ -1203,6 +1208,7 @@ DDA_OCR_FIELD = 'bs_ocr' |
1203 | HMH_OCR_FIELD = 'hmh_ocr' | 1208 | HMH_OCR_FIELD = 'hmh_ocr' |
1204 | JYPZ_OCR_FIELD = 'jypz_ocr' | 1209 | JYPZ_OCR_FIELD = 'jypz_ocr' |
1205 | HT_FIELD = 'ht_ocr' | 1210 | HT_FIELD = 'ht_ocr' |
1211 | QRS_FIELD = 'qrs_ocr' | ||
1206 | BD_FIELD = 'bd_ocr' | 1212 | BD_FIELD = 'bd_ocr' |
1207 | BS_FIELD = 'bss_ocr' | 1213 | BS_FIELD = 'bss_ocr' |
1208 | HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr' | 1214 | HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr' |
... | @@ -1226,6 +1232,7 @@ RESULT_MAPPING = { | ... | @@ -1226,6 +1232,7 @@ RESULT_MAPPING = { |
1226 | HMH_CLASSIFY: HMH_OCR_FIELD, | 1232 | HMH_CLASSIFY: HMH_OCR_FIELD, |
1227 | JYPZ_CLASSIFY: JYPZ_OCR_FIELD, | 1233 | JYPZ_CLASSIFY: JYPZ_OCR_FIELD, |
1228 | CONTRACT_CLASSIFY: HT_FIELD, | 1234 | CONTRACT_CLASSIFY: HT_FIELD, |
1235 | CONTRACT_QRS_CLASSIFY: QRS_FIELD, | ||
1229 | INSURANCE_CLASSIFY: BD_FIELD, | 1236 | INSURANCE_CLASSIFY: BD_FIELD, |
1230 | BS_CLASSIFY: BS_FIELD, | 1237 | BS_CLASSIFY: BS_FIELD, |
1231 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD, | 1238 | HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD, |
... | @@ -1479,6 +1486,10 @@ AFC_CON_FIELD_ORDER_LTGT = ( | ... | @@ -1479,6 +1486,10 @@ AFC_CON_FIELD_ORDER_LTGT = ( |
1479 | ('标准利率', '标准利率'), | 1486 | ('标准利率', '标准利率'), |
1480 | ) | 1487 | ) |
1481 | 1488 | ||
1489 | SE_AFC_CON_QRS_MAP = { | ||
1490 | '合同编号': (1, '合同编号'), | ||
1491 | } | ||
1492 | |||
1482 | SE_AFC_CON_MAP = { | 1493 | SE_AFC_CON_MAP = { |
1483 | '合同编号-每页': (None, None, '合同编号', None), | 1494 | '合同编号-每页': (None, None, '合同编号', None), |
1484 | '所购车辆价格-小写-重要条款': (1, 1, '所购车辆价格', None), | 1495 | '所购车辆价格-小写-重要条款': (1, 1, '所购车辆价格', None), |
... | @@ -2308,6 +2319,7 @@ FILE_NAME_PREFIX_MAP = { | ... | @@ -2308,6 +2319,7 @@ FILE_NAME_PREFIX_MAP = { |
2308 | ECONTRACT_KEYWORDS_MAP = { | 2319 | ECONTRACT_KEYWORDS_MAP = { |
2309 | AFC_PREFIX: [ | 2320 | AFC_PREFIX: [ |
2310 | ('抵押贷款合同', CONTRACT_CLASSIFY), | 2321 | ('抵押贷款合同', CONTRACT_CLASSIFY), |
2322 | ('送达地址确认书', CONTRACT_QRS_CLASSIFY), | ||
2311 | # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0), | 2323 | # ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0), |
2312 | ], | 2324 | ], |
2313 | HIL_PREFIX: [ | 2325 | HIL_PREFIX: [ | ... | ... |
... | @@ -987,6 +987,15 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -987,6 +987,15 @@ class Command(BaseCommand, LoggerMixin): |
987 | # } | 987 | # } |
988 | # } | 988 | # } |
989 | license_summary[classify] = [res] | 989 | license_summary[classify] = [res] |
990 | elif classify == consts.CONTRACT_QRS_CLASSIFY: | ||
991 | res = {} | ||
992 | for key, (pno, key1) in consts.SE_AFC_CON_QRS_MAP.items(): | ||
993 | res[key] = page_info_dict.get(str(pno), {}).get(key1, '') | ||
994 | res.setdefault(consts.IMG_PATH_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get( | ||
995 | consts.IMG_PATH_KEY, '') | ||
996 | res.setdefault(consts.ALL_POSITION_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get( | ||
997 | consts.ALL_POSITION_KEY, {}).get(key1, []) | ||
998 | license_summary[classify] = [res] | ||
990 | else: | 999 | else: |
991 | res = {} | 1000 | res = {} |
992 | for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items(): | 1001 | for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items(): |
... | @@ -1474,6 +1483,16 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1474,6 +1483,16 @@ class Command(BaseCommand, LoggerMixin): |
1474 | 'page_num': page_num, | 1483 | 'page_num': page_num, |
1475 | 'page_info': page_info | 1484 | 'page_info': page_info |
1476 | } | 1485 | } |
1486 | elif classify_1_str == str(consts.CONTRACT_QRS_CLASSIFY): | ||
1487 | ocr_result = afc_predict(pdf_handler.pdf_info, is_qrs=True) | ||
1488 | page_num = 'page_1' | ||
1489 | page_res = { | ||
1490 | page_num: { | ||
1491 | 'classify': int(classify_1_str), | ||
1492 | 'page_num': page_num, | ||
1493 | 'page_info': ocr_result.pop(page_num, {}) | ||
1494 | } | ||
1495 | } | ||
1477 | else: | 1496 | else: |
1478 | file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str) | 1497 | file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str) |
1479 | ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1) | 1498 | ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1) | ... | ... |
... | @@ -328,6 +328,7 @@ class AFCOCRResult(models.Model): | ... | @@ -328,6 +328,7 @@ class AFCOCRResult(models.Model): |
328 | hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1") | 328 | hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1") |
329 | hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2") | 329 | hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2") |
330 | hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") | 330 | hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") |
331 | qrs_ocr = models.TextField(null=True, verbose_name="AFC合同确认书") | ||
331 | 332 | ||
332 | update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') | 333 | update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') |
333 | create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') | 334 | create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') |
... | @@ -363,6 +364,7 @@ class HILOCRResult(models.Model): | ... | @@ -363,6 +364,7 @@ class HILOCRResult(models.Model): |
363 | hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1") | 364 | hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1") |
364 | hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2") | 365 | hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2") |
365 | hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") | 366 | hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") |
367 | qrs_ocr = models.TextField(null=True, verbose_name="AFC合同确认书") | ||
366 | 368 | ||
367 | update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') | 369 | update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') |
368 | create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') | 370 | create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') |
... | @@ -397,6 +399,7 @@ class AFCSEOCRResult(models.Model): | ... | @@ -397,6 +399,7 @@ class AFCSEOCRResult(models.Model): |
397 | hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1") | 399 | hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1") |
398 | hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2") | 400 | hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2") |
399 | hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") | 401 | hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") |
402 | qrs_ocr = models.TextField(null=True, verbose_name="AFC合同确认书") | ||
400 | 403 | ||
401 | update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') | 404 | update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') |
402 | create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') | 405 | create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') |
... | @@ -432,6 +435,7 @@ class HILSEOCRResult(models.Model): | ... | @@ -432,6 +435,7 @@ class HILSEOCRResult(models.Model): |
432 | hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1") | 435 | hil_contract_1_ocr = models.TextField(null=True, verbose_name="HIL合同1") |
433 | hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2") | 436 | hil_contract_2_ocr = models.TextField(null=True, verbose_name="HIL合同2") |
434 | hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") | 437 | hil_contract_3_ocr = models.TextField(null=True, verbose_name="HIL合同3") |
438 | qrs_ocr = models.TextField(null=True, verbose_name="AFC合同确认书") | ||
435 | 439 | ||
436 | update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') | 440 | update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间') |
437 | create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') | 441 | create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') | ... | ... |
... | @@ -9,7 +9,21 @@ from .get_char import Finder | ... | @@ -9,7 +9,21 @@ from .get_char import Finder |
9 | import numpy as np | 9 | import numpy as np |
10 | 10 | ||
11 | 11 | ||
12 | def predict(pdf_info): | 12 | def extract_info(ocr_results): |
13 | contract_no = { | ||
14 | "words": None, | ||
15 | "position": None | ||
16 | } | ||
17 | for bbox, text in ocr_results.get('0', {}).values(): | ||
18 | if text.startswith('CH-B'): | ||
19 | contract_no['words'] = text | ||
20 | contract_no['position'] = [bbox[0], bbox[1], bbox[2], bbox[-1]] | ||
21 | break | ||
22 | |||
23 | return {'page_1': {'合同编号': contract_no}} | ||
24 | |||
25 | |||
26 | def predict(pdf_info, is_qrs=False): | ||
13 | ocr_results = {} | 27 | ocr_results = {} |
14 | for pno in pdf_info: | 28 | for pno in pdf_info: |
15 | ocr_results[pno] = {} | 29 | ocr_results[pno] = {} |
... | @@ -32,6 +46,9 @@ def predict(pdf_info): | ... | @@ -32,6 +46,9 @@ def predict(pdf_info): |
32 | keys = list(range(len(ocr_result))) | 46 | keys = list(range(len(ocr_result))) |
33 | ocr_result = dict(zip(keys, ocr_result)) | 47 | ocr_result = dict(zip(keys, ocr_result)) |
34 | ocr_results[pno] = ocr_result | 48 | ocr_results[pno] = ocr_result |
49 | if is_qrs: | ||
50 | results = extract_info(ocr_results) | ||
51 | else: | ||
35 | # 输入是整个 PDF 中的信息 | 52 | # 输入是整个 PDF 中的信息 |
36 | f = Finder(pdf_info, ocr_results=ocr_results) | 53 | f = Finder(pdf_info, ocr_results=ocr_results) |
37 | results = f.get_info() | 54 | results = f.get_info() | ... | ... |
src/common/tools/mssql_script23.py
0 → 100644
1 | import pyodbc | ||
2 | |||
3 | hil_sql = """ | ||
4 | ALTER TABLE hil_ocr_result ADD qrs_ocr nvarchar(max); | ||
5 | ALTER TABLE hil_se_ocr_result ADD qrs_ocr nvarchar(max); | ||
6 | """ | ||
7 | |||
8 | afc_sql = """ | ||
9 | ALTER TABLE afc_ocr_result ADD qrs_ocr nvarchar(max); | ||
10 | ALTER TABLE afc_se_ocr_result ADD qrs_ocr nvarchar(max); | ||
11 | """ | ||
12 | |||
13 | hil_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True) | ||
14 | |||
15 | hil_cursor = hil_cnxn.cursor() | ||
16 | hil_cursor.execute(hil_sql) | ||
17 | |||
18 | hil_cursor.close() | ||
19 | hil_cnxn.close() | ||
20 | |||
21 | afc_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True) | ||
22 | |||
23 | afc_cursor = afc_cnxn.cursor() | ||
24 | afc_cursor.execute(afc_sql) | ||
25 | |||
26 | afc_cursor.close() | ||
27 | afc_cnxn.close() |
-
Please register or sign in to post a comment