7965c565 by 周伟奇

add 0318

1 parent ea79bc59
...@@ -1096,11 +1096,23 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1096,11 +1096,23 @@ class Command(BaseCommand, LoggerMixin):
1096 1096
1097 try: 1097 try:
1098 doc.status = DocStatus.PROCESS_FAILED.value 1098 doc.status = DocStatus.PROCESS_FAILED.value
1099 doc.page_count = pdf_handler.page_count
1099 doc.save() 1100 doc.save()
1100 except Exception as e: 1101 except Exception as e:
1101 self.online_log.error('{0} [process error (db save)] [error={1}]'.format( 1102 self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
1102 self.log_base, traceback.format_exc())) 1103 self.log_base, traceback.format_exc()))
1103 else: 1104 else:
1105
1106 try:
1107 if pdf_handler.is_e_pdf:
1108 doc.metadata = pdf_handler.metadata if pdf_handler.metadata is None else \
1109 json.dumps(pdf_handler.metadata)
1110 doc.page_count = pdf_handler.page_count
1111 doc.save()
1112 except Exception as e:
1113 self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
1114 self.log_base, traceback.format_exc()))
1115
1104 with lock: 1116 with lock:
1105 todo_count_dict[task_str] = pdf_handler.img_count 1117 todo_count_dict[task_str] = pdf_handler.img_count
1106 for img_idx, img_path in enumerate(pdf_handler.img_path_list): 1118 for img_idx, img_path in enumerate(pdf_handler.img_path_list):
...@@ -1147,6 +1159,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1147,6 +1159,7 @@ class Command(BaseCommand, LoggerMixin):
1147 1159
1148 try: 1160 try:
1149 doc.status = DocStatus.PROCESS_FAILED.value 1161 doc.status = DocStatus.PROCESS_FAILED.value
1162 doc.page_count = pdf_handler.page_count
1150 doc.save() 1163 doc.save()
1151 self.online_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] ' 1164 self.online_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] '
1152 '[error={2}]'.format(self.log_base, task_str, traceback.format_exc())) 1165 '[error={2}]'.format(self.log_base, task_str, traceback.format_exc()))
...@@ -1178,6 +1191,13 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1178,6 +1191,13 @@ class Command(BaseCommand, LoggerMixin):
1178 else: 1191 else:
1179 raise Exception('download or pdf to img failed') 1192 raise Exception('download or pdf to img failed')
1180 1193
1194 try:
1195 doc.page_count = pdf_handler.page_count
1196 doc.save()
1197 except Exception as e:
1198 self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
1199 self.log_base, traceback.format_exc()))
1200
1181 if classify_1_str == str(consts.CONTRACT_CLASSIFY): 1201 if classify_1_str == str(consts.CONTRACT_CLASSIFY):
1182 ocr_result = afc_predict(pdf_handler.pdf_info) 1202 ocr_result = afc_predict(pdf_handler.pdf_info)
1183 page_res = {} 1203 page_res = {}
...@@ -1234,6 +1254,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1234,6 +1254,7 @@ class Command(BaseCommand, LoggerMixin):
1234 except Exception as e: 1254 except Exception as e:
1235 try: 1255 try:
1236 doc.status = DocStatus.PROCESS_FAILED.value 1256 doc.status = DocStatus.PROCESS_FAILED.value
1257 doc.page_count = pdf_handler.page_count
1237 doc.save() 1258 doc.save()
1238 self.online_log.warn('{0} [process failed (e-contract)] [task={1}] ' 1259 self.online_log.warn('{0} [process failed (e-contract)] [task={1}] '
1239 '[error={2}]'.format(self.e_log_base, task_str, traceback.format_exc())) 1260 '[error={2}]'.format(self.e_log_base, task_str, traceback.format_exc()))
...@@ -1560,7 +1581,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1560,7 +1581,7 @@ class Command(BaseCommand, LoggerMixin):
1560 # 重构Excel文件 1581 # 重构Excel文件
1561 # src_excel_path = os.path.join(doc_data_path, 'src.xlsx') 1582 # src_excel_path = os.path.join(doc_data_path, 'src.xlsx')
1562 # wb.save(src_excel_path) 1583 # wb.save(src_excel_path)
1563 count_list = wb.rebuild(merged_bs_summary, license_summary, res_list, doc.document_scheme, contract_result) 1584 count_list = wb.rebuild(merged_bs_summary, license_summary, res_list, doc.document_scheme, contract_result, doc.metadata)
1564 wb.save(excel_path) 1585 wb.save(excel_path)
1565 1586
1566 except Exception as e: 1587 except Exception as e:
......
...@@ -61,6 +61,9 @@ class HILDoc(models.Model): ...@@ -61,6 +61,9 @@ class HILDoc(models.Model):
61 mvc_count = models.IntegerField(default=0, verbose_name='机动车登记证书处理数目') 61 mvc_count = models.IntegerField(default=0, verbose_name='机动车登记证书处理数目')
62 vat_count = models.IntegerField(default=0, verbose_name='增值税发票处理数目') 62 vat_count = models.IntegerField(default=0, verbose_name='增值税发票处理数目')
63 63
64 page_count = models.IntegerField(null=True, verbose_name='文件page数目')
65 metadata = models.TextField(null=True, verbose_name="电子PDF专属,PDF信息")
66
64 class Meta: 67 class Meta:
65 managed = False 68 managed = False
66 db_table = 'hil_doc' 69 db_table = 'hil_doc'
...@@ -100,6 +103,9 @@ class AFCDoc(models.Model): ...@@ -100,6 +103,9 @@ class AFCDoc(models.Model):
100 mvc_count = models.IntegerField(default=0, verbose_name='机动车登记证书处理数目') 103 mvc_count = models.IntegerField(default=0, verbose_name='机动车登记证书处理数目')
101 vat_count = models.IntegerField(default=0, verbose_name='增值税发票处理数目') 104 vat_count = models.IntegerField(default=0, verbose_name='增值税发票处理数目')
102 105
106 page_count = models.IntegerField(null=True, verbose_name='文件page数目')
107 metadata = models.TextField(null=True, verbose_name="电子PDF专属,PDF信息")
108
103 class Meta: 109 class Meta:
104 managed = False 110 managed = False
105 situ_db_label = 'afc' 111 situ_db_label = 'afc'
......
1 import re 1 import re
2 import json
2 import random 3 import random
3 import locale 4 import locale
4 import numpy as np 5 import numpy as np
...@@ -311,7 +312,8 @@ class BSWorkbook(Workbook): ...@@ -311,7 +312,8 @@ class BSWorkbook(Workbook):
311 month_mapping.setdefault(item, []).append( 312 month_mapping.setdefault(item, []).append(
312 (ws.title, idx_list[i] + min_row, idx_list[i + 1] + min_row - 1, day_mean)) 313 (ws.title, idx_list[i] + min_row, idx_list[i + 1] + min_row - 1, day_mean))
313 314
314 def build_metadata_rows(self, confidence, code, verify_list, print_time, start_date, end_date, res_count_tuple, is_verify_classify): 315 def build_metadata_rows(self, confidence, code, verify_list, print_time, start_date, end_date,
316 res_count_tuple, is_verify_classify, metadata):
315 metadata_rows = [('流水识别置信度', confidence)] 317 metadata_rows = [('流水识别置信度', confidence)]
316 if is_verify_classify: 318 if is_verify_classify:
317 verify_res = '疑似伪造' if len(verify_list) > 0 else '正常' 319 verify_res = '疑似伪造' if len(verify_list) > 0 else '正常'
...@@ -322,11 +324,26 @@ class BSWorkbook(Workbook): ...@@ -322,11 +324,26 @@ class BSWorkbook(Workbook):
322 metadata_rows.append(('识别成功', res_count_tuple[1])) 324 metadata_rows.append(('识别成功', res_count_tuple[1]))
323 metadata_rows.append(self.blank_row) 325 metadata_rows.append(self.blank_row)
324 326
327 # PDF info
328 metadata_highlight_row = []
329 if isinstance(metadata, str):
330 metadata_dict = json.loads(metadata)
331 author = metadata_dict.pop('author', '')
332 producer = metadata_dict.pop('producer', '')
333 metadata_rows.append(('Author', author))
334 metadata_rows.append(('Producer', producer))
335 if len(author) > 0:
336 metadata_highlight_row.append(6)
337 if 'iText' not in producer and 'Qt' not in producer and 'Haru Free' not in producer:
338 metadata_highlight_row.append(7)
339 metadata_rows.append(self.blank_row)
340
325 verify_highlight_row = [] 341 verify_highlight_row = []
326 if is_verify_classify and len(verify_list) > 0: 342 if is_verify_classify and len(verify_list) > 0:
327 metadata_rows.append(self.verify_header) 343 metadata_rows.append(self.verify_header)
344 verify_start = len(metadata_rows)
328 metadata_rows.extend(verify_list) 345 metadata_rows.extend(verify_list)
329 for r in range(6, len(metadata_rows)+1): 346 for r in range(verify_start, len(metadata_rows)+1):
330 verify_highlight_row.append(r) 347 verify_highlight_row.append(r)
331 348
332 metadata_rows.append(self.blank_row) 349 metadata_rows.append(self.blank_row)
...@@ -344,18 +361,23 @@ class BSWorkbook(Workbook): ...@@ -344,18 +361,23 @@ class BSWorkbook(Workbook):
344 self.blank_row, 361 self.blank_row,
345 self.interest_keyword_header] 362 self.interest_keyword_header]
346 ) 363 )
347 return metadata_rows, verify_highlight_row, timedelta 364 return metadata_rows, verify_highlight_row, timedelta, metadata_highlight_row
348 365
349 def build_meta_sheet(self, role_name, card, confidence, code, verify_list, print_time, start_date, end_date, 366 def build_meta_sheet(self, role_name, card, confidence, code, verify_list, print_time, start_date, end_date,
350 res_count_tuple, is_verify_classify): 367 res_count_tuple, is_verify_classify, metadata):
351 metadata_rows, verify_highlight_row, timedelta = self.build_metadata_rows( 368 metadata_rows, verify_highlight_row, timedelta, metadata_highlight_row = \
352 confidence, code, verify_list, print_time, start_date, end_date, res_count_tuple, is_verify_classify) 369 self.build_metadata_rows(confidence, code, verify_list, print_time, start_date, end_date, res_count_tuple,
370 is_verify_classify, metadata)
353 if not isinstance(role_name, str): 371 if not isinstance(role_name, str):
354 role_name = consts.UNKNOWN_ROLE 372 role_name = consts.UNKNOWN_ROLE
355 ms = self.create_sheet('{0}{1}({2})'.format(self.meta_sheet_title, role_name, card)) 373 ms = self.create_sheet('{0}{1}({2})'.format(self.meta_sheet_title, role_name, card))
356 for row in metadata_rows: 374 for row in metadata_rows:
357 ms.append(row) 375 ms.append(row)
358 376
377 for row in metadata_highlight_row:
378 for cell in ms[row]:
379 cell.fill = self.amount_fill
380
359 if len(verify_highlight_row) > 0: 381 if len(verify_highlight_row) > 0:
360 for cell in ms[2]: 382 for cell in ms[2]:
361 cell.fill = self.amount_fill 383 cell.fill = self.amount_fill
...@@ -625,7 +647,7 @@ class BSWorkbook(Workbook): ...@@ -625,7 +647,7 @@ class BSWorkbook(Workbook):
625 ms.append(row) 647 ms.append(row)
626 self.remove(tmp2_ws) 648 self.remove(tmp2_ws)
627 649
628 def bs_rebuild(self, bs_summary, res_count_tuple): 650 def bs_rebuild(self, bs_summary, res_count_tuple, metadata=None):
629 # bs_summary = { 651 # bs_summary = {
630 # '卡号': { 652 # '卡号': {
631 # 'classify': 0, 653 # 'classify': 0,
...@@ -691,7 +713,8 @@ class BSWorkbook(Workbook): ...@@ -691,7 +713,8 @@ class BSWorkbook(Workbook):
691 start_date, 713 start_date,
692 end_date, 714 end_date,
693 res_count_tuple, 715 res_count_tuple,
694 is_verify_classify) 716 is_verify_classify,
717 metadata)
695 718
696 summary['timedelta'] = timedelta 719 summary['timedelta'] = timedelta
697 720
...@@ -846,16 +869,16 @@ class BSWorkbook(Workbook): ...@@ -846,16 +869,16 @@ class BSWorkbook(Workbook):
846 if len(self.sheetnames) > 1: 869 if len(self.sheetnames) > 1:
847 self.remove(self.get_sheet_by_name('Sheet')) 870 self.remove(self.get_sheet_by_name('Sheet'))
848 871
849 def rebuild(self, bs_summary, license_summary, res_list, document_scheme, contract_result): 872 def rebuild(self, bs_summary, license_summary, res_list, document_scheme, contract_result, metadata):
850 res_count_tuple = self.res_sheet(res_list) 873 res_count_tuple = self.res_sheet(res_list)
851 874
852 count_list = [(consts.MODEL_FIELD_BS, len(bs_summary))] 875 count_list = [(consts.MODEL_FIELD_BS, len(bs_summary))]
853 if document_scheme == consts.DOC_SCHEME_LIST[1]: 876 if document_scheme == consts.DOC_SCHEME_LIST[1]:
854 self.license_rebuild(license_summary, document_scheme, count_list) 877 self.license_rebuild(license_summary, document_scheme, count_list)
855 self.contract_rebuild(contract_result) 878 self.contract_rebuild(contract_result)
856 self.bs_rebuild(bs_summary, res_count_tuple) 879 self.bs_rebuild(bs_summary, res_count_tuple, metadata)
857 else: 880 else:
858 self.bs_rebuild(bs_summary, res_count_tuple) 881 self.bs_rebuild(bs_summary, res_count_tuple, metadata)
859 self.license_rebuild(license_summary, document_scheme, count_list) 882 self.license_rebuild(license_summary, document_scheme, count_list)
860 self.move_res_sheet() 883 self.move_res_sheet()
861 self.remove_base_sheet() 884 self.remove_base_sheet()
......
...@@ -889,7 +889,7 @@ class DocView(GenericView, DocHandler): ...@@ -889,7 +889,7 @@ class DocView(GenericView, DocHandler):
889 create_time__lt=create_time_end + datetime.timedelta(days=1))\ 889 create_time__lt=create_time_end + datetime.timedelta(days=1))\
890 if create_time_start is not None and create_time_end is not None else Q() 890 if create_time_start is not None and create_time_end is not None else Q()
891 query = application_id_query & status_query & data_source_query & upload_finish_time_query & create_time_query 891 query = application_id_query & status_query & data_source_query & upload_finish_time_query & create_time_query
892 val_tuple = ('id', 'application_id', 'upload_finish_time', 'create_time', 'data_source', 'status') 892 val_tuple = ('id', 'application_id', 'upload_finish_time', 'create_time', 'document_scheme', 'data_source', 'status', 'page_count')
893 doc_class, prefix = self.get_doc_class(business_type) 893 doc_class, prefix = self.get_doc_class(business_type)
894 total = doc_class.objects.filter(query).count() 894 total = doc_class.objects.filter(query).count()
895 start_index = page_size * (page - 1) 895 start_index = page_size * (page - 1)
...@@ -898,14 +898,22 @@ class DocView(GenericView, DocHandler): ...@@ -898,14 +898,22 @@ class DocView(GenericView, DocHandler):
898 raise self.invalid_params('页数不存在') 898 raise self.invalid_params('页数不存在')
899 899
900 doc_queryset = doc_class.objects.filter(query).values(*val_tuple).order_by('-create_time')[start_index: end_index] 900 doc_queryset = doc_class.objects.filter(query).values(*val_tuple).order_by('-create_time')[start_index: end_index]
901 doc_list = self.get_doc_list(doc_queryset, prefix) 901 # doc_list = self.get_doc_list(doc_queryset, prefix)
902 for doc_dict in doc_queryset:
903 tmp_scheme = consts.COMPARE_DOC_SCHEME_LIST[0] if doc_dict['document_scheme'] == consts.DOC_SCHEME_LIST[0]\
904 else consts.COMPARE_DOC_SCHEME_LIST[1]
905 application_link = '{0}/showList/showList?entity={1}&scheme={2}&case_id={3}'.format(
906 conf.BASE_URL, prefix, tmp_scheme, doc_dict['application_id'])
907 doc_dict['target_url'] = application_link
902 908
903 # total = len(doc_list) 909 # total = len(doc_list)
904 pagination = {'current': page, 'total': total, 'page_size': page_size} 910 pagination = {'current': page, 'total': total, 'page_size': page_size}
905 res = { 911 res = {
906 'pagination': pagination, 912 'pagination': pagination,
907 'doc_list': doc_list 913 'doc_list': list(doc_queryset)
908 } 914 }
915 # 新增scheme、处理时长、文件页数,删除下载切图
916 # 新增链接跳转比对结果
909 self.running_log.info('[get doc list] [args={0}] [res={1}]'.format(args, res)) 917 self.running_log.info('[get doc list] [args={0}] [res={1}]'.format(args, res))
910 return response.ok(data=res) 918 return response.ok(data=res)
911 919
......
...@@ -2082,7 +2082,7 @@ def se_bs_compare(license_en, ocr_res_dict, strip_list, is_auto): ...@@ -2082,7 +2082,7 @@ def se_bs_compare(license_en, ocr_res_dict, strip_list, is_auto):
2082 dbr1_tmp_res_part = {} 2082 dbr1_tmp_res_part = {}
2083 for idx, (name, value) in enumerate(dbr1_field_list): 2083 for idx, (name, value) in enumerate(dbr1_field_list):
2084 ocr_str_or_list = ocr_res.get(compare_logic[name][0]) 2084 ocr_str_or_list = ocr_res.get(compare_logic[name][0])
2085 if isinstance(ocr_str_or_list, str) or isinstance(ocr_str_or_list, list): 2085 if isinstance(ocr_str_or_list, str) or isinstance(ocr_str_or_list, list) or isinstance(ocr_str_or_list, int):
2086 result = getattr(cp, compare_logic[name][1])(value, ocr_str_or_list, **compare_logic[name][2]) 2086 result = getattr(cp, compare_logic[name][1])(value, ocr_str_or_list, **compare_logic[name][2])
2087 if isinstance(ocr_str_or_list, list): 2087 if isinstance(ocr_str_or_list, list):
2088 ocr_str = json.dumps(ocr_str_or_list, ensure_ascii=False) 2088 ocr_str = json.dumps(ocr_str_or_list, ensure_ascii=False)
...@@ -2114,7 +2114,7 @@ def se_bs_compare(license_en, ocr_res_dict, strip_list, is_auto): ...@@ -2114,7 +2114,7 @@ def se_bs_compare(license_en, ocr_res_dict, strip_list, is_auto):
2114 dbr2_tmp_res_part = {} 2114 dbr2_tmp_res_part = {}
2115 for idx, (name, value) in enumerate(dbr2_field_list): 2115 for idx, (name, value) in enumerate(dbr2_field_list):
2116 ocr_str_or_list = ocr_res.get(compare_logic[name][0]) 2116 ocr_str_or_list = ocr_res.get(compare_logic[name][0])
2117 if isinstance(ocr_str_or_list, str) or isinstance(ocr_str_or_list, list): 2117 if isinstance(ocr_str_or_list, str) or isinstance(ocr_str_or_list, list) or isinstance(ocr_str_or_list, int):
2118 result = getattr(cp, compare_logic[name][1])(value, ocr_str_or_list, **compare_logic[name][2]) 2118 result = getattr(cp, compare_logic[name][1])(value, ocr_str_or_list, **compare_logic[name][2])
2119 if isinstance(ocr_str_or_list, list): 2119 if isinstance(ocr_str_or_list, list):
2120 ocr_str = json.dumps(ocr_str_or_list, ensure_ascii=False) 2120 ocr_str = json.dumps(ocr_str_or_list, ensure_ascii=False)
......
1 import pyodbc
2
3 hil_sql = """
4 ALTER TABLE hil_doc ADD page_count smallint;
5 ALTER TABLE hil_doc ADD metadata nvarchar(max);
6 """
7
8 afc_sql = """
9 ALTER TABLE afc_doc ADD page_count smallint;
10 ALTER TABLE afc_doc ADD metadata nvarchar(max);
11 """
12
13 hil_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True)
14
15 hil_cursor = hil_cnxn.cursor()
16 hil_cursor.execute(hil_sql)
17
18 hil_cursor.close()
19 hil_cnxn.close()
20
21 afc_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True)
22
23 afc_cursor = afc_cnxn.cursor()
24 afc_cursor.execute(afc_sql)
25
26 afc_cursor.close()
27 afc_cnxn.close()
...@@ -47,6 +47,8 @@ class PDFHandler: ...@@ -47,6 +47,8 @@ class PDFHandler:
47 '中国建设银行个人活期账户全部交易明细', 47 '中国建设银行个人活期账户全部交易明细',
48 '平安银行个人账户交易明细清单', 48 '平安银行个人账户交易明细清单',
49 ] 49 ]
50 self.page_count = None
51 self.metadata = None
50 52
51 def get_suffix(self, file_name): 53 def get_suffix(self, file_name):
52 if file_name is None: 54 if file_name is None:
...@@ -321,6 +323,7 @@ class PDFHandler: ...@@ -321,6 +323,7 @@ class PDFHandler:
321 def e_contract_process(self): 323 def e_contract_process(self):
322 os.makedirs(self.img_dir_path, exist_ok=True) 324 os.makedirs(self.img_dir_path, exist_ok=True)
323 with fitz.Document(self.path) as pdf: 325 with fitz.Document(self.path) as pdf:
326 self.page_count = pdf.pageCount
324 for pno in range(pdf.pageCount): 327 for pno in range(pdf.pageCount):
325 page = pdf.loadPage(pno) 328 page = pdf.loadPage(pno)
326 self.pdf_info[str(pno)] = json.loads(page.getText('json')) 329 self.pdf_info[str(pno)] = json.loads(page.getText('json'))
...@@ -341,6 +344,8 @@ class PDFHandler: ...@@ -341,6 +344,8 @@ class PDFHandler:
341 self.img_path_list.append(img_save_path) 344 self.img_path_list.append(img_save_path)
342 else: 345 else:
343 with fitz.Document(self.path) as pdf: 346 with fitz.Document(self.path) as pdf:
347 self.metadata = pdf.metadata
348 self.page_count = pdf.pageCount
344 if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count: 349 if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
345 self.img_count = pdf.pageCount 350 self.img_count = pdf.pageCount
346 return 351 return
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!