7965c565 by 周伟奇

add 0318

1 parent ea79bc59
......@@ -1096,11 +1096,23 @@ class Command(BaseCommand, LoggerMixin):
try:
doc.status = DocStatus.PROCESS_FAILED.value
doc.page_count = pdf_handler.page_count
doc.save()
except Exception as e:
self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
self.log_base, traceback.format_exc()))
else:
try:
if pdf_handler.is_e_pdf:
doc.metadata = pdf_handler.metadata if pdf_handler.metadata is None else \
json.dumps(pdf_handler.metadata)
doc.page_count = pdf_handler.page_count
doc.save()
except Exception as e:
self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
self.log_base, traceback.format_exc()))
with lock:
todo_count_dict[task_str] = pdf_handler.img_count
for img_idx, img_path in enumerate(pdf_handler.img_path_list):
......@@ -1147,6 +1159,7 @@ class Command(BaseCommand, LoggerMixin):
try:
doc.status = DocStatus.PROCESS_FAILED.value
doc.page_count = pdf_handler.page_count
doc.save()
self.online_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] '
'[error={2}]'.format(self.log_base, task_str, traceback.format_exc()))
......@@ -1178,6 +1191,13 @@ class Command(BaseCommand, LoggerMixin):
else:
raise Exception('download or pdf to img failed')
try:
doc.page_count = pdf_handler.page_count
doc.save()
except Exception as e:
self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
self.log_base, traceback.format_exc()))
if classify_1_str == str(consts.CONTRACT_CLASSIFY):
ocr_result = afc_predict(pdf_handler.pdf_info)
page_res = {}
......@@ -1234,6 +1254,7 @@ class Command(BaseCommand, LoggerMixin):
except Exception as e:
try:
doc.status = DocStatus.PROCESS_FAILED.value
doc.page_count = pdf_handler.page_count
doc.save()
self.online_log.warn('{0} [process failed (e-contract)] [task={1}] '
'[error={2}]'.format(self.e_log_base, task_str, traceback.format_exc()))
......@@ -1560,7 +1581,7 @@ class Command(BaseCommand, LoggerMixin):
# 重构Excel文件
# src_excel_path = os.path.join(doc_data_path, 'src.xlsx')
# wb.save(src_excel_path)
count_list = wb.rebuild(merged_bs_summary, license_summary, res_list, doc.document_scheme, contract_result)
count_list = wb.rebuild(merged_bs_summary, license_summary, res_list, doc.document_scheme, contract_result, doc.metadata)
wb.save(excel_path)
except Exception as e:
......
......@@ -61,6 +61,9 @@ class HILDoc(models.Model):
mvc_count = models.IntegerField(default=0, verbose_name='机动车登记证书处理数目')
vat_count = models.IntegerField(default=0, verbose_name='增值税发票处理数目')
page_count = models.IntegerField(null=True, verbose_name='文件page数目')
metadata = models.TextField(null=True, verbose_name="电子PDF专属,PDF信息")
class Meta:
managed = False
db_table = 'hil_doc'
......@@ -100,6 +103,9 @@ class AFCDoc(models.Model):
mvc_count = models.IntegerField(default=0, verbose_name='机动车登记证书处理数目')
vat_count = models.IntegerField(default=0, verbose_name='增值税发票处理数目')
page_count = models.IntegerField(null=True, verbose_name='文件page数目')
metadata = models.TextField(null=True, verbose_name="电子PDF专属,PDF信息")
class Meta:
managed = False
situ_db_label = 'afc'
......
import re
import json
import random
import locale
import numpy as np
......@@ -311,7 +312,8 @@ class BSWorkbook(Workbook):
month_mapping.setdefault(item, []).append(
(ws.title, idx_list[i] + min_row, idx_list[i + 1] + min_row - 1, day_mean))
def build_metadata_rows(self, confidence, code, verify_list, print_time, start_date, end_date, res_count_tuple, is_verify_classify):
def build_metadata_rows(self, confidence, code, verify_list, print_time, start_date, end_date,
res_count_tuple, is_verify_classify, metadata):
metadata_rows = [('流水识别置信度', confidence)]
if is_verify_classify:
verify_res = '疑似伪造' if len(verify_list) > 0 else '正常'
......@@ -322,11 +324,26 @@ class BSWorkbook(Workbook):
metadata_rows.append(('识别成功', res_count_tuple[1]))
metadata_rows.append(self.blank_row)
# PDF info
metadata_highlight_row = []
if isinstance(metadata, str):
metadata_dict = json.loads(metadata)
author = metadata_dict.pop('author', '')
producer = metadata_dict.pop('producer', '')
metadata_rows.append(('Author', author))
metadata_rows.append(('Producer', producer))
if len(author) > 0:
metadata_highlight_row.append(6)
if 'iText' not in producer and 'Qt' not in producer and 'Haru Free' not in producer:
metadata_highlight_row.append(7)
metadata_rows.append(self.blank_row)
verify_highlight_row = []
if is_verify_classify and len(verify_list) > 0:
metadata_rows.append(self.verify_header)
verify_start = len(metadata_rows)
metadata_rows.extend(verify_list)
for r in range(6, len(metadata_rows)+1):
for r in range(verify_start, len(metadata_rows)+1):
verify_highlight_row.append(r)
metadata_rows.append(self.blank_row)
......@@ -344,18 +361,23 @@ class BSWorkbook(Workbook):
self.blank_row,
self.interest_keyword_header]
)
return metadata_rows, verify_highlight_row, timedelta
return metadata_rows, verify_highlight_row, timedelta, metadata_highlight_row
def build_meta_sheet(self, role_name, card, confidence, code, verify_list, print_time, start_date, end_date,
res_count_tuple, is_verify_classify):
metadata_rows, verify_highlight_row, timedelta = self.build_metadata_rows(
confidence, code, verify_list, print_time, start_date, end_date, res_count_tuple, is_verify_classify)
res_count_tuple, is_verify_classify, metadata):
metadata_rows, verify_highlight_row, timedelta, metadata_highlight_row = \
self.build_metadata_rows(confidence, code, verify_list, print_time, start_date, end_date, res_count_tuple,
is_verify_classify, metadata)
if not isinstance(role_name, str):
role_name = consts.UNKNOWN_ROLE
ms = self.create_sheet('{0}{1}({2})'.format(self.meta_sheet_title, role_name, card))
for row in metadata_rows:
ms.append(row)
for row in metadata_highlight_row:
for cell in ms[row]:
cell.fill = self.amount_fill
if len(verify_highlight_row) > 0:
for cell in ms[2]:
cell.fill = self.amount_fill
......@@ -625,7 +647,7 @@ class BSWorkbook(Workbook):
ms.append(row)
self.remove(tmp2_ws)
def bs_rebuild(self, bs_summary, res_count_tuple):
def bs_rebuild(self, bs_summary, res_count_tuple, metadata=None):
# bs_summary = {
# '卡号': {
# 'classify': 0,
......@@ -691,7 +713,8 @@ class BSWorkbook(Workbook):
start_date,
end_date,
res_count_tuple,
is_verify_classify)
is_verify_classify,
metadata)
summary['timedelta'] = timedelta
......@@ -846,16 +869,16 @@ class BSWorkbook(Workbook):
if len(self.sheetnames) > 1:
self.remove(self.get_sheet_by_name('Sheet'))
def rebuild(self, bs_summary, license_summary, res_list, document_scheme, contract_result):
def rebuild(self, bs_summary, license_summary, res_list, document_scheme, contract_result, metadata):
res_count_tuple = self.res_sheet(res_list)
count_list = [(consts.MODEL_FIELD_BS, len(bs_summary))]
if document_scheme == consts.DOC_SCHEME_LIST[1]:
self.license_rebuild(license_summary, document_scheme, count_list)
self.contract_rebuild(contract_result)
self.bs_rebuild(bs_summary, res_count_tuple)
self.bs_rebuild(bs_summary, res_count_tuple, metadata)
else:
self.bs_rebuild(bs_summary, res_count_tuple)
self.bs_rebuild(bs_summary, res_count_tuple, metadata)
self.license_rebuild(license_summary, document_scheme, count_list)
self.move_res_sheet()
self.remove_base_sheet()
......
......@@ -889,7 +889,7 @@ class DocView(GenericView, DocHandler):
create_time__lt=create_time_end + datetime.timedelta(days=1))\
if create_time_start is not None and create_time_end is not None else Q()
query = application_id_query & status_query & data_source_query & upload_finish_time_query & create_time_query
val_tuple = ('id', 'application_id', 'upload_finish_time', 'create_time', 'data_source', 'status')
val_tuple = ('id', 'application_id', 'upload_finish_time', 'create_time', 'document_scheme', 'data_source', 'status', 'page_count')
doc_class, prefix = self.get_doc_class(business_type)
total = doc_class.objects.filter(query).count()
start_index = page_size * (page - 1)
......@@ -898,14 +898,22 @@ class DocView(GenericView, DocHandler):
raise self.invalid_params('页数不存在')
doc_queryset = doc_class.objects.filter(query).values(*val_tuple).order_by('-create_time')[start_index: end_index]
doc_list = self.get_doc_list(doc_queryset, prefix)
# doc_list = self.get_doc_list(doc_queryset, prefix)
for doc_dict in doc_queryset:
tmp_scheme = consts.COMPARE_DOC_SCHEME_LIST[0] if doc_dict['document_scheme'] == consts.DOC_SCHEME_LIST[0]\
else consts.COMPARE_DOC_SCHEME_LIST[1]
application_link = '{0}/showList/showList?entity={1}&scheme={2}&case_id={3}'.format(
conf.BASE_URL, prefix, tmp_scheme, doc_dict['application_id'])
doc_dict['target_url'] = application_link
# total = len(doc_list)
pagination = {'current': page, 'total': total, 'page_size': page_size}
res = {
'pagination': pagination,
'doc_list': doc_list
'doc_list': list(doc_queryset)
}
# 新增scheme、处理时长、文件页数,删除下载切图
# 新增链接跳转比对结果
self.running_log.info('[get doc list] [args={0}] [res={1}]'.format(args, res))
return response.ok(data=res)
......
......@@ -2082,7 +2082,7 @@ def se_bs_compare(license_en, ocr_res_dict, strip_list, is_auto):
dbr1_tmp_res_part = {}
for idx, (name, value) in enumerate(dbr1_field_list):
ocr_str_or_list = ocr_res.get(compare_logic[name][0])
if isinstance(ocr_str_or_list, str) or isinstance(ocr_str_or_list, list):
if isinstance(ocr_str_or_list, str) or isinstance(ocr_str_or_list, list) or isinstance(ocr_str_or_list, int):
result = getattr(cp, compare_logic[name][1])(value, ocr_str_or_list, **compare_logic[name][2])
if isinstance(ocr_str_or_list, list):
ocr_str = json.dumps(ocr_str_or_list, ensure_ascii=False)
......@@ -2114,7 +2114,7 @@ def se_bs_compare(license_en, ocr_res_dict, strip_list, is_auto):
dbr2_tmp_res_part = {}
for idx, (name, value) in enumerate(dbr2_field_list):
ocr_str_or_list = ocr_res.get(compare_logic[name][0])
if isinstance(ocr_str_or_list, str) or isinstance(ocr_str_or_list, list):
if isinstance(ocr_str_or_list, str) or isinstance(ocr_str_or_list, list) or isinstance(ocr_str_or_list, int):
result = getattr(cp, compare_logic[name][1])(value, ocr_str_or_list, **compare_logic[name][2])
if isinstance(ocr_str_or_list, list):
ocr_str = json.dumps(ocr_str_or_list, ensure_ascii=False)
......
import pyodbc
hil_sql = """
ALTER TABLE hil_doc ADD page_count smallint;
ALTER TABLE hil_doc ADD metadata nvarchar(max);
"""
afc_sql = """
ALTER TABLE afc_doc ADD page_count smallint;
ALTER TABLE afc_doc ADD metadata nvarchar(max);
"""
hil_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True)
hil_cursor = hil_cnxn.cursor()
hil_cursor.execute(hil_sql)
hil_cursor.close()
hil_cnxn.close()
afc_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True)
afc_cursor = afc_cnxn.cursor()
afc_cursor.execute(afc_sql)
afc_cursor.close()
afc_cnxn.close()
......@@ -47,6 +47,8 @@ class PDFHandler:
'中国建设银行个人活期账户全部交易明细',
'平安银行个人账户交易明细清单',
]
self.page_count = None
self.metadata = None
def get_suffix(self, file_name):
if file_name is None:
......@@ -321,6 +323,7 @@ class PDFHandler:
def e_contract_process(self):
os.makedirs(self.img_dir_path, exist_ok=True)
with fitz.Document(self.path) as pdf:
self.page_count = pdf.pageCount
for pno in range(pdf.pageCount):
page = pdf.loadPage(pno)
self.pdf_info[str(pno)] = json.loads(page.getText('json'))
......@@ -341,6 +344,8 @@ class PDFHandler:
self.img_path_list.append(img_save_path)
else:
with fitz.Document(self.path) as pdf:
self.metadata = pdf.metadata
self.page_count = pdf.pageCount
if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
self.img_count = pdf.pageCount
return
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!