13e30ac5 by 周伟奇

add wb rebuild

1 parent 1526125c
......@@ -5,14 +5,16 @@ import signal
import base64
import asyncio
import aiohttp
from openpyxl import Workbook
# from openpyxl import Workbook
from apps.doc.ocr.wb import BSWorkbook
from django.core.management import BaseCommand
from settings import conf
from common.mixins import LoggerMixin
from common.tools.file_tools import write_zip_file
from common.tools.pdf_to_img import PDFHandler
from apps.doc.models import DocStatus, HILDoc, AFCDoc
from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords
from apps.doc.named_enum import KeywordsType
from apps.doc import consts
from apps.doc.ocr.edms import EDMS, rh
......@@ -86,9 +88,11 @@ class Command(BaseCommand, LoggerMixin):
return doc_data_path, excel_path, pdf_path
@staticmethod
def append_sheet(wb, sheets_list, img_name):
def append_sheet(wb, sheets_list, img_name, role_summary):
for i, sheet in enumerate(sheets_list):
ws = wb.create_sheet('{0}_{1}'.format(img_name, i))
sheet_name = '{0}_{1}'.format(img_name, i)
role_summary['银行-户名'].append((sheet_name, 1, None, None, None, None, None))
ws = wb.create_sheet(sheet_name)
cells = sheet.get('cells')
for cell in cells:
c1 = cell.get('start_column')
......@@ -112,12 +116,12 @@ class Command(BaseCommand, LoggerMixin):
async with session.post(self.ocr_url, json=json_data) as response:
return await response.json()
async def img_ocr_excel(self, wb, img_path):
async def img_ocr_excel(self, wb, img_path, role_summary):
res = await self.fetch_ocr_result(img_path)
self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res))
sheets_list = res.get('result').get('res')
img_name = os.path.basename(img_path)
self.append_sheet(wb, sheets_list, img_name)
self.append_sheet(wb, sheets_list, img_name, role_summary)
# TODO 细化文件状态,不同异常状态采取不同的处理
# TODO 调用接口重试
......@@ -148,13 +152,22 @@ class Command(BaseCommand, LoggerMixin):
write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
# 4.图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件
wb = Workbook()
role_summary = {
'银行-户名': []
}
interest_keyword = Keywords.objects.filter(
type=KeywordsType.INTEREST.value).values_list('keyword', flat=True)
salary_keyword = Keywords.objects.filter(
type=KeywordsType.SALARY.value).values_list('keyword', flat=True)
loan_keyword = Keywords.objects.filter(type=KeywordsType.LOAN.value).values_list('keyword', flat=True)
wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword)
loop = asyncio.get_event_loop()
tasks = [self.img_ocr_excel(wb, img_path) for img_path in pdf_handler.img_path_list]
tasks = [self.img_ocr_excel(wb, img_path, role_summary) for img_path in pdf_handler.img_path_list]
loop.run_until_complete(asyncio.wait(tasks))
# loop.close()
# 整合excel文件
wb.rebuild(role_summary)
wb.save(excel_path)
except Exception as e:
doc.status = DocStatus.PROCESS_FAILED.value
......@@ -164,7 +177,8 @@ class Command(BaseCommand, LoggerMixin):
else:
try:
# 5.上传至EDMS
self.edms.upload(excel_path, doc, business_type)
# self.edms.upload(excel_path, doc, business_type)
print('upload pass')
except Exception as e:
doc.status = DocStatus.UPLOAD_FAILED.value
doc.save()
......
import numpy as np
import locale
import numpy as np
from pandas._libs import tslib
from pandas._libs.tslibs.nattype import NaTType
from pandas.core.indexes.datetimes import DatetimeIndex
from openpyxl import Workbook
from openpyxl.styles import Border, Side, PatternFill, numbers
......@@ -43,6 +44,7 @@ class BSWorkbook(Workbook):
self.amount_fill = PatternFill("solid", fgColor="00FFFF00")
self.bd = Side(style='thin', color="000000")
self.border = Border(left=self.bd, top=self.bd, right=self.bd, bottom=self.bd)
self.MAX_MEAN = 31
def sheet_prune(self, ws):
ws.insert_cols(1, amount=self.fixed_col_amount)
......@@ -56,7 +58,29 @@ class BSWorkbook(Workbook):
ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_idx - col)
ws.delete_cols(self.fixed_col_amount + 1, amount=ws.max_column)
def sheet_split(self, ws, month_mapping):
@staticmethod
def month_split(dti, date_list):
month_list = []
idx_list = []
month_pre = None
for idx, month_str in enumerate(dti.strftime('%Y-%m')):
if isinstance(month_str, float):
continue
if month_str != month_pre:
month_list.append(month_str)
if month_pre is None:
date_list.append(dti[idx].date())
idx = 0
idx_list.append(idx)
month_pre = month_str
for idx in range(len(dti)-1, -1, -1):
if isinstance(dti[idx], NaTType):
continue
date_list.append(dti[idx].date())
break
return month_list, idx_list
def sheet_split(self, ws, month_mapping, date_list):
for date_tuple in ws.iter_cols(min_col=1, max_col=1, min_row=2, values_only=True):
dt_array, tz_parsed = tslib.array_to_datetime(
np.array(date_tuple, copy=False, dtype=np.object_),
......@@ -68,6 +92,31 @@ class BSWorkbook(Workbook):
)
dti = DatetimeIndex(dt_array, tz=None, name=None)
month_list, idx_list = self.month_split(dti, date_list)
if len(month_list) == 0:
month_info = month_mapping.setdefault('xxxx-xx', [])
month_info.append((ws.title, 2, ws.max_row, 0))
elif len(month_list) == 1:
month_info = month_mapping.setdefault(month_list[0], [])
day_mean = np.mean(dti.day.dropna())
if len(month_info) == 0:
month_info.append((ws.title, 2, ws.max_row, day_mean))
else:
for i, item in enumerate(month_info):
# TODO 倒序处理
if day_mean <= item[-1]:
month_info.insert(i, (ws.title, 2, ws.max_row, day_mean))
break
else:
month_info.append((ws.title, 2, ws.max_row, day_mean))
else:
for i, item in enumerate(month_list[:-1]):
month_mapping.setdefault(item, []).append(
(ws.title, idx_list[i] + 2, idx_list[i + 1] + 1, self.MAX_MEAN))
month_mapping.setdefault(month_list[-1], []).insert(
0, (ws.title, idx_list[-1] + 2, ws.max_row, 0))
def build_metadata_rows(self, confidence_max, code_list, print_time, start_date, end_date, date_interval):
metadata_rows = [('流水识别置信度', confidence_max), self.blank_row, self.code_header]
metadata_rows.extend(code_list)
......@@ -169,19 +218,24 @@ class BSWorkbook(Workbook):
confidence_max = 0
code_list = []
month_mapping = {}
print_time = start_date = end_date = date_interval = None
date_list = []
start_date = end_date = date_interval = print_time = None
for summary in summary_list:
sheet_name, confidence, page, code, print_time, start_date, end_date = summary
sheet_name, confidence, page, code, print_time_local, start_date_local, end_date_local = summary
ws = self.get_sheet_by_name(sheet_name)
# 1.1.删除多余列、排列
self.sheet_prune(ws)
# 1.2.TODO 按月份分割
self.sheet_split(ws, month_mapping)
# 1.2.按月份分割
self.sheet_split(ws, month_mapping, date_list)
# 1.3.元数据处理 TODO 时间与日期处理
# confidence_max = max(confidence, confidence_max)
# if code is not None:
# code_list.append((page, code))
confidence_max = max(confidence, confidence_max)
if code is not None:
code_list.append((page, code))
if len(date_list) > 1:
start_date = min(date_list)
end_date = max(date_list)
date_interval = (end_date - start_date).days
# 2.元信息提取表
ms = self.build_meta_sheet(role, confidence_max, code_list, print_time, start_date, end_date, date_interval)
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!