f3d6e429 by 周伟奇

update wb build

1 parent a220590e
......@@ -34,3 +34,4 @@ data/*
src/*.sh
test.py
ocr_test.py
\ No newline at end of file
......
......@@ -67,7 +67,6 @@ class Command(BaseCommand, LoggerMixin):
def pdf_download(self, doc, business_type):
if doc is None:
return None, None, None, None
# TODO EDMS下载pdf
doc_data_path = os.path.join(self.data_dir, business_type, str(doc.id))
os.makedirs(doc_data_path, exist_ok=True)
pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
......@@ -128,8 +127,10 @@ class Command(BaseCommand, LoggerMixin):
img_name = os.path.basename(img_path)
self.append_sheet(wb, sheets_list, img_name, role_summary)
# TODO 细化文件状态,不同异常状态采取不同的处理
# TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理
# TODO 调用接口重试
# TODO 异常邮件通知
# TODO 数据库断联问题
def handle(self, *args, **kwargs):
sleep_second = int(conf.SLEEP_SECOND)
max_sleep_second = int(conf.MAX_SLEEP_SECOND)
......
......@@ -35,6 +35,7 @@ class BSWorkbook(Workbook):
header_value = ws.cell(1, col).value
header_idx = consts.HEADERS_MAPPING.get(header_value)
# TODO 关键字段再次查找
# TODO 支付宝、微信流水第一行非表头,怎么处理
if header_idx is None:
continue
letter = get_column_letter(col)
......@@ -63,8 +64,31 @@ class BSWorkbook(Workbook):
break
return month_list, idx_list
def sheet_split(self, ws, month_mapping, date_list):
for date_tuple in ws.iter_cols(min_col=1, max_col=1, min_row=2, values_only=True):
@staticmethod
def get_reverse_trend(day_idx, idx_list):
reverse_trend = 0
pre_day = None
for idx, day in enumerate(day_idx):
if np.isnan(day):
continue
if idx in idx_list or pre_day is None:
pre_day = day
continue
if day < pre_day:
reverse_trend += 1
pre_day = day
elif day > pre_day:
reverse_trend -= 1
pre_day = day
if reverse_trend > 0:
reverse_trend = 1
elif reverse_trend < 0:
reverse_trend = -1
return reverse_trend
def sheet_split(self, ws, month_mapping, date_list, reverse_trend_list):
for date_tuple_src in ws.iter_cols(min_col=1, max_col=1, min_row=2, values_only=True):
date_tuple = [date[:10] if isinstance(date, str) else date for date in date_tuple_src]
dt_array, tz_parsed = tslib.array_to_datetime(
np.array(date_tuple, copy=False, dtype=np.object_),
errors="coerce",
......@@ -78,22 +102,30 @@ class BSWorkbook(Workbook):
month_list, idx_list = self.month_split(dti, date_list)
if len(month_list) == 0:
# month_info process
month_info = month_mapping.setdefault('xxxx-xx', [])
month_info.append((ws.title, 2, ws.max_row, 0))
elif len(month_list) == 1:
# reverse_trend_list process
reverse_trend = self.get_reverse_trend(dti.day, idx_list)
reverse_trend_list.append(reverse_trend)
# month_info process
month_info = month_mapping.setdefault(month_list[0], [])
day_mean = np.mean(dti.day.dropna())
if len(month_info) == 0:
month_info.append((ws.title, 2, ws.max_row, day_mean))
else:
for i, item in enumerate(month_info):
# TODO 倒序处理
if day_mean <= item[-1]:
month_info.insert(i, (ws.title, 2, ws.max_row, day_mean))
break
else:
month_info.append((ws.title, 2, ws.max_row, day_mean))
else:
# reverse_trend_list process
reverse_trend = self.get_reverse_trend(dti.day, idx_list)
reverse_trend_list.append(reverse_trend)
# month_info process
for i, item in enumerate(month_list[:-1]):
month_mapping.setdefault(item, []).append(
(ws.title, idx_list[i] + 2, idx_list[i + 1] + 1, self.MAX_MEAN))
......@@ -128,7 +160,7 @@ class BSWorkbook(Workbook):
ms.append(row)
return ms
def build_month_sheet(self, role, month_mapping, ms):
def build_month_sheet(self, role, month_mapping, ms, is_reverse):
tmp_ws = self.create_sheet('tmp_ws')
for month in sorted(month_mapping.keys()):
# 3.1.拷贝数据
......@@ -143,7 +175,6 @@ class BSWorkbook(Workbook):
amount_mapping = {}
amount_fill_row = set()
for rows in new_ws.iter_rows():
is_fill = False
summary_cell = rows[5]
date_cell = rows[0]
# 关键词1提取
......@@ -154,11 +185,9 @@ class BSWorkbook(Workbook):
tmp_ws.append((summary_cell.value, date_cell.value, rows[2].value))
# 贷款关键词高亮
elif summary_cell.value in self.loan_keyword:
is_fill = True
summary_cell.fill = self.loan_fill
for i, cell in enumerate(rows):
cell.border = self.border
if is_fill:
cell.fill = self.loan_fill
if (i == 2 or i == 3) and cell.row > 1:
try:
# 3.3.金额、余额转数值
......@@ -177,16 +206,18 @@ class BSWorkbook(Workbook):
cell.value, []).append(cell.row)
# 3.4.核对结果
# TODO 借贷、开支类型银行流水,需要手动添加+-号
# TODO 倒序流水需要改变公式
if i == 9 and cell.row > 2:
cell.value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format(cell.row, cell.row - 1,
*self.proof_res)
if is_reverse:
cell.value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format(
cell.row - 1, cell.row, *self.proof_res)
else:
cell.value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format(
cell.row, cell.row - 1, *self.proof_res)
# 3.5.同一天相同进出账高亮
del amount_mapping
for row in amount_fill_row:
for cell in new_ws[row]:
cell.fill = self.amount_fill
new_ws[row][2].fill = self.amount_fill
# 关键词2信息提取
ms.append(self.blank_row)
......@@ -196,9 +227,10 @@ class BSWorkbook(Workbook):
self.remove(tmp_ws)
def rebuild(self, role_summary):
# (sheet_name, confidence, page, code, print_time, start_date, end_date)
# (sheet_name, confidence, page, code, print_time, start_date, end_date) # TODO 表名简化,+卡号
for role, summary_list in role_summary.items():
# 1.原表修剪、排列、按照月份分割
reverse_trend_list = []
confidence_max = 0
code_list = []
month_mapping = {}
......@@ -210,7 +242,7 @@ class BSWorkbook(Workbook):
# 1.1.删除多余列、排列
self.sheet_prune(ws)
# 1.2.按月份分割
self.sheet_split(ws, month_mapping, date_list)
self.sheet_split(ws, month_mapping, date_list, reverse_trend_list)
# 1.3.元数据处理 TODO 时间与日期处理
confidence_max = max(confidence, confidence_max)
if code is not None:
......@@ -224,7 +256,12 @@ class BSWorkbook(Workbook):
ms = self.build_meta_sheet(role, confidence_max, code_list, print_time, start_date, end_date, date_interval)
# 3.创建月份表、提取/高亮关键行
self.build_month_sheet(role, month_mapping, ms)
is_reverse = False
if sum(reverse_trend_list) > 0: # 倒序处理
is_reverse = True
for month_list in month_mapping.values():
month_list.sort(key=lambda x: x[-1], reverse=True)
self.build_month_sheet(role, month_mapping, ms, is_reverse)
# 删除原表
for summary in summary_list:
......
......@@ -4,8 +4,10 @@ from PIL import Image
from io import BytesIO
# 页面保存为png图片参数
ZOOM_X = ZOOM_Y = 2.0
trans = fitz.Matrix(ZOOM_X, ZOOM_X).preRotate(0) # zoom factor 2 in each dimension
ZOOM_X_1 = ZOOM_Y_1 = 1.0
ZOOM_X_2 = ZOOM_Y_2 = 2.0
trans_1 = fitz.Matrix(ZOOM_X_1, ZOOM_X_1).preRotate(0) # zoom factor 1 in each dimension
trans_2 = fitz.Matrix(ZOOM_X_2, ZOOM_X_2).preRotate(0) # zoom factor 2 in each dimension
# 特殊filter处理
ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'}
......@@ -30,7 +32,10 @@ class PDFHandler:
return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext))
def page_to_png(self, page):
pm = page.getPixmap(matrix=trans, alpha=False)
if page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500:
pm = page.getPixmap(matrix=trans_1, alpha=False)
else:
pm = page.getPixmap(matrix=trans_2, alpha=False)
img_save_path = self.get_img_save_path(page.number)
pm.writePNG(img_save_path)
self.img_path_list.append(img_save_path)
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!