add wb rebuild
Showing
2 changed files
with
86 additions
and
18 deletions
| ... | @@ -5,14 +5,16 @@ import signal | ... | @@ -5,14 +5,16 @@ import signal |
| 5 | import base64 | 5 | import base64 |
| 6 | import asyncio | 6 | import asyncio |
| 7 | import aiohttp | 7 | import aiohttp |
| 8 | from openpyxl import Workbook | 8 | # from openpyxl import Workbook |
| 9 | from apps.doc.ocr.wb import BSWorkbook | ||
| 9 | from django.core.management import BaseCommand | 10 | from django.core.management import BaseCommand |
| 10 | 11 | ||
| 11 | from settings import conf | 12 | from settings import conf |
| 12 | from common.mixins import LoggerMixin | 13 | from common.mixins import LoggerMixin |
| 13 | from common.tools.file_tools import write_zip_file | 14 | from common.tools.file_tools import write_zip_file |
| 14 | from common.tools.pdf_to_img import PDFHandler | 15 | from common.tools.pdf_to_img import PDFHandler |
| 15 | from apps.doc.models import DocStatus, HILDoc, AFCDoc | 16 | from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords |
| 17 | from apps.doc.named_enum import KeywordsType | ||
| 16 | from apps.doc import consts | 18 | from apps.doc import consts |
| 17 | from apps.doc.ocr.edms import EDMS, rh | 19 | from apps.doc.ocr.edms import EDMS, rh |
| 18 | 20 | ||
| ... | @@ -86,9 +88,11 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -86,9 +88,11 @@ class Command(BaseCommand, LoggerMixin): |
| 86 | return doc_data_path, excel_path, pdf_path | 88 | return doc_data_path, excel_path, pdf_path |
| 87 | 89 | ||
| 88 | @staticmethod | 90 | @staticmethod |
| 89 | def append_sheet(wb, sheets_list, img_name): | 91 | def append_sheet(wb, sheets_list, img_name, role_summary): |
| 90 | for i, sheet in enumerate(sheets_list): | 92 | for i, sheet in enumerate(sheets_list): |
| 91 | ws = wb.create_sheet('{0}_{1}'.format(img_name, i)) | 93 | sheet_name = '{0}_{1}'.format(img_name, i) |
| 94 | role_summary['银行-户名'].append((sheet_name, 1, None, None, None, None, None)) | ||
| 95 | ws = wb.create_sheet(sheet_name) | ||
| 92 | cells = sheet.get('cells') | 96 | cells = sheet.get('cells') |
| 93 | for cell in cells: | 97 | for cell in cells: |
| 94 | c1 = cell.get('start_column') | 98 | c1 = cell.get('start_column') |
| ... | @@ -112,12 +116,12 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -112,12 +116,12 @@ class Command(BaseCommand, LoggerMixin): |
| 112 | async with session.post(self.ocr_url, json=json_data) as response: | 116 | async with session.post(self.ocr_url, json=json_data) as response: |
| 113 | return await response.json() | 117 | return await response.json() |
| 114 | 118 | ||
| 115 | async def img_ocr_excel(self, wb, img_path): | 119 | async def img_ocr_excel(self, wb, img_path, role_summary): |
| 116 | res = await self.fetch_ocr_result(img_path) | 120 | res = await self.fetch_ocr_result(img_path) |
| 117 | self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res)) | 121 | self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res)) |
| 118 | sheets_list = res.get('result').get('res') | 122 | sheets_list = res.get('result').get('res') |
| 119 | img_name = os.path.basename(img_path) | 123 | img_name = os.path.basename(img_path) |
| 120 | self.append_sheet(wb, sheets_list, img_name) | 124 | self.append_sheet(wb, sheets_list, img_name, role_summary) |
| 121 | 125 | ||
| 122 | # TODO 细化文件状态,不同异常状态采取不同的处理 | 126 | # TODO 细化文件状态,不同异常状态采取不同的处理 |
| 123 | # TODO 调用接口重试 | 127 | # TODO 调用接口重试 |
| ... | @@ -148,13 +152,22 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -148,13 +152,22 @@ class Command(BaseCommand, LoggerMixin): |
| 148 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) | 152 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) |
| 149 | 153 | ||
| 150 | # 4.图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件 | 154 | # 4.图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件 |
| 151 | wb = Workbook() | 155 | role_summary = { |
| 156 | '银行-户名': [] | ||
| 157 | } | ||
| 158 | interest_keyword = Keywords.objects.filter( | ||
| 159 | type=KeywordsType.INTEREST.value).values_list('keyword', flat=True) | ||
| 160 | salary_keyword = Keywords.objects.filter( | ||
| 161 | type=KeywordsType.SALARY.value).values_list('keyword', flat=True) | ||
| 162 | loan_keyword = Keywords.objects.filter(type=KeywordsType.LOAN.value).values_list('keyword', flat=True) | ||
| 163 | wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword) | ||
| 152 | loop = asyncio.get_event_loop() | 164 | loop = asyncio.get_event_loop() |
| 153 | tasks = [self.img_ocr_excel(wb, img_path) for img_path in pdf_handler.img_path_list] | 165 | tasks = [self.img_ocr_excel(wb, img_path, role_summary) for img_path in pdf_handler.img_path_list] |
| 154 | loop.run_until_complete(asyncio.wait(tasks)) | 166 | loop.run_until_complete(asyncio.wait(tasks)) |
| 155 | # loop.close() | 167 | # loop.close() |
| 156 | 168 | ||
| 157 | # 整合excel文件 | 169 | # 整合excel文件 |
| 170 | wb.rebuild(role_summary) | ||
| 158 | wb.save(excel_path) | 171 | wb.save(excel_path) |
| 159 | except Exception as e: | 172 | except Exception as e: |
| 160 | doc.status = DocStatus.PROCESS_FAILED.value | 173 | doc.status = DocStatus.PROCESS_FAILED.value |
| ... | @@ -164,7 +177,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -164,7 +177,8 @@ class Command(BaseCommand, LoggerMixin): |
| 164 | else: | 177 | else: |
| 165 | try: | 178 | try: |
| 166 | # 5.上传至EDMS | 179 | # 5.上传至EDMS |
| 167 | self.edms.upload(excel_path, doc, business_type) | 180 | # self.edms.upload(excel_path, doc, business_type) |
| 181 | print('upload pass') | ||
| 168 | except Exception as e: | 182 | except Exception as e: |
| 169 | doc.status = DocStatus.UPLOAD_FAILED.value | 183 | doc.status = DocStatus.UPLOAD_FAILED.value |
| 170 | doc.save() | 184 | doc.save() | ... | ... |
| 1 | import numpy as np | ||
| 2 | import locale | 1 | import locale |
| 2 | import numpy as np | ||
| 3 | from pandas._libs import tslib | 3 | from pandas._libs import tslib |
| 4 | from pandas._libs.tslibs.nattype import NaTType | ||
| 4 | from pandas.core.indexes.datetimes import DatetimeIndex | 5 | from pandas.core.indexes.datetimes import DatetimeIndex |
| 5 | from openpyxl import Workbook | 6 | from openpyxl import Workbook |
| 6 | from openpyxl.styles import Border, Side, PatternFill, numbers | 7 | from openpyxl.styles import Border, Side, PatternFill, numbers |
| ... | @@ -43,6 +44,7 @@ class BSWorkbook(Workbook): | ... | @@ -43,6 +44,7 @@ class BSWorkbook(Workbook): |
| 43 | self.amount_fill = PatternFill("solid", fgColor="00FFFF00") | 44 | self.amount_fill = PatternFill("solid", fgColor="00FFFF00") |
| 44 | self.bd = Side(style='thin', color="000000") | 45 | self.bd = Side(style='thin', color="000000") |
| 45 | self.border = Border(left=self.bd, top=self.bd, right=self.bd, bottom=self.bd) | 46 | self.border = Border(left=self.bd, top=self.bd, right=self.bd, bottom=self.bd) |
| 47 | self.MAX_MEAN = 31 | ||
| 46 | 48 | ||
| 47 | def sheet_prune(self, ws): | 49 | def sheet_prune(self, ws): |
| 48 | ws.insert_cols(1, amount=self.fixed_col_amount) | 50 | ws.insert_cols(1, amount=self.fixed_col_amount) |
| ... | @@ -56,7 +58,29 @@ class BSWorkbook(Workbook): | ... | @@ -56,7 +58,29 @@ class BSWorkbook(Workbook): |
| 56 | ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_idx - col) | 58 | ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_idx - col) |
| 57 | ws.delete_cols(self.fixed_col_amount + 1, amount=ws.max_column) | 59 | ws.delete_cols(self.fixed_col_amount + 1, amount=ws.max_column) |
| 58 | 60 | ||
| 59 | def sheet_split(self, ws, month_mapping): | 61 | @staticmethod |
| 62 | def month_split(dti, date_list): | ||
| 63 | month_list = [] | ||
| 64 | idx_list = [] | ||
| 65 | month_pre = None | ||
| 66 | for idx, month_str in enumerate(dti.strftime('%Y-%m')): | ||
| 67 | if isinstance(month_str, float): | ||
| 68 | continue | ||
| 69 | if month_str != month_pre: | ||
| 70 | month_list.append(month_str) | ||
| 71 | if month_pre is None: | ||
| 72 | date_list.append(dti[idx].date()) | ||
| 73 | idx = 0 | ||
| 74 | idx_list.append(idx) | ||
| 75 | month_pre = month_str | ||
| 76 | for idx in range(len(dti)-1, -1, -1): | ||
| 77 | if isinstance(dti[idx], NaTType): | ||
| 78 | continue | ||
| 79 | date_list.append(dti[idx].date()) | ||
| 80 | break | ||
| 81 | return month_list, idx_list | ||
| 82 | |||
| 83 | def sheet_split(self, ws, month_mapping, date_list): | ||
| 60 | for date_tuple in ws.iter_cols(min_col=1, max_col=1, min_row=2, values_only=True): | 84 | for date_tuple in ws.iter_cols(min_col=1, max_col=1, min_row=2, values_only=True): |
| 61 | dt_array, tz_parsed = tslib.array_to_datetime( | 85 | dt_array, tz_parsed = tslib.array_to_datetime( |
| 62 | np.array(date_tuple, copy=False, dtype=np.object_), | 86 | np.array(date_tuple, copy=False, dtype=np.object_), |
| ... | @@ -68,6 +92,31 @@ class BSWorkbook(Workbook): | ... | @@ -68,6 +92,31 @@ class BSWorkbook(Workbook): |
| 68 | ) | 92 | ) |
| 69 | dti = DatetimeIndex(dt_array, tz=None, name=None) | 93 | dti = DatetimeIndex(dt_array, tz=None, name=None) |
| 70 | 94 | ||
| 95 | month_list, idx_list = self.month_split(dti, date_list) | ||
| 96 | |||
| 97 | if len(month_list) == 0: | ||
| 98 | month_info = month_mapping.setdefault('xxxx-xx', []) | ||
| 99 | month_info.append((ws.title, 2, ws.max_row, 0)) | ||
| 100 | elif len(month_list) == 1: | ||
| 101 | month_info = month_mapping.setdefault(month_list[0], []) | ||
| 102 | day_mean = np.mean(dti.day.dropna()) | ||
| 103 | if len(month_info) == 0: | ||
| 104 | month_info.append((ws.title, 2, ws.max_row, day_mean)) | ||
| 105 | else: | ||
| 106 | for i, item in enumerate(month_info): | ||
| 107 | # TODO 倒序处理 | ||
| 108 | if day_mean <= item[-1]: | ||
| 109 | month_info.insert(i, (ws.title, 2, ws.max_row, day_mean)) | ||
| 110 | break | ||
| 111 | else: | ||
| 112 | month_info.append((ws.title, 2, ws.max_row, day_mean)) | ||
| 113 | else: | ||
| 114 | for i, item in enumerate(month_list[:-1]): | ||
| 115 | month_mapping.setdefault(item, []).append( | ||
| 116 | (ws.title, idx_list[i] + 2, idx_list[i + 1] + 1, self.MAX_MEAN)) | ||
| 117 | month_mapping.setdefault(month_list[-1], []).insert( | ||
| 118 | 0, (ws.title, idx_list[-1] + 2, ws.max_row, 0)) | ||
| 119 | |||
| 71 | def build_metadata_rows(self, confidence_max, code_list, print_time, start_date, end_date, date_interval): | 120 | def build_metadata_rows(self, confidence_max, code_list, print_time, start_date, end_date, date_interval): |
| 72 | metadata_rows = [('流水识别置信度', confidence_max), self.blank_row, self.code_header] | 121 | metadata_rows = [('流水识别置信度', confidence_max), self.blank_row, self.code_header] |
| 73 | metadata_rows.extend(code_list) | 122 | metadata_rows.extend(code_list) |
| ... | @@ -169,19 +218,24 @@ class BSWorkbook(Workbook): | ... | @@ -169,19 +218,24 @@ class BSWorkbook(Workbook): |
| 169 | confidence_max = 0 | 218 | confidence_max = 0 |
| 170 | code_list = [] | 219 | code_list = [] |
| 171 | month_mapping = {} | 220 | month_mapping = {} |
| 172 | print_time = start_date = end_date = date_interval = None | 221 | date_list = [] |
| 222 | start_date = end_date = date_interval = print_time = None | ||
| 173 | for summary in summary_list: | 223 | for summary in summary_list: |
| 174 | sheet_name, confidence, page, code, print_time, start_date, end_date = summary | 224 | sheet_name, confidence, page, code, print_time_local, start_date_local, end_date_local = summary |
| 175 | ws = self.get_sheet_by_name(sheet_name) | 225 | ws = self.get_sheet_by_name(sheet_name) |
| 176 | # 1.1.删除多余列、排列 | 226 | # 1.1.删除多余列、排列 |
| 177 | self.sheet_prune(ws) | 227 | self.sheet_prune(ws) |
| 178 | # 1.2.TODO 按月份分割 | 228 | # 1.2.按月份分割 |
| 179 | self.sheet_split(ws, month_mapping) | 229 | self.sheet_split(ws, month_mapping, date_list) |
| 180 | # 1.3.元数据处理 TODO 时间与日期处理 | 230 | # 1.3.元数据处理 TODO 时间与日期处理 |
| 181 | # confidence_max = max(confidence, confidence_max) | 231 | confidence_max = max(confidence, confidence_max) |
| 182 | # if code is not None: | 232 | if code is not None: |
| 183 | # code_list.append((page, code)) | 233 | code_list.append((page, code)) |
| 184 | 234 | ||
| 235 | if len(date_list) > 1: | ||
| 236 | start_date = min(date_list) | ||
| 237 | end_date = max(date_list) | ||
| 238 | date_interval = (end_date - start_date).days | ||
| 185 | # 2.元信息提取表 | 239 | # 2.元信息提取表 |
| 186 | ms = self.build_meta_sheet(role, confidence_max, code_list, print_time, start_date, end_date, date_interval) | 240 | ms = self.build_meta_sheet(role, confidence_max, code_list, print_time, start_date, end_date, date_interval) |
| 187 | 241 | ... | ... |
-
Please register or sign in to post a comment