update wb build
Showing
4 changed files
with
66 additions
and
22 deletions
... | @@ -67,7 +67,6 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -67,7 +67,6 @@ class Command(BaseCommand, LoggerMixin): |
67 | def pdf_download(self, doc, business_type): | 67 | def pdf_download(self, doc, business_type): |
68 | if doc is None: | 68 | if doc is None: |
69 | return None, None, None, None | 69 | return None, None, None, None |
70 | # TODO EDMS下载pdf | ||
71 | doc_data_path = os.path.join(self.data_dir, business_type, str(doc.id)) | 70 | doc_data_path = os.path.join(self.data_dir, business_type, str(doc.id)) |
72 | os.makedirs(doc_data_path, exist_ok=True) | 71 | os.makedirs(doc_data_path, exist_ok=True) |
73 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) | 72 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) |
... | @@ -128,8 +127,10 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -128,8 +127,10 @@ class Command(BaseCommand, LoggerMixin): |
128 | img_name = os.path.basename(img_path) | 127 | img_name = os.path.basename(img_path) |
129 | self.append_sheet(wb, sheets_list, img_name, role_summary) | 128 | self.append_sheet(wb, sheets_list, img_name, role_summary) |
130 | 129 | ||
131 | # TODO 细化文件状态,不同异常状态采取不同的处理 | 130 | # TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理 |
132 | # TODO 调用接口重试 | 131 | # TODO 调用接口重试 |
132 | # TODO 异常邮件通知 | ||
133 | # TODO 数据库断联问题 | ||
133 | def handle(self, *args, **kwargs): | 134 | def handle(self, *args, **kwargs): |
134 | sleep_second = int(conf.SLEEP_SECOND) | 135 | sleep_second = int(conf.SLEEP_SECOND) |
135 | max_sleep_second = int(conf.MAX_SLEEP_SECOND) | 136 | max_sleep_second = int(conf.MAX_SLEEP_SECOND) | ... | ... |
... | @@ -35,6 +35,7 @@ class BSWorkbook(Workbook): | ... | @@ -35,6 +35,7 @@ class BSWorkbook(Workbook): |
35 | header_value = ws.cell(1, col).value | 35 | header_value = ws.cell(1, col).value |
36 | header_idx = consts.HEADERS_MAPPING.get(header_value) | 36 | header_idx = consts.HEADERS_MAPPING.get(header_value) |
37 | # TODO 关键字段再次查找 | 37 | # TODO 关键字段再次查找 |
38 | # TODO 支付宝、微信流水第一行非表头,怎么处理 | ||
38 | if header_idx is None: | 39 | if header_idx is None: |
39 | continue | 40 | continue |
40 | letter = get_column_letter(col) | 41 | letter = get_column_letter(col) |
... | @@ -63,8 +64,31 @@ class BSWorkbook(Workbook): | ... | @@ -63,8 +64,31 @@ class BSWorkbook(Workbook): |
63 | break | 64 | break |
64 | return month_list, idx_list | 65 | return month_list, idx_list |
65 | 66 | ||
66 | def sheet_split(self, ws, month_mapping, date_list): | 67 | @staticmethod |
67 | for date_tuple in ws.iter_cols(min_col=1, max_col=1, min_row=2, values_only=True): | 68 | def get_reverse_trend(day_idx, idx_list): |
69 | reverse_trend = 0 | ||
70 | pre_day = None | ||
71 | for idx, day in enumerate(day_idx): | ||
72 | if np.isnan(day): | ||
73 | continue | ||
74 | if idx in idx_list or pre_day is None: | ||
75 | pre_day = day | ||
76 | continue | ||
77 | if day < pre_day: | ||
78 | reverse_trend += 1 | ||
79 | pre_day = day | ||
80 | elif day > pre_day: | ||
81 | reverse_trend -= 1 | ||
82 | pre_day = day | ||
83 | if reverse_trend > 0: | ||
84 | reverse_trend = 1 | ||
85 | elif reverse_trend < 0: | ||
86 | reverse_trend = -1 | ||
87 | return reverse_trend | ||
88 | |||
89 | def sheet_split(self, ws, month_mapping, date_list, reverse_trend_list): | ||
90 | for date_tuple_src in ws.iter_cols(min_col=1, max_col=1, min_row=2, values_only=True): | ||
91 | date_tuple = [date[:10] if isinstance(date, str) else date for date in date_tuple_src] | ||
68 | dt_array, tz_parsed = tslib.array_to_datetime( | 92 | dt_array, tz_parsed = tslib.array_to_datetime( |
69 | np.array(date_tuple, copy=False, dtype=np.object_), | 93 | np.array(date_tuple, copy=False, dtype=np.object_), |
70 | errors="coerce", | 94 | errors="coerce", |
... | @@ -78,22 +102,30 @@ class BSWorkbook(Workbook): | ... | @@ -78,22 +102,30 @@ class BSWorkbook(Workbook): |
78 | month_list, idx_list = self.month_split(dti, date_list) | 102 | month_list, idx_list = self.month_split(dti, date_list) |
79 | 103 | ||
80 | if len(month_list) == 0: | 104 | if len(month_list) == 0: |
105 | # month_info process | ||
81 | month_info = month_mapping.setdefault('xxxx-xx', []) | 106 | month_info = month_mapping.setdefault('xxxx-xx', []) |
82 | month_info.append((ws.title, 2, ws.max_row, 0)) | 107 | month_info.append((ws.title, 2, ws.max_row, 0)) |
83 | elif len(month_list) == 1: | 108 | elif len(month_list) == 1: |
109 | # reverse_trend_list process | ||
110 | reverse_trend = self.get_reverse_trend(dti.day, idx_list) | ||
111 | reverse_trend_list.append(reverse_trend) | ||
112 | # month_info process | ||
84 | month_info = month_mapping.setdefault(month_list[0], []) | 113 | month_info = month_mapping.setdefault(month_list[0], []) |
85 | day_mean = np.mean(dti.day.dropna()) | 114 | day_mean = np.mean(dti.day.dropna()) |
86 | if len(month_info) == 0: | 115 | if len(month_info) == 0: |
87 | month_info.append((ws.title, 2, ws.max_row, day_mean)) | 116 | month_info.append((ws.title, 2, ws.max_row, day_mean)) |
88 | else: | 117 | else: |
89 | for i, item in enumerate(month_info): | 118 | for i, item in enumerate(month_info): |
90 | # TODO 倒序处理 | ||
91 | if day_mean <= item[-1]: | 119 | if day_mean <= item[-1]: |
92 | month_info.insert(i, (ws.title, 2, ws.max_row, day_mean)) | 120 | month_info.insert(i, (ws.title, 2, ws.max_row, day_mean)) |
93 | break | 121 | break |
94 | else: | 122 | else: |
95 | month_info.append((ws.title, 2, ws.max_row, day_mean)) | 123 | month_info.append((ws.title, 2, ws.max_row, day_mean)) |
96 | else: | 124 | else: |
125 | # reverse_trend_list process | ||
126 | reverse_trend = self.get_reverse_trend(dti.day, idx_list) | ||
127 | reverse_trend_list.append(reverse_trend) | ||
128 | # month_info process | ||
97 | for i, item in enumerate(month_list[:-1]): | 129 | for i, item in enumerate(month_list[:-1]): |
98 | month_mapping.setdefault(item, []).append( | 130 | month_mapping.setdefault(item, []).append( |
99 | (ws.title, idx_list[i] + 2, idx_list[i + 1] + 1, self.MAX_MEAN)) | 131 | (ws.title, idx_list[i] + 2, idx_list[i + 1] + 1, self.MAX_MEAN)) |
... | @@ -128,7 +160,7 @@ class BSWorkbook(Workbook): | ... | @@ -128,7 +160,7 @@ class BSWorkbook(Workbook): |
128 | ms.append(row) | 160 | ms.append(row) |
129 | return ms | 161 | return ms |
130 | 162 | ||
131 | def build_month_sheet(self, role, month_mapping, ms): | 163 | def build_month_sheet(self, role, month_mapping, ms, is_reverse): |
132 | tmp_ws = self.create_sheet('tmp_ws') | 164 | tmp_ws = self.create_sheet('tmp_ws') |
133 | for month in sorted(month_mapping.keys()): | 165 | for month in sorted(month_mapping.keys()): |
134 | # 3.1.拷贝数据 | 166 | # 3.1.拷贝数据 |
... | @@ -143,7 +175,6 @@ class BSWorkbook(Workbook): | ... | @@ -143,7 +175,6 @@ class BSWorkbook(Workbook): |
143 | amount_mapping = {} | 175 | amount_mapping = {} |
144 | amount_fill_row = set() | 176 | amount_fill_row = set() |
145 | for rows in new_ws.iter_rows(): | 177 | for rows in new_ws.iter_rows(): |
146 | is_fill = False | ||
147 | summary_cell = rows[5] | 178 | summary_cell = rows[5] |
148 | date_cell = rows[0] | 179 | date_cell = rows[0] |
149 | # 关键词1提取 | 180 | # 关键词1提取 |
... | @@ -154,11 +185,9 @@ class BSWorkbook(Workbook): | ... | @@ -154,11 +185,9 @@ class BSWorkbook(Workbook): |
154 | tmp_ws.append((summary_cell.value, date_cell.value, rows[2].value)) | 185 | tmp_ws.append((summary_cell.value, date_cell.value, rows[2].value)) |
155 | # 贷款关键词高亮 | 186 | # 贷款关键词高亮 |
156 | elif summary_cell.value in self.loan_keyword: | 187 | elif summary_cell.value in self.loan_keyword: |
157 | is_fill = True | 188 | summary_cell.fill = self.loan_fill |
158 | for i, cell in enumerate(rows): | 189 | for i, cell in enumerate(rows): |
159 | cell.border = self.border | 190 | cell.border = self.border |
160 | if is_fill: | ||
161 | cell.fill = self.loan_fill | ||
162 | if (i == 2 or i == 3) and cell.row > 1: | 191 | if (i == 2 or i == 3) and cell.row > 1: |
163 | try: | 192 | try: |
164 | # 3.3.金额、余额转数值 | 193 | # 3.3.金额、余额转数值 |
... | @@ -177,16 +206,18 @@ class BSWorkbook(Workbook): | ... | @@ -177,16 +206,18 @@ class BSWorkbook(Workbook): |
177 | cell.value, []).append(cell.row) | 206 | cell.value, []).append(cell.row) |
178 | # 3.4.核对结果 | 207 | # 3.4.核对结果 |
179 | # TODO 借贷、开支类型银行流水,需要手动添加+-号 | 208 | # TODO 借贷、开支类型银行流水,需要手动添加+-号 |
180 | # TODO 倒序流水需要改变公式 | ||
181 | if i == 9 and cell.row > 2: | 209 | if i == 9 and cell.row > 2: |
182 | cell.value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format(cell.row, cell.row - 1, | 210 | if is_reverse: |
183 | *self.proof_res) | 211 | cell.value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format( |
212 | cell.row - 1, cell.row, *self.proof_res) | ||
213 | else: | ||
214 | cell.value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format( | ||
215 | cell.row, cell.row - 1, *self.proof_res) | ||
184 | 216 | ||
185 | # 3.5.同一天相同进出账高亮 | 217 | # 3.5.同一天相同进出账高亮 |
186 | del amount_mapping | 218 | del amount_mapping |
187 | for row in amount_fill_row: | 219 | for row in amount_fill_row: |
188 | for cell in new_ws[row]: | 220 | new_ws[row][2].fill = self.amount_fill |
189 | cell.fill = self.amount_fill | ||
190 | 221 | ||
191 | # 关键词2信息提取 | 222 | # 关键词2信息提取 |
192 | ms.append(self.blank_row) | 223 | ms.append(self.blank_row) |
... | @@ -196,9 +227,10 @@ class BSWorkbook(Workbook): | ... | @@ -196,9 +227,10 @@ class BSWorkbook(Workbook): |
196 | self.remove(tmp_ws) | 227 | self.remove(tmp_ws) |
197 | 228 | ||
198 | def rebuild(self, role_summary): | 229 | def rebuild(self, role_summary): |
199 | # (sheet_name, confidence, page, code, print_time, start_date, end_date) | 230 | # (sheet_name, confidence, page, code, print_time, start_date, end_date) # TODO 表名简化,+卡号 |
200 | for role, summary_list in role_summary.items(): | 231 | for role, summary_list in role_summary.items(): |
201 | # 1.原表修剪、排列、按照月份分割 | 232 | # 1.原表修剪、排列、按照月份分割 |
233 | reverse_trend_list = [] | ||
202 | confidence_max = 0 | 234 | confidence_max = 0 |
203 | code_list = [] | 235 | code_list = [] |
204 | month_mapping = {} | 236 | month_mapping = {} |
... | @@ -210,7 +242,7 @@ class BSWorkbook(Workbook): | ... | @@ -210,7 +242,7 @@ class BSWorkbook(Workbook): |
210 | # 1.1.删除多余列、排列 | 242 | # 1.1.删除多余列、排列 |
211 | self.sheet_prune(ws) | 243 | self.sheet_prune(ws) |
212 | # 1.2.按月份分割 | 244 | # 1.2.按月份分割 |
213 | self.sheet_split(ws, month_mapping, date_list) | 245 | self.sheet_split(ws, month_mapping, date_list, reverse_trend_list) |
214 | # 1.3.元数据处理 TODO 时间与日期处理 | 246 | # 1.3.元数据处理 TODO 时间与日期处理 |
215 | confidence_max = max(confidence, confidence_max) | 247 | confidence_max = max(confidence, confidence_max) |
216 | if code is not None: | 248 | if code is not None: |
... | @@ -224,7 +256,12 @@ class BSWorkbook(Workbook): | ... | @@ -224,7 +256,12 @@ class BSWorkbook(Workbook): |
224 | ms = self.build_meta_sheet(role, confidence_max, code_list, print_time, start_date, end_date, date_interval) | 256 | ms = self.build_meta_sheet(role, confidence_max, code_list, print_time, start_date, end_date, date_interval) |
225 | 257 | ||
226 | # 3.创建月份表、提取/高亮关键行 | 258 | # 3.创建月份表、提取/高亮关键行 |
227 | self.build_month_sheet(role, month_mapping, ms) | 259 | is_reverse = False |
260 | if sum(reverse_trend_list) > 0: # 倒序处理 | ||
261 | is_reverse = True | ||
262 | for month_list in month_mapping.values(): | ||
263 | month_list.sort(key=lambda x: x[-1], reverse=True) | ||
264 | self.build_month_sheet(role, month_mapping, ms, is_reverse) | ||
228 | 265 | ||
229 | # 删除原表 | 266 | # 删除原表 |
230 | for summary in summary_list: | 267 | for summary in summary_list: | ... | ... |
... | @@ -4,8 +4,10 @@ from PIL import Image | ... | @@ -4,8 +4,10 @@ from PIL import Image |
4 | from io import BytesIO | 4 | from io import BytesIO |
5 | 5 | ||
6 | # 页面保存为png图片参数 | 6 | # 页面保存为png图片参数 |
7 | ZOOM_X = ZOOM_Y = 2.0 | 7 | ZOOM_X_1 = ZOOM_Y_1 = 1.0 |
8 | trans = fitz.Matrix(ZOOM_X, ZOOM_X).preRotate(0) # zoom factor 2 in each dimension | 8 | ZOOM_X_2 = ZOOM_Y_2 = 2.0 |
9 | trans_1 = fitz.Matrix(ZOOM_X_1, ZOOM_X_1).preRotate(0) # zoom factor 1 in each dimension | ||
10 | trans_2 = fitz.Matrix(ZOOM_X_2, ZOOM_X_2).preRotate(0) # zoom factor 2 in each dimension | ||
9 | 11 | ||
10 | # 特殊filter处理 | 12 | # 特殊filter处理 |
11 | ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'} | 13 | ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'} |
... | @@ -30,7 +32,10 @@ class PDFHandler: | ... | @@ -30,7 +32,10 @@ class PDFHandler: |
30 | return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext)) | 32 | return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext)) |
31 | 33 | ||
32 | def page_to_png(self, page): | 34 | def page_to_png(self, page): |
33 | pm = page.getPixmap(matrix=trans, alpha=False) | 35 | if page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500: |
36 | pm = page.getPixmap(matrix=trans_1, alpha=False) | ||
37 | else: | ||
38 | pm = page.getPixmap(matrix=trans_2, alpha=False) | ||
34 | img_save_path = self.get_img_save_path(page.number) | 39 | img_save_path = self.get_img_save_path(page.number) |
35 | pm.writePNG(img_save_path) | 40 | pm.writePNG(img_save_path) |
36 | self.img_path_list.append(img_save_path) | 41 | self.img_path_list.append(img_save_path) | ... | ... |
-
Please register or sign in to post a comment