add wsc
Showing
4 changed files
with
621 additions
and
8 deletions
... | @@ -206,8 +206,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -206,8 +206,8 @@ class Command(BaseCommand, LoggerMixin): |
206 | ocr_res = ocr_response.json() | 206 | ocr_res = ocr_response.json() |
207 | end_time = time.time() | 207 | end_time = time.time() |
208 | speed_time = int(end_time - start_time) | 208 | speed_time = int(end_time - start_time) |
209 | self.folder_log.info('{0} [ocr success] [img={1}] [res={2}] [speed_time={3}]'.format( | 209 | self.folder_log.info('{0} [ocr success] [img={1}] [speed_time={2}]'.format( |
210 | self.log_base, img_path, ocr_res, speed_time)) | 210 | self.log_base, img_path, speed_time)) |
211 | 211 | ||
212 | if isinstance(ocr_res, dict): | 212 | if isinstance(ocr_res, dict): |
213 | if ocr_res.get('code') == 1: | 213 | if ocr_res.get('code') == 1: | ... | ... |
... | @@ -224,8 +224,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -224,8 +224,8 @@ class Command(BaseCommand, LoggerMixin): |
224 | ocr_res = ocr_response.json() | 224 | ocr_res = ocr_response.json() |
225 | end_time = time.time() | 225 | end_time = time.time() |
226 | speed_time = int(end_time - start_time) | 226 | speed_time = int(end_time - start_time) |
227 | self.folder_log.info('{0} [ocr success] [img={1}] [res={2}] [speed_time={3}]'.format( | 227 | self.folder_log.info('{0} [ocr success] [img={1}] [speed_time={2}]'.format( |
228 | self.log_base, img_path, ocr_res, speed_time)) | 228 | self.log_base, img_path, speed_time)) |
229 | 229 | ||
230 | if isinstance(ocr_res, dict): | 230 | if isinstance(ocr_res, dict): |
231 | if ocr_res.get('code') == 1: | 231 | if ocr_res.get('code') == 1: |
... | @@ -275,8 +275,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -275,8 +275,8 @@ class Command(BaseCommand, LoggerMixin): |
275 | ocr_res = ocr_response.json() | 275 | ocr_res = ocr_response.json() |
276 | end_time = time.time() | 276 | end_time = time.time() |
277 | speed_time = int(end_time - start_time) | 277 | speed_time = int(end_time - start_time) |
278 | self.folder_log.info('{0} [ltgt ocr success] [path={1}] [res={2}] [speed_time={3}]'.format( | 278 | self.folder_log.info('{0} [ltgt ocr success] [path={1}] [speed_time={2}]'.format( |
279 | self.log_base, path, ocr_res, speed_time)) | 279 | self.log_base, path, speed_time)) |
280 | return ocr_res | 280 | return ocr_res |
281 | else: | 281 | else: |
282 | self.folder_log.warn('{0} [ltgt ocr failed] [path={1}]'.format(self.log_base, path)) | 282 | self.folder_log.warn('{0} [ltgt ocr failed] [path={1}]'.format(self.log_base, path)) | ... | ... |
... | @@ -201,8 +201,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -201,8 +201,8 @@ class Command(BaseCommand, LoggerMixin): |
201 | ocr_res = ocr_response.json() | 201 | ocr_res = ocr_response.json() |
202 | end_time = time.time() | 202 | end_time = time.time() |
203 | speed_time = int(end_time - start_time) | 203 | speed_time = int(end_time - start_time) |
204 | self.folder_log.info('{0} [ocr success] [img={1}] [res={2}] [speed_time={3}]'.format( | 204 | self.folder_log.info('{0} [ocr success] [img={1}] [speed_time={2}]'.format( |
205 | self.log_base, img_path, ocr_res, speed_time)) | 205 | self.log_base, img_path, speed_time)) |
206 | return ocr_res | 206 | return ocr_res |
207 | else: | 207 | else: |
208 | self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path)) | 208 | self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path)) | ... | ... |
1 | import os | ||
2 | import re | ||
3 | import time | ||
4 | import shutil | ||
5 | import base64 | ||
6 | import signal | ||
7 | import requests | ||
8 | import traceback | ||
9 | from PIL import Image | ||
10 | from datetime import datetime | ||
11 | from django.core.management import BaseCommand | ||
12 | from multiprocessing import Process | ||
13 | import numpy as np | ||
14 | from fuzzywuzzy import fuzz | ||
15 | from shapely.geometry import Polygon | ||
16 | |||
17 | from settings import conf | ||
18 | from common.mixins import LoggerMixin | ||
19 | from common.tools.pdf_to_img import PDFHandler | ||
20 | from apps.doc import consts | ||
21 | from apps.doc.exceptions import OCR1Exception, OCR4Exception | ||
22 | from apps.doc.ocr.wb import BSWorkbook | ||
23 | |||
24 | |||
25 | class Finder: | ||
26 | """Summary | ||
27 | |||
28 | Attributes: | ||
29 | ocr_results (TYPE): Description | ||
30 | """ | ||
31 | |||
32 | def __init__(self, ocr_results=None): | ||
33 | self.ocr_results = ocr_results | ||
34 | |||
35 | self.init_result = { | ||
36 | "合同编号列表": [], | ||
37 | "经销商名称_Page3": "", | ||
38 | "经销商名称_Page38": "", | ||
39 | "经销商统一社会信用代码或公司注册号": "", | ||
40 | "保证人": "", | ||
41 | "综合授信额度金额英文": "", | ||
42 | "综合授信额度金额中文": "", | ||
43 | "综合授信额度期限开始日期英文": '', | ||
44 | "综合授信额度期限截止日期英文": '', | ||
45 | "综合授信额度期限开始日期中文": '', | ||
46 | "综合授信额度期限截止日期中文": '', | ||
47 | "保证金比例英文": "", | ||
48 | "保证金比例中文": "", | ||
49 | "其他约定与条件英文": "", | ||
50 | "其他约定与条件中文": "", | ||
51 | } | ||
52 | |||
53 | def get_line(self, ocr_results, key_string): | ||
54 | # 根据指定关键词, 找出与关键词同处一行的字符 | ||
55 | |||
56 | top, bottom = -1, -1 | ||
57 | # 首先找到这个关键词所在的 Bbox | ||
58 | for key in ocr_results: | ||
59 | bbox, text = ocr_results[key] | ||
60 | if key_string in text: | ||
61 | top, bottom = min(bbox[1::2]), max(bbox[1::2]) | ||
62 | break | ||
63 | |||
64 | line_text = [] | ||
65 | # 然后找到一行 | ||
66 | for key in ocr_results: | ||
67 | bbox, text = ocr_results[key] | ||
68 | if top < np.mean(bbox[1::2]) < bottom: | ||
69 | line_text.append([bbox, text]) | ||
70 | |||
71 | # 从左到右排序 | ||
72 | lines = '' | ||
73 | if len(line_text) > 0: | ||
74 | line_text = sorted(line_text, key=lambda x: x[0][0], reverse=False) | ||
75 | lines = ''.join([i[1] for i in line_text]) | ||
76 | return lines | ||
77 | |||
78 | def page_predict(self, ocr_results, page_template): | ||
79 | classes = [] | ||
80 | for pno in ocr_results: | ||
81 | ocr_texts = '' | ||
82 | for key in ocr_results[pno]: | ||
83 | bbox, text = ocr_results[pno][key] | ||
84 | ocr_texts += text | ||
85 | pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符 | ||
86 | ocr_texts = pattern.sub('', ocr_texts) | ||
87 | |||
88 | score = fuzz.ratio(page_template, ocr_texts) / 100. | ||
89 | classes.append([pno, score]) | ||
90 | pred = sorted(classes, key=lambda x: x[1], reverse=True)[0] | ||
91 | return pred | ||
92 | |||
93 | def get_top_key(self, ocr_results, key_string): # 加入过滤词典 | ||
94 | """找到与 key_string 最匹配的字段的 key | ||
95 | """ | ||
96 | if len(ocr_results) == 0: | ||
97 | return -1, -1 | ||
98 | ratio_list = [[fuzz.ratio(key_string, ocr_results[key][1]), key] for key in ocr_results] | ||
99 | top_key = sorted(ratio_list, key=lambda x: x[0])[-1] | ||
100 | return top_key | ||
101 | |||
102 | def get_top_iou(self, ocr_results, poly): | ||
103 | """求最大IoU | ||
104 | """ | ||
105 | iou_list = [] | ||
106 | for key in ocr_results: | ||
107 | bbox, text = ocr_results[key] | ||
108 | g = Polygon(np.array(bbox).reshape((-1, 2))) | ||
109 | p = Polygon(np.array(poly).reshape((-1, 2))) | ||
110 | if not g.is_valid or not p.is_valid: | ||
111 | continue | ||
112 | inter = Polygon(g).intersection(Polygon(p)).area | ||
113 | union = g.area + p.area - inter | ||
114 | iou = inter / union | ||
115 | iou_list.append([iou, key]) | ||
116 | if len(iou_list) == 0: | ||
117 | return -1, -1 | ||
118 | top_iou = sorted(iou_list, key=lambda x: x[0])[-1] | ||
119 | return top_iou | ||
120 | |||
121 | def get_key_value(self, ocr_results, key_string): | ||
122 | """根据 key 查找 value | ||
123 | """ | ||
124 | value = '' | ||
125 | |||
126 | tmp_ocr_results = {} | ||
127 | for key in ocr_results: | ||
128 | bbox, text = ocr_results[key] | ||
129 | # 定制化规则, 比如过滤一些词呀什么的 | ||
130 | # 该例中, 我们要去掉非中文字符 | ||
131 | pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符 | ||
132 | text = pattern.sub('', text) | ||
133 | tmp_ocr_results[key] = [bbox, text] | ||
134 | |||
135 | # 先根据 key_string 找到 key 的位置所在, 再判断该位置是否包含 value | ||
136 | # 若不包含 value, 则往右边一个单位查找 value | ||
137 | ratio, key = self.get_top_key(tmp_ocr_results, key_string) | ||
138 | if ratio > 50: | ||
139 | bbox, text = ocr_results[key] | ||
140 | words = text.strip(key_string).split(':')[-1] | ||
141 | if len(words) == 0: | ||
142 | # 将 bbox 右移一个单位 | ||
143 | x0, y0, x1, y1, x2, y2, x3, y3 = bbox | ||
144 | rw = abs(x0 - x1) | ||
145 | anchor = [x0 + rw, y0, x1 + rw, y1, x2 + rw, y2, x3 + rw, y3] | ||
146 | iou, key = self.get_top_iou(ocr_results, anchor) | ||
147 | if ratio > 0.3: | ||
148 | bbox, text = ocr_results[key] | ||
149 | words = text.strip(key_string).split(':')[-1] | ||
150 | value = words | ||
151 | else: | ||
152 | value = words | ||
153 | return value | ||
154 | |||
155 | def get_contract_No(self): | ||
156 | """提取左上角的合同编号字段 | ||
157 | """ | ||
158 | contract_No_list = [] | ||
159 | for pno in self.ocr_results: | ||
160 | # 请务必保证 OCR 结果不为空, 否则直接返回空屁 | ||
161 | if len(self.ocr_results[pno]) > 0: | ||
162 | contract_No = self.get_key_value(self.ocr_results[pno], '合同编号') | ||
163 | else: | ||
164 | contract_No = '' | ||
165 | contract_No_list.append(contract_No) | ||
166 | return contract_No_list | ||
167 | |||
168 | def get_info_in_page_3(self): | ||
169 | """提取第三页上的经销商名称,和经销商统一社会信用代码或公司注册号 | ||
170 | """ | ||
171 | dealer_name = '' | ||
172 | dealer_No = '' | ||
173 | |||
174 | template = r"""合同编号宝马汽车金融中国有限公司甲方宝马汽车金融中国有限公司地址中国北京市朝阳区东三环北路霞光里号佳程 | ||
175 | 广场座层乙方统一社会信用代码或公司注册号地址鉴于甲方是一家依照中国法律合法组建和存续的汽车金融公司愿意 | ||
176 | 为宝马中国汽车贸易有限公司以下简称宝马中国及华晨宝马汽车有限公司以下简称华晨宝马在中国大陆的宝马集团经 | ||
177 | 销商提供汽车批售融资服务乙方是一家依据中国法律合法组建和存续与宝马中国和或华晨宝马签署了授权销售合同具 | ||
178 | 有专营进口和或国产宝马集团产品合法资格的企业本着自愿平等互惠互利的原则甲乙双方经充分协商签署本综合授信 | ||
179 | 额度合同本合同达成如下条款综合授信额度合同版本""".replace(" ", "").replace("\n", "") | ||
180 | |||
181 | # 首先找到第三页纸, 我们阈值设为0.5 | ||
182 | pno, score = self.page_predict(self.ocr_results, template) | ||
183 | if score > 0.5: | ||
184 | if len(self.ocr_results[pno]) > 0: | ||
185 | # print(self.ocr_results[pno]) | ||
186 | # 在所有字段中搜索乙方 | ||
187 | for key in self.ocr_results[pno]: | ||
188 | bbox, text = self.ocr_results[pno][key] | ||
189 | if '乙方:' in text: | ||
190 | words = text.split(':')[-1].replace('【', '[').replace('{', '[').replace('】', ']') | ||
191 | dealer_name = words | ||
192 | |||
193 | words = self.get_key_value(self.ocr_results[pno], '统一社会信用代码或公司注册号') | ||
194 | dealer_No = words.replace('O', '0') | ||
195 | return dealer_name, dealer_No | ||
196 | |||
197 | def get_info_in_page_38(self): | ||
198 | """提取第38页上的经销商名称 | ||
199 | """ | ||
200 | dealer_name = '' | ||
201 | template = r"""宝马汽车金融中国有限公司合同编号签署页甲方宝马汽车金融中国有限公司盖章姓名姓名职务职务日期乙方汽车销售服务 | ||
202 | 有限公司盖章姓名姓名职务职务日期综合授信额度合同版本""".replace(" ", "").replace("\n", "") | ||
203 | |||
204 | # 首先找到第38页纸, 我们阈值设为0.5 | ||
205 | pno, score = self.page_predict(self.ocr_results, template) | ||
206 | if score > 0.5: | ||
207 | if len(self.ocr_results[pno]) > 0: | ||
208 | for key in self.ocr_results[pno]: | ||
209 | bbox, text = self.ocr_results[pno][key] | ||
210 | if '乙方:' in text: | ||
211 | words = text.split(':')[-1].replace('【', '[').replace('{', '[') | ||
212 | words = re.sub(r'[(())盖章《]', "", words) | ||
213 | dealer_name = words | ||
214 | return dealer_name | ||
215 | |||
216 | def get_guarantor(self): | ||
217 | """提取第10页上保证人段落,所见即所得 | ||
218 | """ | ||
219 | guarantor = '[/]' | ||
220 | all_texts = '' | ||
221 | for pno in self.ocr_results: | ||
222 | for key in self.ocr_results[pno]: | ||
223 | bbox, text = self.ocr_results[pno][key] | ||
224 | all_texts += text | ||
225 | |||
226 | searchObj = re.search(r'保证人\[(.*?)\]与甲方', all_texts) | ||
227 | if searchObj: | ||
228 | words = f'[{searchObj.group(1)}]' | ||
229 | words = words.replace('【', '[').replace('】', ']').replace(',', ',').replace('(', '(').replace(')', ')') | ||
230 | guarantor = words | ||
231 | return guarantor | ||
232 | |||
233 | def get_info_in_page_39(self): | ||
234 | """提取综合授信合同上的一些字段 | ||
235 | """ | ||
236 | # Amount of General Credit Line | ||
237 | amount_eng = '' | ||
238 | amount_chn = '' | ||
239 | term_start_eng = '' | ||
240 | term_end_eng = '' | ||
241 | term_start_chn = '' | ||
242 | term_end_chn = '' | ||
243 | deposit_eng = '' | ||
244 | deposit_chn = '' | ||
245 | |||
246 | template = r"""合同编号中国有限公司宝马汽车金融综合授信额度合同附件确认函综合授信额度金额本合同项下的综合授信额度为人民币 | ||
247 | 大写综合授信额度下面各个业务的授信额度将由甲方以授信额度通知函的方式时不时的通知乙方本合同项下的综合授信额 | ||
248 | 度可以由甲方根据乙方的信用和财务状况自行决定随时调整本合同项下的综合授信额度应为在本确认函第条的期间内双方 | ||
249 | 在确认函中确认的授信额度以及甲方向乙方时不时通过书面授信额度通知函以及临时授信额度通知函中沟通的额度的总和 | ||
250 | 综合授信额度期限从至或者由甲方向乙方通过书面形式在授信额度通知函中沟通的更短期间保证金甲方对乙方的最低保证 | ||
251 | 金要求为综合授信额度的实际执行的保证金比例以甲方不时另行书面通知根据最新的经销商融资或保证金相关政策或活动 | ||
252 | 为准综合授信额度合同版本""".replace(" ", "").replace("\n", "") | ||
253 | |||
254 | # 首先找到综合授信合同第一面, 我们阈值设为0.5 | ||
255 | pno, score = self.page_predict(self.ocr_results, template) | ||
256 | if score > 0.5: | ||
257 | if len(self.ocr_results[pno]) > 0: | ||
258 | # 根据关键词,找这一行字符 | ||
259 | lines = self.get_line(self.ocr_results[pno], 'RMB') | ||
260 | # searchObj = re.search( r'RMB(.*?)in', lines) | ||
261 | searchObj = re.search(r'[0-9,.]+', lines) | ||
262 | if searchObj: | ||
263 | words = searchObj.group() | ||
264 | amount_eng = words | ||
265 | |||
266 | lines = self.get_line(self.ocr_results[pno], '人民币') | ||
267 | searchObj = re.search(r'大写(.*?)综合', lines) | ||
268 | if searchObj: | ||
269 | words = searchObj.group(1) | ||
270 | pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符 | ||
271 | words = pattern.sub('', words) | ||
272 | words = words.replace("仔", "仟").replace("任", "仟") | ||
273 | words = words.replace("值", "佰") | ||
274 | words = words.replace("拐", "捌") | ||
275 | words = words.replace("查", "壹") | ||
276 | words = words.replace("政", "玖") | ||
277 | words = words.replace("垒", "叁") | ||
278 | amount_chn = words | ||
279 | |||
280 | lines = self.get_line(self.ocr_results[pno], 'ending') | ||
281 | if len(lines) > 0: | ||
282 | start, end = lines.split('ending') | ||
283 | searchStart = re.search(r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', start) | ||
284 | if searchStart: | ||
285 | words = searchStart.group() | ||
286 | term_start_eng = words | ||
287 | searchEnd = re.search(r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', end) | ||
288 | if searchEnd: | ||
289 | words = searchEnd.group() | ||
290 | term_end_eng = words | ||
291 | |||
292 | lines = self.get_line(self.ocr_results[pno], '至') | ||
293 | if len(lines) > 0: | ||
294 | start, end = lines.split('至') | ||
295 | searchStart = re.search(r'[0-9]{4}-[0-9]+-[0-9]+', start) | ||
296 | if searchStart: | ||
297 | words = searchStart.group() | ||
298 | term_start_chn = words | ||
299 | searchEnd = re.search(r'[0-9]{4}-[0-9]+-[0-9]+', end) | ||
300 | if searchEnd: | ||
301 | words = searchEnd.group() | ||
302 | term_end_chn = words | ||
303 | |||
304 | lines = self.get_line(self.ocr_results[pno], 'above') | ||
305 | searchObj = re.search(r'aboveto([0-9]+)', lines) | ||
306 | if searchObj: | ||
307 | words = searchObj.group(1) | ||
308 | deposit_eng = f'{words}%' | ||
309 | |||
310 | lines = self.get_line(self.ocr_results[pno], '授信额度的') | ||
311 | searchObj = re.search(r'授信额度的([0-9]+)', lines) | ||
312 | if searchObj: | ||
313 | words = searchObj.group(1) | ||
314 | deposit_chn = f'{words}%' | ||
315 | |||
316 | return amount_eng, amount_chn, term_start_eng, term_end_eng, \ | ||
317 | term_start_chn, term_end_chn, deposit_eng, deposit_chn | ||
318 | |||
319 | def get_other_arrangements_and_conditions(self): | ||
320 | """获取其它约定与条件文本段落 | ||
321 | """ | ||
322 | other_arrangements_and_conditions_eng = '' | ||
323 | other_arrangements_and_conditions_chn = '' | ||
324 | |||
325 | all_texts = '' | ||
326 | for pno in self.ocr_results: | ||
327 | for key in self.ocr_results[pno]: | ||
328 | all_texts += self.ocr_results[pno][key][1] | ||
329 | |||
330 | searchObj = re.search(r'Conditions:(.*?)其他约定与条件', all_texts, re.I) | ||
331 | if searchObj: | ||
332 | words = searchObj.group(1) | ||
333 | pattern = re.compile("[\u4e00-\u9fa5]") # 去除中文字符 | ||
334 | words = pattern.sub('', words) | ||
335 | other_arrangements_and_conditions_eng = words | ||
336 | |||
337 | searchObj = re.search(r'条件:(.*?)General', all_texts, re.I) | ||
338 | if searchObj: | ||
339 | words = searchObj.group(1) | ||
340 | other_arrangements_and_conditions_chn = words | ||
341 | return other_arrangements_and_conditions_eng, other_arrangements_and_conditions_chn | ||
342 | |||
343 | def get_info(self): | ||
344 | # 按照文档页码返回一个合同编号列表,依次表示每一页上识别到的合同编号 | ||
345 | contract_No_list = self.get_contract_No() | ||
346 | self.init_result["合同编号列表"] = contract_No_list | ||
347 | |||
348 | dealer_name, dealer_No = self.get_info_in_page_3() | ||
349 | self.init_result["经销商名称_Page3"] = dealer_name | ||
350 | self.init_result["经销商统一社会信用代码或公司注册号"] = dealer_No | ||
351 | |||
352 | dealer_name = self.get_info_in_page_38() | ||
353 | self.init_result["经销商名称_Page38"] = dealer_name | ||
354 | |||
355 | guarantor = self.get_guarantor() | ||
356 | self.init_result["保证人"] = guarantor | ||
357 | |||
358 | amount_eng, amount_chn, term_start_eng, term_end_eng, \ | ||
359 | term_start_chn, term_end_chn, deposit_eng, deposit_chn = self.get_info_in_page_39() | ||
360 | self.init_result["综合授信额度金额英文"] = amount_eng | ||
361 | self.init_result["综合授信额度金额中文"] = amount_chn | ||
362 | self.init_result["综合授信额度期限开始日期英文"] = term_start_eng | ||
363 | self.init_result["综合授信额度期限截止日期英文"] = term_end_eng | ||
364 | self.init_result["综合授信额度期限开始日期中文"] = term_start_chn | ||
365 | self.init_result["综合授信额度期限截止日期中文"] = term_end_chn | ||
366 | self.init_result["保证金比例英文"] = deposit_eng | ||
367 | self.init_result["保证金比例中文"] = deposit_chn | ||
368 | |||
369 | words_eng, words_chn = self.get_other_arrangements_and_conditions() | ||
370 | self.init_result["其他约定与条件英文"] = words_eng | ||
371 | self.init_result["其他约定与条件中文"] = words_chn | ||
372 | return self.init_result | ||
373 | |||
374 | |||
375 | class TIFFHandler: | ||
376 | |||
377 | def __init__(self, path, img_save_path): | ||
378 | self.path = path | ||
379 | self.img_save_path = img_save_path | ||
380 | self.img_path_list = [] | ||
381 | |||
382 | def extract_image(self): | ||
383 | os.makedirs(self.img_save_path, exist_ok=True) | ||
384 | tiff = Image.open(self.path) | ||
385 | tiff.load() | ||
386 | |||
387 | for i in range(tiff.n_frames): | ||
388 | try: | ||
389 | save_path = os.path.join(self.img_save_path, 'page_{0}.jpeg'.format(i)) | ||
390 | tiff.seek(i) | ||
391 | tiff.save(save_path) | ||
392 | self.img_path_list.append(save_path) | ||
393 | except EOFError: | ||
394 | break | ||
395 | |||
396 | |||
397 | class Command(BaseCommand, LoggerMixin): | ||
398 | |||
399 | def __init__(self): | ||
400 | super().__init__() | ||
401 | self.log_base = '[folder wsc process]' | ||
402 | # 处理文件开关 | ||
403 | self.switch = True | ||
404 | self.sheet_name = 'Wholesales Contract' | ||
405 | self.finder = Finder() | ||
406 | # 睡眠时间 | ||
407 | self.sleep_time = float(conf.SLEEP_SECOND_FOLDER) | ||
408 | # input folder | ||
409 | self.input_dir = conf.WSC_DIR | ||
410 | # ocr相关 | ||
411 | self.go_ocr_url = conf.WSC_GO_URL | ||
412 | # 优雅退出信号:15 | ||
413 | signal.signal(signal.SIGTERM, self.signal_handler) | ||
414 | |||
415 | def signal_handler(self, sig, frame): | ||
416 | self.switch = False # 停止处理文件 | ||
417 | |||
418 | @staticmethod | ||
419 | def parse_img_path(img_path): | ||
420 | # 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext) | ||
421 | img_name, _ = os.path.splitext(os.path.basename(img_path)) | ||
422 | if re.match(r'page_\d+_img_\d+', img_name): | ||
423 | part_list = img_name.split('_') | ||
424 | return img_name, int(part_list[1])+1, int(part_list[3])+1 | ||
425 | else: | ||
426 | return img_name, 1, 1 | ||
427 | |||
428 | @staticmethod | ||
429 | def get_path(name, img_output_dir, wb_output_dir, pdf_output_dir): | ||
430 | time_stamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') | ||
431 | new_name = '{0}_{1}'.format(time_stamp, name) | ||
432 | img_save_path = os.path.join(img_output_dir, new_name) | ||
433 | pdf_save_path = os.path.join(pdf_output_dir, new_name) | ||
434 | excel_name = '{0}.xlsx'.format(os.path.splitext(new_name)[0]) | ||
435 | excel_path = os.path.join(wb_output_dir, excel_name) | ||
436 | return img_save_path, excel_path, pdf_save_path | ||
437 | |||
438 | def res_process(self, all_res, excel_path): | ||
439 | try: | ||
440 | self.finder.ocr_results = all_res | ||
441 | results = self.finder.get_info() | ||
442 | |||
443 | wb = BSWorkbook(set(), set(), set(), set(), set()) | ||
444 | ws = wb.create_sheet(self.sheet_name) | ||
445 | for write_field, field_value in results.items(): | ||
446 | if isinstance(field_value, list): | ||
447 | ws.append((write_field, *field_value)) | ||
448 | else: | ||
449 | ws.append((write_field, field_value)) | ||
450 | wb.remove_base_sheet() | ||
451 | wb.save(excel_path) | ||
452 | except Exception as e: | ||
453 | self.folder_log.error('{0} [wb build error] [path={1}] [error={2}]'.format( | ||
454 | self.log_base, excel_path, traceback.format_exc())) | ||
455 | |||
456 | def ocr_process(self, img_path): | ||
457 | if os.path.exists(img_path): | ||
458 | # TODO 图片验证 | ||
459 | with open(img_path, 'rb') as f: | ||
460 | base64_data = base64.b64encode(f.read()) | ||
461 | # 获取解码后的base64值 | ||
462 | file_data = base64_data.decode() | ||
463 | json_data = { | ||
464 | "file": file_data, | ||
465 | } | ||
466 | |||
467 | for times in range(consts.RETRY_TIMES): | ||
468 | try: | ||
469 | ocr_response = requests.post(self.go_ocr_url, json=json_data) | ||
470 | if ocr_response.status_code != 200: | ||
471 | raise OCR1Exception('{0} go status code: {1}'.format(self.log_base, ocr_response.status_code)) | ||
472 | except Exception as e: | ||
473 | self.folder_log.warn('{0} [go failed] [times={1}] [img_path={2}] [error={3}]'.format( | ||
474 | self.log_base, times, img_path, traceback.format_exc())) | ||
475 | else: | ||
476 | ocr_res = ocr_response.json() | ||
477 | self.folder_log.info('{0} [ocr success] [img={1}]'.format( | ||
478 | self.log_base, img_path)) | ||
479 | return ocr_res | ||
480 | else: | ||
481 | self.folder_log.warn('{0} [go failed] [img_path={1}]'.format(self.log_base, img_path)) | ||
482 | |||
483 | |||
484 | def get_pno(self, img_path): | ||
485 | img_name, _ = os.path.splitext(os.path.basename(img_path)) | ||
486 | return int(img_name.split('_')[1]) | ||
487 | |||
488 | def images_process(self, img_path_list, excel_path): | ||
489 | all_res = {} | ||
490 | for img_path in img_path_list: | ||
491 | ocr_res = self.ocr_process(img_path) | ||
492 | pno = self.get_pno(img_path) | ||
493 | all_res[pno] = ocr_res | ||
494 | self.res_process(all_res, excel_path) | ||
495 | |||
496 | def pdf_process(self, name, path, img_output_dir, wb_output_dir, pdf_output_dir): | ||
497 | if os.path.exists(path): | ||
498 | try: | ||
499 | img_save_path, excel_path, pdf_save_path = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir) | ||
500 | self.folder_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path)) | ||
501 | pdf_handler = PDFHandler(path, img_save_path) | ||
502 | pdf_handler.extract_image() | ||
503 | self.folder_log.info('{0} [pdf to img end] [path={1}]'.format(self.log_base, path)) | ||
504 | except Exception as e: | ||
505 | self.folder_log.error('{0} [pdf to img error] [path={1}] [error={2}]'.format( | ||
506 | self.log_base, path, traceback.format_exc())) | ||
507 | raise e | ||
508 | else: | ||
509 | self.images_process(pdf_handler.img_path_list, excel_path) | ||
510 | shutil.move(path, pdf_save_path) | ||
511 | |||
512 | def tif_process(self, name, path, img_output_dir, wb_output_dir, tiff_output_dir): | ||
513 | if os.path.exists(path): | ||
514 | try: | ||
515 | img_save_path, excel_path, tiff_save_path = self.get_path(name, img_output_dir, wb_output_dir, tiff_output_dir) | ||
516 | self.folder_log.info('{0} [tiff to img start] [path={1}]'.format(self.log_base, path)) | ||
517 | tiff_handler = TIFFHandler(path, img_save_path) | ||
518 | tiff_handler.extract_image() | ||
519 | self.folder_log.info('{0} [tiff to img end] [path={1}]'.format(self.log_base, path)) | ||
520 | except Exception as e: | ||
521 | self.folder_log.error('{0} [tiff to img error] [path={1}] [error={2}]'.format( | ||
522 | self.log_base, path, traceback.format_exc())) | ||
523 | raise e | ||
524 | else: | ||
525 | self.images_process(tiff_handler.img_path_list, excel_path) | ||
526 | shutil.move(path, tiff_save_path) | ||
527 | |||
528 | def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir): | ||
529 | try: | ||
530 | img_save_path, excel_path, _ = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir) | ||
531 | except Exception as e: | ||
532 | self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format( | ||
533 | self.log_base, path, traceback.format_exc())) | ||
534 | else: | ||
535 | ocr_res = self.ocr_process(path, classify) | ||
536 | all_res = {path: ocr_res} | ||
537 | self.res_process(all_res, classify, excel_path) | ||
538 | shutil.move(path, img_save_path) | ||
539 | |||
540 | def folder_process(self, input_dir): | ||
541 | while not os.path.isdir(input_dir): | ||
542 | self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir)) | ||
543 | if self.switch: | ||
544 | time.sleep(self.sleep_time) | ||
545 | continue | ||
546 | else: | ||
547 | return | ||
548 | output_dir = os.path.join(os.path.dirname(input_dir), 'Output') | ||
549 | img_output_dir = os.path.join(output_dir, 'image') | ||
550 | wb_output_dir = os.path.join(output_dir, 'excel') | ||
551 | pdf_output_dir = os.path.join(output_dir, 'pdf') | ||
552 | tiff_output_dir = os.path.join(output_dir, 'tiff') | ||
553 | failed_output_dir = os.path.join(output_dir, 'failed') | ||
554 | os.makedirs(output_dir, exist_ok=True) | ||
555 | os.makedirs(img_output_dir, exist_ok=True) | ||
556 | os.makedirs(wb_output_dir, exist_ok=True) | ||
557 | os.makedirs(pdf_output_dir, exist_ok=True) | ||
558 | os.makedirs(tiff_output_dir, exist_ok=True) | ||
559 | os.makedirs(failed_output_dir, exist_ok=True) | ||
560 | os_error_filename_set = set() | ||
561 | while self.switch: | ||
562 | # if not os.path.isdir(input_dir): | ||
563 | # self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir)) | ||
564 | # time.sleep(self.sleep_time) | ||
565 | # continue | ||
566 | # 1. 从input dir获取pdf or image | ||
567 | list_dir = os.listdir(input_dir) | ||
568 | if not list_dir and len(os_error_filename_set) == 0: | ||
569 | self.folder_log.info('{0} [input dir empty] [input_dir={1}]'.format(self.log_base, input_dir)) | ||
570 | time.sleep(self.sleep_time) | ||
571 | continue | ||
572 | all_file_set = set(list_dir) | ||
573 | true_file_set = all_file_set - os_error_filename_set | ||
574 | if len(true_file_set) == 0 and len(os_error_filename_set) > 0: | ||
575 | true_file_set.add(os_error_filename_set.pop()) | ||
576 | for name in true_file_set: | ||
577 | path = os.path.join(input_dir, name) | ||
578 | |||
579 | try: | ||
580 | if os.path.isfile(path): | ||
581 | self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path)) | ||
582 | if name.endswith('.pdf') or name.endswith('.PDF'): | ||
583 | self.pdf_process(name, path, img_output_dir, wb_output_dir, pdf_output_dir) | ||
584 | elif name.endswith('.tif') or name.endswith('.TIF'): | ||
585 | self.tif_process(name, path, img_output_dir, wb_output_dir, tiff_output_dir) | ||
586 | else: | ||
587 | self.folder_log.info('{0} [path is not pdf or tif] [path={1}]'.format( | ||
588 | self.log_base, input_dir)) | ||
589 | failed_path = os.path.join(failed_output_dir, '{0}_{1}'.format(time.time(), name)) | ||
590 | shutil.move(path, failed_path) | ||
591 | self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path)) | ||
592 | else: | ||
593 | self.folder_log.info('{0} [path is dir] [path={1}]'.format(self.log_base, input_dir)) | ||
594 | failed_path = os.path.join(failed_output_dir, '{0}_{1}'.format(time.time(), name)) | ||
595 | shutil.move(path, failed_path) | ||
596 | except OSError: | ||
597 | os_error_filename_set.add(name) | ||
598 | self.folder_log.error('{0} [os error] [path={1}] [error={2}]'.format( | ||
599 | self.log_base, path, traceback.format_exc())) | ||
600 | except Exception as e: | ||
601 | try: | ||
602 | self.folder_log.error('{0} [file error] [path={1}] [error={2}]'.format(self.log_base, path, | ||
603 | traceback.format_exc())) | ||
604 | failed_path = os.path.join(failed_output_dir, '{0}_{1}'.format(time.time(), name)) | ||
605 | shutil.move(path, failed_path) | ||
606 | except Exception as e: | ||
607 | os_error_filename_set.add(name) | ||
608 | self.folder_log.error('{0} [file move error] [path={1}] [error={2}]'.format( | ||
609 | self.log_base, path, traceback.format_exc())) | ||
610 | |||
611 | def handle(self, *args, **kwargs): | ||
612 | self.folder_process(self.input_dir) | ||
613 | self.folder_log.info('{0} [stop safely]'.format(self.log_base)) |
-
Please register or sign in to post a comment