5c08056d by 周伟奇

add wsc

1 parent 5a19df38
...@@ -206,8 +206,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -206,8 +206,8 @@ class Command(BaseCommand, LoggerMixin):
206 ocr_res = ocr_response.json() 206 ocr_res = ocr_response.json()
207 end_time = time.time() 207 end_time = time.time()
208 speed_time = int(end_time - start_time) 208 speed_time = int(end_time - start_time)
209 self.folder_log.info('{0} [ocr success] [img={1}] [res={2}] [speed_time={3}]'.format( 209 self.folder_log.info('{0} [ocr success] [img={1}] [speed_time={2}]'.format(
210 self.log_base, img_path, ocr_res, speed_time)) 210 self.log_base, img_path, speed_time))
211 211
212 if isinstance(ocr_res, dict): 212 if isinstance(ocr_res, dict):
213 if ocr_res.get('code') == 1: 213 if ocr_res.get('code') == 1:
......
...@@ -224,8 +224,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -224,8 +224,8 @@ class Command(BaseCommand, LoggerMixin):
224 ocr_res = ocr_response.json() 224 ocr_res = ocr_response.json()
225 end_time = time.time() 225 end_time = time.time()
226 speed_time = int(end_time - start_time) 226 speed_time = int(end_time - start_time)
227 self.folder_log.info('{0} [ocr success] [img={1}] [res={2}] [speed_time={3}]'.format( 227 self.folder_log.info('{0} [ocr success] [img={1}] [speed_time={2}]'.format(
228 self.log_base, img_path, ocr_res, speed_time)) 228 self.log_base, img_path, speed_time))
229 229
230 if isinstance(ocr_res, dict): 230 if isinstance(ocr_res, dict):
231 if ocr_res.get('code') == 1: 231 if ocr_res.get('code') == 1:
...@@ -275,8 +275,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -275,8 +275,8 @@ class Command(BaseCommand, LoggerMixin):
275 ocr_res = ocr_response.json() 275 ocr_res = ocr_response.json()
276 end_time = time.time() 276 end_time = time.time()
277 speed_time = int(end_time - start_time) 277 speed_time = int(end_time - start_time)
278 self.folder_log.info('{0} [ltgt ocr success] [path={1}] [res={2}] [speed_time={3}]'.format( 278 self.folder_log.info('{0} [ltgt ocr success] [path={1}] [speed_time={2}]'.format(
279 self.log_base, path, ocr_res, speed_time)) 279 self.log_base, path, speed_time))
280 return ocr_res 280 return ocr_res
281 else: 281 else:
282 self.folder_log.warn('{0} [ltgt ocr failed] [path={1}]'.format(self.log_base, path)) 282 self.folder_log.warn('{0} [ltgt ocr failed] [path={1}]'.format(self.log_base, path))
......
...@@ -201,8 +201,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -201,8 +201,8 @@ class Command(BaseCommand, LoggerMixin):
201 ocr_res = ocr_response.json() 201 ocr_res = ocr_response.json()
202 end_time = time.time() 202 end_time = time.time()
203 speed_time = int(end_time - start_time) 203 speed_time = int(end_time - start_time)
204 self.folder_log.info('{0} [ocr success] [img={1}] [res={2}] [speed_time={3}]'.format( 204 self.folder_log.info('{0} [ocr success] [img={1}] [speed_time={2}]'.format(
205 self.log_base, img_path, ocr_res, speed_time)) 205 self.log_base, img_path, speed_time))
206 return ocr_res 206 return ocr_res
207 else: 207 else:
208 self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path)) 208 self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path))
......
1 import os
2 import re
3 import time
4 import shutil
5 import base64
6 import signal
7 import requests
8 import traceback
9 from PIL import Image
10 from datetime import datetime
11 from django.core.management import BaseCommand
12 from multiprocessing import Process
13 import numpy as np
14 from fuzzywuzzy import fuzz
15 from shapely.geometry import Polygon
16
17 from settings import conf
18 from common.mixins import LoggerMixin
19 from common.tools.pdf_to_img import PDFHandler
20 from apps.doc import consts
21 from apps.doc.exceptions import OCR1Exception, OCR4Exception
22 from apps.doc.ocr.wb import BSWorkbook
23
24
25 class Finder:
26 """Summary
27
28 Attributes:
29 ocr_results (TYPE): Description
30 """
31
32 def __init__(self, ocr_results=None):
33 self.ocr_results = ocr_results
34
35 self.init_result = {
36 "合同编号列表": [],
37 "经销商名称_Page3": "",
38 "经销商名称_Page38": "",
39 "经销商统一社会信用代码或公司注册号": "",
40 "保证人": "",
41 "综合授信额度金额英文": "",
42 "综合授信额度金额中文": "",
43 "综合授信额度期限开始日期英文": '',
44 "综合授信额度期限截止日期英文": '',
45 "综合授信额度期限开始日期中文": '',
46 "综合授信额度期限截止日期中文": '',
47 "保证金比例英文": "",
48 "保证金比例中文": "",
49 "其他约定与条件英文": "",
50 "其他约定与条件中文": "",
51 }
52
53 def get_line(self, ocr_results, key_string):
54 # 根据指定关键词, 找出与关键词同处一行的字符
55
56 top, bottom = -1, -1
57 # 首先找到这个关键词所在的 Bbox
58 for key in ocr_results:
59 bbox, text = ocr_results[key]
60 if key_string in text:
61 top, bottom = min(bbox[1::2]), max(bbox[1::2])
62 break
63
64 line_text = []
65 # 然后找到一行
66 for key in ocr_results:
67 bbox, text = ocr_results[key]
68 if top < np.mean(bbox[1::2]) < bottom:
69 line_text.append([bbox, text])
70
71 # 从左到右排序
72 lines = ''
73 if len(line_text) > 0:
74 line_text = sorted(line_text, key=lambda x: x[0][0], reverse=False)
75 lines = ''.join([i[1] for i in line_text])
76 return lines
77
78 def page_predict(self, ocr_results, page_template):
79 classes = []
80 for pno in ocr_results:
81 ocr_texts = ''
82 for key in ocr_results[pno]:
83 bbox, text = ocr_results[pno][key]
84 ocr_texts += text
85 pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符
86 ocr_texts = pattern.sub('', ocr_texts)
87
88 score = fuzz.ratio(page_template, ocr_texts) / 100.
89 classes.append([pno, score])
90 pred = sorted(classes, key=lambda x: x[1], reverse=True)[0]
91 return pred
92
93 def get_top_key(self, ocr_results, key_string): # 加入过滤词典
94 """找到与 key_string 最匹配的字段的 key
95 """
96 if len(ocr_results) == 0:
97 return -1, -1
98 ratio_list = [[fuzz.ratio(key_string, ocr_results[key][1]), key] for key in ocr_results]
99 top_key = sorted(ratio_list, key=lambda x: x[0])[-1]
100 return top_key
101
102 def get_top_iou(self, ocr_results, poly):
103 """求最大IoU
104 """
105 iou_list = []
106 for key in ocr_results:
107 bbox, text = ocr_results[key]
108 g = Polygon(np.array(bbox).reshape((-1, 2)))
109 p = Polygon(np.array(poly).reshape((-1, 2)))
110 if not g.is_valid or not p.is_valid:
111 continue
112 inter = Polygon(g).intersection(Polygon(p)).area
113 union = g.area + p.area - inter
114 iou = inter / union
115 iou_list.append([iou, key])
116 if len(iou_list) == 0:
117 return -1, -1
118 top_iou = sorted(iou_list, key=lambda x: x[0])[-1]
119 return top_iou
120
121 def get_key_value(self, ocr_results, key_string):
122 """根据 key 查找 value
123 """
124 value = ''
125
126 tmp_ocr_results = {}
127 for key in ocr_results:
128 bbox, text = ocr_results[key]
129 # 定制化规则, 比如过滤一些词呀什么的
130 # 该例中, 我们要去掉非中文字符
131 pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符
132 text = pattern.sub('', text)
133 tmp_ocr_results[key] = [bbox, text]
134
135 # 先根据 key_string 找到 key 的位置所在, 再判断该位置是否包含 value
136 # 若不包含 value, 则往右边一个单位查找 value
137 ratio, key = self.get_top_key(tmp_ocr_results, key_string)
138 if ratio > 50:
139 bbox, text = ocr_results[key]
140 words = text.strip(key_string).split(':')[-1]
141 if len(words) == 0:
142 # 将 bbox 右移一个单位
143 x0, y0, x1, y1, x2, y2, x3, y3 = bbox
144 rw = abs(x0 - x1)
145 anchor = [x0 + rw, y0, x1 + rw, y1, x2 + rw, y2, x3 + rw, y3]
146 iou, key = self.get_top_iou(ocr_results, anchor)
147 if ratio > 0.3:
148 bbox, text = ocr_results[key]
149 words = text.strip(key_string).split(':')[-1]
150 value = words
151 else:
152 value = words
153 return value
154
155 def get_contract_No(self):
156 """提取左上角的合同编号字段
157 """
158 contract_No_list = []
159 for pno in self.ocr_results:
160 # 请务必保证 OCR 结果不为空, 否则直接返回空屁
161 if len(self.ocr_results[pno]) > 0:
162 contract_No = self.get_key_value(self.ocr_results[pno], '合同编号')
163 else:
164 contract_No = ''
165 contract_No_list.append(contract_No)
166 return contract_No_list
167
168 def get_info_in_page_3(self):
169 """提取第三页上的经销商名称,和经销商统一社会信用代码或公司注册号
170 """
171 dealer_name = ''
172 dealer_No = ''
173
174 template = r"""合同编号宝马汽车金融中国有限公司甲方宝马汽车金融中国有限公司地址中国北京市朝阳区东三环北路霞光里号佳程
175 广场座层乙方统一社会信用代码或公司注册号地址鉴于甲方是一家依照中国法律合法组建和存续的汽车金融公司愿意
176 为宝马中国汽车贸易有限公司以下简称宝马中国及华晨宝马汽车有限公司以下简称华晨宝马在中国大陆的宝马集团经
177 销商提供汽车批售融资服务乙方是一家依据中国法律合法组建和存续与宝马中国和或华晨宝马签署了授权销售合同具
178 有专营进口和或国产宝马集团产品合法资格的企业本着自愿平等互惠互利的原则甲乙双方经充分协商签署本综合授信
179 额度合同本合同达成如下条款综合授信额度合同版本""".replace(" ", "").replace("\n", "")
180
181 # 首先找到第三页纸, 我们阈值设为0.5
182 pno, score = self.page_predict(self.ocr_results, template)
183 if score > 0.5:
184 if len(self.ocr_results[pno]) > 0:
185 # print(self.ocr_results[pno])
186 # 在所有字段中搜索乙方
187 for key in self.ocr_results[pno]:
188 bbox, text = self.ocr_results[pno][key]
189 if '乙方:' in text:
190 words = text.split(':')[-1].replace('【', '[').replace('{', '[').replace('】', ']')
191 dealer_name = words
192
193 words = self.get_key_value(self.ocr_results[pno], '统一社会信用代码或公司注册号')
194 dealer_No = words.replace('O', '0')
195 return dealer_name, dealer_No
196
197 def get_info_in_page_38(self):
198 """提取第38页上的经销商名称
199 """
200 dealer_name = ''
201 template = r"""宝马汽车金融中国有限公司合同编号签署页甲方宝马汽车金融中国有限公司盖章姓名姓名职务职务日期乙方汽车销售服务
202 有限公司盖章姓名姓名职务职务日期综合授信额度合同版本""".replace(" ", "").replace("\n", "")
203
204 # 首先找到第38页纸, 我们阈值设为0.5
205 pno, score = self.page_predict(self.ocr_results, template)
206 if score > 0.5:
207 if len(self.ocr_results[pno]) > 0:
208 for key in self.ocr_results[pno]:
209 bbox, text = self.ocr_results[pno][key]
210 if '乙方:' in text:
211 words = text.split(':')[-1].replace('【', '[').replace('{', '[')
212 words = re.sub(r'[(())盖章《]', "", words)
213 dealer_name = words
214 return dealer_name
215
216 def get_guarantor(self):
217 """提取第10页上保证人段落,所见即所得
218 """
219 guarantor = '[/]'
220 all_texts = ''
221 for pno in self.ocr_results:
222 for key in self.ocr_results[pno]:
223 bbox, text = self.ocr_results[pno][key]
224 all_texts += text
225
226 searchObj = re.search(r'保证人\[(.*?)\]与甲方', all_texts)
227 if searchObj:
228 words = f'[{searchObj.group(1)}]'
229 words = words.replace('【', '[').replace('】', ']').replace(',', ',').replace('(', '(').replace(')', ')')
230 guarantor = words
231 return guarantor
232
233 def get_info_in_page_39(self):
234 """提取综合授信合同上的一些字段
235 """
236 # Amount of General Credit Line
237 amount_eng = ''
238 amount_chn = ''
239 term_start_eng = ''
240 term_end_eng = ''
241 term_start_chn = ''
242 term_end_chn = ''
243 deposit_eng = ''
244 deposit_chn = ''
245
246 template = r"""合同编号中国有限公司宝马汽车金融综合授信额度合同附件确认函综合授信额度金额本合同项下的综合授信额度为人民币
247 大写综合授信额度下面各个业务的授信额度将由甲方以授信额度通知函的方式时不时的通知乙方本合同项下的综合授信额
248 度可以由甲方根据乙方的信用和财务状况自行决定随时调整本合同项下的综合授信额度应为在本确认函第条的期间内双方
249 在确认函中确认的授信额度以及甲方向乙方时不时通过书面授信额度通知函以及临时授信额度通知函中沟通的额度的总和
250 综合授信额度期限从至或者由甲方向乙方通过书面形式在授信额度通知函中沟通的更短期间保证金甲方对乙方的最低保证
251 金要求为综合授信额度的实际执行的保证金比例以甲方不时另行书面通知根据最新的经销商融资或保证金相关政策或活动
252 为准综合授信额度合同版本""".replace(" ", "").replace("\n", "")
253
254 # 首先找到综合授信合同第一面, 我们阈值设为0.5
255 pno, score = self.page_predict(self.ocr_results, template)
256 if score > 0.5:
257 if len(self.ocr_results[pno]) > 0:
258 # 根据关键词,找这一行字符
259 lines = self.get_line(self.ocr_results[pno], 'RMB')
260 # searchObj = re.search( r'RMB(.*?)in', lines)
261 searchObj = re.search(r'[0-9,.]+', lines)
262 if searchObj:
263 words = searchObj.group()
264 amount_eng = words
265
266 lines = self.get_line(self.ocr_results[pno], '人民币')
267 searchObj = re.search(r'大写(.*?)综合', lines)
268 if searchObj:
269 words = searchObj.group(1)
270 pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符
271 words = pattern.sub('', words)
272 words = words.replace("仔", "仟").replace("任", "仟")
273 words = words.replace("值", "佰")
274 words = words.replace("拐", "捌")
275 words = words.replace("查", "壹")
276 words = words.replace("政", "玖")
277 words = words.replace("垒", "叁")
278 amount_chn = words
279
280 lines = self.get_line(self.ocr_results[pno], 'ending')
281 if len(lines) > 0:
282 start, end = lines.split('ending')
283 searchStart = re.search(r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', start)
284 if searchStart:
285 words = searchStart.group()
286 term_start_eng = words
287 searchEnd = re.search(r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', end)
288 if searchEnd:
289 words = searchEnd.group()
290 term_end_eng = words
291
292 lines = self.get_line(self.ocr_results[pno], '至')
293 if len(lines) > 0:
294 start, end = lines.split('至')
295 searchStart = re.search(r'[0-9]{4}-[0-9]+-[0-9]+', start)
296 if searchStart:
297 words = searchStart.group()
298 term_start_chn = words
299 searchEnd = re.search(r'[0-9]{4}-[0-9]+-[0-9]+', end)
300 if searchEnd:
301 words = searchEnd.group()
302 term_end_chn = words
303
304 lines = self.get_line(self.ocr_results[pno], 'above')
305 searchObj = re.search(r'aboveto([0-9]+)', lines)
306 if searchObj:
307 words = searchObj.group(1)
308 deposit_eng = f'{words}%'
309
310 lines = self.get_line(self.ocr_results[pno], '授信额度的')
311 searchObj = re.search(r'授信额度的([0-9]+)', lines)
312 if searchObj:
313 words = searchObj.group(1)
314 deposit_chn = f'{words}%'
315
316 return amount_eng, amount_chn, term_start_eng, term_end_eng, \
317 term_start_chn, term_end_chn, deposit_eng, deposit_chn
318
319 def get_other_arrangements_and_conditions(self):
320 """获取其它约定与条件文本段落
321 """
322 other_arrangements_and_conditions_eng = ''
323 other_arrangements_and_conditions_chn = ''
324
325 all_texts = ''
326 for pno in self.ocr_results:
327 for key in self.ocr_results[pno]:
328 all_texts += self.ocr_results[pno][key][1]
329
330 searchObj = re.search(r'Conditions:(.*?)其他约定与条件', all_texts, re.I)
331 if searchObj:
332 words = searchObj.group(1)
333 pattern = re.compile("[\u4e00-\u9fa5]") # 去除中文字符
334 words = pattern.sub('', words)
335 other_arrangements_and_conditions_eng = words
336
337 searchObj = re.search(r'条件:(.*?)General', all_texts, re.I)
338 if searchObj:
339 words = searchObj.group(1)
340 other_arrangements_and_conditions_chn = words
341 return other_arrangements_and_conditions_eng, other_arrangements_and_conditions_chn
342
343 def get_info(self):
344 # 按照文档页码返回一个合同编号列表,依次表示每一页上识别到的合同编号
345 contract_No_list = self.get_contract_No()
346 self.init_result["合同编号列表"] = contract_No_list
347
348 dealer_name, dealer_No = self.get_info_in_page_3()
349 self.init_result["经销商名称_Page3"] = dealer_name
350 self.init_result["经销商统一社会信用代码或公司注册号"] = dealer_No
351
352 dealer_name = self.get_info_in_page_38()
353 self.init_result["经销商名称_Page38"] = dealer_name
354
355 guarantor = self.get_guarantor()
356 self.init_result["保证人"] = guarantor
357
358 amount_eng, amount_chn, term_start_eng, term_end_eng, \
359 term_start_chn, term_end_chn, deposit_eng, deposit_chn = self.get_info_in_page_39()
360 self.init_result["综合授信额度金额英文"] = amount_eng
361 self.init_result["综合授信额度金额中文"] = amount_chn
362 self.init_result["综合授信额度期限开始日期英文"] = term_start_eng
363 self.init_result["综合授信额度期限截止日期英文"] = term_end_eng
364 self.init_result["综合授信额度期限开始日期中文"] = term_start_chn
365 self.init_result["综合授信额度期限截止日期中文"] = term_end_chn
366 self.init_result["保证金比例英文"] = deposit_eng
367 self.init_result["保证金比例中文"] = deposit_chn
368
369 words_eng, words_chn = self.get_other_arrangements_and_conditions()
370 self.init_result["其他约定与条件英文"] = words_eng
371 self.init_result["其他约定与条件中文"] = words_chn
372 return self.init_result
373
374
375 class TIFFHandler:
376
377 def __init__(self, path, img_save_path):
378 self.path = path
379 self.img_save_path = img_save_path
380 self.img_path_list = []
381
382 def extract_image(self):
383 os.makedirs(self.img_save_path, exist_ok=True)
384 tiff = Image.open(self.path)
385 tiff.load()
386
387 for i in range(tiff.n_frames):
388 try:
389 save_path = os.path.join(self.img_save_path, 'page_{0}.jpeg'.format(i))
390 tiff.seek(i)
391 tiff.save(save_path)
392 self.img_path_list.append(save_path)
393 except EOFError:
394 break
395
396
397 class Command(BaseCommand, LoggerMixin):
398
399 def __init__(self):
400 super().__init__()
401 self.log_base = '[folder wsc process]'
402 # 处理文件开关
403 self.switch = True
404 self.sheet_name = 'Wholesales Contract'
405 self.finder = Finder()
406 # 睡眠时间
407 self.sleep_time = float(conf.SLEEP_SECOND_FOLDER)
408 # input folder
409 self.input_dir = conf.WSC_DIR
410 # ocr相关
411 self.go_ocr_url = conf.WSC_GO_URL
412 # 优雅退出信号:15
413 signal.signal(signal.SIGTERM, self.signal_handler)
414
415 def signal_handler(self, sig, frame):
416 self.switch = False # 停止处理文件
417
418 @staticmethod
419 def parse_img_path(img_path):
420 # 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext)
421 img_name, _ = os.path.splitext(os.path.basename(img_path))
422 if re.match(r'page_\d+_img_\d+', img_name):
423 part_list = img_name.split('_')
424 return img_name, int(part_list[1])+1, int(part_list[3])+1
425 else:
426 return img_name, 1, 1
427
428 @staticmethod
429 def get_path(name, img_output_dir, wb_output_dir, pdf_output_dir):
430 time_stamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
431 new_name = '{0}_{1}'.format(time_stamp, name)
432 img_save_path = os.path.join(img_output_dir, new_name)
433 pdf_save_path = os.path.join(pdf_output_dir, new_name)
434 excel_name = '{0}.xlsx'.format(os.path.splitext(new_name)[0])
435 excel_path = os.path.join(wb_output_dir, excel_name)
436 return img_save_path, excel_path, pdf_save_path
437
438 def res_process(self, all_res, excel_path):
439 try:
440 self.finder.ocr_results = all_res
441 results = self.finder.get_info()
442
443 wb = BSWorkbook(set(), set(), set(), set(), set())
444 ws = wb.create_sheet(self.sheet_name)
445 for write_field, field_value in results.items():
446 if isinstance(field_value, list):
447 ws.append((write_field, *field_value))
448 else:
449 ws.append((write_field, field_value))
450 wb.remove_base_sheet()
451 wb.save(excel_path)
452 except Exception as e:
453 self.folder_log.error('{0} [wb build error] [path={1}] [error={2}]'.format(
454 self.log_base, excel_path, traceback.format_exc()))
455
456 def ocr_process(self, img_path):
457 if os.path.exists(img_path):
458 # TODO 图片验证
459 with open(img_path, 'rb') as f:
460 base64_data = base64.b64encode(f.read())
461 # 获取解码后的base64值
462 file_data = base64_data.decode()
463 json_data = {
464 "file": file_data,
465 }
466
467 for times in range(consts.RETRY_TIMES):
468 try:
469 ocr_response = requests.post(self.go_ocr_url, json=json_data)
470 if ocr_response.status_code != 200:
471 raise OCR1Exception('{0} go status code: {1}'.format(self.log_base, ocr_response.status_code))
472 except Exception as e:
473 self.folder_log.warn('{0} [go failed] [times={1}] [img_path={2}] [error={3}]'.format(
474 self.log_base, times, img_path, traceback.format_exc()))
475 else:
476 ocr_res = ocr_response.json()
477 self.folder_log.info('{0} [ocr success] [img={1}]'.format(
478 self.log_base, img_path))
479 return ocr_res
480 else:
481 self.folder_log.warn('{0} [go failed] [img_path={1}]'.format(self.log_base, img_path))
482
483
484 def get_pno(self, img_path):
485 img_name, _ = os.path.splitext(os.path.basename(img_path))
486 return int(img_name.split('_')[1])
487
488 def images_process(self, img_path_list, excel_path):
489 all_res = {}
490 for img_path in img_path_list:
491 ocr_res = self.ocr_process(img_path)
492 pno = self.get_pno(img_path)
493 all_res[pno] = ocr_res
494 self.res_process(all_res, excel_path)
495
496 def pdf_process(self, name, path, img_output_dir, wb_output_dir, pdf_output_dir):
497 if os.path.exists(path):
498 try:
499 img_save_path, excel_path, pdf_save_path = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir)
500 self.folder_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path))
501 pdf_handler = PDFHandler(path, img_save_path)
502 pdf_handler.extract_image()
503 self.folder_log.info('{0} [pdf to img end] [path={1}]'.format(self.log_base, path))
504 except Exception as e:
505 self.folder_log.error('{0} [pdf to img error] [path={1}] [error={2}]'.format(
506 self.log_base, path, traceback.format_exc()))
507 raise e
508 else:
509 self.images_process(pdf_handler.img_path_list, excel_path)
510 shutil.move(path, pdf_save_path)
511
512 def tif_process(self, name, path, img_output_dir, wb_output_dir, tiff_output_dir):
513 if os.path.exists(path):
514 try:
515 img_save_path, excel_path, tiff_save_path = self.get_path(name, img_output_dir, wb_output_dir, tiff_output_dir)
516 self.folder_log.info('{0} [tiff to img start] [path={1}]'.format(self.log_base, path))
517 tiff_handler = TIFFHandler(path, img_save_path)
518 tiff_handler.extract_image()
519 self.folder_log.info('{0} [tiff to img end] [path={1}]'.format(self.log_base, path))
520 except Exception as e:
521 self.folder_log.error('{0} [tiff to img error] [path={1}] [error={2}]'.format(
522 self.log_base, path, traceback.format_exc()))
523 raise e
524 else:
525 self.images_process(tiff_handler.img_path_list, excel_path)
526 shutil.move(path, tiff_save_path)
527
528 def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir):
529 try:
530 img_save_path, excel_path, _ = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir)
531 except Exception as e:
532 self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format(
533 self.log_base, path, traceback.format_exc()))
534 else:
535 ocr_res = self.ocr_process(path, classify)
536 all_res = {path: ocr_res}
537 self.res_process(all_res, classify, excel_path)
538 shutil.move(path, img_save_path)
539
540 def folder_process(self, input_dir):
541 while not os.path.isdir(input_dir):
542 self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir))
543 if self.switch:
544 time.sleep(self.sleep_time)
545 continue
546 else:
547 return
548 output_dir = os.path.join(os.path.dirname(input_dir), 'Output')
549 img_output_dir = os.path.join(output_dir, 'image')
550 wb_output_dir = os.path.join(output_dir, 'excel')
551 pdf_output_dir = os.path.join(output_dir, 'pdf')
552 tiff_output_dir = os.path.join(output_dir, 'tiff')
553 failed_output_dir = os.path.join(output_dir, 'failed')
554 os.makedirs(output_dir, exist_ok=True)
555 os.makedirs(img_output_dir, exist_ok=True)
556 os.makedirs(wb_output_dir, exist_ok=True)
557 os.makedirs(pdf_output_dir, exist_ok=True)
558 os.makedirs(tiff_output_dir, exist_ok=True)
559 os.makedirs(failed_output_dir, exist_ok=True)
560 os_error_filename_set = set()
561 while self.switch:
562 # if not os.path.isdir(input_dir):
563 # self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir))
564 # time.sleep(self.sleep_time)
565 # continue
566 # 1. 从input dir获取pdf or image
567 list_dir = os.listdir(input_dir)
568 if not list_dir and len(os_error_filename_set) == 0:
569 self.folder_log.info('{0} [input dir empty] [input_dir={1}]'.format(self.log_base, input_dir))
570 time.sleep(self.sleep_time)
571 continue
572 all_file_set = set(list_dir)
573 true_file_set = all_file_set - os_error_filename_set
574 if len(true_file_set) == 0 and len(os_error_filename_set) > 0:
575 true_file_set.add(os_error_filename_set.pop())
576 for name in true_file_set:
577 path = os.path.join(input_dir, name)
578
579 try:
580 if os.path.isfile(path):
581 self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path))
582 if name.endswith('.pdf') or name.endswith('.PDF'):
583 self.pdf_process(name, path, img_output_dir, wb_output_dir, pdf_output_dir)
584 elif name.endswith('.tif') or name.endswith('.TIF'):
585 self.tif_process(name, path, img_output_dir, wb_output_dir, tiff_output_dir)
586 else:
587 self.folder_log.info('{0} [path is not pdf or tif] [path={1}]'.format(
588 self.log_base, input_dir))
589 failed_path = os.path.join(failed_output_dir, '{0}_{1}'.format(time.time(), name))
590 shutil.move(path, failed_path)
591 self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path))
592 else:
593 self.folder_log.info('{0} [path is dir] [path={1}]'.format(self.log_base, input_dir))
594 failed_path = os.path.join(failed_output_dir, '{0}_{1}'.format(time.time(), name))
595 shutil.move(path, failed_path)
596 except OSError:
597 os_error_filename_set.add(name)
598 self.folder_log.error('{0} [os error] [path={1}] [error={2}]'.format(
599 self.log_base, path, traceback.format_exc()))
600 except Exception as e:
601 try:
602 self.folder_log.error('{0} [file error] [path={1}] [error={2}]'.format(self.log_base, path,
603 traceback.format_exc()))
604 failed_path = os.path.join(failed_output_dir, '{0}_{1}'.format(time.time(), name))
605 shutil.move(path, failed_path)
606 except Exception as e:
607 os_error_filename_set.add(name)
608 self.folder_log.error('{0} [file move error] [path={1}] [error={2}]'.format(
609 self.log_base, path, traceback.format_exc()))
610
611 def handle(self, *args, **kwargs):
612 self.folder_process(self.input_dir)
613 self.folder_log.info('{0} [stop safely]'.format(self.log_base))
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!