cf8331db by 周伟奇

folder add tiff

1 parent 6cf3b86d
...@@ -6,6 +6,7 @@ import base64 ...@@ -6,6 +6,7 @@ import base64
6 import signal 6 import signal
7 import requests 7 import requests
8 import traceback 8 import traceback
9 from PIL import Image
9 from datetime import datetime 10 from datetime import datetime
10 from django.core.management import BaseCommand 11 from django.core.management import BaseCommand
11 from multiprocessing import Process 12 from multiprocessing import Process
...@@ -18,6 +19,27 @@ from apps.doc.exceptions import OCR1Exception, OCR4Exception ...@@ -18,6 +19,27 @@ from apps.doc.exceptions import OCR1Exception, OCR4Exception
18 from apps.doc.ocr.wb import BSWorkbook 19 from apps.doc.ocr.wb import BSWorkbook
19 20
20 21
22 class TIFFHandler:
23
24 def __init__(self, path, img_save_path):
25 self.path = path
26 self.img_save_path = img_save_path
27 self.img_path_list = []
28
29 def extract_image(self):
30 tiff = Image.open(self.path)
31 tiff.load()
32
33 for i in range(tiff.n_frames):
34 try:
35 save_path = os.path.join(self.img_save_path, 'page_{0}'.format(i))
36 tiff.seek(i)
37 tiff.save(save_path)
38 self.img_path_list.append(save_path)
39 except EOFError:
40 break
41
42
21 class Command(BaseCommand, LoggerMixin): 43 class Command(BaseCommand, LoggerMixin):
22 44
23 def __init__(self): 45 def __init__(self):
...@@ -225,6 +247,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -225,6 +247,7 @@ class Command(BaseCommand, LoggerMixin):
225 except Exception as e: 247 except Exception as e:
226 self.folder_log.error('{0} [pdf to img error] [path={1}] [error={2}]'.format( 248 self.folder_log.error('{0} [pdf to img error] [path={1}] [error={2}]'.format(
227 self.log_base, path, traceback.format_exc())) 249 self.log_base, path, traceback.format_exc()))
250 raise e
228 else: 251 else:
229 all_res = {} 252 all_res = {}
230 for img_path in pdf_handler.img_path_list: 253 for img_path in pdf_handler.img_path_list:
...@@ -233,6 +256,26 @@ class Command(BaseCommand, LoggerMixin): ...@@ -233,6 +256,26 @@ class Command(BaseCommand, LoggerMixin):
233 self.res_process(all_res, classify, excel_path) 256 self.res_process(all_res, classify, excel_path)
234 shutil.move(path, pdf_save_path) 257 shutil.move(path, pdf_save_path)
235 258
259 def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir):
260 if os.path.exists(path):
261 try:
262 img_save_path, excel_path, tiff_save_path = self.get_path(name, img_output_dir, wb_output_dir, tiff_output_dir)
263 self.folder_log.info('{0} [tiff to img start] [path={1}]'.format(self.log_base, path))
264 tiff_handler = TIFFHandler(path, img_save_path)
265 tiff_handler.extract_image()
266 self.folder_log.info('{0} [tiff to img end] [path={1}]'.format(self.log_base, path))
267 except Exception as e:
268 self.folder_log.error('{0} [tiff to img error] [path={1}] [error={2}]'.format(
269 self.log_base, path, traceback.format_exc()))
270 raise e
271 else:
272 all_res = {}
273 for img_path in tiff_handler.img_path_list:
274 ocr_res = self.ocr_process(img_path, classify)
275 all_res[img_path] = ocr_res
276 self.res_process(all_res, classify, excel_path)
277 shutil.move(path, tiff_save_path)
278
236 def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir): 279 def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir):
237 ocr_res = self.ocr_process(path, classify) 280 ocr_res = self.ocr_process(path, classify)
238 all_res = {path: ocr_res} 281 all_res = {path: ocr_res}
...@@ -258,11 +301,13 @@ class Command(BaseCommand, LoggerMixin): ...@@ -258,11 +301,13 @@ class Command(BaseCommand, LoggerMixin):
258 img_output_dir = os.path.join(output_dir, 'image') 301 img_output_dir = os.path.join(output_dir, 'image')
259 wb_output_dir = os.path.join(output_dir, 'excel') 302 wb_output_dir = os.path.join(output_dir, 'excel')
260 pdf_output_dir = os.path.join(output_dir, 'pdf') 303 pdf_output_dir = os.path.join(output_dir, 'pdf')
304 tiff_output_dir = os.path.join(output_dir, 'tiff')
261 failed_output_dir = os.path.join(output_dir, 'failed') 305 failed_output_dir = os.path.join(output_dir, 'failed')
262 os.makedirs(output_dir, exist_ok=True) 306 os.makedirs(output_dir, exist_ok=True)
263 os.makedirs(img_output_dir, exist_ok=True) 307 os.makedirs(img_output_dir, exist_ok=True)
264 os.makedirs(wb_output_dir, exist_ok=True) 308 os.makedirs(wb_output_dir, exist_ok=True)
265 os.makedirs(pdf_output_dir, exist_ok=True) 309 os.makedirs(pdf_output_dir, exist_ok=True)
310 os.makedirs(tiff_output_dir, exist_ok=True)
266 os.makedirs(failed_output_dir, exist_ok=True) 311 os.makedirs(failed_output_dir, exist_ok=True)
267 while self.switch: 312 while self.switch:
268 if not os.path.isdir(input_dir): 313 if not os.path.isdir(input_dir):
...@@ -282,6 +327,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -282,6 +327,8 @@ class Command(BaseCommand, LoggerMixin):
282 self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path)) 327 self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path))
283 if name.endswith('.pdf'): 328 if name.endswith('.pdf'):
284 self.pdf_process(name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir) 329 self.pdf_process(name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir)
330 elif name.endswith('.tif'):
331 self.tif_process(name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir)
285 else: 332 else:
286 self.img_process(name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir) 333 self.img_process(name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir)
287 self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path)) 334 self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path))
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!