Merge branch 'feature/mvc' into feature/compare
Showing
1 changed file
with
47 additions
and
0 deletions
| ... | @@ -6,6 +6,7 @@ import base64 | ... | @@ -6,6 +6,7 @@ import base64 |
| 6 | import signal | 6 | import signal |
| 7 | import requests | 7 | import requests |
| 8 | import traceback | 8 | import traceback |
| 9 | from PIL import Image | ||
| 9 | from datetime import datetime | 10 | from datetime import datetime |
| 10 | from django.core.management import BaseCommand | 11 | from django.core.management import BaseCommand |
| 11 | from multiprocessing import Process | 12 | from multiprocessing import Process |
| ... | @@ -18,6 +19,27 @@ from apps.doc.exceptions import OCR1Exception, OCR4Exception | ... | @@ -18,6 +19,27 @@ from apps.doc.exceptions import OCR1Exception, OCR4Exception |
| 18 | from apps.doc.ocr.wb import BSWorkbook | 19 | from apps.doc.ocr.wb import BSWorkbook |
| 19 | 20 | ||
| 20 | 21 | ||
| 22 | class TIFFHandler: | ||
| 23 | |||
| 24 | def __init__(self, path, img_save_path): | ||
| 25 | self.path = path | ||
| 26 | self.img_save_path = img_save_path | ||
| 27 | self.img_path_list = [] | ||
| 28 | |||
| 29 | def extract_image(self): | ||
| 30 | tiff = Image.open(self.path) | ||
| 31 | tiff.load() | ||
| 32 | |||
| 33 | for i in range(tiff.n_frames): | ||
| 34 | try: | ||
| 35 | save_path = os.path.join(self.img_save_path, 'page_{0}'.format(i)) | ||
| 36 | tiff.seek(i) | ||
| 37 | tiff.save(save_path) | ||
| 38 | self.img_path_list.append(save_path) | ||
| 39 | except EOFError: | ||
| 40 | break | ||
| 41 | |||
| 42 | |||
| 21 | class Command(BaseCommand, LoggerMixin): | 43 | class Command(BaseCommand, LoggerMixin): |
| 22 | 44 | ||
| 23 | def __init__(self): | 45 | def __init__(self): |
| ... | @@ -225,6 +247,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -225,6 +247,7 @@ class Command(BaseCommand, LoggerMixin): |
| 225 | except Exception as e: | 247 | except Exception as e: |
| 226 | self.folder_log.error('{0} [pdf to img error] [path={1}] [error={2}]'.format( | 248 | self.folder_log.error('{0} [pdf to img error] [path={1}] [error={2}]'.format( |
| 227 | self.log_base, path, traceback.format_exc())) | 249 | self.log_base, path, traceback.format_exc())) |
| 250 | raise e | ||
| 228 | else: | 251 | else: |
| 229 | all_res = {} | 252 | all_res = {} |
| 230 | for img_path in pdf_handler.img_path_list: | 253 | for img_path in pdf_handler.img_path_list: |
| ... | @@ -233,6 +256,26 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -233,6 +256,26 @@ class Command(BaseCommand, LoggerMixin): |
| 233 | self.res_process(all_res, classify, excel_path) | 256 | self.res_process(all_res, classify, excel_path) |
| 234 | shutil.move(path, pdf_save_path) | 257 | shutil.move(path, pdf_save_path) |
| 235 | 258 | ||
| 259 | def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir): | ||
| 260 | if os.path.exists(path): | ||
| 261 | try: | ||
| 262 | img_save_path, excel_path, tiff_save_path = self.get_path(name, img_output_dir, wb_output_dir, tiff_output_dir) | ||
| 263 | self.folder_log.info('{0} [tiff to img start] [path={1}]'.format(self.log_base, path)) | ||
| 264 | tiff_handler = TIFFHandler(path, img_save_path) | ||
| 265 | tiff_handler.extract_image() | ||
| 266 | self.folder_log.info('{0} [tiff to img end] [path={1}]'.format(self.log_base, path)) | ||
| 267 | except Exception as e: | ||
| 268 | self.folder_log.error('{0} [tiff to img error] [path={1}] [error={2}]'.format( | ||
| 269 | self.log_base, path, traceback.format_exc())) | ||
| 270 | raise e | ||
| 271 | else: | ||
| 272 | all_res = {} | ||
| 273 | for img_path in tiff_handler.img_path_list: | ||
| 274 | ocr_res = self.ocr_process(img_path, classify) | ||
| 275 | all_res[img_path] = ocr_res | ||
| 276 | self.res_process(all_res, classify, excel_path) | ||
| 277 | shutil.move(path, tiff_save_path) | ||
| 278 | |||
| 236 | def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir): | 279 | def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir): |
| 237 | ocr_res = self.ocr_process(path, classify) | 280 | ocr_res = self.ocr_process(path, classify) |
| 238 | all_res = {path: ocr_res} | 281 | all_res = {path: ocr_res} |
| ... | @@ -258,11 +301,13 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -258,11 +301,13 @@ class Command(BaseCommand, LoggerMixin): |
| 258 | img_output_dir = os.path.join(output_dir, 'image') | 301 | img_output_dir = os.path.join(output_dir, 'image') |
| 259 | wb_output_dir = os.path.join(output_dir, 'excel') | 302 | wb_output_dir = os.path.join(output_dir, 'excel') |
| 260 | pdf_output_dir = os.path.join(output_dir, 'pdf') | 303 | pdf_output_dir = os.path.join(output_dir, 'pdf') |
| 304 | tiff_output_dir = os.path.join(output_dir, 'tiff') | ||
| 261 | failed_output_dir = os.path.join(output_dir, 'failed') | 305 | failed_output_dir = os.path.join(output_dir, 'failed') |
| 262 | os.makedirs(output_dir, exist_ok=True) | 306 | os.makedirs(output_dir, exist_ok=True) |
| 263 | os.makedirs(img_output_dir, exist_ok=True) | 307 | os.makedirs(img_output_dir, exist_ok=True) |
| 264 | os.makedirs(wb_output_dir, exist_ok=True) | 308 | os.makedirs(wb_output_dir, exist_ok=True) |
| 265 | os.makedirs(pdf_output_dir, exist_ok=True) | 309 | os.makedirs(pdf_output_dir, exist_ok=True) |
| 310 | os.makedirs(tiff_output_dir, exist_ok=True) | ||
| 266 | os.makedirs(failed_output_dir, exist_ok=True) | 311 | os.makedirs(failed_output_dir, exist_ok=True) |
| 267 | while self.switch: | 312 | while self.switch: |
| 268 | if not os.path.isdir(input_dir): | 313 | if not os.path.isdir(input_dir): |
| ... | @@ -282,6 +327,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -282,6 +327,8 @@ class Command(BaseCommand, LoggerMixin): |
| 282 | self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path)) | 327 | self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path)) |
| 283 | if name.endswith('.pdf'): | 328 | if name.endswith('.pdf'): |
| 284 | self.pdf_process(name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir) | 329 | self.pdf_process(name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir) |
| 330 | elif name.endswith('.tif'): | ||
| 331 | self.tif_process(name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir) | ||
| 285 | else: | 332 | else: |
| 286 | self.img_process(name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir) | 333 | self.img_process(name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir) |
| 287 | self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path)) | 334 | self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path)) | ... | ... |
-
Please register or sign in to post a comment