cf8331db by 周伟奇

folder add tiff

1 parent 6cf3b86d
......@@ -6,6 +6,7 @@ import base64
import signal
import requests
import traceback
from PIL import Image
from datetime import datetime
from django.core.management import BaseCommand
from multiprocessing import Process
......@@ -18,6 +19,27 @@ from apps.doc.exceptions import OCR1Exception, OCR4Exception
from apps.doc.ocr.wb import BSWorkbook
class TIFFHandler:
def __init__(self, path, img_save_path):
self.path = path
self.img_save_path = img_save_path
self.img_path_list = []
def extract_image(self):
tiff = Image.open(self.path)
tiff.load()
for i in range(tiff.n_frames):
try:
save_path = os.path.join(self.img_save_path, 'page_{0}'.format(i))
tiff.seek(i)
tiff.save(save_path)
self.img_path_list.append(save_path)
except EOFError:
break
class Command(BaseCommand, LoggerMixin):
def __init__(self):
......@@ -225,6 +247,7 @@ class Command(BaseCommand, LoggerMixin):
except Exception as e:
self.folder_log.error('{0} [pdf to img error] [path={1}] [error={2}]'.format(
self.log_base, path, traceback.format_exc()))
raise e
else:
all_res = {}
for img_path in pdf_handler.img_path_list:
......@@ -233,6 +256,26 @@ class Command(BaseCommand, LoggerMixin):
self.res_process(all_res, classify, excel_path)
shutil.move(path, pdf_save_path)
def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir):
if os.path.exists(path):
try:
img_save_path, excel_path, tiff_save_path = self.get_path(name, img_output_dir, wb_output_dir, tiff_output_dir)
self.folder_log.info('{0} [tiff to img start] [path={1}]'.format(self.log_base, path))
tiff_handler = TIFFHandler(path, img_save_path)
tiff_handler.extract_image()
self.folder_log.info('{0} [tiff to img end] [path={1}]'.format(self.log_base, path))
except Exception as e:
self.folder_log.error('{0} [tiff to img error] [path={1}] [error={2}]'.format(
self.log_base, path, traceback.format_exc()))
raise e
else:
all_res = {}
for img_path in tiff_handler.img_path_list:
ocr_res = self.ocr_process(img_path, classify)
all_res[img_path] = ocr_res
self.res_process(all_res, classify, excel_path)
shutil.move(path, tiff_save_path)
def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir):
ocr_res = self.ocr_process(path, classify)
all_res = {path: ocr_res}
......@@ -258,11 +301,13 @@ class Command(BaseCommand, LoggerMixin):
img_output_dir = os.path.join(output_dir, 'image')
wb_output_dir = os.path.join(output_dir, 'excel')
pdf_output_dir = os.path.join(output_dir, 'pdf')
tiff_output_dir = os.path.join(output_dir, 'tiff')
failed_output_dir = os.path.join(output_dir, 'failed')
os.makedirs(output_dir, exist_ok=True)
os.makedirs(img_output_dir, exist_ok=True)
os.makedirs(wb_output_dir, exist_ok=True)
os.makedirs(pdf_output_dir, exist_ok=True)
os.makedirs(tiff_output_dir, exist_ok=True)
os.makedirs(failed_output_dir, exist_ok=True)
while self.switch:
if not os.path.isdir(input_dir):
......@@ -282,6 +327,8 @@ class Command(BaseCommand, LoggerMixin):
self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path))
if name.endswith('.pdf'):
self.pdf_process(name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir)
elif name.endswith('.tif'):
self.tif_process(name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir)
else:
self.img_process(name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir)
self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path))
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!