import time import os import signal import fitz from PIL import Image from io import BytesIO from django.core.management import BaseCommand from common.mixins import LoggerMixin from common.redis_cache import redis_handler as rh from apps.doc.models import UploadDocRecords from settings import conf class Command(BaseCommand, LoggerMixin): def __init__(self): super().__init__() self.log_base = '[doc process]' # 处理文件开关 self.switch = True # 数据目录 self.data_dir = conf.DATA_DIR # pdf页面转图片 self.zoom_x = 2.0 self.zoom_y = 2.0 self.trans = fitz.Matrix(self.zoom_x, self.zoom_y).preRotate(0) # zoom factor 2 in each dimension # 优雅退出信号:15 signal.signal(signal.SIGTERM, self.signal_handler) def signal_handler(self, sig, frame): self.switch = False # 停止处理文件 def get_task_info(self): # TODO 优先队列 & status modify task_id = rh.dequeue() if task_id is None: self.cronjob_log.info('{0} [get_task_info] [queue empty]'.format(self.log_base)) return task_info = UploadDocRecords.objects.filter(id=task_id).values( 'id', 'metadata_version_id', 'document_name').first() if task_info is None: self.cronjob_log.warn('{0} [get_task_info] [task not found] [task_id={1}]'.format(self.log_base, task_id)) self.cronjob_log.info('{0} [get_task_info success] [task_info={1}]'.format(self.log_base, task_info)) return task_info def pdf_download(self, task_info): if task_info is None: return # TODO EDMS下载pdf pdf_path = '/Users/clay/Desktop/biz/biz_logic/data/2/横版-表格-工商银行CH-B008802400.pdf' self.cronjob_log.info('{0} [pdf download success] [task_info={1}] [pdf_path={2}]'.format( self.log_base, task_info, pdf_path)) return pdf_path @staticmethod def getimage(pix): if pix.colorspace.n != 4: return pix tpix = fitz.Pixmap(fitz.csRGB, pix) return tpix def recoverpix(self, doc, item): x = item[0] # xref of PDF image s = item[1] # xref of its /SMask is_rgb = True if item[5] == 'DeviceRGB' else False # RGB if is_rgb: if s == 0: return doc.extractImage(x) # we need to reconstruct the alpha channel with the smask pix1 = fitz.Pixmap(doc, x) pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry # sanity check if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): pix2 = None return self.getimage(pix1) pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value pix1 = pix2 = None # free temp pixmaps return self.getimage(pix) # GRAY/CMYK pix1 = fitz.Pixmap(doc, x) pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added if s != 0: pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry # sanity check if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): pix2 = None return self.getimage(pix1) pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value pix1 = pix2 = None # free temp pixmaps pix = fitz.Pixmap(fitz.csRGB, pix) # GRAY/CMYK to RGB return self.getimage(pix) @staticmethod def get_img_data(pix): if type(pix) is dict: # we got a raw image ext = pix["ext"] img_data = pix["image"] else: # we got a pixmap ext = 'png' img_data = pix.getPNGData() return ext, img_data @staticmethod def split_il(il): img_il_list = [] start = 0 length = len(il) for i in range(length): if i == start: if i == length - 1: img_il_list.append(il[start: length]) continue elif i == length - 1: img_il_list.append(il[start: length]) continue if il[i][2] != il[i - 1][2]: img_il_list.append(il[start: i]) start = i elif il[i][3] != il[i - 1][3]: img_il_list.append(il[start: i + 1]) start = i + 1 return img_il_list def handle(self, *args, **kwargs): while self.switch: # 从队列获取文件信息 task_info = self.get_task_info() # 从EDMS获取PDF文件 pdf_path = self.pdf_download(task_info) # 队列为空时的处理 if pdf_path is None: time.sleep(10) continue # PDF文件提取图片 img_save_path = os.path.join(os.path.dirname(pdf_path), 'img') os.makedirs(img_save_path, exist_ok=True) with fitz.Document(pdf_path) as pdf: self.cronjob_log.info('{0} [pdf_path={1}] [pdf_metadata={2}]'.format( self.log_base, pdf_path, pdf.metadata)) # xref_list = [] # TODO 图片去重 for pno in range(pdf.pageCount): il = pdf.getPageImageList(pno) il.sort(key=lambda x: x[0]) img_il_list = self.split_il(il) del il if len(img_il_list) > 3: # 单页无规律小图过多时,使用页面转图片 page = pdf.loadPage(pno) pm = page.getPixmap(matrix=self.trans, alpha=False) save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number)) # pm.writePNG(save_path) pm.writeImage(save_path) else: # 提取图片 for img_count, img_il in enumerate(img_il_list): if len(img_il) == 1: # 当只有一张图片时, 简化处理 pix = self.recoverpix(pdf, img_il[0]) ext, img_data = self.get_img_data(pix) save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( pno, img_count, ext)) with open(save_path, "wb") as f: f.write(img_data) else: # 多张图片,竖向拼接 height_sum = 0 im_list = [] width = img_il[0][2] for img in img_il: # xref = img[0] # if xref in xref_list: # continue height = img[3] pix = self.recoverpix(pdf, img) ext, img_data = self.get_img_data(pix) # xref_list.append(xref) im = Image.open(BytesIO(img_data)) im_list.append((height, im, ext)) height_sum += height save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( pno, img_count, im_list[0][2])) res = Image.new(im_list[0][1].mode, (width, height_sum)) h_now = 0 for h, m, _ in im_list: res.paste(m, box=(0, h_now)) h_now += h res.save(save_path) # 图片调用算法判断是否为银行流水 # 图片调用算法OCR为excel文件 # 整合excel文件上传至EDMS