c1f24adf by 周伟奇

Merge branch 'feature/0611' of gitlab.situdata.com:zhouweiqi/bmw-ocr into feature/0611

2 parents b8745dc6 906f258d
......@@ -585,7 +585,7 @@ class Command(BaseCommand, LoggerMixin):
img_save_path = os.path.join(doc_data_path, 'img')
pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
pdf_handler = PDFHandler(pdf_path, img_save_path)
pdf_handler = PDFHandler(pdf_path, img_save_path, doc.document_name)
max_count_obj = Configs.objects.filter(id=2).first()
try:
max_img_count = int(max_count_obj.value)
......
......@@ -27,7 +27,7 @@ from apps.doc.named_enum import RequestTeam, RequestTrigger, ProcessName
from common.tools.comparison import cp
compare_log = logging.getLogger('compare')
log_base = '[CA Compare]'
log_base = '[Compare]'
def name_check(ocr_res_dict, second_ocr_field, second_compare_list, second_id_num, name):
......
import os
import shutil
import fitz
from PIL import Image
from io import BytesIO
......@@ -22,12 +23,25 @@ WH_COUPLE_5 = (100, 200)
class PDFHandler:
def __init__(self, path, img_dir_path):
def __init__(self, path, img_dir_path, document_name=None):
self.path = path
self.img_dir_path = img_dir_path
self.img_path_list = []
self.img_count = 0
self.xref_set = set()
self.img_suffixs = {'.jpeg', '.jpg', '.png', '.webp', '.bmp'}
self.suffix = self.get_suffix(document_name)
def get_suffix(self, file_name):
if file_name is None:
return None
try:
_, src_suffix = os.path.splitext(file_name)
lower_suffix = src_suffix.lower()
if lower_suffix in self.img_suffixs:
return lower_suffix
except Exception as e:
return
def get_img_save_path(self, pno, img_index=0, ext='png'):
return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext))
......@@ -197,33 +211,39 @@ class PDFHandler:
self.img_path_list = []
self.xref_set = set()
os.makedirs(self.img_dir_path, exist_ok=True)
with fitz.Document(self.path) as pdf:
if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
self.img_count = pdf.pageCount
return
for pno in range(pdf.pageCount):
il = pdf.getPageImageList(pno) # 获取页面图片对象
# (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
# 1.页面图片对象数目为0时,保存整个页面为png图片
if len(il) == 0:
page = pdf.loadPage(pno)
self.page_to_png(page)
# 2.页面图片对象数目为1时:
# 小图(如电子账单的盖章):保存整个页面为png图片
# 大图:提取图片对象
elif len(il) == 1:
xref, smask, width, height, _, colorspace, _, _, _ = il[0]
# 小图
if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]:
if self.suffix in self.img_suffixs:
img_save_path = self.get_img_save_path(0, ext=self.suffix[1:])
shutil.copy(self.path, img_save_path)
self.img_path_list.append(img_save_path)
else:
with fitz.Document(self.path) as pdf:
if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
self.img_count = pdf.pageCount
return
for pno in range(pdf.pageCount):
il = pdf.getPageImageList(pno) # 获取页面图片对象
# (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
# 1.页面图片对象数目为0时,保存整个页面为png图片
if len(il) == 0:
page = pdf.loadPage(pno)
self.page_to_png(page)
# 大图
elif xref not in self.xref_set:
self.extract_single_image(pdf, xref, smask, colorspace, pno)
# 3.页面图片对象数目大于1时,特殊处理
else:
self.merge_il(pdf, pno, il)
# 2.页面图片对象数目为1时:
# 小图(如电子账单的盖章):保存整个页面为png图片
# 大图:提取图片对象
elif len(il) == 1:
xref, smask, width, height, _, colorspace, _, _, _ = il[0]
# 小图
if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]:
page = pdf.loadPage(pno)
self.page_to_png(page)
# 大图
elif xref not in self.xref_set:
self.extract_single_image(pdf, xref, smask, colorspace, pno)
# 3.页面图片对象数目大于1时,特殊处理
else:
self.merge_il(pdf, pno, il)
self.img_count = len(self.img_path_list)
def extract_page_image(self):
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!