c1f24adf by 周伟奇

Merge branch 'feature/0611' of gitlab.situdata.com:zhouweiqi/bmw-ocr into feature/0611

2 parents b8745dc6 906f258d
...@@ -585,7 +585,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -585,7 +585,7 @@ class Command(BaseCommand, LoggerMixin):
585 img_save_path = os.path.join(doc_data_path, 'img') 585 img_save_path = os.path.join(doc_data_path, 'img')
586 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) 586 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
587 587
588 pdf_handler = PDFHandler(pdf_path, img_save_path) 588 pdf_handler = PDFHandler(pdf_path, img_save_path, doc.document_name)
589 max_count_obj = Configs.objects.filter(id=2).first() 589 max_count_obj = Configs.objects.filter(id=2).first()
590 try: 590 try:
591 max_img_count = int(max_count_obj.value) 591 max_img_count = int(max_count_obj.value)
......
...@@ -27,7 +27,7 @@ from apps.doc.named_enum import RequestTeam, RequestTrigger, ProcessName ...@@ -27,7 +27,7 @@ from apps.doc.named_enum import RequestTeam, RequestTrigger, ProcessName
27 from common.tools.comparison import cp 27 from common.tools.comparison import cp
28 28
29 compare_log = logging.getLogger('compare') 29 compare_log = logging.getLogger('compare')
30 log_base = '[CA Compare]' 30 log_base = '[Compare]'
31 31
32 32
33 def name_check(ocr_res_dict, second_ocr_field, second_compare_list, second_id_num, name): 33 def name_check(ocr_res_dict, second_ocr_field, second_compare_list, second_id_num, name):
......
1 import os 1 import os
2 import shutil
2 import fitz 3 import fitz
3 from PIL import Image 4 from PIL import Image
4 from io import BytesIO 5 from io import BytesIO
...@@ -22,12 +23,25 @@ WH_COUPLE_5 = (100, 200) ...@@ -22,12 +23,25 @@ WH_COUPLE_5 = (100, 200)
22 23
23 class PDFHandler: 24 class PDFHandler:
24 25
25 def __init__(self, path, img_dir_path): 26 def __init__(self, path, img_dir_path, document_name=None):
26 self.path = path 27 self.path = path
27 self.img_dir_path = img_dir_path 28 self.img_dir_path = img_dir_path
28 self.img_path_list = [] 29 self.img_path_list = []
29 self.img_count = 0 30 self.img_count = 0
30 self.xref_set = set() 31 self.xref_set = set()
32 self.img_suffixs = {'.jpeg', '.jpg', '.png', '.webp', '.bmp'}
33 self.suffix = self.get_suffix(document_name)
34
35 def get_suffix(self, file_name):
36 if file_name is None:
37 return None
38 try:
39 _, src_suffix = os.path.splitext(file_name)
40 lower_suffix = src_suffix.lower()
41 if lower_suffix in self.img_suffixs:
42 return lower_suffix
43 except Exception as e:
44 return
31 45
32 def get_img_save_path(self, pno, img_index=0, ext='png'): 46 def get_img_save_path(self, pno, img_index=0, ext='png'):
33 return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext)) 47 return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext))
...@@ -197,33 +211,39 @@ class PDFHandler: ...@@ -197,33 +211,39 @@ class PDFHandler:
197 self.img_path_list = [] 211 self.img_path_list = []
198 self.xref_set = set() 212 self.xref_set = set()
199 os.makedirs(self.img_dir_path, exist_ok=True) 213 os.makedirs(self.img_dir_path, exist_ok=True)
200 with fitz.Document(self.path) as pdf: 214
201 if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count: 215 if self.suffix in self.img_suffixs:
202 self.img_count = pdf.pageCount 216 img_save_path = self.get_img_save_path(0, ext=self.suffix[1:])
203 return 217 shutil.copy(self.path, img_save_path)
204 for pno in range(pdf.pageCount): 218 self.img_path_list.append(img_save_path)
205 il = pdf.getPageImageList(pno) # 获取页面图片对象 219 else:
206 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) 220 with fitz.Document(self.path) as pdf:
207 221 if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
208 # 1.页面图片对象数目为0时,保存整个页面为png图片 222 self.img_count = pdf.pageCount
209 if len(il) == 0: 223 return
210 page = pdf.loadPage(pno) 224 for pno in range(pdf.pageCount):
211 self.page_to_png(page) 225 il = pdf.getPageImageList(pno) # 获取页面图片对象
212 # 2.页面图片对象数目为1时: 226 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
213 # 小图(如电子账单的盖章):保存整个页面为png图片 227
214 # 大图:提取图片对象 228 # 1.页面图片对象数目为0时,保存整个页面为png图片
215 elif len(il) == 1: 229 if len(il) == 0:
216 xref, smask, width, height, _, colorspace, _, _, _ = il[0]
217 # 小图
218 if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]:
219 page = pdf.loadPage(pno) 230 page = pdf.loadPage(pno)
220 self.page_to_png(page) 231 self.page_to_png(page)
221 # 大图 232 # 2.页面图片对象数目为1时:
222 elif xref not in self.xref_set: 233 # 小图(如电子账单的盖章):保存整个页面为png图片
223 self.extract_single_image(pdf, xref, smask, colorspace, pno) 234 # 大图:提取图片对象
224 # 3.页面图片对象数目大于1时,特殊处理 235 elif len(il) == 1:
225 else: 236 xref, smask, width, height, _, colorspace, _, _, _ = il[0]
226 self.merge_il(pdf, pno, il) 237 # 小图
238 if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]:
239 page = pdf.loadPage(pno)
240 self.page_to_png(page)
241 # 大图
242 elif xref not in self.xref_set:
243 self.extract_single_image(pdf, xref, smask, colorspace, pno)
244 # 3.页面图片对象数目大于1时,特殊处理
245 else:
246 self.merge_il(pdf, pno, il)
227 self.img_count = len(self.img_path_list) 247 self.img_count = len(self.img_path_list)
228 248
229 def extract_page_image(self): 249 def extract_page_image(self):
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!