c1f24adf by 周伟奇

Merge branch 'feature/0611' of gitlab.situdata.com:zhouweiqi/bmw-ocr into feature/0611

2 parents b8745dc6 906f258d
......@@ -585,7 +585,7 @@ class Command(BaseCommand, LoggerMixin):
img_save_path = os.path.join(doc_data_path, 'img')
pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
pdf_handler = PDFHandler(pdf_path, img_save_path)
pdf_handler = PDFHandler(pdf_path, img_save_path, doc.document_name)
max_count_obj = Configs.objects.filter(id=2).first()
try:
max_img_count = int(max_count_obj.value)
......
......@@ -27,7 +27,7 @@ from apps.doc.named_enum import RequestTeam, RequestTrigger, ProcessName
from common.tools.comparison import cp
compare_log = logging.getLogger('compare')
log_base = '[CA Compare]'
log_base = '[Compare]'
def name_check(ocr_res_dict, second_ocr_field, second_compare_list, second_id_num, name):
......
import os
import shutil
import fitz
from PIL import Image
from io import BytesIO
......@@ -22,12 +23,25 @@ WH_COUPLE_5 = (100, 200)
class PDFHandler:
def __init__(self, path, img_dir_path):
def __init__(self, path, img_dir_path, document_name=None):
self.path = path
self.img_dir_path = img_dir_path
self.img_path_list = []
self.img_count = 0
self.xref_set = set()
self.img_suffixs = {'.jpeg', '.jpg', '.png', '.webp', '.bmp'}
self.suffix = self.get_suffix(document_name)
def get_suffix(self, file_name):
if file_name is None:
return None
try:
_, src_suffix = os.path.splitext(file_name)
lower_suffix = src_suffix.lower()
if lower_suffix in self.img_suffixs:
return lower_suffix
except Exception as e:
return
def get_img_save_path(self, pno, img_index=0, ext='png'):
return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext))
......@@ -197,6 +211,12 @@ class PDFHandler:
self.img_path_list = []
self.xref_set = set()
os.makedirs(self.img_dir_path, exist_ok=True)
if self.suffix in self.img_suffixs:
img_save_path = self.get_img_save_path(0, ext=self.suffix[1:])
shutil.copy(self.path, img_save_path)
self.img_path_list.append(img_save_path)
else:
with fitz.Document(self.path) as pdf:
if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
self.img_count = pdf.pageCount
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!