bb0678cb by 周伟奇

Merge branch 'feature/ebank' into feature/0918

2 parents 369697a8 4383b4d1
......@@ -730,11 +730,18 @@ class Command(BaseCommand, LoggerMixin):
else:
with lock:
todo_count_dict[task_str] = pdf_handler.img_count
for img_path in pdf_handler.img_path_list:
for img_idx, img_path in enumerate(pdf_handler.img_path_list):
while img_queue.full():
self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
time.sleep(self.sleep_time_img_put)
img_queue.put(img_path)
if pdf_handler.is_ebank:
try:
text_list = pdf_handler.page_text_list[img_idx].pop('rebuild_text')
except Exception as e:
text_list = []
else:
text_list = []
img_queue.put((img_path, text_list))
# except EDMSException as e:
# try:
# doc.status = DocStatus.PROCESS_FAILED.value
......@@ -779,7 +786,7 @@ class Command(BaseCommand, LoggerMixin):
def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url, error_list):
while len(error_list) == 0 or not img_queue.empty():
try:
img_path = img_queue.get(block=False)
img_path, text_list = img_queue.get(block=False)
except Exception as e:
# self.online_log.info('{0} [img_2_ocr_1] [queue empty]'.format(self.log_base))
time.sleep(self.sleep_time_img_get)
......@@ -797,6 +804,8 @@ class Command(BaseCommand, LoggerMixin):
json_data_1 = {
"file": file_data
}
if len(text_list) > 0:
json_data_1['text_list'] = text_list
start_time = time.time()
ocr_1_response = requests.post(url, json=json_data_1)
......
......@@ -31,6 +31,8 @@ class PDFHandler:
self.xref_set = set()
self.img_suffixs = {'.jpeg', '.jpg', '.png', '.webp', '.bmp'}
self.suffix = self.get_suffix(document_name)
self.is_ebank = False
self.page_text_list = []
def get_suffix(self, file_name):
if file_name is None:
......@@ -46,6 +48,30 @@ class PDFHandler:
def get_img_save_path(self, pno, img_index=0, ext='png'):
return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext))
def rebuild_bbox(self, src_width, src_height, pno):
try:
width = self.page_text_list[pno].pop('width')
height = self.page_text_list[pno].pop('height')
src_text_list = self.page_text_list[pno].pop('text')
width_scale = src_width / width
height_scale = src_height / height
rebuild_text_list = []
for bbox, text in src_text_list:
x0, y0, x1, y1 = bbox
x0 = x0 * width_scale
y0 = y0 * height_scale
x1 = x1 * width_scale
y1 = y1 * height_scale
rebuild_text_list.append(
((x0, y0, x1, y0, x1, y1, x0, y1), text)
)
self.page_text_list[pno]['rebuild_text'] = rebuild_text_list
except Exception as e:
pass
def page_to_png(self, page):
if page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500:
pm = page.getPixmap(matrix=trans_1, alpha=False)
......@@ -54,6 +80,8 @@ class PDFHandler:
img_save_path = self.get_img_save_path(page.number)
pm.writePNG(img_save_path)
self.img_path_list.append(img_save_path)
if self.is_ebank:
self.rebuild_bbox(pm.width, pm.height, page.number)
@staticmethod
def getimage(pix):
......@@ -207,6 +235,36 @@ class PDFHandler:
page = pdf.loadPage(pno)
self.page_to_png(page)
def check_ebank(self, pdf):
page_text_list = []
text_item_sum = 0
for pno in range(pdf.pageCount):
page = pdf.loadPage(pno)
textpage = page.getTextPage()
text = textpage.extractDICT()
text_list = []
for block in text.get('blocks'):
for line in block.get('lines'):
for span in line.get('spans'):
char = span.get('text')
bbox = span.get('bbox')
if char.strip() == '':
continue
text_list.append((bbox, char))
text_item_sum += len(text_list)
if text_item_sum < (pno + 1) * 5:
return
else:
page_text_list.append(
{
'width': text.get('width'),
'height': text.get('height'),
'text': text_list
}
)
self.is_ebank = True
self.page_text_list = page_text_list
def extract_image(self, max_img_count=None):
self.img_path_list = []
self.xref_set = set()
......@@ -221,12 +279,13 @@ class PDFHandler:
if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
self.img_count = pdf.pageCount
return
self.check_ebank(pdf)
for pno in range(pdf.pageCount):
il = pdf.getPageImageList(pno) # 获取页面图片对象
# (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
# 1.页面图片对象数目为0时,保存整个页面为png图片
if len(il) == 0:
if self.is_ebank or len(il) == 0:
page = pdf.loadPage(pno)
self.page_to_png(page)
# 2.页面图片对象数目为1时:
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!