4383b4d1 by 周伟奇

add ebank

1 parent db2ce2cf
...@@ -730,11 +730,18 @@ class Command(BaseCommand, LoggerMixin): ...@@ -730,11 +730,18 @@ class Command(BaseCommand, LoggerMixin):
730 else: 730 else:
731 with lock: 731 with lock:
732 todo_count_dict[task_str] = pdf_handler.img_count 732 todo_count_dict[task_str] = pdf_handler.img_count
733 for img_path in pdf_handler.img_path_list: 733 for img_idx, img_path in enumerate(pdf_handler.img_path_list):
734 while img_queue.full(): 734 while img_queue.full():
735 self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) 735 self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
736 time.sleep(self.sleep_time_img_put) 736 time.sleep(self.sleep_time_img_put)
737 img_queue.put(img_path) 737 if pdf_handler.is_ebank:
738 try:
739 text_list = pdf_handler.page_text_list[img_idx].pop('rebuild_text')
740 except Exception as e:
741 text_list = []
742 else:
743 text_list = []
744 img_queue.put((img_path, text_list))
738 # except EDMSException as e: 745 # except EDMSException as e:
739 # try: 746 # try:
740 # doc.status = DocStatus.PROCESS_FAILED.value 747 # doc.status = DocStatus.PROCESS_FAILED.value
...@@ -779,7 +786,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -779,7 +786,7 @@ class Command(BaseCommand, LoggerMixin):
779 def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url, error_list): 786 def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url, error_list):
780 while len(error_list) == 0 or not img_queue.empty(): 787 while len(error_list) == 0 or not img_queue.empty():
781 try: 788 try:
782 img_path = img_queue.get(block=False) 789 img_path, text_list = img_queue.get(block=False)
783 except Exception as e: 790 except Exception as e:
784 # self.online_log.info('{0} [img_2_ocr_1] [queue empty]'.format(self.log_base)) 791 # self.online_log.info('{0} [img_2_ocr_1] [queue empty]'.format(self.log_base))
785 time.sleep(self.sleep_time_img_get) 792 time.sleep(self.sleep_time_img_get)
...@@ -797,6 +804,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -797,6 +804,8 @@ class Command(BaseCommand, LoggerMixin):
797 json_data_1 = { 804 json_data_1 = {
798 "file": file_data 805 "file": file_data
799 } 806 }
807 if len(text_list) > 0:
808 json_data_1['text_list'] = text_list
800 809
801 start_time = time.time() 810 start_time = time.time()
802 ocr_1_response = requests.post(url, json=json_data_1) 811 ocr_1_response = requests.post(url, json=json_data_1)
......
...@@ -31,6 +31,8 @@ class PDFHandler: ...@@ -31,6 +31,8 @@ class PDFHandler:
31 self.xref_set = set() 31 self.xref_set = set()
32 self.img_suffixs = {'.jpeg', '.jpg', '.png', '.webp', '.bmp'} 32 self.img_suffixs = {'.jpeg', '.jpg', '.png', '.webp', '.bmp'}
33 self.suffix = self.get_suffix(document_name) 33 self.suffix = self.get_suffix(document_name)
34 self.is_ebank = False
35 self.page_text_list = []
34 36
35 def get_suffix(self, file_name): 37 def get_suffix(self, file_name):
36 if file_name is None: 38 if file_name is None:
...@@ -46,6 +48,30 @@ class PDFHandler: ...@@ -46,6 +48,30 @@ class PDFHandler:
46 def get_img_save_path(self, pno, img_index=0, ext='png'): 48 def get_img_save_path(self, pno, img_index=0, ext='png'):
47 return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext)) 49 return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext))
48 50
51 def rebuild_bbox(self, src_width, src_height, pno):
52 try:
53 width = self.page_text_list[pno].pop('width')
54 height = self.page_text_list[pno].pop('height')
55 src_text_list = self.page_text_list[pno].pop('text')
56
57 width_scale = src_width / width
58 height_scale = src_height / height
59
60 rebuild_text_list = []
61
62 for bbox, text in src_text_list:
63 x0, y0, x1, y1 = bbox
64 x0 = x0 * width_scale
65 y0 = y0 * height_scale
66 x1 = x1 * width_scale
67 y1 = y1 * height_scale
68 rebuild_text_list.append(
69 ((x0, y0, x1, y0, x1, y1, x0, y1), text)
70 )
71 self.page_text_list[pno]['rebuild_text'] = rebuild_text_list
72 except Exception as e:
73 pass
74
49 def page_to_png(self, page): 75 def page_to_png(self, page):
50 if page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500: 76 if page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500:
51 pm = page.getPixmap(matrix=trans_1, alpha=False) 77 pm = page.getPixmap(matrix=trans_1, alpha=False)
...@@ -54,6 +80,8 @@ class PDFHandler: ...@@ -54,6 +80,8 @@ class PDFHandler:
54 img_save_path = self.get_img_save_path(page.number) 80 img_save_path = self.get_img_save_path(page.number)
55 pm.writePNG(img_save_path) 81 pm.writePNG(img_save_path)
56 self.img_path_list.append(img_save_path) 82 self.img_path_list.append(img_save_path)
83 if self.is_ebank:
84 self.rebuild_bbox(pm.width, pm.height, page.number)
57 85
58 @staticmethod 86 @staticmethod
59 def getimage(pix): 87 def getimage(pix):
...@@ -207,6 +235,36 @@ class PDFHandler: ...@@ -207,6 +235,36 @@ class PDFHandler:
207 page = pdf.loadPage(pno) 235 page = pdf.loadPage(pno)
208 self.page_to_png(page) 236 self.page_to_png(page)
209 237
238 def check_ebank(self, pdf):
239 page_text_list = []
240 text_item_sum = 0
241 for pno in range(pdf.pageCount):
242 page = pdf.loadPage(pno)
243 textpage = page.getTextPage()
244 text = textpage.extractDICT()
245 text_list = []
246 for block in text.get('blocks'):
247 for line in block.get('lines'):
248 for span in line.get('spans'):
249 char = span.get('text')
250 bbox = span.get('bbox')
251 if char.strip() == '':
252 continue
253 text_list.append((bbox, char))
254 text_item_sum += len(text_list)
255 if text_item_sum < (pno + 1) * 5:
256 return
257 else:
258 page_text_list.append(
259 {
260 'width': text.get('width'),
261 'height': text.get('height'),
262 'text': text_list
263 }
264 )
265 self.is_ebank = True
266 self.page_text_list = page_text_list
267
210 def extract_image(self, max_img_count=None): 268 def extract_image(self, max_img_count=None):
211 self.img_path_list = [] 269 self.img_path_list = []
212 self.xref_set = set() 270 self.xref_set = set()
...@@ -221,12 +279,13 @@ class PDFHandler: ...@@ -221,12 +279,13 @@ class PDFHandler:
221 if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count: 279 if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
222 self.img_count = pdf.pageCount 280 self.img_count = pdf.pageCount
223 return 281 return
282 self.check_ebank(pdf)
224 for pno in range(pdf.pageCount): 283 for pno in range(pdf.pageCount):
225 il = pdf.getPageImageList(pno) # 获取页面图片对象 284 il = pdf.getPageImageList(pno) # 获取页面图片对象
226 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) 285 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
227 286
228 # 1.页面图片对象数目为0时,保存整个页面为png图片 287 # 1.页面图片对象数目为0时,保存整个页面为png图片
229 if len(il) == 0: 288 if self.is_ebank or len(il) == 0:
230 page = pdf.loadPage(pno) 289 page = pdf.loadPage(pno)
231 self.page_to_png(page) 290 self.page_to_png(page)
232 # 2.页面图片对象数目为1时: 291 # 2.页面图片对象数目为1时:
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!