add ebank
Showing
2 changed files
with
72 additions
and
4 deletions
... | @@ -730,11 +730,18 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -730,11 +730,18 @@ class Command(BaseCommand, LoggerMixin): |
730 | else: | 730 | else: |
731 | with lock: | 731 | with lock: |
732 | todo_count_dict[task_str] = pdf_handler.img_count | 732 | todo_count_dict[task_str] = pdf_handler.img_count |
733 | for img_path in pdf_handler.img_path_list: | 733 | for img_idx, img_path in enumerate(pdf_handler.img_path_list): |
734 | while img_queue.full(): | 734 | while img_queue.full(): |
735 | self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) | 735 | self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) |
736 | time.sleep(self.sleep_time_img_put) | 736 | time.sleep(self.sleep_time_img_put) |
737 | img_queue.put(img_path) | 737 | if pdf_handler.is_ebank: |
738 | try: | ||
739 | text_list = pdf_handler.page_text_list[img_idx].pop('rebuild_text') | ||
740 | except Exception as e: | ||
741 | text_list = [] | ||
742 | else: | ||
743 | text_list = [] | ||
744 | img_queue.put((img_path, text_list)) | ||
738 | # except EDMSException as e: | 745 | # except EDMSException as e: |
739 | # try: | 746 | # try: |
740 | # doc.status = DocStatus.PROCESS_FAILED.value | 747 | # doc.status = DocStatus.PROCESS_FAILED.value |
... | @@ -779,7 +786,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -779,7 +786,7 @@ class Command(BaseCommand, LoggerMixin): |
779 | def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url, error_list): | 786 | def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url, error_list): |
780 | while len(error_list) == 0 or not img_queue.empty(): | 787 | while len(error_list) == 0 or not img_queue.empty(): |
781 | try: | 788 | try: |
782 | img_path = img_queue.get(block=False) | 789 | img_path, text_list = img_queue.get(block=False) |
783 | except Exception as e: | 790 | except Exception as e: |
784 | # self.online_log.info('{0} [img_2_ocr_1] [queue empty]'.format(self.log_base)) | 791 | # self.online_log.info('{0} [img_2_ocr_1] [queue empty]'.format(self.log_base)) |
785 | time.sleep(self.sleep_time_img_get) | 792 | time.sleep(self.sleep_time_img_get) |
... | @@ -797,6 +804,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -797,6 +804,8 @@ class Command(BaseCommand, LoggerMixin): |
797 | json_data_1 = { | 804 | json_data_1 = { |
798 | "file": file_data | 805 | "file": file_data |
799 | } | 806 | } |
807 | if len(text_list) > 0: | ||
808 | json_data_1['text_list'] = text_list | ||
800 | 809 | ||
801 | start_time = time.time() | 810 | start_time = time.time() |
802 | ocr_1_response = requests.post(url, json=json_data_1) | 811 | ocr_1_response = requests.post(url, json=json_data_1) | ... | ... |
... | @@ -31,6 +31,8 @@ class PDFHandler: | ... | @@ -31,6 +31,8 @@ class PDFHandler: |
31 | self.xref_set = set() | 31 | self.xref_set = set() |
32 | self.img_suffixs = {'.jpeg', '.jpg', '.png', '.webp', '.bmp'} | 32 | self.img_suffixs = {'.jpeg', '.jpg', '.png', '.webp', '.bmp'} |
33 | self.suffix = self.get_suffix(document_name) | 33 | self.suffix = self.get_suffix(document_name) |
34 | self.is_ebank = False | ||
35 | self.page_text_list = [] | ||
34 | 36 | ||
35 | def get_suffix(self, file_name): | 37 | def get_suffix(self, file_name): |
36 | if file_name is None: | 38 | if file_name is None: |
... | @@ -46,6 +48,30 @@ class PDFHandler: | ... | @@ -46,6 +48,30 @@ class PDFHandler: |
46 | def get_img_save_path(self, pno, img_index=0, ext='png'): | 48 | def get_img_save_path(self, pno, img_index=0, ext='png'): |
47 | return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext)) | 49 | return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext)) |
48 | 50 | ||
51 | def rebuild_bbox(self, src_width, src_height, pno): | ||
52 | try: | ||
53 | width = self.page_text_list[pno].pop('width') | ||
54 | height = self.page_text_list[pno].pop('height') | ||
55 | src_text_list = self.page_text_list[pno].pop('text') | ||
56 | |||
57 | width_scale = src_width / width | ||
58 | height_scale = src_height / height | ||
59 | |||
60 | rebuild_text_list = [] | ||
61 | |||
62 | for bbox, text in src_text_list: | ||
63 | x0, y0, x1, y1 = bbox | ||
64 | x0 = x0 * width_scale | ||
65 | y0 = y0 * height_scale | ||
66 | x1 = x1 * width_scale | ||
67 | y1 = y1 * height_scale | ||
68 | rebuild_text_list.append( | ||
69 | ((x0, y0, x1, y0, x1, y1, x0, y1), text) | ||
70 | ) | ||
71 | self.page_text_list[pno]['rebuild_text'] = rebuild_text_list | ||
72 | except Exception as e: | ||
73 | pass | ||
74 | |||
49 | def page_to_png(self, page): | 75 | def page_to_png(self, page): |
50 | if page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500: | 76 | if page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500: |
51 | pm = page.getPixmap(matrix=trans_1, alpha=False) | 77 | pm = page.getPixmap(matrix=trans_1, alpha=False) |
... | @@ -54,6 +80,8 @@ class PDFHandler: | ... | @@ -54,6 +80,8 @@ class PDFHandler: |
54 | img_save_path = self.get_img_save_path(page.number) | 80 | img_save_path = self.get_img_save_path(page.number) |
55 | pm.writePNG(img_save_path) | 81 | pm.writePNG(img_save_path) |
56 | self.img_path_list.append(img_save_path) | 82 | self.img_path_list.append(img_save_path) |
83 | if self.is_ebank: | ||
84 | self.rebuild_bbox(pm.width, pm.height, page.number) | ||
57 | 85 | ||
58 | @staticmethod | 86 | @staticmethod |
59 | def getimage(pix): | 87 | def getimage(pix): |
... | @@ -207,6 +235,36 @@ class PDFHandler: | ... | @@ -207,6 +235,36 @@ class PDFHandler: |
207 | page = pdf.loadPage(pno) | 235 | page = pdf.loadPage(pno) |
208 | self.page_to_png(page) | 236 | self.page_to_png(page) |
209 | 237 | ||
238 | def check_ebank(self, pdf): | ||
239 | page_text_list = [] | ||
240 | text_item_sum = 0 | ||
241 | for pno in range(pdf.pageCount): | ||
242 | page = pdf.loadPage(pno) | ||
243 | textpage = page.getTextPage() | ||
244 | text = textpage.extractDICT() | ||
245 | text_list = [] | ||
246 | for block in text.get('blocks'): | ||
247 | for line in block.get('lines'): | ||
248 | for span in line.get('spans'): | ||
249 | char = span.get('text') | ||
250 | bbox = span.get('bbox') | ||
251 | if char.strip() == '': | ||
252 | continue | ||
253 | text_list.append((bbox, char)) | ||
254 | text_item_sum += len(text_list) | ||
255 | if text_item_sum < (pno + 1) * 5: | ||
256 | return | ||
257 | else: | ||
258 | page_text_list.append( | ||
259 | { | ||
260 | 'width': text.get('width'), | ||
261 | 'height': text.get('height'), | ||
262 | 'text': text_list | ||
263 | } | ||
264 | ) | ||
265 | self.is_ebank = True | ||
266 | self.page_text_list = page_text_list | ||
267 | |||
210 | def extract_image(self, max_img_count=None): | 268 | def extract_image(self, max_img_count=None): |
211 | self.img_path_list = [] | 269 | self.img_path_list = [] |
212 | self.xref_set = set() | 270 | self.xref_set = set() |
... | @@ -221,12 +279,13 @@ class PDFHandler: | ... | @@ -221,12 +279,13 @@ class PDFHandler: |
221 | if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count: | 279 | if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count: |
222 | self.img_count = pdf.pageCount | 280 | self.img_count = pdf.pageCount |
223 | return | 281 | return |
282 | self.check_ebank(pdf) | ||
224 | for pno in range(pdf.pageCount): | 283 | for pno in range(pdf.pageCount): |
225 | il = pdf.getPageImageList(pno) # 获取页面图片对象 | 284 | il = pdf.getPageImageList(pno) # 获取页面图片对象 |
226 | # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) | 285 | # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) |
227 | 286 | ||
228 | # 1.页面图片对象数目为0时,保存整个页面为png图片 | 287 | # 1.页面图片对象数目为0时,保存整个页面为png图片 |
229 | if len(il) == 0: | 288 | if self.is_ebank or len(il) == 0: |
230 | page = pdf.loadPage(pno) | 289 | page = pdf.loadPage(pno) |
231 | self.page_to_png(page) | 290 | self.page_to_png(page) |
232 | # 2.页面图片对象数目为1时: | 291 | # 2.页面图片对象数目为1时: | ... | ... |
-
Please register or sign in to post a comment