add ebank

周伟奇
Showing 2 changed files with 72 additions and 4 deletions
src/apps/doc/management/commands/ocr_process.py
src/common/tools/pdf_to_img.py
--- a/src/apps/doc/management/commands/ocr_process.py
View file @4383b4d
+++ b/src/apps/doc/management/commands/ocr_process.py
View file @4383b4d
@@ -730,11 +730,18 @@ class Command(BaseCommand, LoggerMixin):
                    else:
                        with lock:
                            todo_count_dict[task_str] = pdf_handler.img_count
-                        for img_path in pdf_handler.img_path_list:
+                        for img_idx, img_path in enumerate(pdf_handler.img_path_list):
                            while img_queue.full():
                                self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
                                time.sleep(self.sleep_time_img_put)
-                            img_queue.put(img_path)
+                            if pdf_handler.is_ebank:
+                                try:
+                                    text_list = pdf_handler.page_text_list[img_idx].pop('rebuild_text')
+                                except Exception as e:
+                                    text_list = []
+                            else:
+                                text_list = []
+                            img_queue.put((img_path, text_list))
                # except EDMSException as e:
                #     try:
                #         doc.status = DocStatus.PROCESS_FAILED.value
@@ -779,7 +786,7 @@ class Command(BaseCommand, LoggerMixin):
    def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url, error_list):
        while len(error_list) == 0 or not img_queue.empty():
            try:
-                img_path = img_queue.get(block=False)
+                img_path, text_list = img_queue.get(block=False)
            except Exception as e:
                # self.online_log.info('{0} [img_2_ocr_1] [queue empty]'.format(self.log_base))
                time.sleep(self.sleep_time_img_get)
@@ -797,6 +804,8 @@ class Command(BaseCommand, LoggerMixin):
                            json_data_1 = {
                                "file": file_data
                            }
+                            if len(text_list) > 0:
+                                json_data_1['text_list'] = text_list

                            start_time = time.time()
                            ocr_1_response = requests.post(url, json=json_data_1)
--- a/src/common/tools/pdf_to_img.py
View file @4383b4d
+++ b/src/common/tools/pdf_to_img.py
View file @4383b4d
@@ -31,6 +31,8 @@ class PDFHandler:
        self.xref_set = set()
        self.img_suffixs = {'.jpeg', '.jpg', '.png', '.webp', '.bmp'}
        self.suffix = self.get_suffix(document_name)
+        self.is_ebank = False
+        self.page_text_list = []

    def get_suffix(self, file_name):
        if file_name is None:
@@ -46,6 +48,30 @@ class PDFHandler:
    def get_img_save_path(self, pno, img_index=0, ext='png'):
        return os.path.join(self.img_dir_path, 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext))

+    def rebuild_bbox(self, src_width, src_height, pno):
+        try:
+            width = self.page_text_list[pno].pop('width')
+            height = self.page_text_list[pno].pop('height')
+            src_text_list = self.page_text_list[pno].pop('text')
+
+            width_scale = src_width / width
+            height_scale = src_height / height
+
+            rebuild_text_list = []
+
+            for bbox, text in src_text_list:
+                x0, y0, x1, y1 = bbox
+                x0 = x0 * width_scale
+                y0 = y0 * height_scale
+                x1 = x1 * width_scale
+                y1 = y1 * height_scale
+                rebuild_text_list.append(
+                    ((x0, y0, x1, y0, x1, y1, x0, y1), text)
+                )
+            self.page_text_list[pno]['rebuild_text'] = rebuild_text_list
+        except Exception as e:
+            pass
+
    def page_to_png(self, page):
        if page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500:
            pm = page.getPixmap(matrix=trans_1, alpha=False)
@@ -54,6 +80,8 @@ class PDFHandler:
        img_save_path = self.get_img_save_path(page.number)
        pm.writePNG(img_save_path)
        self.img_path_list.append(img_save_path)
+        if self.is_ebank:
+            self.rebuild_bbox(pm.width, pm.height, page.number)

    @staticmethod
    def getimage(pix):
@@ -207,6 +235,36 @@ class PDFHandler:
            page = pdf.loadPage(pno)
            self.page_to_png(page)

+    def check_ebank(self, pdf):
+        page_text_list = []
+        text_item_sum = 0
+        for pno in range(pdf.pageCount):
+            page = pdf.loadPage(pno)
+            textpage = page.getTextPage()
+            text = textpage.extractDICT()
+            text_list = []
+            for block in text.get('blocks'):
+                for line in block.get('lines'):
+                    for span in line.get('spans'):
+                        char = span.get('text')
+                        bbox = span.get('bbox')
+                        if char.strip() == '':
+                            continue
+                        text_list.append((bbox, char))
+            text_item_sum += len(text_list)
+            if text_item_sum < (pno + 1) * 5:
+                return
+            else:
+                page_text_list.append(
+                    {
+                        'width': text.get('width'),
+                        'height': text.get('height'),
+                        'text': text_list
+                    }
+                )
+        self.is_ebank = True
+        self.page_text_list = page_text_list
+
    def extract_image(self, max_img_count=None):
        self.img_path_list = []
        self.xref_set = set()
@@ -221,12 +279,13 @@ class PDFHandler:
                if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
                    self.img_count = pdf.pageCount
                    return
+                self.check_ebank(pdf)
                for pno in range(pdf.pageCount):
                    il = pdf.getPageImageList(pno)  # 获取页面图片对象
                    # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)

                    # 1.页面图片对象数目为0时，保存整个页面为png图片
-                    if len(il) == 0:
+                    if self.is_ebank or len(il) == 0:
                        page = pdf.loadPage(pno)
                        self.page_to_png(page)
                    # 2.页面图片对象数目为1时：