Merge branch 'feature/weixin-bs-2'

冯轩
Showing 4 changed files with 266 additions and 1 deletions
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/ocr/wb.py
src/apps/doc/views.py
src/common/tools/pdf_to_img.py
--- a/src/apps/doc/management/commands/ocr_process.py
View file @272692c
+++ b/src/apps/doc/management/commands/ocr_process.py
View file @272692c
@@ -1287,7 +1287,10 @@ class Command(BaseCommand, LoggerMixin):
                    target_pdf_path = os.path.join(pdf_doc_data_path, '{0}.pdf'.format(pdf_doc.id))
                    shutil.move(pdf_path, target_pdf_path)
-                    pdf_task_str = consts.SPLIT_STR.join([business_type, str(pdf_doc.id), '0'])
+                    if '微信支付交易明细证明' in os.path.basename(pdf_path) or '微信流水' in os.path.basename(pdf_path): 
+                        pdf_task_str = consts.SPLIT_STR.join([business_type, str(pdf_doc.id), '12'])
+                    else:
+                        pdf_task_str = consts.SPLIT_STR.join([business_type, str(pdf_doc.id), '0'])
                    pdf_task_str_list.append(pdf_task_str)
                except Exception as e:
                    self.online_log.warn('{0} [zip_2_pdfs] [recreate pdf task failed] [task={1}] [pdf_path={2}]'
@@ -1504,6 +1507,138 @@ class Command(BaseCommand, LoggerMixin):
                                self.log_base, traceback.format_exc()))
                            # error_list.append(1)
                            # return
+                elif classify_1_str == '12' or classify_1_str == '29': # weixin e-bs 或e-invoice 都走微信电子流水逻辑
+                    try:
+                        max_img_count = 500
+                        for times in range(consts.RETRY_TIMES):
+                            try:
+                                if doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
+                                    self.online_log.info('{0} [mo ni xia dan] [task={1}] [times={2}] '
+                                                         '[pdf_path={3}]'.format(self.log_base, task_str,
+                                                                                 times, pdf_path))
+                                elif os.path.exists(pdf_path):
+                                    self.online_log.info('{0} [pdf from zip file] [task={1}] [times={2}] '
+                                                         '[pdf_path={3}]'.format(self.log_base, task_str,
+                                                                                 times, pdf_path))
+                                else:
+                                    # self.edms.download(pdf_path, doc.metadata_version_id)
+                                    self.edms.download(pdf_path, doc.metadata_version_id, doc.document_scheme,
+                                                       business_type)
+                                    self.online_log.info('{0} [ecm download success] [task={1}] [times={2}] '
+                                                         '[pdf_path={3}]'.format(self.log_base, task_str,
+                                                                                 times, pdf_path))
+                                # 3.PDF文件提取图片
+                                self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
+                                    self.log_base, task_str, times))
+                                start_time = time.time()
+                                pdf_handler.extract_image_for_weixin(max_img_count)
+                                end_time = time.time()
+                                speed_time = int(end_time - start_time)
+                                self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'.format(
+                                    self.log_base, task_str, times, speed_time, pdf_handler.is_new_modify))
+                            except Exception as e:
+                                self.online_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] '
+                                                      '[error={3}]'.format(self.log_base, task_str, times,
+                                                                           traceback.format_exc()))
+                            else:
+                                break
+                        else:
+                            raise Exception('download or pdf to img failed')
+                        if pdf_handler.img_count == 0:
+                            self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format(
+                                self.log_base, task_str))
+                            raise Exception('pdf img empty')
+                        elif pdf_handler.img_count >= max_img_count:
+                            self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format(
+                                self.log_base, task_str, pdf_handler.img_count))
+                            try:
+                                report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport
+                                report_table.objects.create(
+                                    case_number=doc.application_id,
+                                    request_team=RequestTeam.get_value(doc.document_scheme, 0),
+                                    request_trigger=RequestTrigger.get_value(doc.data_source, 0),
+                                    input_file=doc.document_name,
+                                    transaction_start=doc.start_time,
+                                    transaction_end=doc.start_time,
+                                    successful_at_this_level=False,
+                                    failure_reason=FailureReason.IMG_LIMIT.value,
+                                    process_name=ProcessName.ALL.value,
+                                    notes='pdf page count: {0}'.format(str(pdf_handler.img_count))
+                                )
+                            except Exception as e:
+                                self.online_log.error('{0} [process error (report db save)] [error={1}]'.format(
+                                    self.log_base, traceback.format_exc()))
+                            try:
+                                doc.status = DocStatus.PROCESS_FAILED.value
+                                doc.page_count = pdf_handler.page_count
+                                doc.save()
+                            except Exception as e:
+                                self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
+                                    self.log_base, traceback.format_exc()))
+                        else:
+                            try:
+                                if pdf_handler.is_e_pdf:
+                                    doc.metadata = pdf_handler.metadata if pdf_handler.metadata is None else \
+                                        json.dumps(pdf_handler.metadata)
+                                doc.page_count = pdf_handler.page_count
+                                doc.save()
+                            except Exception as e:
+                                self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
+                                    self.log_base, traceback.format_exc()))
+                            with lock:
+                                todo_count_dict[task_str] = pdf_handler.img_count
+                            self.online_log.info('{0} [pdf_2_img_2_queue] [{1}] [is_ebank={2}]'.format(
+                                self.log_base, task_str, pdf_handler.is_ebank
+                            ))
+                            for img_idx, img_path in enumerate(pdf_handler.img_path_list):
+                                while img_queue.full():
+                                    self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
+                                    time.sleep(self.sleep_time_img_put)
+                                if pdf_handler.is_e_weixin_bs:
+                                    try:
+                                        #self.online_log.info('{0} [pdf_2_img_2_queue] [img_idx={1}] [page_text_list={2}]'.format(self.log_base, img_idx, pdf_handler.page_text_list))
+                                        text_list = pdf_handler.page_text_list[img_idx].pop('rebuild_text')
+                                    except Exception as e:
+                                        text_list = []
+                                else:
+                                    text_list = []
+                                img_queue.put((business_type, img_path, text_list))
+                    except Exception as e:
+                        try:
+                            end_time = timezone.now()
+                            report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport
+                            report_table.objects.create(
+                                case_number=doc.application_id,
+                                request_team=RequestTeam.get_value(doc.document_scheme, 0),
+                                request_trigger=RequestTrigger.get_value(doc.data_source, 0),
+                                input_file=doc.document_name,
+                                transaction_start=doc.start_time,
+                                transaction_end=end_time,
+                                successful_at_this_level=False,
+                                failure_reason=FailureReason.PDF.value,
+                                process_name=ProcessName.ALL.value,
+                            )
+                        except Exception as e:
+                            self.online_log.error('{0} [process error (report db save)] [error={1}]'.format(
+                                self.log_base, traceback.format_exc()))
+                        try:
+                            doc.status = DocStatus.PROCESS_FAILED.value
+                            doc.page_count = pdf_handler.page_count
+                            doc.save()
+                            self.online_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] '
+                                                  '[error={2}]'.format(self.log_base, task_str, traceback.format_exc()))
+                        except Exception as e:
+                            self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
+                                self.log_base, traceback.format_exc()))
+                            # error_list.append(1)
+                            # return
                else:  # e-contract or or e-fsm-contract or e-hmh
                    try:
                        # pdf下载 处理 图片存储 识别
@@ -1674,6 +1809,7 @@ class Command(BaseCommand, LoggerMixin):
                                json_data_1['text_list'] = text_list
                            start_time = time.time()
+                            self.online_log.info('{0} [ocr_1 api] [img={1}] [json_data_1={2}]'.format(self.log_base, img_path, json_data_1))
                            ocr_1_response = requests.post(url, json=json_data_1)
                            if ocr_1_response.status_code != 200:
                                raise OCR1Exception('ocr_1 status code: {0}'.format(ocr_1_response.status_code))
@@ -1684,6 +1820,7 @@ class Command(BaseCommand, LoggerMixin):
                        else:
                            self.online_log.info('{0} [ocr_1 start] [img={1}] [url={2}]'.format(self.log_base, img_path, url))
                            ocr_1_res = ocr_1_response.json()
+                            self.online_log.info('{0} [ocr_1 api res] [img={1}] [ocr_1_res={2}]'.format(self.log_base, img_path, ocr_1_res))
                            end_time = time.time()
                            speed_time = int(end_time - start_time)
                            self.online_log.info('{0} [ocr_1 success] [img={1}] [url={2}] [speed_time={3}]'.format(
--- a/src/apps/doc/ocr/wb.py
View file @272692c
+++ b/src/apps/doc/ocr/wb.py
View file @272692c
@@ -11,6 +11,8 @@ from openpyxl import Workbook
 from openpyxl.styles import PatternFill, numbers
 from openpyxl.utils import get_column_letter
 from apps.doc import consts
+import logging
+online_log = logging.getLogger('online')
 class BSWorkbook(Workbook):
@@ -562,6 +564,8 @@ class BSWorkbook(Workbook):
                borrow_cell = None if borrow_cell_idx is None or borrow_cell_idx >= length else rows[borrow_cell_idx]
                summary_cell_value = None if summary_cell is None else summary_cell.value
+                if summary_cell.value is not None:
+                    summary_cell_value = summary_cell_value.strip()
                date_cell_value = None if date_cell is None else date_cell.value
                amount_cell_value = None if amount_cell is None else amount_cell.value
                over_cell_value = None if over_cell is None else over_cell.value
@@ -638,6 +642,7 @@ class BSWorkbook(Workbook):
                # 3.2.提取信息、高亮
                # row = summary_cell.row
+                # online_log.info('[ti qu xin xi gao liang =========== >] [summary_cell_value={0}]'.format(summary_cell_value))
                if summary_cell is not None:
                    # 关键词1提取
                    if summary_cell_value in self.interest_keyword:
--- a/src/apps/doc/views.py
View file @272692c
+++ b/src/apps/doc/views.py
View file @272692c
@@ -693,6 +693,14 @@ class UploadDocView(GenericView, DocHandler):
                    classify_1 = classify_1_tmp
                    break 
+        if classify_1 == 0 and ('微信支付交易明细证明' in document_name or '微信流水' in document_name):
+            classify_1 = 12
+            self.running_log.info('[weixin bs process] [doc_id={0}]'.format(doc.id))
+        if classify_1 == 0 and (document_name.startswith("dzfp_")):
+            classify_1 = 0
+            self.running_log.info('[dzfp process] [doc_id={0}]'.format(doc.id))
        if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \
                or document_name.endswith('.RAR'):
@@ -1247,6 +1255,14 @@ class DocView(DocGenericView, DocHandler):
                if keyword in document_name:
                    classify_1 = classify_1_tmp
                    break 
+        if classify_1 == 0 and ('微信支付交易明细证明' in document_name or '微信流水' in document_name):
+            classify_1 = 12
+            self.running_log.info('[weixin bs process] [doc_id={0}]'.format(doc.id))
+        if classify_1 == 0 and (document_name.startswith("dzfp_")):
+            classify_1 = 0
+            self.running_log.info('[dzfp process] [doc_id={0}]'.format(doc.id))
        # tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)]
        task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)])
--- a/src/common/tools/pdf_to_img.py
View file @272692c
+++ b/src/common/tools/pdf_to_img.py
View file @272692c
@@ -69,6 +69,7 @@ class PDFHandler:
        self.suffix = self.get_suffix(document_name)
        self.is_ebank = False
        self.is_e_pdf = False
+        self.is_e_weixin_bs = False
        self.page_text_list = []
        self.pdf_info = {}
        self.img_path_pno_list = []
@@ -186,6 +187,8 @@ class PDFHandler:
        self.img_path_list.append(img_save_path)
        if self.is_ebank:
            self.rebuild_bbox(pm.width, pm.height, page.number)
+        if self.is_e_weixin_bs:
+            self.rebuild_bbox(pm.width, pm.height, page.number)
    @staticmethod
    def getimage(pix):
@@ -407,6 +410,57 @@ class PDFHandler:
        self.is_e_pdf = True
        self.page_text_list = page_text_list
+    def put_text(self, pdf):
+        page_text_list = []
+        text_item_sum = 0
+        for pno in range(pdf.pageCount):
+            page = pdf.loadPage(pno)
+            if page.rotation is None:
+                rotation = 0
+            elif isinstance(page.rotation, int):
+                divisor, remainder = divmod(page.rotation, 90)
+                if remainder != 0:
+                    return
+                rotation = divmod(divisor, 4)[1]
+            else:
+                return
+            textpage = page.getTextPage()
+            text = textpage.extractDICT()
+            text_list = []
+            for block in text.get('blocks'):
+                for line in block.get('lines'):
+                    for span in line.get('spans'):
+                        char = span.get('text')
+                        if char.strip() == '':
+                            continue
+                        #  特殊emoji跳过
+                        try:
+                            print(char)
+                        except Exception as e:
+                            continue
+                        bbox = span.get('bbox')
+                        if pno == 0 and self.title_is_ebank(char):
+                            in_ebank_set = True
+                        text_list.append((bbox, char))
+            text_item_sum += len(text_list)
+            if text_item_sum < (pno + 1) * 5:
+                return
+            else:
+                page_text_list.append(
+                    {
+                        'width': text.get('width'),
+                        'height': text.get('height'),
+                        'rotation': rotation,
+                        'text': text_list
+                    }
+                )
+        self.is_e_pdf = True
+        self.is_e_weixin_bs = True
+        self.page_text_list = page_text_list
    def e_contract_process(self):
        os.makedirs(self.img_dir_path, exist_ok=True)
        with fitz.Document(self.path) as pdf:
@@ -473,6 +527,59 @@ class PDFHandler:
                        self.merge_il(pdf, pno, il)
        self.img_count = len(self.img_path_list)
+    def extract_image_for_weixin(self, max_img_count=None):
+        self.img_path_list = []
+        self.xref_set = set()
+        os.makedirs(self.img_dir_path, exist_ok=True)
+        if self.suffix in self.img_suffixs:
+            img_save_path = self.get_img_save_path(0, ext=self.suffix[1:])
+            shutil.copy(self.path, img_save_path)
+            self.img_path_list.append(img_save_path)
+        else:
+            with fitz.Document(self.path) as pdf:
+                # 解密
+                for pwd in self.pwd_list:
+                    if not pdf.isEncrypted:
+                        break
+                    pdf.authenticate(pwd)
+                self.metadata = pdf.metadata
+                self.page_count = pdf.pageCount
+                if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
+                    self.img_count = pdf.pageCount
+                    return
+                self.put_text(pdf)
+                for pno in range(pdf.pageCount):
+                    il = pdf.getPageImageList(pno)  # 获取页面图片对象
+                    # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
+                    # 1.页面图片对象数目为0时，保存整个页面为png图片
+                    if self.is_e_pdf or self.is_ebank or len(il) == 0:
+                        page = pdf.loadPage(pno)
+                        self.page_to_png(page)
+                    # 2.页面图片对象数目为1时：
+                    # 小图(如电子账单的盖章)：保存整个页面为png图片
+                    # 大图：提取图片对象
+                    elif len(il) == 1:
+                        xref, smask, width, height, _, colorspace, _, _, _ = il[0]
+                        # 小图
+                        if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]:
+                            page = pdf.loadPage(pno)
+                            self.page_to_png(page)
+                        # 大图
+                        elif width >= WH_COUPLE_6[0] or height >= WH_COUPLE_6[1]:
+                            self.is_new_modify = 1
+                            is_big_img = (width < WH_COUPLE_7[0] and height < WH_COUPLE_7[1]) # 防止图片过大
+                            page = pdf.loadPage(pno)
+                            self.page_to_png(page, is_big_img=is_big_img) 
+                        elif xref not in self.xref_set:
+                            self.extract_single_image(pdf, xref, smask, colorspace, pno)
+                    # 3.页面图片对象数目大于1时，特殊处理
+                    else:
+                        self.merge_il(pdf, pno, il)
+        self.img_count = len(self.img_path_list)
    def extract_page_image(self):
        self.img_path_list = []
        self.xref_set = set()