add edms retry

周伟奇
Showing 3 changed files with 165 additions and 124 deletions
src/apps/doc/consts.py
src/apps/doc/exceptions.py
src/apps/doc/management/commands/doc_ocr_process.py
--- a/src/apps/doc/consts.py
View file @e570371
+++ b/src/apps/doc/consts.py
View file @e570371
@@ -35,6 +35,8 @@ DEALER_CODE_META_FIELD_id = 13
 BUSINESS_TYPE_META_FIELD_id = 93
 DEALER_CODE = 'ocr_situ_group'

+RETRY_TIMES = 3
+
 # ---------银行流水模板相关--------------------------------------------------------------------------------------------

 TRANS_MAP = {
--- a/src/apps/doc/exceptions.py 0 → 100644
View file @e570371
+++ b/src/apps/doc/exceptions.py 0 → 100644
View file @e570371
+class EDMSException(Exception):
+    pass
--- a/src/apps/doc/management/commands/doc_ocr_process.py
View file @e570371
+++ b/src/apps/doc/management/commands/doc_ocr_process.py
View file @e570371
@@ -19,6 +19,7 @@ from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords
 from apps.doc.named_enum import KeywordsType
 from apps.doc import consts
 from apps.doc.ocr.edms import EDMS, rh
+from apps.doc.exceptions import EDMSException


 class Command(BaseCommand, LoggerMixin):
@@ -72,11 +73,21 @@ class Command(BaseCommand, LoggerMixin):
        os.makedirs(doc_data_path, exist_ok=True)
        pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
        if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
-            self.edms.download(pdf_path, doc.metadata_version_id)
+            for times in range(consts.RETRY_TIMES):
+                try:
+                    self.edms.download(pdf_path, doc.metadata_version_id)
+                except Exception as e:
+                    self.cronjob_log.warn('{0} [edms download failed] [times={1}] [business_type={2}] [doc_id={3}] '
+                                          '[error={4}]'.format(self.log_base, times, business_type, doc.id, e))
+                    edms_exc = str(e)
+                else:
+                    break
+            else:
+                raise EDMSException(edms_exc)

        excel_path = os.path.join(doc_data_path, '{0}.xlsx'.format(doc.id))
        src_excel_path = os.path.join(doc_data_path, 'src.xlsx')
-        self.cronjob_log.info('{0} [pdf download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format(
+        self.cronjob_log.info('{0} [edms download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format(
            self.log_base, business_type, doc.id, pdf_path))
        return doc_data_path, excel_path, src_excel_path, pdf_path

@@ -177,105 +188,21 @@ class Command(BaseCommand, LoggerMixin):
        else:
            skip_img.append(self.parse_img_path(img_path))

-    # async def fetch_ocr_result(self, url, json_data):
-    #     async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
-    #         async with session.post(url, json=json_data) as response:
-    #             if response.status == 200:
-    #                 return await response.json()
-    #
-    # async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary):
-    #     with open(img_path, 'rb') as f:
-    #         base64_data = base64.b64encode(f.read())
-    #         # 获取解码后的base64值
-    #         file_data = base64_data.decode()
-    #     json_data_1 = {
-    #         "file": file_data
-    #     }
-    #     ocr_res_1 = await self.fetch_ocr_result(self.ocr_url_1, json_data_1)
-    #     if ocr_res_1 is None:
-    #         raise Exception('ocr 1 error, img_path={0}'.format(img_path))
-    #     else:
-    #         self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
-    #             self.log_base, img_path, ocr_res_1))
-    #
-    #         if ocr_res_1.get('code') == 1:
-    #             ocr_data = ocr_res_1.get('data', {})
-    #             classify = ocr_data.get('classify')
-    #             if classify is None:
-    #                 return
-    #             elif classify in consts.OTHER_CLASSIFY_SET:  # 其他类
-    #                 return
-    #             elif classify in consts.LICENSE_CLASSIFY_SET_1:  # 证件1
-    #                 self.license1_process(ocr_data, license_summary, classify)
-    #             elif classify in consts.LICENSE_CLASSIFY_SET_2:  # 证件2
-    #                 pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
-    #                 json_data_2 = {
-    #                     "pid": str(pid),
-    #                     "key": conf.OCR_KEY,
-    #                     "secret": conf.OCR_SECRET,
-    #                     "file": file_data
-    #                 }
-    #                 ocr_res_2 = await self.fetch_ocr_result(self.ocr_url_2, json_data_2)
-    #                 if ocr_res_2 is None:
-    #                     raise Exception('ocr 2 error, img_path={0}'.format(img_path))
-    #                 else:
-    #                     # 识别结果
-    #                     self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
-    #                         self.log_base, img_path, ocr_res_2))
-    #                     self.license2_process(ocr_res_2, license_summary, pid, classify)
-    #             else:  # 流水处理
-    #                 self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify)
-
-    def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
-        # # 流水
-        # res = {
-        #     'code': 1,
-        #     'msg': 'success',
-        #     'data': {
-        #         'classify': 0,
-        #         'confidence': 0.999,
-        #         'data': [
-        #             {
-        #                 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
-        #                 'cells': []
-        #             },
-        #             {
-        #                 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
-        #                 'cells': []
-        #             }
-        #         ]
-        #     }
-        # }
-        #
-        # # 证件-1
-        # res = {
-        #     'code': 1,
-        #     'msg': 'success',
-        #     'data': {
-        #         'classify': 0,
-        #         'confidence': 0.999,
-        #         'data': [
-        #             {
-        #                 'cn_key': 'value',
-        #                 'cn_key': 'value',
-        #             },
-        #             {
-        #                 'cn_key': 'value',
-        #                 'cn_key': 'value',
-        #             },
-        #         ]
-        #     }
-        # }
-        #
-        # # 证件-2 or 其他类
-        # res = {
-        #     'code': 1,
-        #     'msg': 'success',
-        #     'data': {
-        #         'classify': 0,
-        #         'confidence': 0.999,
-        #     }
-        # }
+    @staticmethod
+    async def fetch_ocr_1_result(url, json_data):
+        async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
+            async with session.post(url, json=json_data) as response:
+                if response.status == 200:
+                    return await response.json()
+
+    @staticmethod
+    async def fetch_ocr_2_result(url, json_data):
+        async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
+            async with session.post(url, data=json_data) as response:
+                if response.status == 200:
+                    return await response.json()
+
+    async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
        with open(img_path, 'rb') as f:
            base64_data = base64.b64encode(f.read())
            # 获取解码后的base64值
@@ -283,9 +210,10 @@ class Command(BaseCommand, LoggerMixin):
        json_data_1 = {
            "file": file_data
        }
-        response_1 = requests.post(self.ocr_url_1, json=json_data_1)
-        if response_1.status_code == 200:
-            ocr_res_1 = response_1.json()
+        ocr_res_1 = await self.fetch_ocr_1_result(self.ocr_url_1, json_data_1)
+        if ocr_res_1 is None:
+            raise Exception('ocr 1 error, img_path={0}'.format(img_path))
+        else:
            self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
                self.log_base, img_path, ocr_res_1))

@@ -308,26 +236,121 @@ class Command(BaseCommand, LoggerMixin):
                        "secret": conf.OCR_SECRET,
                        "file": file_data
                    }
-                    response_2 = requests.post(self.ocr_url_2, data=json_data_2)
-                    if response_2.status_code == 200:
+                    ocr_res_2 = await self.fetch_ocr_2_result(self.ocr_url_2, json_data_2)
+                    if ocr_res_2 is None:
+                        raise Exception('ocr 2 error, img_path={0}'.format(img_path))
+                    else:
                        # 识别结果
-                        ocr_res_2 = response_2.json()
                        self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
                            self.log_base, img_path, ocr_res_2))
                        self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path)
-                    else:
-                        raise Exception('ocr 2 error, img_path={0}'.format(img_path))
                else:  # 流水处理
                    self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img)
            else:
                skip_img.append(self.parse_img_path(img_path))
-        else:
-            raise Exception('ocr 1 error, img_path={0}'.format(img_path))
+
+    # def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
+    #     # # 流水
+    #     # res = {
+    #     #     'code': 1,
+    #     #     'msg': 'success',
+    #     #     'data': {
+    #     #         'classify': 0,
+    #     #         'confidence': 0.999,
+    #     #         'data': [
+    #     #             {
+    #     #                 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
+    #     #                 'cells': []
+    #     #             },
+    #     #             {
+    #     #                 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
+    #     #                 'cells': []
+    #     #             }
+    #     #         ]
+    #     #     }
+    #     # }
+    #     #
+    #     # # 证件-1
+    #     # res = {
+    #     #     'code': 1,
+    #     #     'msg': 'success',
+    #     #     'data': {
+    #     #         'classify': 0,
+    #     #         'confidence': 0.999,
+    #     #         'data': [
+    #     #             {
+    #     #                 'cn_key': 'value',
+    #     #                 'cn_key': 'value',
+    #     #             },
+    #     #             {
+    #     #                 'cn_key': 'value',
+    #     #                 'cn_key': 'value',
+    #     #             },
+    #     #         ]
+    #     #     }
+    #     # }
+    #     #
+    #     # # 证件-2 or 其他类
+    #     # res = {
+    #     #     'code': 1,
+    #     #     'msg': 'success',
+    #     #     'data': {
+    #     #         'classify': 0,
+    #     #         'confidence': 0.999,
+    #     #     }
+    #     # }
+    #     with open(img_path, 'rb') as f:
+    #         base64_data = base64.b64encode(f.read())
+    #         # 获取解码后的base64值
+    #         file_data = base64_data.decode()
+    #     json_data_1 = {
+    #         "file": file_data
+    #     }
+    #     response_1 = requests.post(self.ocr_url_1, json=json_data_1)
+    #     if response_1.status_code == 200:
+    #         ocr_res_1 = response_1.json()
+    #         self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
+    #             self.log_base, img_path, ocr_res_1))
+    #
+    #         if ocr_res_1.get('code') == 1:
+    #             ocr_data = ocr_res_1.get('data', {})
+    #             classify = ocr_data.get('classify')
+    #             if classify is None:
+    #                 skip_img.append(self.parse_img_path(img_path))
+    #                 return
+    #             elif classify in consts.OTHER_CLASSIFY_SET:  # 其他类
+    #                 skip_img.append(self.parse_img_path(img_path))
+    #                 return
+    #             elif classify in consts.LICENSE_CLASSIFY_SET_1:  # 证件1
+    #                 self.license1_process(ocr_data, license_summary, classify, skip_img, img_path)
+    #             elif classify in consts.LICENSE_CLASSIFY_SET_2:  # 证件2
+    #                 pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
+    #                 json_data_2 = {
+    #                     "pid": str(pid),
+    #                     "key": conf.OCR_KEY,
+    #                     "secret": conf.OCR_SECRET,
+    #                     "file": file_data
+    #                 }
+    #                 response_2 = requests.post(self.ocr_url_2, data=json_data_2)
+    #                 if response_2.status_code == 200:
+    #                     # 识别结果
+    #                     ocr_res_2 = response_2.json()
+    #                     self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
+    #                         self.log_base, img_path, ocr_res_2))
+    #                     self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path)
+    #                 else:
+    #                     raise Exception('ocr 2 error, img_path={0}'.format(img_path))
+    #             else:  # 流水处理
+    #                 self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img)
+    #         else:
+    #             skip_img.append(self.parse_img_path(img_path))
+    #     else:
+    #         raise Exception('ocr 1 error, img_path={0}'.format(img_path))

    @staticmethod
    def parse_img_path(img_path):
        img_name, _ = os.path.splitext(os.path.basename(img_path))
-        return img_name[5], img_name[11]
+        return int(img_name[5])+1, int(img_name[11])+1

    @staticmethod
    def get_most(value_list):
@@ -520,14 +543,14 @@ class Command(BaseCommand, LoggerMixin):
                # wb = Workbook()

                # 4.1 获取OCR结果
-                # loop = asyncio.get_event_loop()
-                # tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary)
-                #          for img_path in pdf_handler.img_path_list]
-                # loop.run_until_complete(asyncio.wait(tasks))
+                loop = asyncio.get_event_loop()
+                tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img)
+                         for img_path in pdf_handler.img_path_list]
+                loop.run_until_complete(asyncio.wait(tasks))
                # loop.close()

-                for img_path in pdf_handler.img_path_list:
-                    self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img)
+                # for img_path in pdf_handler.img_path_list:
+                #     self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img)

                self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] '
                                      '[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary,
@@ -545,23 +568,37 @@ class Command(BaseCommand, LoggerMixin):
                wb.save(src_excel_path)
                wb.rebuild(merged_bs_summary, license_summary, skip_img)
                wb.save(excel_path)
+            except EDMSException as e:
+                self.cronjob_log.error('{0} [process failed (edms download)] [business_type={1}] [doc_id={2}] '
+                                       '[err={3}]'.format(self.log_base, business_type, doc.id, e))
            except Exception as e:
                doc.status = DocStatus.PROCESS_FAILED.value
                doc.save()
-                self.cronjob_log.error('{0} [process failed] [business_type={1}] [doc_id={2}] [err={3}]'.format(
-                    self.log_base, business_type, doc.id, e))
+                self.cronjob_log.error('{0} [process failed (program)] [business_type={1}] [doc_id={2}] '
+                                       '[err={3}]'.format(self.log_base, business_type, doc.id, e))
            else:
                try:
                    # 5.上传至EDMS
-                    self.edms.upload(excel_path, doc, business_type)
-                    # print('upload pass')
+                    for times in range(consts.RETRY_TIMES):
+                        try:
+                            self.edms.upload(excel_path, doc, business_type)
+                        except Exception as e:
+                            self.cronjob_log.warn(
+                                '{0} [edms upload failed] [times={1}] [business_type={2}] [doc_id={3}] '
+                                '[error={4}]'.format(self.log_base, times, business_type, doc.id, e))
+                            edms_exc = str(e)
+                        else:
+                            break
+                    else:
+                        raise EDMSException(edms_exc)
                except Exception as e:
                    doc.status = DocStatus.UPLOAD_FAILED.value
                    doc.save()
                    end_time = time.time()
                    speed_time = int(end_time - start_time)
-                    self.cronjob_log.error('{0} [upload failed] [business_type={1}] [doc_id={2}] [speed_time={3}] '
-                                           '[err={4}]'.format(self.log_base, business_type, doc.id, speed_time, e))
+                    self.cronjob_log.error('{0} [process failed (edms upload)] [business_type={1}] [doc_id={2}] '
+                                           '[speed_time={3}] [err={4}]'.format(self.log_base, business_type, doc.id,
+                                                                               speed_time, e))
                    write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))

                else: