add asyncio

周伟奇
Showing 2 changed files with 117 additions and 78 deletions
.gitignore
src/apps/doc/management/commands/doc_ocr_process.py
--- a/.gitignore
View file @6a5899f
+++ b/.gitignore
View file @6a5899f
@@ -33,6 +33,4 @@ data/*
 # 脚本
 src/*.sh

-test.py
-ocr_test.py
-ocr_test_2.py
\ No newline at end of file
+test*
\ No newline at end of file
--- a/src/apps/doc/management/commands/doc_ocr_process.py
View file @6a5899f
+++ b/src/apps/doc/management/commands/doc_ocr_process.py
View file @6a5899f
@@ -175,14 +175,12 @@ class Command(BaseCommand, LoggerMixin):
                            (field_dict.get('chn_key', ''), field_dict.get('value', '')))
                    license_summary.setdefault(classify, []).append(res_list)

-    # async def fetch_ocr_result(self, img_path):
-    #     async with aiohttp.ClientSession(
-    #             headers=self.ocr_header, connector=aiohttp.TCPConnector(ssl=False)
-    #     ) as session:
-    #         json_data = self.get_ocr_json(img_path)
-    #         async with session.post(self.ocr_url, json=json_data) as response:
-    #             return await response.json()
-    #
+    async def fetch_ocr_result(self, url, json_data):
+        async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
+            async with session.post(url, json=json_data) as response:
+                if response.status == 200:
+                    return await response.json()
+
    # async def img_2_ocr_2_wb(self, wb, img_path, summary):
    #     res = await self.fetch_ocr_result(img_path)
    #     self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res))
@@ -190,56 +188,7 @@ class Command(BaseCommand, LoggerMixin):
    #     img_name = os.path.basename(img_path)
    #     self.append_sheet(wb, sheets_list, img_name, summary)

-    def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary):
-        # # 流水
-        # res = {
-        #     'code': 1,
-        #     'msg': 'success',
-        #     'data': {
-        #         'classify': 0,
-        #         'confidence': 0.999,
-        #         'data': [
-        #             {
-        #                 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
-        #                 'cells': []
-        #             },
-        #             {
-        #                 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
-        #                 'cells': []
-        #             }
-        #         ]
-        #     }
-        # }
-        #
-        # # 证件-1
-        # res = {
-        #     'code': 1,
-        #     'msg': 'success',
-        #     'data': {
-        #         'classify': 0,
-        #         'confidence': 0.999,
-        #         'data': [
-        #             {
-        #                 'cn_key': 'value',
-        #                 'cn_key': 'value',
-        #             },
-        #             {
-        #                 'cn_key': 'value',
-        #                 'cn_key': 'value',
-        #             },
-        #         ]
-        #     }
-        # }
-        #
-        # # 证件-2 or 其他类
-        # res = {
-        #     'code': 1,
-        #     'msg': 'success',
-        #     'data': {
-        #         'classify': 0,
-        #         'confidence': 0.999,
-        #     }
-        # }
+    async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary):
        with open(img_path, 'rb') as f:
            base64_data = base64.b64encode(f.read())
            # 获取解码后的base64值
@@ -247,9 +196,10 @@ class Command(BaseCommand, LoggerMixin):
        json_data_1 = {
            "file": file_data
        }
-        response_1 = requests.post(self.ocr_url_1, json=json_data_1)
-        if response_1.status_code == 200:
-            ocr_res_1 = response_1.json()
+        ocr_res_1 = await self.fetch_ocr_result(self.ocr_url_1, json_data_1)
+        if ocr_res_1 is None:
+            raise Exception('ocr 1 error, img_path={0}'.format(img_path))
+        else:
            self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
                self.log_base, img_path, ocr_res_1))

@@ -270,22 +220,112 @@ class Command(BaseCommand, LoggerMixin):
                        "secret": conf.OCR_SECRET,
                        "file": file_data
                    }
-                    response_2 = requests.post(self.ocr_url_2, data=json_data_2)
-                    if response_2.status_code == 200:
+                    ocr_res_2 = await self.fetch_ocr_result(self.ocr_url_2, json_data_2)
+                    if ocr_res_2 is None:
+                        raise Exception('ocr 2 error, img_path={0}'.format(img_path))
+                    else:
                        # 识别结果
-                        ocr_res_2 = response_2.json()
                        self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
                            self.log_base, img_path, ocr_res_2))
                        self.license2_process(ocr_res_2, license_summary, pid, classify)
-                    else:
-                        raise Exception('ocr 2 error, img_path={0}'.format(img_path))
                else:  # 流水处理
                    self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify)
-            else:
-                pass
-        else:
-            raise Exception('ocr 1 error, img_path={0}'.format(img_path))

+    # def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary):
+    #     # # 流水
+    #     # res = {
+    #     #     'code': 1,
+    #     #     'msg': 'success',
+    #     #     'data': {
+    #     #         'classify': 0,
+    #     #         'confidence': 0.999,
+    #     #         'data': [
+    #     #             {
+    #     #                 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
+    #     #                 'cells': []
+    #     #             },
+    #     #             {
+    #     #                 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
+    #     #                 'cells': []
+    #     #             }
+    #     #         ]
+    #     #     }
+    #     # }
+    #     #
+    #     # # 证件-1
+    #     # res = {
+    #     #     'code': 1,
+    #     #     'msg': 'success',
+    #     #     'data': {
+    #     #         'classify': 0,
+    #     #         'confidence': 0.999,
+    #     #         'data': [
+    #     #             {
+    #     #                 'cn_key': 'value',
+    #     #                 'cn_key': 'value',
+    #     #             },
+    #     #             {
+    #     #                 'cn_key': 'value',
+    #     #                 'cn_key': 'value',
+    #     #             },
+    #     #         ]
+    #     #     }
+    #     # }
+    #     #
+    #     # # 证件-2 or 其他类
+    #     # res = {
+    #     #     'code': 1,
+    #     #     'msg': 'success',
+    #     #     'data': {
+    #     #         'classify': 0,
+    #     #         'confidence': 0.999,
+    #     #     }
+    #     # }
+    #     with open(img_path, 'rb') as f:
+    #         base64_data = base64.b64encode(f.read())
+    #         # 获取解码后的base64值
+    #         file_data = base64_data.decode()
+    #     json_data_1 = {
+    #         "file": file_data
+    #     }
+    #     response_1 = requests.post(self.ocr_url_1, json=json_data_1)
+    #     if response_1.status_code == 200:
+    #         ocr_res_1 = response_1.json()
+    #         self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
+    #             self.log_base, img_path, ocr_res_1))
+    #
+    #         if ocr_res_1.get('code') == 1:
+    #             ocr_data = ocr_res_1.get('data', {})
+    #             classify = ocr_data.get('classify')
+    #             if classify is None:
+    #                 return
+    #             elif classify in consts.OTHER_CLASSIFY_SET:  # 其他类
+    #                 return
+    #             elif classify in consts.LICENSE_CLASSIFY_SET_1:  # 证件1
+    #                 self.license1_process(ocr_data, license_summary, classify)
+    #             elif classify in consts.LICENSE_CLASSIFY_SET_2:  # 证件2
+    #                 pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
+    #                 json_data_2 = {
+    #                     "pid": str(pid),
+    #                     "key": conf.OCR_KEY,
+    #                     "secret": conf.OCR_SECRET,
+    #                     "file": file_data
+    #                 }
+    #                 response_2 = requests.post(self.ocr_url_2, data=json_data_2)
+    #                 if response_2.status_code == 200:
+    #                     # 识别结果
+    #                     ocr_res_2 = response_2.json()
+    #                     self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
+    #                         self.log_base, img_path, ocr_res_2))
+    #                     self.license2_process(ocr_res_2, license_summary, pid, classify)
+    #                 else:
+    #                     raise Exception('ocr 2 error, img_path={0}'.format(img_path))
+    #             else:  # 流水处理
+    #                 self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify)
+    #         else:
+    #             pass
+    #     else:
+    #         raise Exception('ocr 1 error, img_path={0}'.format(img_path))

    @staticmethod
    def get_most(value_list):
@@ -475,13 +515,14 @@ class Command(BaseCommand, LoggerMixin):
                # wb = Workbook()

                # 4.1 获取OCR结果
-                # loop = asyncio.get_event_loop()
-                # tasks = [self.img_2_ocr_2_wb(wb, img_path, summary) for img_path in pdf_handler.img_path_list]
-                # loop.run_until_complete(asyncio.wait(tasks))
+                loop = asyncio.get_event_loop()
+                tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary)
+                         for img_path in pdf_handler.img_path_list]
+                loop.run_until_complete(asyncio.wait(tasks))
                # loop.close()

-                for img_path in pdf_handler.img_path_list:
-                    self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary)
+                # for img_path in pdf_handler.img_path_list:
+                #     self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary)

                self.cronjob_log.info('{0} [bs_summary={1}] [unknown_summary={2}] [license_summary={3}]'.format(
                    self.log_base, bs_summary, unknown_summary, license_summary))