init reocr

冯轩
Showing 2 changed files with 68 additions and 12 deletions
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/views.py
--- a/src/apps/doc/management/commands/ocr_process.py
View file @91ff815
+++ b/src/apps/doc/management/commands/ocr_process.py
View file @91ff815
@@ -100,7 +100,7 @@ class Command(BaseCommand, LoggerMixin):
            if len(info_tuple) == 2:
                business_type, doc_id_str = info_tuple
            else:
-                business_type, doc_id_str, classify_1_str = info_tuple
+                business_type, doc_id_str, classify_1_str, re_ocr_flag = info_tuple
            doc_id = int(doc_id_str)
            doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
            zip_doc = doc_class.objects.filter(id=doc_id).first()
@@ -124,7 +124,7 @@ class Command(BaseCommand, LoggerMixin):
        else:
            self.online_log.info('{0} [zip_2_pdfs] [db save end] [task_str={1}]'.format(
                self.log_base, task_str))
-            return zip_doc, business_type
+            return zip_doc, business_type, re_ocr_flag

    def get_doc_info(self, task_str, is_priority=False):
        try:
@@ -135,7 +135,7 @@ class Command(BaseCommand, LoggerMixin):
                classify_1_str = '0'
                rebuild_task_str = task_str
            else:
-                business_type, doc_id_str, classify_1_str = info_tuple
+                business_type, doc_id_str, classify_1_str, re_ocr_flag = info_tuple
                rebuild_task_str = '{0}{1}{2}'.format(business_type, consts.SPLIT_STR, doc_id_str)
            doc_id = int(doc_id_str)
            doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
@@ -160,7 +160,7 @@ class Command(BaseCommand, LoggerMixin):
        else:
            self.online_log.info('{0} [get_doc_info] [db save end] [task_str={1}] [is_priority={2}]'.format(
                self.log_base, task_str, is_priority))
-            return doc, business_type, rebuild_task_str, classify_1_str
+            return doc, business_type, rebuild_task_str, classify_1_str, re_ocr_flag

    # def pdf_download(self, doc, pdf_path):
    #     if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
@@ -1202,7 +1202,7 @@ class Command(BaseCommand, LoggerMixin):
            self.online_log.info('{0} [zip_2_pdfs] [task={1}]'.format(self.log_base, task_str))

            # 2. 修改doc状态: 识别中
-            zip_doc, business_type = self.get_zip_doc_info(task_str)
+            zip_doc, business_type, re_ocr_flag = self.get_zip_doc_info(task_str)
            if zip_doc is None:
                time.sleep(self.sleep_time_doc_get)
                continue
@@ -1287,7 +1287,7 @@ class Command(BaseCommand, LoggerMixin):
                    target_pdf_path = os.path.join(pdf_doc_data_path, '{0}.pdf'.format(pdf_doc.id))
                    shutil.move(pdf_path, target_pdf_path)

-                    pdf_task_str = consts.SPLIT_STR.join([business_type, str(pdf_doc.id), '0'])
+                    pdf_task_str = consts.SPLIT_STR.join([business_type, str(pdf_doc.id), '0', re_ocr_flag])
                    pdf_task_str_list.append(pdf_task_str)
                except Exception as e:
                    self.online_log.warn('{0} [zip_2_pdfs] [recreate pdf task failed] [task={1}] [pdf_path={2}]'
@@ -1336,7 +1336,7 @@ class Command(BaseCommand, LoggerMixin):

            try:
                # 1. 从队列获取文件信息
-                doc, business_type, task_str, classify_1_str = self.get_doc_info(task_str, is_priority)
+                doc, business_type, task_str, classify_1_str, re_ocr_flag = self.get_doc_info(task_str, is_priority)
                # 队列为空时的处理
                if doc is None:
                    time.sleep(self.sleep_time_doc_get)
@@ -1386,7 +1386,8 @@ class Command(BaseCommand, LoggerMixin):
                                self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
                                    self.log_base, task_str, times))
                                start_time = time.time()
-                                pdf_handler.extract_image(max_img_count)
+                                max_img_count_or_none = None if re_ocr_flag == 'Y' else max_img_count
+                                pdf_handler.extract_image(max_img_count_or_none)
                                end_time = time.time()
                                speed_time = int(end_time - start_time)
                                self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'.format(
@@ -1404,7 +1405,7 @@ class Command(BaseCommand, LoggerMixin):
                            self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format(
                                self.log_base, task_str))
                            raise Exception('pdf img empty')
-                        elif pdf_handler.img_count >= max_img_count:
+                        elif re_ocr_flag == 'N' and pdf_handler.img_count >= max_img_count:
                            self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format(
                                self.log_base, task_str, pdf_handler.img_count))

--- a/src/apps/doc/views.py
View file @91ff815
+++ b/src/apps/doc/views.py
View file @91ff815
@@ -589,6 +589,11 @@ invoice_download_args = {
    'application_ids':  fields.Str(required=True),
 }

+doc_reocr_args = {
+    'doc_id': fields.Int(required=True),
+    'application_entity': fields.Int(required=True),
+}
+

 class UploadDocView(GenericView, DocHandler):
    # permission_classes = []
@@ -698,7 +703,7 @@ class UploadDocView(GenericView, DocHandler):
                or document_name.endswith('.RAR'):
            is_zip = True

-        task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)])
+        task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1), 'N'])
        enqueue_res = rh.enqueue([task], is_priority, is_zip)
        self.running_log.info('[doc upload success] [args={0}] [business_type={1}] [doc_id={2}] '
                              '[is_priority={3}] [enqueue_res={4}] [is_fsm={5} [classify_1={6}]]'.format(args, prefix, doc.id,
@@ -1249,7 +1254,7 @@ class DocView(DocGenericView, DocHandler):
                    break 

        # tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)]
-        task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)])
+        task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1), 'N'])
        enqueue_res = rh.enqueue([task], is_priority)

        self.running_log.info('[mock doc upload success] [args={0}] [business_type={1}] [doc_id={2}] '
@@ -2068,4 +2073,54 @@ class InvoiceQueryInfoView(GenericView):
            return response2.ok(data=java_result)
        except Exception as e:
            self.running_log.error("invoice info request to java error, url:{0}, param:{1}, errorMsg:{2}".format(
-                url, json.dumps(body), traceback.format_exc()))
\ No newline at end of file
+                url, json.dumps(body), traceback.format_exc()))
+
+class DocReOcrView(GenericView, DocHandler):
+    permission_classes = [IsAuthenticated]
+    authentication_classes = [OAuth2AuthenticationWithUser]
+
+    # required_scopes = ['write']
+
+    # 现有文件重新识别接口
+    @use_args(doc_reocr_args, location='data')
+    def post(self, request, args): 
+        start_time = time.time()
+
+        application_entity = args.get('application_entity')
+        doc_id = args.get('doc_id')
+
+        doc_class, prefix = self.get_doc_class(application_entity)
+        doc = doc_class.objects.filter(id=doc_id).first()
+
+        # 3. 选择队列进入
+        is_priority = PriorityApplication.objects.filter(application_id=doc.application_id, on_off=True).exists()
+        is_zip = False
+
+        classify_1 = 0
+        # 电子合同 Econtract or OVP(FSM)
+        if doc.data_source == consts.DATA_SOURCE_LIST[2] or doc.data_source == consts.DATA_SOURCE_LIST[3]:  
+            if doc.document_scheme == consts.DOC_SCHEME_LIST[1]:
+                for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix):
+                    if keyword in doc.document_name:
+                        classify_1 = classify_1_tmp
+                        break
+        # FSM合同：WEP/MSI/SC/SC2
+        elif doc.data_source == consts.DATA_SOURCE_LIST[0] and doc.document_scheme == consts.DOC_SCHEME_LIST[0]:
+            for keyword, classify_1_tmp in consts.FSM_ECONTRACT_KEYWORDS_MAP.get(prefix):
+                if keyword in doc.document_name:
+                    classify_1 = classify_1_tmp
+                    break 
+
+
+        if doc.document_name.endswith('.zip') or doc.document_name.endswith('.rar') or doc.document_name.endswith('.ZIP') \
+                or doc.document_name.endswith('.RAR'):
+            is_zip = True
+
+        # task = 'AFC_11001_0_Y' 'AFC_11001_0_N' 最后的Y,N表示是否是reocr，N否，Y是
+        task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1), 'Y'])
+        enqueue_res = rh.enqueue([task], is_priority, is_zip)
+        self.running_log.info('[doc upload success] [args={0}] [business_type={1}] [doc_id={2}] '
+                              '[is_priority={3}] [enqueue_res={4}] [classify_1={5}]'.format(args, prefix, doc.id,
+                                                                           is_priority, enqueue_res, classify_1))
+
+        return response.ok()
\ No newline at end of file