merge

冯轩
Showing 3 changed files with 12 additions and 8 deletions
src/apps/doc/internal_urls.py
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/views.py
--- a/src/apps/doc/internal_urls.py
View file @160ac57
+++ b/src/apps/doc/internal_urls.py
View file @160ac57
@@ -10,4 +10,7 @@ urlpatterns = [
    path(r'invoice/downloadExcel', views.InvoiceExcelView.as_view()),
    path(r'invoice/queryInfo', views.InvoiceQueryInfoView.as_view()),
    path(r'contract/v1', views.SEContractView.as_view()),
+    path(r'reocr', views.DocReOcrView.as_view()),
+    path(r'batch/reocr', views.BatchReOcrView.as_view()),
+    
 ]
--- a/src/apps/doc/management/commands/ocr_process.py
View file @160ac57
+++ b/src/apps/doc/management/commands/ocr_process.py
View file @160ac57
@@ -100,7 +100,7 @@ class Command(BaseCommand, LoggerMixin):
            if len(info_tuple) == 2:
                business_type, doc_id_str = info_tuple
            else:
-                business_type, doc_id_str, classify_1_str = info_tuple
+                business_type, doc_id_str, classify_1_str, re_ocr_flag = info_tuple
            doc_id = int(doc_id_str)
            doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
            zip_doc = doc_class.objects.filter(id=doc_id).first()
@@ -124,7 +124,7 @@ class Command(BaseCommand, LoggerMixin):
        else:
            self.online_log.info('{0} [zip_2_pdfs] [db save end] [task_str={1}]'.format(
                self.log_base, task_str))
-            return zip_doc, business_type
+            return zip_doc, business_type, re_ocr_flag

    def get_doc_info(self, task_str, is_priority=False):
        try:
@@ -135,7 +135,7 @@ class Command(BaseCommand, LoggerMixin):
                classify_1_str = '0'
                rebuild_task_str = task_str
            else:
-                business_type, doc_id_str, classify_1_str = info_tuple
+                business_type, doc_id_str, classify_1_str, re_ocr_flag = info_tuple
                rebuild_task_str = '{0}{1}{2}'.format(business_type, consts.SPLIT_STR, doc_id_str)
            doc_id = int(doc_id_str)
            doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
@@ -160,7 +160,7 @@ class Command(BaseCommand, LoggerMixin):
        else:
            self.online_log.info('{0} [get_doc_info] [db save end] [task_str={1}] [is_priority={2}]'.format(
                self.log_base, task_str, is_priority))
-            return doc, business_type, rebuild_task_str, classify_1_str
+            return doc, business_type, rebuild_task_str, classify_1_str, re_ocr_flag

    # def pdf_download(self, doc, pdf_path):
    #     if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
@@ -1202,7 +1202,7 @@ class Command(BaseCommand, LoggerMixin):
            self.online_log.info('{0} [zip_2_pdfs] [task={1}]'.format(self.log_base, task_str))

            # 2. 修改doc状态: 识别中
-            zip_doc, business_type = self.get_zip_doc_info(task_str)
+            zip_doc, business_type, re_ocr_flag = self.get_zip_doc_info(task_str)
            if zip_doc is None:
                time.sleep(self.sleep_time_doc_get)
                continue
@@ -1339,7 +1339,7 @@ class Command(BaseCommand, LoggerMixin):

            try:
                # 1. 从队列获取文件信息
-                doc, business_type, task_str, classify_1_str = self.get_doc_info(task_str, is_priority)
+                doc, business_type, task_str, classify_1_str, re_ocr_flag = self.get_doc_info(task_str, is_priority)
                # 队列为空时的处理
                if doc is None:
                    time.sleep(self.sleep_time_doc_get)
@@ -1389,7 +1389,8 @@ class Command(BaseCommand, LoggerMixin):
                                self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
                                    self.log_base, task_str, times))
                                start_time = time.time()
-                                pdf_handler.extract_image(max_img_count)
+                                max_img_count_or_none = None if re_ocr_flag == 'Y' else max_img_count
+                                pdf_handler.extract_image(max_img_count_or_none)
                                end_time = time.time()
                                speed_time = int(end_time - start_time)
                                self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'.format(
@@ -1407,7 +1408,7 @@ class Command(BaseCommand, LoggerMixin):
                            self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format(
                                self.log_base, task_str))
                            raise Exception('pdf img empty')
-                        elif pdf_handler.img_count >= max_img_count:
+                        elif re_ocr_flag == 'N' and pdf_handler.img_count >= max_img_count:
                            self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format(
                                self.log_base, task_str, pdf_handler.img_count))

--- a/src/apps/doc/views.py
View file @160ac57
+++ b/src/apps/doc/views.py
View file @160ac57