init

冯轩
Showing 2 changed files with 136 additions and 1 deletions
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/views.py
--- a/src/apps/doc/management/commands/ocr_process.py
View file @adb3724
+++ b/src/apps/doc/management/commands/ocr_process.py
View file @adb3724
@@ -1504,6 +1504,134 @@ class Command(BaseCommand, LoggerMixin):
                                self.log_base, traceback.format_exc()))
                            # error_list.append(1)
                            # return
+                elif classify_1_str == '29': # e-invoice
+                    try:
+                        max_img_count = 500
+                        for times in range(consts.RETRY_TIMES):
+                            try:
+                                if doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
+                                    self.online_log.info('{0} [mo ni xia dan] [task={1}] [times={2}] '
+                                                         '[pdf_path={3}]'.format(self.log_base, task_str,
+                                                                                 times, pdf_path))
+                                elif os.path.exists(pdf_path):
+                                    self.online_log.info('{0} [pdf from zip file] [task={1}] [times={2}] '
+                                                         '[pdf_path={3}]'.format(self.log_base, task_str,
+                                                                                 times, pdf_path))
+                                else:
+                                    # self.edms.download(pdf_path, doc.metadata_version_id)
+                                    self.edms.download(pdf_path, doc.metadata_version_id, doc.document_scheme,
+                                                       business_type)
+                                    self.online_log.info('{0} [ecm download success] [task={1}] [times={2}] '
+                                                         '[pdf_path={3}]'.format(self.log_base, task_str,
+                                                                                 times, pdf_path))
+                                # 3.PDF文件提取图片
+                                self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
+                                    self.log_base, task_str, times))
+                                start_time = time.time()
+                                pdf_handler.extract_image_for_weixin(max_img_count) #沿用微信流程
+                                end_time = time.time()
+                                speed_time = int(end_time - start_time)
+                                self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'.format(
+                                    self.log_base, task_str, times, speed_time, pdf_handler.is_new_modify))
+                            except Exception as e:
+                                self.online_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] '
+                                                      '[error={3}]'.format(self.log_base, task_str, times,
+                                                                           traceback.format_exc()))
+                            else:
+                                break
+                        else:
+                            raise Exception('download or pdf to img failed')
+                        if pdf_handler.img_count == 0:
+                            self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format(
+                                self.log_base, task_str))
+                            raise Exception('pdf img empty')
+                        elif pdf_handler.img_count >= max_img_count:
+                            self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format(
+                                self.log_base, task_str, pdf_handler.img_count))
+                            try:
+                                report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport
+                                report_table.objects.create(
+                                    case_number=doc.application_id,
+                                    request_team=RequestTeam.get_value(doc.document_scheme, 0),
+                                    request_trigger=RequestTrigger.get_value(doc.data_source, 0),
+                                    input_file=doc.document_name,
+                                    transaction_start=doc.start_time,
+                                    transaction_end=doc.start_time,
+                                    successful_at_this_level=False,
+                                    failure_reason=FailureReason.IMG_LIMIT.value,
+                                    process_name=ProcessName.ALL.value,
+                                    notes='pdf page count: {0}'.format(str(pdf_handler.img_count))
+                                )
+                            except Exception as e:
+                                self.online_log.error('{0} [process error (report db save)] [error={1}]'.format(
+                                    self.log_base, traceback.format_exc()))
+                            try:
+                                doc.status = DocStatus.PROCESS_FAILED.value
+                                doc.page_count = pdf_handler.page_count
+                                doc.save()
+                            except Exception as e:
+                                self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
+                                    self.log_base, traceback.format_exc()))
+                        else:
+                            try:
+                                if pdf_handler.is_e_pdf:
+                                    doc.metadata = pdf_handler.metadata if pdf_handler.metadata is None else \
+                                        json.dumps(pdf_handler.metadata)
+                                doc.page_count = pdf_handler.page_count
+                                doc.save()
+                            except Exception as e:
+                                self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
+                                    self.log_base, traceback.format_exc()))
+                            with lock:
+                                todo_count_dict[task_str] = pdf_handler.img_count
+                            self.online_log.info('{0} [pdf_2_img_2_queue] [{1}] [is_ebank={2}]'.format(
+                                self.log_base, task_str, pdf_handler.is_ebank
+                            ))
+                            for img_idx, img_path in enumerate(pdf_handler.img_path_list):
+                                while img_queue.full():
+                                    self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
+                                    time.sleep(self.sleep_time_img_put)
+                                if pdf_handler.is_e_weixin_bs:
+                                    try:
+                                        #self.online_log.info('{0} [pdf_2_img_2_queue] [img_idx={1}] [page_text_list={2}]'.format(self.log_base, img_idx, pdf_handler.page_text_list))
+                                        text_list = pdf_handler.page_text_list[img_idx].pop('rebuild_text')
+                                    except Exception as e:
+                                        text_list = []
+                                else:
+                                    text_list = []
+                                img_queue.put((business_type, img_path, text_list))
+                    except Exception as e:
+                        try:
+                            end_time = timezone.now()
+                            report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport
+                            report_table.objects.create(
+                                case_number=doc.application_id,
+                                request_team=RequestTeam.get_value(doc.document_scheme, 0),
+                                request_trigger=RequestTrigger.get_value(doc.data_source, 0),
+                                input_file=doc.document_name,
+                                transaction_start=doc.start_time,
+                                transaction_end=end_time,
+                                successful_at_this_level=False,
+                                failure_reason=FailureReason.PDF.value,
+                                process_name=ProcessName.ALL.value,
+                            )
+                        except Exception as e:
+                            self.online_log.error('{0} [process error (report db save)] [error={1}]'.format(
+                                self.log_base, traceback.format_exc()))
+                        try:
+                            doc.status = DocStatus.PROCESS_FAILED.value
+                            doc.page_count = pdf_handler.page_count
+                            doc.save()
+                            self.online_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] '
+                                                  '[error={2}]'.format(self.log_base, task_str, traceback.format_exc()))
+                        except Exception as e:
+                            self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
+                                self.log_base, traceback.format_exc()))
                else:  # e-contract or or e-fsm-contract or e-hmh
                    try:
                        # pdf下载 处理 图片存储 识别
--- a/src/apps/doc/views.py
View file @adb3724
+++ b/src/apps/doc/views.py
View file @adb3724
@@ -692,7 +692,10 @@ class UploadDocView(GenericView, DocHandler):
                if keyword in document_name:
                    classify_1 = classify_1_tmp
                    break 
+        if classify_1 == 0 and (document_name.startswith('dzfp_')):
+            classify_1 = 29
+            self.running_log.info('[dzfp process] [doc_id={0}]'.format(doc.id))
        if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \
                or document_name.endswith('.RAR'):
@@ -1248,6 +1251,10 @@ class DocView(DocGenericView, DocHandler):
                    classify_1 = classify_1_tmp
                    break 
+        if classify_1 == 0 and (document_name.startswith('dzfp_')):
+            classify_1 = 29
+            self.running_log.info('[dzfp process] [doc_id={0}]'.format(doc.id))
        # tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)]
        task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)])
        enqueue_res = rh.enqueue([task], is_priority)