Merge branch 'feature/page_limit' into feature/0611

周伟奇
Showing 5 changed files with 60 additions and 6 deletions
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/models.py
src/apps/doc/named_enum.py
src/common/tools/mssql_script5.py
src/common/tools/pdf_to_img.py
--- a/src/apps/doc/management/commands/ocr_process.py
View file @647c603
+++ b/src/apps/doc/management/commands/ocr_process.py
View file @647c603
@@ -23,7 +23,7 @@ from apps.doc.ocr.edms import EDMS, rh
 from apps.doc.named_enum import KeywordsType, FailureReason, WorkflowName, ProcessName, RequestTeam, RequestTrigger
 from apps.doc.exceptions import EDMSException, OCR1Exception, OCR2Exception, OCR4Exception
 from apps.doc.ocr.wb import BSWorkbook
-from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords, HILOCRResult, AFCOCRResult, HILOCRReport, AFCOCRReport, DDARecords, IDBCRecords
+from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords, HILOCRResult, AFCOCRResult, HILOCRReport, AFCOCRReport, DDARecords, IDBCRecords, Configs
 from celery_compare.tasks import compare
@@ -572,6 +572,11 @@ class Command(BaseCommand, LoggerMixin):
                    pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
                    pdf_handler = PDFHandler(pdf_path, img_save_path)
+                    max_count_obj = Configs.objects.filter(id=2).first()
+                    try:
+                        max_img_count = int(max_count_obj.value)
+                    except Exception as e:
+                        max_img_count = 500
                    for times in range(consts.RETRY_TIMES):
                        try:
@@ -584,7 +589,7 @@ class Command(BaseCommand, LoggerMixin):
                            self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
                                self.log_base, task_str, times))
                            start_time = time.time()
-                            pdf_handler.extract_image()
+                            pdf_handler.extract_image(max_img_count)
                            end_time = time.time()
                            speed_time = int(end_time - start_time)
                            self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format(
@@ -598,14 +603,34 @@ class Command(BaseCommand, LoggerMixin):
                    else:
                        raise Exception('download or pdf to img failed')
-                    img_count = len(pdf_handler.img_path_list)
+                    if pdf_handler.img_count == 0:
-                    if img_count == 0:
                        self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format(
                            self.log_base, task_str))
                        raise Exception('pdf img empty')
+                    elif pdf_handler.img_count >= max_img_count:
+                        self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format(
+                            self.log_base, task_str, pdf_handler.img_count))
+                        try:
+                            report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport
+                            report_table.objects.create(
+                                case_number=doc.application_id,
+                                request_team=RequestTeam.get_value(doc.document_scheme, 0),
+                                request_trigger=RequestTrigger.get_value(doc.data_source, 0),
+                                input_file=doc.document_name,
+                                transaction_start=doc.start_time,
+                                transaction_end=time.time(),
+                                successful_at_this_level=False,
+                                failure_reason=FailureReason.IMG_LIMIT.value,
+                                process_name=ProcessName.ALL.value,
+                                notes='pdf page count: {0}'.format(str(pdf_handler.img_count))
+                            )
+                        except Exception as e:
+                            self.online_log.error('{0} [process error (report db save)] [error={1}]'.format(
+                                self.log_base, traceback.format_exc()))
                    else:
                        with lock:
-                            todo_count_dict[task_str] = img_count
+                            todo_count_dict[task_str] = pdf_handler.img_count
                        for img_path in pdf_handler.img_path_list:
                            while img_queue.full():
                                self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
--- a/src/apps/doc/models.py
View file @647c603
+++ b/src/apps/doc/models.py
View file @647c603
@@ -282,6 +282,7 @@ class HILOCRReport(models.Model):
    process_name = models.SmallIntegerField(default=ProcessName.ALL.value, verbose_name="流程名称")
    total_fields = models.IntegerField(null=True, verbose_name='比对字段数目')
    workflow_name = models.SmallIntegerField(null=True, verbose_name="工作流程")
+    notes = models.CharField(null=True, max_length=2048, verbose_name="备注")
    class Meta:
        managed = False
@@ -301,6 +302,7 @@ class AFCOCRReport(models.Model):
    process_name = models.SmallIntegerField(default=ProcessName.ALL.value, verbose_name="流程名称")
    total_fields = models.IntegerField(null=True, verbose_name='比对字段数目')
    workflow_name = models.SmallIntegerField(null=True, verbose_name="工作流程")
+    notes = models.CharField(null=True, max_length=2048, verbose_name="备注")
    class Meta:
        managed = False
--- a/src/apps/doc/named_enum.py
View file @647c603
+++ b/src/apps/doc/named_enum.py
View file @647c603
@@ -42,6 +42,7 @@ class FailureReason(NamedEnum):
    PDF = (0, 'PDF处理失败')
    EXCEL = (1, '构建excel失败')
    EDMS = (2, 'EDMS上传失败')
+    IMG_LIMIT = (3, 'PDF图片过多')
 class ProcessName(NamedEnum):
--- a/src/common/tools/mssql_script5.py 0 → 100644
View file @647c603
+++ b/src/common/tools/mssql_script5.py 0 → 100644
View file @647c603
+import pyodbc
+hil_sql = "ALTER TABLE hil_ocr_report ADD notes nvarchar(2048)"
+afc_sql = "ALTER TABLE afc_ocr_report ADD notes nvarchar(2048)"
+hil_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True)
+hil_cursor = hil_cnxn.cursor()
+hil_cursor.execute(hil_sql)
+hil_cursor.close()
+hil_cnxn.close()
+afc_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True)
+afc_cursor = afc_cnxn.cursor()
+afc_cursor.execute(afc_sql)
+afc_cursor.close()
+afc_cnxn.close()
--- a/src/common/tools/pdf_to_img.py
View file @647c603
+++ b/src/common/tools/pdf_to_img.py
View file @647c603
@@ -26,6 +26,7 @@ class PDFHandler:
        self.path = path
        self.img_dir_path = img_dir_path
        self.img_path_list = []
+        self.img_count = 0
        self.xref_set = set()
    def get_img_save_path(self, pno, img_index=0, ext='png'):
@@ -192,11 +193,14 @@ class PDFHandler:
            page = pdf.loadPage(pno)
            self.page_to_png(page)
-    def extract_image(self):
+    def extract_image(self, max_img_count=None):
        self.img_path_list = []
        self.xref_set = set()
        os.makedirs(self.img_dir_path, exist_ok=True)
        with fitz.Document(self.path) as pdf:
+            if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
+                self.img_count = pdf.pageCount
+                return
            for pno in range(pdf.pageCount):
                il = pdf.getPageImageList(pno)  # 获取页面图片对象
                # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
@@ -220,3 +224,4 @@ class PDFHandler:
                # 3.页面图片对象数目大于1时，特殊处理
                else:
                    self.merge_il(pdf, pno, il)
+        self.img_count = len(self.img_path_list)