pdf page limit

周伟奇
Showing 4 changed files with 41 additions and 8 deletions
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/models.py
src/common/tools/mssql_script5.py
src/common/tools/pdf_to_img.py
--- a/src/apps/doc/management/commands/ocr_process.py
View file @24c87e7
+++ b/src/apps/doc/management/commands/ocr_process.py
View file @24c87e7
@@ -23,7 +23,7 @@ from apps.doc.ocr.edms import EDMS, rh
 from apps.doc.named_enum import KeywordsType, FailureReason, WorkflowName, ProcessName, RequestTeam, RequestTrigger
 from apps.doc.exceptions import EDMSException, OCR1Exception, OCR2Exception, OCR4Exception
 from apps.doc.ocr.wb import BSWorkbook
-from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords, HILOCRResult, AFCOCRResult, HILOCRReport, AFCOCRReport, DDARecords, IDBCRecords
+from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords, HILOCRResult, AFCOCRResult, HILOCRReport, AFCOCRReport, DDARecords, IDBCRecords, Configs
 from celery_compare.tasks import compare


@@ -582,6 +582,11 @@ class Command(BaseCommand, LoggerMixin):
                    pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))

                    pdf_handler = PDFHandler(pdf_path, img_save_path)
+                    max_count_obj = Configs.objects.filter(id=2).first()
+                    try:
+                        max_img_count = int(max_count_obj.value)
+                    except Exception as e:
+                        max_img_count = 500

                    for times in range(consts.RETRY_TIMES):
                        try:
@@ -594,7 +599,7 @@ class Command(BaseCommand, LoggerMixin):
                            self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
                                self.log_base, task_str, times))
                            start_time = time.time()
-                            pdf_handler.extract_image()
+                            pdf_handler.extract_image(max_img_count)
                            end_time = time.time()
                            speed_time = int(end_time - start_time)
                            self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format(
@@ -608,14 +613,13 @@ class Command(BaseCommand, LoggerMixin):
                    else:
                        raise Exception('download or pdf to img failed')

-                    img_count = len(pdf_handler.img_path_list)
-                    if img_count == 0:
+                    if pdf_handler.img_count == 0:
                        self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format(
                            self.log_base, task_str))
                        raise Exception('pdf img empty')
-                    elif img_count >= max_img_count:
+                    elif pdf_handler.img_count >= max_img_count:
                        self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format(
-                            self.log_base, task_str, img_count))
+                            self.log_base, task_str, pdf_handler.img_count))

                        try:
                            report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport
@@ -629,13 +633,14 @@ class Command(BaseCommand, LoggerMixin):
                                successful_at_this_level=False,
                                failure_reason=FailureReason.IMG_LIMIT.value,
                                process_name=ProcessName.ALL.value,
+                                notes='pdf page count: {0}'.format(str(pdf_handler.img_count))
                            )
                        except Exception as e:
                            self.online_log.error('{0} [process error (report db save)] [error={1}]'.format(
                                self.log_base, traceback.format_exc()))
                    else:
                        with lock:
-                            todo_count_dict[task_str] = img_count
+                            todo_count_dict[task_str] = pdf_handler.img_count
                        for img_path in pdf_handler.img_path_list:
                            while img_queue.full():
                                self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
--- a/src/apps/doc/models.py
View file @24c87e7
+++ b/src/apps/doc/models.py
View file @24c87e7
@@ -282,6 +282,7 @@ class HILOCRReport(models.Model):
    process_name = models.SmallIntegerField(default=ProcessName.ALL.value, verbose_name="流程名称")
    total_fields = models.IntegerField(null=True, verbose_name='比对字段数目')
    workflow_name = models.SmallIntegerField(null=True, verbose_name="工作流程")
+    notes = models.CharField(null=True, max_length=2048, verbose_name="备注")

    class Meta:
        managed = False
@@ -301,6 +302,7 @@ class AFCOCRReport(models.Model):
    process_name = models.SmallIntegerField(default=ProcessName.ALL.value, verbose_name="流程名称")
    total_fields = models.IntegerField(null=True, verbose_name='比对字段数目')
    workflow_name = models.SmallIntegerField(null=True, verbose_name="工作流程")
+    notes = models.CharField(null=True, max_length=2048, verbose_name="备注")

    class Meta:
        managed = False
--- a/src/common/tools/mssql_script5.py 0 → 100644
View file @24c87e7
+++ b/src/common/tools/mssql_script5.py 0 → 100644
View file @24c87e7
+import pyodbc
+
+hil_sql = "ALTER TABLE hil_ocr_report ADD notes nvarchar(2048)"
+
+afc_sql = "ALTER TABLE afc_ocr_report ADD notes nvarchar(2048)"
+
+hil_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True)
+
+hil_cursor = hil_cnxn.cursor()
+hil_cursor.execute(hil_sql)
+
+hil_cursor.close()
+hil_cnxn.close()
+
+afc_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True)
+
+afc_cursor = afc_cnxn.cursor()
+afc_cursor.execute(afc_sql)
+
+afc_cursor.close()
+afc_cnxn.close()
--- a/src/common/tools/pdf_to_img.py
View file @24c87e7
+++ b/src/common/tools/pdf_to_img.py
View file @24c87e7
@@ -26,6 +26,7 @@ class PDFHandler:
        self.path = path
        self.img_dir_path = img_dir_path
        self.img_path_list = []
+        self.img_count = 0
        self.xref_set = set()

    def get_img_save_path(self, pno, img_index=0, ext='png'):
@@ -192,11 +193,14 @@ class PDFHandler:
            page = pdf.loadPage(pno)
            self.page_to_png(page)

-    def extract_image(self):
+    def extract_image(self, max_img_count=None):
        self.img_path_list = []
        self.xref_set = set()
        os.makedirs(self.img_dir_path, exist_ok=True)
        with fitz.Document(self.path) as pdf:
+            if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
+                self.img_count = pdf.pageCount
+                return
            for pno in range(pdf.pageCount):
                il = pdf.getPageImageList(pno)  # 获取页面图片对象
                # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
@@ -220,3 +224,4 @@ class PDFHandler:
                # 3.页面图片对象数目大于1时，特殊处理
                else:
                    self.merge_il(pdf, pno, il)
+        self.img_count = len(self.img_path_list)