24c87e7e by 周伟奇

pdf page limit

1 parent 96178db6
...@@ -23,7 +23,7 @@ from apps.doc.ocr.edms import EDMS, rh ...@@ -23,7 +23,7 @@ from apps.doc.ocr.edms import EDMS, rh
23 from apps.doc.named_enum import KeywordsType, FailureReason, WorkflowName, ProcessName, RequestTeam, RequestTrigger 23 from apps.doc.named_enum import KeywordsType, FailureReason, WorkflowName, ProcessName, RequestTeam, RequestTrigger
24 from apps.doc.exceptions import EDMSException, OCR1Exception, OCR2Exception, OCR4Exception 24 from apps.doc.exceptions import EDMSException, OCR1Exception, OCR2Exception, OCR4Exception
25 from apps.doc.ocr.wb import BSWorkbook 25 from apps.doc.ocr.wb import BSWorkbook
26 from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords, HILOCRResult, AFCOCRResult, HILOCRReport, AFCOCRReport, DDARecords, IDBCRecords 26 from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords, HILOCRResult, AFCOCRResult, HILOCRReport, AFCOCRReport, DDARecords, IDBCRecords, Configs
27 from celery_compare.tasks import compare 27 from celery_compare.tasks import compare
28 28
29 29
...@@ -582,6 +582,11 @@ class Command(BaseCommand, LoggerMixin): ...@@ -582,6 +582,11 @@ class Command(BaseCommand, LoggerMixin):
582 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) 582 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
583 583
584 pdf_handler = PDFHandler(pdf_path, img_save_path) 584 pdf_handler = PDFHandler(pdf_path, img_save_path)
585 max_count_obj = Configs.objects.filter(id=2).first()
586 try:
587 max_img_count = int(max_count_obj.value)
588 except Exception as e:
589 max_img_count = 500
585 590
586 for times in range(consts.RETRY_TIMES): 591 for times in range(consts.RETRY_TIMES):
587 try: 592 try:
...@@ -594,7 +599,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -594,7 +599,7 @@ class Command(BaseCommand, LoggerMixin):
594 self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format( 599 self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
595 self.log_base, task_str, times)) 600 self.log_base, task_str, times))
596 start_time = time.time() 601 start_time = time.time()
597 pdf_handler.extract_image() 602 pdf_handler.extract_image(max_img_count)
598 end_time = time.time() 603 end_time = time.time()
599 speed_time = int(end_time - start_time) 604 speed_time = int(end_time - start_time)
600 self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format( 605 self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format(
...@@ -608,14 +613,13 @@ class Command(BaseCommand, LoggerMixin): ...@@ -608,14 +613,13 @@ class Command(BaseCommand, LoggerMixin):
608 else: 613 else:
609 raise Exception('download or pdf to img failed') 614 raise Exception('download or pdf to img failed')
610 615
611 img_count = len(pdf_handler.img_path_list) 616 if pdf_handler.img_count == 0:
612 if img_count == 0:
613 self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format( 617 self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format(
614 self.log_base, task_str)) 618 self.log_base, task_str))
615 raise Exception('pdf img empty') 619 raise Exception('pdf img empty')
616 elif img_count >= max_img_count: 620 elif pdf_handler.img_count >= max_img_count:
617 self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format( 621 self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format(
618 self.log_base, task_str, img_count)) 622 self.log_base, task_str, pdf_handler.img_count))
619 623
620 try: 624 try:
621 report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport 625 report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport
...@@ -629,13 +633,14 @@ class Command(BaseCommand, LoggerMixin): ...@@ -629,13 +633,14 @@ class Command(BaseCommand, LoggerMixin):
629 successful_at_this_level=False, 633 successful_at_this_level=False,
630 failure_reason=FailureReason.IMG_LIMIT.value, 634 failure_reason=FailureReason.IMG_LIMIT.value,
631 process_name=ProcessName.ALL.value, 635 process_name=ProcessName.ALL.value,
636 notes='pdf page count: {0}'.format(str(pdf_handler.img_count))
632 ) 637 )
633 except Exception as e: 638 except Exception as e:
634 self.online_log.error('{0} [process error (report db save)] [error={1}]'.format( 639 self.online_log.error('{0} [process error (report db save)] [error={1}]'.format(
635 self.log_base, traceback.format_exc())) 640 self.log_base, traceback.format_exc()))
636 else: 641 else:
637 with lock: 642 with lock:
638 todo_count_dict[task_str] = img_count 643 todo_count_dict[task_str] = pdf_handler.img_count
639 for img_path in pdf_handler.img_path_list: 644 for img_path in pdf_handler.img_path_list:
640 while img_queue.full(): 645 while img_queue.full():
641 self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) 646 self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
......
...@@ -282,6 +282,7 @@ class HILOCRReport(models.Model): ...@@ -282,6 +282,7 @@ class HILOCRReport(models.Model):
282 process_name = models.SmallIntegerField(default=ProcessName.ALL.value, verbose_name="流程名称") 282 process_name = models.SmallIntegerField(default=ProcessName.ALL.value, verbose_name="流程名称")
283 total_fields = models.IntegerField(null=True, verbose_name='比对字段数目') 283 total_fields = models.IntegerField(null=True, verbose_name='比对字段数目')
284 workflow_name = models.SmallIntegerField(null=True, verbose_name="工作流程") 284 workflow_name = models.SmallIntegerField(null=True, verbose_name="工作流程")
285 notes = models.CharField(null=True, max_length=2048, verbose_name="备注")
285 286
286 class Meta: 287 class Meta:
287 managed = False 288 managed = False
...@@ -301,6 +302,7 @@ class AFCOCRReport(models.Model): ...@@ -301,6 +302,7 @@ class AFCOCRReport(models.Model):
301 process_name = models.SmallIntegerField(default=ProcessName.ALL.value, verbose_name="流程名称") 302 process_name = models.SmallIntegerField(default=ProcessName.ALL.value, verbose_name="流程名称")
302 total_fields = models.IntegerField(null=True, verbose_name='比对字段数目') 303 total_fields = models.IntegerField(null=True, verbose_name='比对字段数目')
303 workflow_name = models.SmallIntegerField(null=True, verbose_name="工作流程") 304 workflow_name = models.SmallIntegerField(null=True, verbose_name="工作流程")
305 notes = models.CharField(null=True, max_length=2048, verbose_name="备注")
304 306
305 class Meta: 307 class Meta:
306 managed = False 308 managed = False
......
1 import pyodbc
2
3 hil_sql = "ALTER TABLE hil_ocr_report ADD notes nvarchar(2048)"
4
5 afc_sql = "ALTER TABLE afc_ocr_report ADD notes nvarchar(2048)"
6
7 hil_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True)
8
9 hil_cursor = hil_cnxn.cursor()
10 hil_cursor.execute(hil_sql)
11
12 hil_cursor.close()
13 hil_cnxn.close()
14
15 afc_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True)
16
17 afc_cursor = afc_cnxn.cursor()
18 afc_cursor.execute(afc_sql)
19
20 afc_cursor.close()
21 afc_cnxn.close()
...@@ -26,6 +26,7 @@ class PDFHandler: ...@@ -26,6 +26,7 @@ class PDFHandler:
26 self.path = path 26 self.path = path
27 self.img_dir_path = img_dir_path 27 self.img_dir_path = img_dir_path
28 self.img_path_list = [] 28 self.img_path_list = []
29 self.img_count = 0
29 self.xref_set = set() 30 self.xref_set = set()
30 31
31 def get_img_save_path(self, pno, img_index=0, ext='png'): 32 def get_img_save_path(self, pno, img_index=0, ext='png'):
...@@ -192,11 +193,14 @@ class PDFHandler: ...@@ -192,11 +193,14 @@ class PDFHandler:
192 page = pdf.loadPage(pno) 193 page = pdf.loadPage(pno)
193 self.page_to_png(page) 194 self.page_to_png(page)
194 195
195 def extract_image(self): 196 def extract_image(self, max_img_count=None):
196 self.img_path_list = [] 197 self.img_path_list = []
197 self.xref_set = set() 198 self.xref_set = set()
198 os.makedirs(self.img_dir_path, exist_ok=True) 199 os.makedirs(self.img_dir_path, exist_ok=True)
199 with fitz.Document(self.path) as pdf: 200 with fitz.Document(self.path) as pdf:
201 if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
202 self.img_count = pdf.pageCount
203 return
200 for pno in range(pdf.pageCount): 204 for pno in range(pdf.pageCount):
201 il = pdf.getPageImageList(pno) # 获取页面图片对象 205 il = pdf.getPageImageList(pno) # 获取页面图片对象
202 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) 206 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
...@@ -220,3 +224,4 @@ class PDFHandler: ...@@ -220,3 +224,4 @@ class PDFHandler:
220 # 3.页面图片对象数目大于1时,特殊处理 224 # 3.页面图片对象数目大于1时,特殊处理
221 else: 225 else:
222 self.merge_il(pdf, pno, il) 226 self.merge_il(pdf, pno, il)
227 self.img_count = len(self.img_path_list)
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!