647c603e by 周伟奇

Merge branch 'feature/page_limit' into feature/0611

2 parents 5c5a6351 24c87e7e
...@@ -23,7 +23,7 @@ from apps.doc.ocr.edms import EDMS, rh ...@@ -23,7 +23,7 @@ from apps.doc.ocr.edms import EDMS, rh
23 from apps.doc.named_enum import KeywordsType, FailureReason, WorkflowName, ProcessName, RequestTeam, RequestTrigger 23 from apps.doc.named_enum import KeywordsType, FailureReason, WorkflowName, ProcessName, RequestTeam, RequestTrigger
24 from apps.doc.exceptions import EDMSException, OCR1Exception, OCR2Exception, OCR4Exception 24 from apps.doc.exceptions import EDMSException, OCR1Exception, OCR2Exception, OCR4Exception
25 from apps.doc.ocr.wb import BSWorkbook 25 from apps.doc.ocr.wb import BSWorkbook
26 from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords, HILOCRResult, AFCOCRResult, HILOCRReport, AFCOCRReport, DDARecords, IDBCRecords 26 from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords, HILOCRResult, AFCOCRResult, HILOCRReport, AFCOCRReport, DDARecords, IDBCRecords, Configs
27 from celery_compare.tasks import compare 27 from celery_compare.tasks import compare
28 28
29 29
...@@ -572,6 +572,11 @@ class Command(BaseCommand, LoggerMixin): ...@@ -572,6 +572,11 @@ class Command(BaseCommand, LoggerMixin):
572 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) 572 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
573 573
574 pdf_handler = PDFHandler(pdf_path, img_save_path) 574 pdf_handler = PDFHandler(pdf_path, img_save_path)
575 max_count_obj = Configs.objects.filter(id=2).first()
576 try:
577 max_img_count = int(max_count_obj.value)
578 except Exception as e:
579 max_img_count = 500
575 580
576 for times in range(consts.RETRY_TIMES): 581 for times in range(consts.RETRY_TIMES):
577 try: 582 try:
...@@ -584,7 +589,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -584,7 +589,7 @@ class Command(BaseCommand, LoggerMixin):
584 self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format( 589 self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
585 self.log_base, task_str, times)) 590 self.log_base, task_str, times))
586 start_time = time.time() 591 start_time = time.time()
587 pdf_handler.extract_image() 592 pdf_handler.extract_image(max_img_count)
588 end_time = time.time() 593 end_time = time.time()
589 speed_time = int(end_time - start_time) 594 speed_time = int(end_time - start_time)
590 self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format( 595 self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format(
...@@ -598,14 +603,34 @@ class Command(BaseCommand, LoggerMixin): ...@@ -598,14 +603,34 @@ class Command(BaseCommand, LoggerMixin):
598 else: 603 else:
599 raise Exception('download or pdf to img failed') 604 raise Exception('download or pdf to img failed')
600 605
601 img_count = len(pdf_handler.img_path_list) 606 if pdf_handler.img_count == 0:
602 if img_count == 0:
603 self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format( 607 self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format(
604 self.log_base, task_str)) 608 self.log_base, task_str))
605 raise Exception('pdf img empty') 609 raise Exception('pdf img empty')
610 elif pdf_handler.img_count >= max_img_count:
611 self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format(
612 self.log_base, task_str, pdf_handler.img_count))
613
614 try:
615 report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport
616 report_table.objects.create(
617 case_number=doc.application_id,
618 request_team=RequestTeam.get_value(doc.document_scheme, 0),
619 request_trigger=RequestTrigger.get_value(doc.data_source, 0),
620 input_file=doc.document_name,
621 transaction_start=doc.start_time,
622 transaction_end=time.time(),
623 successful_at_this_level=False,
624 failure_reason=FailureReason.IMG_LIMIT.value,
625 process_name=ProcessName.ALL.value,
626 notes='pdf page count: {0}'.format(str(pdf_handler.img_count))
627 )
628 except Exception as e:
629 self.online_log.error('{0} [process error (report db save)] [error={1}]'.format(
630 self.log_base, traceback.format_exc()))
606 else: 631 else:
607 with lock: 632 with lock:
608 todo_count_dict[task_str] = img_count 633 todo_count_dict[task_str] = pdf_handler.img_count
609 for img_path in pdf_handler.img_path_list: 634 for img_path in pdf_handler.img_path_list:
610 while img_queue.full(): 635 while img_queue.full():
611 self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) 636 self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
......
...@@ -282,6 +282,7 @@ class HILOCRReport(models.Model): ...@@ -282,6 +282,7 @@ class HILOCRReport(models.Model):
282 process_name = models.SmallIntegerField(default=ProcessName.ALL.value, verbose_name="流程名称") 282 process_name = models.SmallIntegerField(default=ProcessName.ALL.value, verbose_name="流程名称")
283 total_fields = models.IntegerField(null=True, verbose_name='比对字段数目') 283 total_fields = models.IntegerField(null=True, verbose_name='比对字段数目')
284 workflow_name = models.SmallIntegerField(null=True, verbose_name="工作流程") 284 workflow_name = models.SmallIntegerField(null=True, verbose_name="工作流程")
285 notes = models.CharField(null=True, max_length=2048, verbose_name="备注")
285 286
286 class Meta: 287 class Meta:
287 managed = False 288 managed = False
...@@ -301,6 +302,7 @@ class AFCOCRReport(models.Model): ...@@ -301,6 +302,7 @@ class AFCOCRReport(models.Model):
301 process_name = models.SmallIntegerField(default=ProcessName.ALL.value, verbose_name="流程名称") 302 process_name = models.SmallIntegerField(default=ProcessName.ALL.value, verbose_name="流程名称")
302 total_fields = models.IntegerField(null=True, verbose_name='比对字段数目') 303 total_fields = models.IntegerField(null=True, verbose_name='比对字段数目')
303 workflow_name = models.SmallIntegerField(null=True, verbose_name="工作流程") 304 workflow_name = models.SmallIntegerField(null=True, verbose_name="工作流程")
305 notes = models.CharField(null=True, max_length=2048, verbose_name="备注")
304 306
305 class Meta: 307 class Meta:
306 managed = False 308 managed = False
......
...@@ -42,6 +42,7 @@ class FailureReason(NamedEnum): ...@@ -42,6 +42,7 @@ class FailureReason(NamedEnum):
42 PDF = (0, 'PDF处理失败') 42 PDF = (0, 'PDF处理失败')
43 EXCEL = (1, '构建excel失败') 43 EXCEL = (1, '构建excel失败')
44 EDMS = (2, 'EDMS上传失败') 44 EDMS = (2, 'EDMS上传失败')
45 IMG_LIMIT = (3, 'PDF图片过多')
45 46
46 47
47 class ProcessName(NamedEnum): 48 class ProcessName(NamedEnum):
......
1 import pyodbc
2
3 hil_sql = "ALTER TABLE hil_ocr_report ADD notes nvarchar(2048)"
4
5 afc_sql = "ALTER TABLE afc_ocr_report ADD notes nvarchar(2048)"
6
7 hil_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True)
8
9 hil_cursor = hil_cnxn.cursor()
10 hil_cursor.execute(hil_sql)
11
12 hil_cursor.close()
13 hil_cnxn.close()
14
15 afc_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True)
16
17 afc_cursor = afc_cnxn.cursor()
18 afc_cursor.execute(afc_sql)
19
20 afc_cursor.close()
21 afc_cnxn.close()
...@@ -26,6 +26,7 @@ class PDFHandler: ...@@ -26,6 +26,7 @@ class PDFHandler:
26 self.path = path 26 self.path = path
27 self.img_dir_path = img_dir_path 27 self.img_dir_path = img_dir_path
28 self.img_path_list = [] 28 self.img_path_list = []
29 self.img_count = 0
29 self.xref_set = set() 30 self.xref_set = set()
30 31
31 def get_img_save_path(self, pno, img_index=0, ext='png'): 32 def get_img_save_path(self, pno, img_index=0, ext='png'):
...@@ -192,11 +193,14 @@ class PDFHandler: ...@@ -192,11 +193,14 @@ class PDFHandler:
192 page = pdf.loadPage(pno) 193 page = pdf.loadPage(pno)
193 self.page_to_png(page) 194 self.page_to_png(page)
194 195
195 def extract_image(self): 196 def extract_image(self, max_img_count=None):
196 self.img_path_list = [] 197 self.img_path_list = []
197 self.xref_set = set() 198 self.xref_set = set()
198 os.makedirs(self.img_dir_path, exist_ok=True) 199 os.makedirs(self.img_dir_path, exist_ok=True)
199 with fitz.Document(self.path) as pdf: 200 with fitz.Document(self.path) as pdf:
201 if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
202 self.img_count = pdf.pageCount
203 return
200 for pno in range(pdf.pageCount): 204 for pno in range(pdf.pageCount):
201 il = pdf.getPageImageList(pno) # 获取页面图片对象 205 il = pdf.getPageImageList(pno) # 获取页面图片对象
202 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) 206 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
...@@ -220,3 +224,4 @@ class PDFHandler: ...@@ -220,3 +224,4 @@ class PDFHandler:
220 # 3.页面图片对象数目大于1时,特殊处理 224 # 3.页面图片对象数目大于1时,特殊处理
221 else: 225 else:
222 self.merge_il(pdf, pno, il) 226 self.merge_il(pdf, pno, il)
227 self.img_count = len(self.img_path_list)
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!