Merge branch 'feature/page_limit' into feature/0611
Showing
5 changed files
with
60 additions
and
6 deletions
| ... | @@ -23,7 +23,7 @@ from apps.doc.ocr.edms import EDMS, rh | ... | @@ -23,7 +23,7 @@ from apps.doc.ocr.edms import EDMS, rh |
| 23 | from apps.doc.named_enum import KeywordsType, FailureReason, WorkflowName, ProcessName, RequestTeam, RequestTrigger | 23 | from apps.doc.named_enum import KeywordsType, FailureReason, WorkflowName, ProcessName, RequestTeam, RequestTrigger |
| 24 | from apps.doc.exceptions import EDMSException, OCR1Exception, OCR2Exception, OCR4Exception | 24 | from apps.doc.exceptions import EDMSException, OCR1Exception, OCR2Exception, OCR4Exception |
| 25 | from apps.doc.ocr.wb import BSWorkbook | 25 | from apps.doc.ocr.wb import BSWorkbook |
| 26 | from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords, HILOCRResult, AFCOCRResult, HILOCRReport, AFCOCRReport, DDARecords, IDBCRecords | 26 | from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords, HILOCRResult, AFCOCRResult, HILOCRReport, AFCOCRReport, DDARecords, IDBCRecords, Configs |
| 27 | from celery_compare.tasks import compare | 27 | from celery_compare.tasks import compare |
| 28 | 28 | ||
| 29 | 29 | ||
| ... | @@ -572,6 +572,11 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -572,6 +572,11 @@ class Command(BaseCommand, LoggerMixin): |
| 572 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) | 572 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) |
| 573 | 573 | ||
| 574 | pdf_handler = PDFHandler(pdf_path, img_save_path) | 574 | pdf_handler = PDFHandler(pdf_path, img_save_path) |
| 575 | max_count_obj = Configs.objects.filter(id=2).first() | ||
| 576 | try: | ||
| 577 | max_img_count = int(max_count_obj.value) | ||
| 578 | except Exception as e: | ||
| 579 | max_img_count = 500 | ||
| 575 | 580 | ||
| 576 | for times in range(consts.RETRY_TIMES): | 581 | for times in range(consts.RETRY_TIMES): |
| 577 | try: | 582 | try: |
| ... | @@ -584,7 +589,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -584,7 +589,7 @@ class Command(BaseCommand, LoggerMixin): |
| 584 | self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format( | 589 | self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format( |
| 585 | self.log_base, task_str, times)) | 590 | self.log_base, task_str, times)) |
| 586 | start_time = time.time() | 591 | start_time = time.time() |
| 587 | pdf_handler.extract_image() | 592 | pdf_handler.extract_image(max_img_count) |
| 588 | end_time = time.time() | 593 | end_time = time.time() |
| 589 | speed_time = int(end_time - start_time) | 594 | speed_time = int(end_time - start_time) |
| 590 | self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format( | 595 | self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format( |
| ... | @@ -598,14 +603,34 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -598,14 +603,34 @@ class Command(BaseCommand, LoggerMixin): |
| 598 | else: | 603 | else: |
| 599 | raise Exception('download or pdf to img failed') | 604 | raise Exception('download or pdf to img failed') |
| 600 | 605 | ||
| 601 | img_count = len(pdf_handler.img_path_list) | 606 | if pdf_handler.img_count == 0: |
| 602 | if img_count == 0: | ||
| 603 | self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format( | 607 | self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format( |
| 604 | self.log_base, task_str)) | 608 | self.log_base, task_str)) |
| 605 | raise Exception('pdf img empty') | 609 | raise Exception('pdf img empty') |
| 610 | elif pdf_handler.img_count >= max_img_count: | ||
| 611 | self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format( | ||
| 612 | self.log_base, task_str, pdf_handler.img_count)) | ||
| 613 | |||
| 614 | try: | ||
| 615 | report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport | ||
| 616 | report_table.objects.create( | ||
| 617 | case_number=doc.application_id, | ||
| 618 | request_team=RequestTeam.get_value(doc.document_scheme, 0), | ||
| 619 | request_trigger=RequestTrigger.get_value(doc.data_source, 0), | ||
| 620 | input_file=doc.document_name, | ||
| 621 | transaction_start=doc.start_time, | ||
| 622 | transaction_end=time.time(), | ||
| 623 | successful_at_this_level=False, | ||
| 624 | failure_reason=FailureReason.IMG_LIMIT.value, | ||
| 625 | process_name=ProcessName.ALL.value, | ||
| 626 | notes='pdf page count: {0}'.format(str(pdf_handler.img_count)) | ||
| 627 | ) | ||
| 628 | except Exception as e: | ||
| 629 | self.online_log.error('{0} [process error (report db save)] [error={1}]'.format( | ||
| 630 | self.log_base, traceback.format_exc())) | ||
| 606 | else: | 631 | else: |
| 607 | with lock: | 632 | with lock: |
| 608 | todo_count_dict[task_str] = img_count | 633 | todo_count_dict[task_str] = pdf_handler.img_count |
| 609 | for img_path in pdf_handler.img_path_list: | 634 | for img_path in pdf_handler.img_path_list: |
| 610 | while img_queue.full(): | 635 | while img_queue.full(): |
| 611 | self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) | 636 | self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) | ... | ... |
| ... | @@ -282,6 +282,7 @@ class HILOCRReport(models.Model): | ... | @@ -282,6 +282,7 @@ class HILOCRReport(models.Model): |
| 282 | process_name = models.SmallIntegerField(default=ProcessName.ALL.value, verbose_name="流程名称") | 282 | process_name = models.SmallIntegerField(default=ProcessName.ALL.value, verbose_name="流程名称") |
| 283 | total_fields = models.IntegerField(null=True, verbose_name='比对字段数目') | 283 | total_fields = models.IntegerField(null=True, verbose_name='比对字段数目') |
| 284 | workflow_name = models.SmallIntegerField(null=True, verbose_name="工作流程") | 284 | workflow_name = models.SmallIntegerField(null=True, verbose_name="工作流程") |
| 285 | notes = models.CharField(null=True, max_length=2048, verbose_name="备注") | ||
| 285 | 286 | ||
| 286 | class Meta: | 287 | class Meta: |
| 287 | managed = False | 288 | managed = False |
| ... | @@ -301,6 +302,7 @@ class AFCOCRReport(models.Model): | ... | @@ -301,6 +302,7 @@ class AFCOCRReport(models.Model): |
| 301 | process_name = models.SmallIntegerField(default=ProcessName.ALL.value, verbose_name="流程名称") | 302 | process_name = models.SmallIntegerField(default=ProcessName.ALL.value, verbose_name="流程名称") |
| 302 | total_fields = models.IntegerField(null=True, verbose_name='比对字段数目') | 303 | total_fields = models.IntegerField(null=True, verbose_name='比对字段数目') |
| 303 | workflow_name = models.SmallIntegerField(null=True, verbose_name="工作流程") | 304 | workflow_name = models.SmallIntegerField(null=True, verbose_name="工作流程") |
| 305 | notes = models.CharField(null=True, max_length=2048, verbose_name="备注") | ||
| 304 | 306 | ||
| 305 | class Meta: | 307 | class Meta: |
| 306 | managed = False | 308 | managed = False | ... | ... |
| ... | @@ -42,6 +42,7 @@ class FailureReason(NamedEnum): | ... | @@ -42,6 +42,7 @@ class FailureReason(NamedEnum): |
| 42 | PDF = (0, 'PDF处理失败') | 42 | PDF = (0, 'PDF处理失败') |
| 43 | EXCEL = (1, '构建excel失败') | 43 | EXCEL = (1, '构建excel失败') |
| 44 | EDMS = (2, 'EDMS上传失败') | 44 | EDMS = (2, 'EDMS上传失败') |
| 45 | IMG_LIMIT = (3, 'PDF图片过多') | ||
| 45 | 46 | ||
| 46 | 47 | ||
| 47 | class ProcessName(NamedEnum): | 48 | class ProcessName(NamedEnum): | ... | ... |
src/common/tools/mssql_script5.py
0 → 100644
| 1 | import pyodbc | ||
| 2 | |||
| 3 | hil_sql = "ALTER TABLE hil_ocr_report ADD notes nvarchar(2048)" | ||
| 4 | |||
| 5 | afc_sql = "ALTER TABLE afc_ocr_report ADD notes nvarchar(2048)" | ||
| 6 | |||
| 7 | hil_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True) | ||
| 8 | |||
| 9 | hil_cursor = hil_cnxn.cursor() | ||
| 10 | hil_cursor.execute(hil_sql) | ||
| 11 | |||
| 12 | hil_cursor.close() | ||
| 13 | hil_cnxn.close() | ||
| 14 | |||
| 15 | afc_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True) | ||
| 16 | |||
| 17 | afc_cursor = afc_cnxn.cursor() | ||
| 18 | afc_cursor.execute(afc_sql) | ||
| 19 | |||
| 20 | afc_cursor.close() | ||
| 21 | afc_cnxn.close() |
| ... | @@ -26,6 +26,7 @@ class PDFHandler: | ... | @@ -26,6 +26,7 @@ class PDFHandler: |
| 26 | self.path = path | 26 | self.path = path |
| 27 | self.img_dir_path = img_dir_path | 27 | self.img_dir_path = img_dir_path |
| 28 | self.img_path_list = [] | 28 | self.img_path_list = [] |
| 29 | self.img_count = 0 | ||
| 29 | self.xref_set = set() | 30 | self.xref_set = set() |
| 30 | 31 | ||
| 31 | def get_img_save_path(self, pno, img_index=0, ext='png'): | 32 | def get_img_save_path(self, pno, img_index=0, ext='png'): |
| ... | @@ -192,11 +193,14 @@ class PDFHandler: | ... | @@ -192,11 +193,14 @@ class PDFHandler: |
| 192 | page = pdf.loadPage(pno) | 193 | page = pdf.loadPage(pno) |
| 193 | self.page_to_png(page) | 194 | self.page_to_png(page) |
| 194 | 195 | ||
| 195 | def extract_image(self): | 196 | def extract_image(self, max_img_count=None): |
| 196 | self.img_path_list = [] | 197 | self.img_path_list = [] |
| 197 | self.xref_set = set() | 198 | self.xref_set = set() |
| 198 | os.makedirs(self.img_dir_path, exist_ok=True) | 199 | os.makedirs(self.img_dir_path, exist_ok=True) |
| 199 | with fitz.Document(self.path) as pdf: | 200 | with fitz.Document(self.path) as pdf: |
| 201 | if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count: | ||
| 202 | self.img_count = pdf.pageCount | ||
| 203 | return | ||
| 200 | for pno in range(pdf.pageCount): | 204 | for pno in range(pdf.pageCount): |
| 201 | il = pdf.getPageImageList(pno) # 获取页面图片对象 | 205 | il = pdf.getPageImageList(pno) # 获取页面图片对象 |
| 202 | # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) | 206 | # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) |
| ... | @@ -220,3 +224,4 @@ class PDFHandler: | ... | @@ -220,3 +224,4 @@ class PDFHandler: |
| 220 | # 3.页面图片对象数目大于1时,特殊处理 | 224 | # 3.页面图片对象数目大于1时,特殊处理 |
| 221 | else: | 225 | else: |
| 222 | self.merge_il(pdf, pno, il) | 226 | self.merge_il(pdf, pno, il) |
| 227 | self.img_count = len(self.img_path_list) | ... | ... |
-
Please register or sign in to post a comment