24c87e7e by 周伟奇

pdf page limit

1 parent 96178db6
......@@ -23,7 +23,7 @@ from apps.doc.ocr.edms import EDMS, rh
from apps.doc.named_enum import KeywordsType, FailureReason, WorkflowName, ProcessName, RequestTeam, RequestTrigger
from apps.doc.exceptions import EDMSException, OCR1Exception, OCR2Exception, OCR4Exception
from apps.doc.ocr.wb import BSWorkbook
from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords, HILOCRResult, AFCOCRResult, HILOCRReport, AFCOCRReport, DDARecords, IDBCRecords
from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords, HILOCRResult, AFCOCRResult, HILOCRReport, AFCOCRReport, DDARecords, IDBCRecords, Configs
from celery_compare.tasks import compare
......@@ -582,6 +582,11 @@ class Command(BaseCommand, LoggerMixin):
pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
pdf_handler = PDFHandler(pdf_path, img_save_path)
max_count_obj = Configs.objects.filter(id=2).first()
try:
max_img_count = int(max_count_obj.value)
except Exception as e:
max_img_count = 500
for times in range(consts.RETRY_TIMES):
try:
......@@ -594,7 +599,7 @@ class Command(BaseCommand, LoggerMixin):
self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
self.log_base, task_str, times))
start_time = time.time()
pdf_handler.extract_image()
pdf_handler.extract_image(max_img_count)
end_time = time.time()
speed_time = int(end_time - start_time)
self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format(
......@@ -608,14 +613,13 @@ class Command(BaseCommand, LoggerMixin):
else:
raise Exception('download or pdf to img failed')
img_count = len(pdf_handler.img_path_list)
if img_count == 0:
if pdf_handler.img_count == 0:
self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format(
self.log_base, task_str))
raise Exception('pdf img empty')
elif img_count >= max_img_count:
elif pdf_handler.img_count >= max_img_count:
self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format(
self.log_base, task_str, img_count))
self.log_base, task_str, pdf_handler.img_count))
try:
report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport
......@@ -629,13 +633,14 @@ class Command(BaseCommand, LoggerMixin):
successful_at_this_level=False,
failure_reason=FailureReason.IMG_LIMIT.value,
process_name=ProcessName.ALL.value,
notes='pdf page count: {0}'.format(str(pdf_handler.img_count))
)
except Exception as e:
self.online_log.error('{0} [process error (report db save)] [error={1}]'.format(
self.log_base, traceback.format_exc()))
else:
with lock:
todo_count_dict[task_str] = img_count
todo_count_dict[task_str] = pdf_handler.img_count
for img_path in pdf_handler.img_path_list:
while img_queue.full():
self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
......
......@@ -282,6 +282,7 @@ class HILOCRReport(models.Model):
process_name = models.SmallIntegerField(default=ProcessName.ALL.value, verbose_name="流程名称")
total_fields = models.IntegerField(null=True, verbose_name='比对字段数目')
workflow_name = models.SmallIntegerField(null=True, verbose_name="工作流程")
notes = models.CharField(null=True, max_length=2048, verbose_name="备注")
class Meta:
managed = False
......@@ -301,6 +302,7 @@ class AFCOCRReport(models.Model):
process_name = models.SmallIntegerField(default=ProcessName.ALL.value, verbose_name="流程名称")
total_fields = models.IntegerField(null=True, verbose_name='比对字段数目')
workflow_name = models.SmallIntegerField(null=True, verbose_name="工作流程")
notes = models.CharField(null=True, max_length=2048, verbose_name="备注")
class Meta:
managed = False
......
import pyodbc
hil_sql = "ALTER TABLE hil_ocr_report ADD notes nvarchar(2048)"
afc_sql = "ALTER TABLE afc_ocr_report ADD notes nvarchar(2048)"
hil_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True)
hil_cursor = hil_cnxn.cursor()
hil_cursor.execute(hil_sql)
hil_cursor.close()
hil_cnxn.close()
afc_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=True)
afc_cursor = afc_cnxn.cursor()
afc_cursor.execute(afc_sql)
afc_cursor.close()
afc_cnxn.close()
......@@ -26,6 +26,7 @@ class PDFHandler:
self.path = path
self.img_dir_path = img_dir_path
self.img_path_list = []
self.img_count = 0
self.xref_set = set()
def get_img_save_path(self, pno, img_index=0, ext='png'):
......@@ -192,11 +193,14 @@ class PDFHandler:
page = pdf.loadPage(pno)
self.page_to_png(page)
def extract_image(self):
def extract_image(self, max_img_count=None):
self.img_path_list = []
self.xref_set = set()
os.makedirs(self.img_dir_path, exist_ok=True)
with fitz.Document(self.path) as pdf:
if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
self.img_count = pdf.pageCount
return
for pno in range(pdf.pageCount):
il = pdf.getPageImageList(pno) # 获取页面图片对象
# (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
......@@ -220,3 +224,4 @@ class PDFHandler:
# 3.页面图片对象数目大于1时,特殊处理
else:
self.merge_il(pdf, pno, il)
self.img_count = len(self.img_path_list)
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!