2932c540 by 冯轩

init

1 parent 8ddb1d4c
......@@ -1504,6 +1504,137 @@ class Command(BaseCommand, LoggerMixin):
self.log_base, traceback.format_exc()))
# error_list.append(1)
# return
elif classify_1_str == '12': # weixin e-bs
try:
max_img_count = 500
for times in range(consts.RETRY_TIMES):
try:
if doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
self.online_log.info('{0} [mo ni xia dan] [task={1}] [times={2}] '
'[pdf_path={3}]'.format(self.log_base, task_str,
times, pdf_path))
elif os.path.exists(pdf_path):
self.online_log.info('{0} [pdf from zip file] [task={1}] [times={2}] '
'[pdf_path={3}]'.format(self.log_base, task_str,
times, pdf_path))
else:
# self.edms.download(pdf_path, doc.metadata_version_id)
self.edms.download(pdf_path, doc.metadata_version_id, doc.document_scheme,
business_type)
self.online_log.info('{0} [ecm download success] [task={1}] [times={2}] '
'[pdf_path={3}]'.format(self.log_base, task_str,
times, pdf_path))
# 3.PDF文件提取图片
self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
self.log_base, task_str, times))
start_time = time.time()
pdf_handler.extract_image_for_weixin(max_img_count)
end_time = time.time()
speed_time = int(end_time - start_time)
self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'.format(
self.log_base, task_str, times, speed_time, pdf_handler.is_new_modify))
except Exception as e:
self.online_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] '
'[error={3}]'.format(self.log_base, task_str, times,
traceback.format_exc()))
else:
break
else:
raise Exception('download or pdf to img failed')
if pdf_handler.img_count == 0:
self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format(
self.log_base, task_str))
raise Exception('pdf img empty')
elif pdf_handler.img_count >= max_img_count:
self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format(
self.log_base, task_str, pdf_handler.img_count))
try:
report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport
report_table.objects.create(
case_number=doc.application_id,
request_team=RequestTeam.get_value(doc.document_scheme, 0),
request_trigger=RequestTrigger.get_value(doc.data_source, 0),
input_file=doc.document_name,
transaction_start=doc.start_time,
transaction_end=doc.start_time,
successful_at_this_level=False,
failure_reason=FailureReason.IMG_LIMIT.value,
process_name=ProcessName.ALL.value,
notes='pdf page count: {0}'.format(str(pdf_handler.img_count))
)
except Exception as e:
self.online_log.error('{0} [process error (report db save)] [error={1}]'.format(
self.log_base, traceback.format_exc()))
try:
doc.status = DocStatus.PROCESS_FAILED.value
doc.page_count = pdf_handler.page_count
doc.save()
except Exception as e:
self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
self.log_base, traceback.format_exc()))
else:
try:
if pdf_handler.is_e_pdf:
doc.metadata = pdf_handler.metadata if pdf_handler.metadata is None else \
json.dumps(pdf_handler.metadata)
doc.page_count = pdf_handler.page_count
doc.save()
except Exception as e:
self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
self.log_base, traceback.format_exc()))
with lock:
todo_count_dict[task_str] = pdf_handler.img_count
self.online_log.info('{0} [pdf_2_img_2_queue] [{1}] [is_ebank={2}]'.format(
self.log_base, task_str, pdf_handler.is_ebank
))
for img_idx, img_path in enumerate(pdf_handler.img_path_list):
while img_queue.full():
self.online_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
time.sleep(self.sleep_time_img_put)
if pdf_handler.is_e_weixin_bs:
try:
text_list = pdf_handler.page_text_list
except Exception as e:
text_list = []
else:
text_list = []
img_queue.put((business_type, img_path, text_list))
except Exception as e:
try:
end_time = timezone.now()
report_table = HILOCRReport if business_type == consts.HIL_PREFIX else AFCOCRReport
report_table.objects.create(
case_number=doc.application_id,
request_team=RequestTeam.get_value(doc.document_scheme, 0),
request_trigger=RequestTrigger.get_value(doc.data_source, 0),
input_file=doc.document_name,
transaction_start=doc.start_time,
transaction_end=end_time,
successful_at_this_level=False,
failure_reason=FailureReason.PDF.value,
process_name=ProcessName.ALL.value,
)
except Exception as e:
self.online_log.error('{0} [process error (report db save)] [error={1}]'.format(
self.log_base, traceback.format_exc()))
try:
doc.status = DocStatus.PROCESS_FAILED.value
doc.page_count = pdf_handler.page_count
doc.save()
self.online_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] '
'[error={2}]'.format(self.log_base, task_str, traceback.format_exc()))
except Exception as e:
self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
self.log_base, traceback.format_exc()))
# error_list.append(1)
# return
else: # e-contract or or e-fsm-contract or e-hmh
try:
# pdf下载 处理 图片存储 识别
......@@ -1674,6 +1805,7 @@ class Command(BaseCommand, LoggerMixin):
json_data_1['text_list'] = text_list
start_time = time.time()
self.online_log.info('{0} [ocr_1 api] [img={1}] [json_data_1={2}]'.format(self.log_base, img_path, json_data_1))
ocr_1_response = requests.post(url, json=json_data_1)
if ocr_1_response.status_code != 200:
raise OCR1Exception('ocr_1 status code: {0}'.format(ocr_1_response.status_code))
......
......@@ -684,6 +684,10 @@ class UploadDocView(GenericView, DocHandler):
classify_1 = classify_1_tmp
break
if classify_1 == 0 and ('微信支付交易明细证明' in document_name or '微信流水' in document_name):
classify_1 = 12
self.running_log.info('[weixin bs process] [doc_id={0}]'.format(doc.id))
if document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \
or document_name.endswith('.RAR'):
......@@ -1239,6 +1243,10 @@ class DocView(DocGenericView, DocHandler):
classify_1 = classify_1_tmp
break
if classify_1 == 0 and ('微信支付交易明细证明' in document_name or '微信流水' in document_name):
classify_1 = 12
self.running_log.info('[weixin bs process] [doc_id={0}]'.format(doc.id))
# tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)]
task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)])
enqueue_res = rh.enqueue([task], is_priority)
......
......@@ -69,6 +69,7 @@ class PDFHandler:
self.suffix = self.get_suffix(document_name)
self.is_ebank = False
self.is_e_pdf = False
self.is_e_weixin_bs = False
self.page_text_list = []
self.pdf_info = {}
self.img_path_pno_list = []
......@@ -407,6 +408,57 @@ class PDFHandler:
self.is_e_pdf = True
self.page_text_list = page_text_list
def put_text(self, pdf):
page_text_list = []
text_item_sum = 0
for pno in range(pdf.pageCount):
page = pdf.loadPage(pno)
if page.rotation is None:
rotation = 0
elif isinstance(page.rotation, int):
divisor, remainder = divmod(page.rotation, 90)
if remainder != 0:
return
rotation = divmod(divisor, 4)[1]
else:
return
textpage = page.getTextPage()
text = textpage.extractDICT()
text_list = []
for block in text.get('blocks'):
for line in block.get('lines'):
for span in line.get('spans'):
char = span.get('text')
if char.strip() == '':
continue
# 特殊emoji跳过
try:
print(char)
except Exception as e:
continue
bbox = span.get('bbox')
if pno == 0 and self.title_is_ebank(char):
in_ebank_set = True
text_list.append((bbox, char))
text_item_sum += len(text_list)
if text_item_sum < (pno + 1) * 5:
return
else:
page_text_list.append(
{
'width': text.get('width'),
'height': text.get('height'),
'rotation': rotation,
'text': text_list
}
)
self.is_e_pdf = True
self.is_e_weixin_bs = True
self.page_text_list = page_text_list
def e_contract_process(self):
os.makedirs(self.img_dir_path, exist_ok=True)
with fitz.Document(self.path) as pdf:
......@@ -473,6 +525,59 @@ class PDFHandler:
self.merge_il(pdf, pno, il)
self.img_count = len(self.img_path_list)
def extract_image_for_weixin(self, max_img_count=None):
self.img_path_list = []
self.xref_set = set()
os.makedirs(self.img_dir_path, exist_ok=True)
if self.suffix in self.img_suffixs:
img_save_path = self.get_img_save_path(0, ext=self.suffix[1:])
shutil.copy(self.path, img_save_path)
self.img_path_list.append(img_save_path)
else:
with fitz.Document(self.path) as pdf:
# 解密
for pwd in self.pwd_list:
if not pdf.isEncrypted:
break
pdf.authenticate(pwd)
self.metadata = pdf.metadata
self.page_count = pdf.pageCount
if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
self.img_count = pdf.pageCount
return
self.put_text(pdf)
for pno in range(pdf.pageCount):
il = pdf.getPageImageList(pno) # 获取页面图片对象
# (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
# 1.页面图片对象数目为0时,保存整个页面为png图片
if self.is_e_pdf or self.is_ebank or len(il) == 0:
page = pdf.loadPage(pno)
self.page_to_png(page)
# 2.页面图片对象数目为1时:
# 小图(如电子账单的盖章):保存整个页面为png图片
# 大图:提取图片对象
elif len(il) == 1:
xref, smask, width, height, _, colorspace, _, _, _ = il[0]
# 小图
if width < WH_COUPLE_1[0] and height < WH_COUPLE_1[1]:
page = pdf.loadPage(pno)
self.page_to_png(page)
# 大图
elif width >= WH_COUPLE_6[0] or height >= WH_COUPLE_6[1]:
self.is_new_modify = 1
is_big_img = (width < WH_COUPLE_7[0] and height < WH_COUPLE_7[1]) # 防止图片过大
page = pdf.loadPage(pno)
self.page_to_png(page, is_big_img=is_big_img)
elif xref not in self.xref_set:
self.extract_single_image(pdf, xref, smask, colorspace, pno)
# 3.页面图片对象数目大于1时,特殊处理
else:
self.merge_il(pdf, pno, il)
self.img_count = len(self.img_path_list)
def extract_page_image(self):
self.img_path_list = []
self.xref_set = set()
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!