97994674 by 周伟奇

ocr excel upload eDMS

1 parent 7aa0284c
......@@ -12,13 +12,16 @@ Django==2.1
django-oauth-toolkit==1.3.2
djangorestframework==3.9.0
djangorestframework-jwt==1.11.0
et-xmlfile==1.0.1
idna==2.9
idna-ssl==1.1.0
isodate==0.6.0
jdcal==1.4.1
lxml==4.5.1
marshmallow==3.6.1
multidict==4.7.6
oauthlib==3.1.0
openpyxl==3.0.4
pdfminer3k==1.3.4
Pillow==7.1.2
ply==3.11
......
PAGE_DEFAULT = 1
PAGE_SIZE_DEFAULT = 10
DOC_SCHEME_LIST = ['Acceptance', 'Settlement', 'Contract Management']
DATA_SOURCE_LIST = ['POS', 'EAPP', 'Econtract']
BUSINESS_TYPE_LIST = ['HIL', 'AFC']
HIL_SET = {'HIL', 'HIl', 'HiL', 'Hil', 'hIL', 'hIl', 'hiL', 'hil', 'CO00002'}
FIXED_APPLICATION_ID = '手工单'
DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACT MANAGEMENT']
DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT']
HIL_PREFIX = 'HIL'
AFC_PREFIX = 'AFC'
SPLIT_STR = '_'
BUSINESS_TYPE_LIST = [HIL_PREFIX, AFC_PREFIX]
HIL_SET = {'HIL', 'HIl', 'HiL', 'Hil', 'hIL', 'hIl', 'hiL', 'hil', 'CO00002'}
SESSION_PREFIX = 'FHLSID'
CUSTOM_CLIENT = 'CustomClient'
......@@ -15,12 +19,22 @@ FIXED_FILE_SIZE = 0
DOWNLOAD_ACTION_TYPE = 'Downloaded'
DOC_SCHEMA_ID_FILL = {
'Acceptance': (1, 'DFE-AutoFilingScript'),
'Settlement': (20, 'DFE-AutoFilingScript'),
'Contract Management': (86, 'Schema-Based')
'ACCEPTANCE': (1, 'DFE-AutoFilingScript'),
'SETTLEMENT': (20, 'DFE-AutoFilingScript'),
'CONTRACT MANAGEMENT': (86, 'Schema-Based')
}
BUSINESS_TYPE_DICT = {
HIL_PREFIX: 'CO00002',
AFC_PREFIX: 'CO00001'
}
DOC_SCHEMA_TYPE = 'ElectronicRecord'
APPLICATION_ID_META_FIELD_id = 1
DEALER_CODE_META_FIELD_id = 13
BUSINESS_TYPE_META_FIELD_id = 93
DEALER_CODE = 'ocr_situ_group'
AMOUNT_COL_TITLE_SET = {"交易金额", "金额", "收入/支出金额", "发生额"}
OVERAGE_COL_TITLE_SET = {"账户余额", "余额"}
PROOF_COL_TITLE = '核对结果'
PROOF_RES = ('对', '错')
META_SHEET_TITLE = '关键信息提取和展示'
......
import os
import requests
from zeep import Client, xsd
from settings import conf
......@@ -65,9 +66,9 @@ class EDMS:
params = {'token': token}
self.download_handler(params, headers, save_path)
def create_upload_token(self, headers, file_size):
def create_upload_token(self, headers):
with self.rc_client.settings(extra_http_headers=headers):
token = self.rc_client.service.CreateUploadToken(fileSize=file_size)
token = self.rc_client.service.CreateUploadToken(fileSize=consts.FIXED_FILE_SIZE)
return token
def upload_handler(self, file_path, params, headers):
......@@ -80,11 +81,19 @@ class EDMS:
else:
raise Exception
def get_doc_info(self, token, doc_info):
doc_schema_id, auto_filing = consts.DOC_SCHEMA_ID_FILL.get(doc_info.get('document_scheme'))
application_id = doc_info.get('application_id')
doc_file_name = doc_info.get('doc_file_name')
business_type = doc_info.get('business_type')
@staticmethod
def get_doc_file_name(doc_name):
if doc_name.endswith('pdf'):
name, _ = os.path.splitext(doc_name)
return name
return doc_name
def get_doc_info(self, token, doc, business_type, file_path):
business_type = consts.BUSINESS_TYPE_DICT.get(business_type)
doc_schema_id, auto_filing = consts.DOC_SCHEMA_ID_FILL.get(doc.document_scheme)
application_id = doc.application_id
doc_file_name = self.get_doc_file_name(doc.document_name)
origin_file_name = os.path.basename(file_path)
fields_with_value = [
{'FieldId': consts.APPLICATION_ID_META_FIELD_id,
'FieldValue': xsd.AnyObject(xsd.String(), application_id)},
......@@ -99,20 +108,20 @@ class EDMS:
'DocumentName': doc_file_name,
'FieldsWithValues': fields_with_values,
'UploadToken': token,
'OriginalFileName': doc_file_name,
'OriginalFileName': origin_file_name,
'SendEmailToMembers': False,
'AutoFilingScriptToUse': auto_filing,
'DocumentSchemaType': consts.DOC_SCHEMA_TYPE,
}
return info
def add_doc_info(self, headers, token, doc_info):
info = self.get_doc_info(token, doc_info)
def add_doc_info(self, headers, token, doc, business_type, file_path):
info = self.get_doc_info(token, doc, business_type, file_path)
with self.dm_client.settings(extra_http_headers=headers):
metadata_version_id = self.dm_client.service.AddDocumentInfo(info=info)
return metadata_version_id
def upload(self, file_path, file_size, doc_info):
def upload(self, file_path, doc, business_type):
# file_path = '/Users/clay/Postman/files/OCRuploadTest4.txt'
# file_size = 16
# doc_info = {
......@@ -122,12 +131,12 @@ class EDMS:
# 'business_type': 'CO00001',
# }
headers = self.get_headers()
token = self.create_upload_token(headers, file_size)
token = self.create_upload_token(headers)
headers.update({'Content-Type': 'application/octet-stream'})
params = {'token': token}
self.upload_handler(file_path, params, headers)
headers.pop('Content-Type')
metadata_version_id = self.add_doc_info(headers, token, doc_info)
metadata_version_id = self.add_doc_info(headers, token, doc, business_type, file_path)
return metadata_version_id
......
import os
import time
import fitz
import xlwt
import signal
import base64
import asyncio
import aiohttp
import locale
from PIL import Image
from io import BytesIO
from zeep import Client
from openpyxl import Workbook
from openpyxl.styles import numbers
from openpyxl.utils import get_column_letter
from django.core.management import BaseCommand
from common.mixins import LoggerMixin
......@@ -23,7 +25,7 @@ class Command(BaseCommand, LoggerMixin):
def __init__(self):
super().__init__()
self.log_base = '[doc process]'
self.log_base = '[doc ocr process]'
# 处理文件开关
self.switch = True
# 数据目录
......@@ -50,46 +52,54 @@ class Command(BaseCommand, LoggerMixin):
task_str, is_priority = rh.dequeue()
if task_str is None:
self.cronjob_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base))
return None, None, None, None
return None, None
business_type, doc_id_str = task_str.split('_')
business_type, doc_id_str = task_str.split(consts.SPLIT_STR)
doc_id = int(doc_id_str)
doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
doc_info = doc_class.objects.filter(id=doc_id, status=DocStatus.INIT.value).values(
'id', 'metadata_version_id', 'application_id', 'document_name', 'document_scheme').first()
if doc_info is None:
self.cronjob_log.warn('{0} [get_doc_info] [doc completed] [task_str={1}] [is_priority={2}]'.format(
# doc_info = doc_class.objects.filter(id=doc_id, status=DocStatus.INIT.value).values(
# 'id', 'metadata_version_id', 'application_id', 'document_name', 'document_scheme').first()
doc = doc_class.objects.filter(id=doc_id).first()
if doc is None:
self.cronjob_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format(
self.log_base, task_str, is_priority))
return None, None, None, None
doc_class.objects.filter(id=doc_id).update(status=DocStatus.PROCESSING.value)
self.cronjob_log.info('{0} [get_doc_info] [task_str={1}] [is_priority={2}] [doc_info={3}]'.format(
self.log_base, task_str, is_priority, doc_info))
return doc_info, doc_class, doc_id, business_type
def pdf_download(self, doc_id, doc_info, business_type):
if doc_info is None:
return None, None
elif doc.status != DocStatus.INIT.value:
self.cronjob_log.warn('{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] '
'[doc_status={3}]'.format(self.log_base, task_str, is_priority, doc.status))
return None, None
doc.status = DocStatus.PROCESSING.value
doc.save()
self.cronjob_log.info('{0} [get_doc_info] [success] [task_str={1}] [is_priority={2}]'.format(
self.log_base, task_str, is_priority))
return doc, business_type
def pdf_download(self, doc, business_type):
if doc is None:
return None, None, None
# TODO EDMS下载pdf
doc_data_path = os.path.join(self.data_dir, business_type, str(doc_id))
pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc_id))
excel_path = os.path.join(doc_data_path, '{0}.xls'.format(doc_id))
self.cronjob_log.info('{0} [pdf download success] [business_type={1}] [doc_info={2}] [pdf_path={3}]'.format(
self.log_base, business_type, doc_info, pdf_path))
doc_data_path = os.path.join(self.data_dir, business_type, str(doc.id))
pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
if doc.application_id != consts.FIXED_APPLICATION_ID:
self.edms.download(pdf_path, doc.metadata_version_id)
excel_path = os.path.join(doc_data_path, '{0}.xls'.format(doc.id))
self.cronjob_log.info('{0} [pdf download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format(
self.log_base, business_type, doc.id, pdf_path))
return doc_data_path, excel_path, pdf_path
@staticmethod
def append_sheet(wb, sheets_list, img_name):
for i, sheet in enumerate(sheets_list):
ws = wb.add_sheet('{0}_{1}'.format(img_name, i))
ws = wb.create_sheet('{0}_{1}'.format(img_name, i))
cells = sheet.get('cells')
for cell in cells:
c1 = cell.get('start_column')
c2 = cell.get('end_column')
# c2 = cell.get('end_column')
r1 = cell.get('start_row')
r2 = cell.get('end_row')
# r2 = cell.get('end_row')
label = cell.get('words')
ws.write_merge(r1, r2, c1, c2, label=label)
ws.cell(row=r1+1, column=c1+1, value=label)
@staticmethod
def get_ocr_json(img_path):
......@@ -112,6 +122,46 @@ class Command(BaseCommand, LoggerMixin):
img_name = os.path.basename(img_path)
self.append_sheet(wb, sheets_list, img_name)
def proof(self, ws):
# 找到金额、余额列
amount_col = overage_col = None
for i in ws[1]:
if i.value in consts.AMOUNT_COL_TITLE_SET:
amount_col = i.column
amount_col_letter = get_column_letter(amount_col)
elif i.value in consts.OVERAGE_COL_TITLE_SET:
overage_col = i.column
overage_col_letter = get_column_letter(overage_col)
if amount_col is None or overage_col is None:
return
# 文本转数值
for col_tuple in ws.iter_cols(min_row=2, min_col=amount_col, max_col=overage_col):
for c in col_tuple:
try:
c.value = locale.atof(c.value)
c.number_format = numbers.FORMAT_NUMBER_00
except Exception:
continue
# 增加核对结果列
proof_col_letter = get_column_letter(ws.max_column + 1)
for c in ws[proof_col_letter]:
if c.row == 1:
c.value = consts.PROOF_COL_TITLE
elif c.row == 2:
continue
else:
c.value = '=IF({3}{0}=SUM({2}{0},{3}{1}), "{4}", "{5}")'.format(
c.row, c.row - 1, amount_col_letter, overage_col_letter, *consts.PROOF_RES)
def wb_process(self, wb, excel_path):
locale.setlocale(locale.LC_NUMERIC, 'en_US.UTF-8')
for ws in wb.worksheets:
if ws.title == 'Sheet':
ws.title = consts.META_SHEET_TITLE
else:
self.proof(ws)
wb.save(excel_path) # TODO no sheet (res always [])
@staticmethod
def getimage(pix):
if pix.colorspace.n != 4:
......@@ -124,7 +174,7 @@ class Command(BaseCommand, LoggerMixin):
s = item[1] # xref of its /SMask
is_rgb = True if item[5] == 'DeviceRGB' else False
# GRAY/RGB # TODO 颜色空间不同处理
# RGB
if is_rgb:
if s == 0:
return doc.extractImage(x)
......@@ -158,7 +208,7 @@ class Command(BaseCommand, LoggerMixin):
pix1 = pix2 = None # free temp pixmaps
pix = fitz.Pixmap(fitz.csRGB, pix) # CMYK to RGB
pix = fitz.Pixmap(fitz.csRGB, pix) # GRAY/CMYK to RGB
return self.getimage(pix)
@staticmethod
......@@ -200,10 +250,11 @@ class Command(BaseCommand, LoggerMixin):
while self.switch:
# 1. 从队列获取文件信息
doc_info, doc_class, doc_id, business_type = self.get_doc_info()
doc, business_type = self.get_doc_info()
try:
# 2. 从EDMS获取PDF文件
doc_data_path, excel_path, pdf_path = self.pdf_download(doc_id, doc_info, business_type)
doc_data_path, excel_path, pdf_path = self.pdf_download(doc, business_type)
# 队列为空时的处理
if pdf_path is None:
......@@ -212,7 +263,7 @@ class Command(BaseCommand, LoggerMixin):
continue
sleep_second = int(conf.SLEEP_SECOND)
try:
# 3.PDF文件提取图片
img_save_path = os.path.join(doc_data_path, 'img')
os.makedirs(img_save_path, exist_ok=True)
......@@ -233,8 +284,8 @@ class Command(BaseCommand, LoggerMixin):
save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number))
pm.writePNG(save_path)
img_path_list.append(save_path)
self.cronjob_log.info('{0} [page to img success] [doc_id={1}] [pdf_path={2}] '
'[page={3}]'.format(self.log_base, doc_id, pdf_path, page.number))
self.cronjob_log.info('{0} [page to img success] [pdf_path={1}] [page={2}]'.format(
self.log_base, pdf_path, page.number))
else: # 提取图片
for img_index, img_il in enumerate(img_il_list):
if len(img_il) == 1: # 当只有一张图片时, 简化处理
......@@ -246,8 +297,8 @@ class Command(BaseCommand, LoggerMixin):
f.write(img_data)
img_path_list.append(save_path)
self.cronjob_log.info(
'{0} [extract img success] [doc_id={1}] [pdf_path={2}] [page={3}] '
'[img_index={4}]'.format(self.log_base, doc_id, pdf_path, pno, img_index))
'{0} [extract img success] [pdf_path={1}] [page={2}] [img_index={3}]'.format(
self.log_base, pdf_path, pno, img_index))
else: # 多张图片,竖向拼接
height_sum = 0
im_list = []
......@@ -276,28 +327,41 @@ class Command(BaseCommand, LoggerMixin):
res.save(save_path)
img_path_list.append(save_path)
self.cronjob_log.info(
'{0} [extract img success] [doc_id={1}] [pdf_path={2}] [page={3}] '
'[img_index={4}]'.format(self.log_base, doc_id, pdf_path, pno, img_index))
self.cronjob_log.info('{0} [pdf to img success] [doc_id={1}]'.format(self.log_base, doc_id))
'{0} [extract img success] [pdf_path={1}] [page={2}] [img_index={3}]'.format(
self.log_base, pdf_path, pno, img_index))
self.cronjob_log.info('{0} [pdf to img success] [business_type={1}] [doc_id={2}]'.format(
self.log_base, business_type, doc.id))
write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc_id)))
write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
# 4.图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件
wb = xlwt.Workbook()
wb = Workbook()
loop = asyncio.get_event_loop()
tasks = [self.img_ocr_excel(wb, img_path) for img_path in img_path_list]
loop.run_until_complete(asyncio.wait(tasks))
# loop.close()
wb.save(excel_path) # TODO no sheet (res always [])
# 整合excel文件
# 整合excel文件
# self.wb_process(wb, excel_path)
wb.save(excel_path)
except Exception as e:
doc.status = DocStatus.PROCESS_FAILED.value
doc.save()
self.cronjob_log.error('{0} [process failed] [business_type={1}] [doc_id={2}] [err={3}]'.format(
self.log_base, business_type, doc.id, e))
else:
try:
# 5.上传至EDMS
self.edms.upload(excel_path, doc, business_type)
except Exception as e:
doc_class.objects.filter(id=doc_id).update(status=DocStatus.PROCESS_FAILED.value)
self.cronjob_log.error('{0} [process failed] [doc_id={1}] [err={2}]'.format(self.log_base, doc_id, e))
doc.status = DocStatus.UPLOAD_FAILED.value
doc.save()
self.cronjob_log.error('{0} [upload failed] [business_type={1}] [doc_id={2}] [err={3}]'.format(
self.log_base, business_type, doc.id, e))
else:
doc_class.objects.filter(id=doc_id).update(status=DocStatus.COMPLETE.value)
self.cronjob_log.info('{0} [doc process complete] [doc_id={1}]'.format(self.log_base, doc_id))
doc.status = DocStatus.COMPLETE.value
doc.save()
self.cronjob_log.info('{0} [doc process complete] [business_type={1}] [doc_id={2}]'.format(
self.log_base, business_type, doc.id))
self.cronjob_log.info('{0} [stop safely]'.format(self.log_base))
......
......@@ -86,73 +86,143 @@ class Command(BaseCommand, LoggerMixin):
@staticmethod
def split_il(il):
img_il_list = []
small_img_il_list = []
big_img_il_list = []
start = 0
index = 0
length = len(il)
for i in range(length):
if il[i][2] >= 700 and il[i][3] >= 647:
if start < i:
small_img_il_list.append((il[start: i], index))
index += 1
else:
start += 1
big_img_il_list.append((il[i], index))
index += 1
continue
if i == start:
if i == length - 1:
img_il_list.append(il[start: length])
small_img_il_list.append((il[start: length], index))
continue
elif i == length - 1:
img_il_list.append(il[start: length])
if il[i][2] == il[i - 1][2]:
small_img_il_list.append((il[start: length], index))
else:
small_img_il_list.append((il[start: i], index))
small_img_il_list.append((il[i: length], index+1))
continue
if il[i][2] != il[i - 1][2]:
img_il_list.append(il[start: i])
small_img_il_list.append((il[start: i], index))
index += 1
start = i
elif il[i][3] != il[i - 1][3]:
img_il_list.append(il[start: i + 1])
elif il[i][3] != il[i - 1][3] and il[i][2] < 1200:
small_img_il_list.append((il[start: i + 1], index))
index += 1
start = i + 1
return img_il_list
return small_img_il_list, big_img_il_list
def handle(self, *args, **kwargs):
pdf_dir = '/Users/clay/Desktop/普通打印-部分无线/竖版-无表格-农业银行'
img_dir = '/Users/clay/Desktop/普通打印-部分无线_img/竖版-无表格-农业银行'
os.makedirs(img_dir, exist_ok=True)
pdf_dir = '/Users/clay/Desktop/问题PDF'
img_dir = '/Users/clay/Desktop/问题PDF'
for d in os.listdir(pdf_dir):
# if d in ['.DS_Store', 'CH-B008486764.pdf', 'CH-B008003736.pdf', 'CH-B008487476.pdf', 'CH-B006763780.pdf',
# 'CH-B009000564.pdf', 'CH-B009020488.pdf']:
if d in ['.DS_Store', '1竖版-无表格-农业银行样例.PNG']:
# if d in ['.DS_Store', 'CH-B008003736.pdf', 'CH-B006317088.pdf', 'CH-B008487476.pdf', 'CH-B006337608.pdf',
# 'CH-B006391612.pdf', 'CH-B006536124.pdf', 'CH-B006526652.pdf', 'CH-B009003592.pdf']:
# continue
# if d != 'CH-B006393152.PDF':
# if d != 'CH-B006526652.pdf':
if d != 'CH-B008487944.pdf':
continue
pdf_path = os.path.join(pdf_dir, d)
# pdf_path = '/Users/clay/Desktop/普通打印part2/工商银行(标准版)/CH-B006754676.pdf'
if os.path.isfile(pdf_path):
img_save_path = os.path.join(img_dir, d)
if os.path.exists(img_save_path):
continue
img_save_path = os.path.join(img_dir, d[:-4])
# if os.path.exists(img_save_path):
# continue
os.makedirs(img_save_path, exist_ok=True)
with fitz.Document(pdf_path) as pdf:
self.cronjob_log.info('{0} [pdf_path={1}] [metadata={2}]'.format(
self.log_base, pdf_path, pdf.metadata))
# xref_list = []
xref_set = set()
for pno in range(pdf.pageCount):
print('---------------------------------------')
il = pdf.getPageImageList(pno)
# (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
print(il)
# for img_index, img in enumerate(il):
# pix = self.recoverpix(pdf, img)
# ext, img_data = self.get_img_data(pix)
# save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format(
# pno, img_index, ext))
# with open(save_path, "wb") as f:
# f.write(img_data)
if len(il) == 0:
page = pdf.loadPage(pno)
pm = page.getPixmap(matrix=self.trans, alpha=False)
save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number))
pm.writePNG(save_path)
elif len(il) == 1:
width = il[0][2]
height = il[0][3]
colorspace = il[0][5]
adobe_filter = il[0][-1]
if colorspace == '' or adobe_filter in ['', '']:
continue
# 小图
if width < 500 and height < 500:
page = pdf.loadPage(pno)
pm = page.getPixmap(matrix=self.trans, alpha=False)
save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number))
pm.writePNG(save_path)
# 大图
elif il[0][0] not in xref_set:
pix = self.recoverpix(pdf, il[0])
ext, img_data = self.get_img_data(pix)
save_path = os.path.join(img_save_path, 'page_{0}_img_0.{1}'.format(pno, ext))
with open(save_path, "wb") as f:
f.write(img_data)
xref_set.add(il[0][0])
else:
il.sort(key=lambda x: x[0])
img_il_list = self.split_il(il)
del il
small_img_il_list, big_img_il_list = self.split_il(il)
print(small_img_il_list)
print(big_img_il_list)
print('+++++++++++++++++++++++++++++++++++')
print(img_il_list)
if len(img_il_list) > 3: # 单页无规律小图过多时,使用页面转图片
if len(small_img_il_list) > 2: # 单页无规律小图过多时,使用页面转图片
page = pdf.loadPage(pno)
pm = page.getPixmap(matrix=self.trans, alpha=False)
save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number))
pm.writePNG(save_path)
# img_path_list.append(save_path)
# self.cronjob_log.info('{0} [page to img success] [doc_id={1}] [pdf_path={2}] '
# '[page={3}]'.format(self.log_base, doc_id, pdf_path, page.number))
else: # 提取图片
for img_index, img_il in enumerate(img_il_list):
if len(img_il) == 1: # 当只有一张图片时, 简化处理
for img_il, img_index in big_img_il_list:
if img_il[0] in xref_set:
continue
pix = self.recoverpix(pdf, img_il)
ext, img_data = self.get_img_data(pix)
save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format(
pno, img_index, ext))
with open(save_path, "wb") as f:
f.write(img_data)
xref_set.add(img_il[0])
for img_il, img_index in small_img_il_list:
# 小图
if len(img_il) == 1 and img_il[0][2] < 500 and img_il[0][3] < 500:
page = pdf.loadPage(pno)
pm = page.getPixmap(matrix=self.trans, alpha=False)
save_path = os.path.join(img_save_path,
'page_{0}_img_0.png'.format(page.number))
pm.writePNG(save_path)
elif len(img_il) == 1 and img_il[0][0] not in xref_set: # 当只有一张图片时, 简化处理
pix = self.recoverpix(pdf, img_il[0])
ext, img_data = self.get_img_data(pix)
save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format(
pno, img_index, ext))
with open(save_path, "wb") as f:
f.write(img_data)
# img_path_list.append(save_path)
# self.cronjob_log.info(
# '{0} [extract img success] [doc_id={1}] [pdf_path={2}] [page={3}] '
# '[img_index={4}]'.format(self.log_base, doc_id, pdf_path, pno, img_index))
xref_set.add(img_il[0][0])
else: # 多张图片,竖向拼接
height_sum = 0
im_list = []
......@@ -179,6 +249,3 @@ class Command(BaseCommand, LoggerMixin):
res.paste(m, box=(0, h_now))
h_now += h
res.save(save_path)
# else:
# img_dir_path = os.path.join(img_dir, d)
# os.makedirs(img_dir_path, exist_ok=True)
......
......@@ -26,7 +26,21 @@ class DocHandler:
@staticmethod
def get_doc_class(business_type):
is_hil = business_type in consts.HIL_SET
doc_class, prefix = (HILDoc, consts.HIL_PREFIX) if is_hil else (AFCDoc, consts.AFC_PREFIX)
return doc_class, prefix
return (HILDoc, consts.HIL_PREFIX) if business_type in consts.HIL_SET else (AFCDoc, consts.AFC_PREFIX)
def fix_scheme(self, scheme):
if scheme in consts.DOC_SCHEME_LIST:
return scheme
elif scheme.upper() in consts.DOC_SCHEME_LIST:
return scheme.upper()
else:
return consts.DOC_SCHEME_LIST[0]
def fix_data_source(self, data_source):
if data_source in consts.DATA_SOURCE_LIST:
return data_source
elif data_source.upper() in consts.DATA_SOURCE_LIST:
return data_source.upper()
else:
return consts.DATA_SOURCE_LIST[0]
......
......@@ -7,20 +7,3 @@ class DocStatus(NamedEnum):
PROCESS_FAILED = (2, '识别失败')
UPLOAD_FAILED = (3, '同步失败')
COMPLETE = (4, '已完成')
class DocScheme(NamedEnum):
ACCEPTANCE = (0, "Acceptance")
SETTLEMENT = (1, 'Settlement')
CONTRACT_MANAGEMENT = (2, 'Contract Management')
class BusinessType(NamedEnum):
AFC = (0, "CO00001")
HIL = (1, 'CO00002')
class DataSource(NamedEnum):
POS = (0, "POS")
EAPP = (1, 'EAPP')
ECONTRACT = (2, 'Econtract')
......
......@@ -60,7 +60,7 @@ doc_list_args = {
'status': fields.Int(required=False,
validate=validate.OneOf(DocStatus.get_value_lst())),
'application_id': fields.Str(required=False, validate=validate.Length(max=64)),
'data_source': fields.Str(required=False, validate=validate.Length(max=64)),
'data_source': fields.Str(required=False, validate=validate.OneOf(consts.DATA_SOURCE_LIST)),
'business_type': fields.Str(required=True, validate=validate.OneOf(consts.BUSINESS_TYPE_LIST)),
'upload_time_start': fields.Date(required=False),
'upload_time_end': fields.Date(required=False),
......@@ -100,6 +100,8 @@ class UploadDocView(GenericView, DocHandler):
document = args.get('document')
business_type = document.get('businessType')
application_id = application_data.get('applicationId')
document_scheme = document.get('documentScheme')
data_source = document.get('dataSource')
try:
# 1. 上传信息记录
record = UploadDocRecords.objects.create(
......@@ -110,9 +112,9 @@ class UploadDocView(GenericView, DocHandler):
guarantor_1=applicant_data.get('guarantor1Name'),
guarantor_2=applicant_data.get('guarantor2Name'),
document_name=document.get('documentName'),
document_scheme=document.get('documentScheme'),
document_scheme=document_scheme,
business_type=business_type,
data_source=document.get('dataSource'),
data_source=data_source,
upload_finish_time=document.get('uploadFinishTime'),
)
except IntegrityError as e:
......@@ -130,17 +132,17 @@ class UploadDocView(GenericView, DocHandler):
guarantor_1=applicant_data.get('guarantor1Name'),
guarantor_2=applicant_data.get('guarantor2Name'),
document_name=document.get('documentName'),
document_scheme=document.get('documentScheme'),
data_source=document.get('dataSource'),
document_scheme=self.fix_scheme(document_scheme),
data_source=self.fix_data_source(data_source),
upload_finish_time=document.get('uploadFinishTime'),
)
# 3. 选择队列进入
is_priority = PriorityApplication.objects.filter(application_id=application_id, on_off=True).exists()
value = ['{0}_{1}'.format(prefix, doc.id)]
redis_res = rh.enqueue(value, is_priority)
self.running_log.info('[doc upload success] [args={0}] [record_id={1}] [prefix={2}] [doc_id={3}] '
tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)]
enqueue_res = rh.enqueue(tasks, is_priority)
self.running_log.info('[doc upload success] [args={0}] [record_id={1}] [business_type={2}] [doc_id={3}] '
'[is_priority={4}] [enqueue_res={5}]'.format(args, record.id, prefix, doc.id,
is_priority, redis_res))
is_priority, enqueue_res))
return response.ok()
post.openapi_doc = '''
......@@ -174,6 +176,7 @@ class PriorityDocView(GenericView, DocHandler):
application_id = application_info.get('APPLICATION_ID')
submit_datetime = application_info.get('SUBMIT_DATETIME')
entity = application_info.get('ENTITY')
if submit_datetime.utcoffset() is not None:
submit_datetime = timezone.make_naive(submit_datetime, timezone.get_current_timezone())
GCAPRecords.objects.create(
entity=entity,
......@@ -190,14 +193,14 @@ class PriorityDocView(GenericView, DocHandler):
doc_class, prefix = self.get_doc_class(entity)
doc_ids = doc_class.objects.filter(application_id=application_id,
status=DocStatus.INIT.value).values_list('id', flat=True)
task_str_list = ['{0}_{1}'.format(prefix, doc_id) for doc_id in doc_ids]
if not task_str_list:
tasks_list = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc_id) for doc_id in doc_ids]
if not tasks_list:
self.running_log.info(
'[priority doc success] [args={0}] [task_str_list={1}]'.format(args, task_str_list))
'[priority doc success] [args={0}]'.format(args))
else:
enqueue_res = rh.enqueue(task_str_list, is_priority=True)
self.running_log.info('[priority doc success] [args={0}] [task_str_list={1}] [enqueue_res={2}]'.format(
args, task_str_list, enqueue_res))
enqueue_res = rh.enqueue(tasks_list, is_priority=True)
self.running_log.info('[priority doc success] [args={0}] [tasks_list={1}] [enqueue_res={2}]'.format(
args, tasks_list, enqueue_res))
return response.ok()
post.openapi_doc = '''
......@@ -268,7 +271,7 @@ class DocView(GenericView, DocHandler):
@use_args(upload_pdf_args, location='files')
def post(self, request, args):
# 1. 上传信息记录
const_str = '手工单'
const_str = consts.FIXED_APPLICATION_ID
metadata_version_id = str(int(time.time()))
upload_finish_time = timezone.now()
document_scheme = random.choice(consts.DOC_SCHEME_LIST)
......@@ -305,8 +308,8 @@ class DocView(GenericView, DocHandler):
)
# 3. 选择队列进入
is_priority = False
value = ['{0}_{1}'.format(prefix, doc.id)]
redis_res = rh.enqueue(value, is_priority)
tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)]
enqueue_res = rh.enqueue(tasks, is_priority)
pdf_file = args.get('pdf_file')
save_dir_path = os.path.join(conf.DATA_DIR, business_type, str(doc.id))
......@@ -314,7 +317,7 @@ class DocView(GenericView, DocHandler):
os.makedirs(save_dir_path, exist_ok=True)
file_write(pdf_file, save_file_path)
self.running_log.info('[mock doc upload success] [args={0}] [record_id={1}] [prefix={2}] [doc_id={3}] '
self.running_log.info('[mock doc upload success] [args={0}] [record_id={1}] [business_type={2}] [doc_id={3}] '
'[is_priority={4}] [enqueue_res={5}]'.format(args, record.id, prefix, doc.id,
is_priority, redis_res))
is_priority, enqueue_res))
return response.ok()
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!