Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
97994674
authored
2020-07-22 17:14:09 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
ocr excel upload eDMS
1 parent
7aa0284c
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
299 additions
and
142 deletions
requirements/base.txt
src/apps/doc/consts.py
src/apps/doc/edms.py
src/apps/doc/management/commands/doc_process.py
src/apps/doc/management/commands/pdf_to_img.py
src/apps/doc/mixins.py
src/apps/doc/named_enum.py
src/apps/doc/views.py
requirements/base.txt
View file @
9799467
...
...
@@ -12,13 +12,16 @@ Django==2.1
django-oauth-toolkit==1.3.2
djangorestframework==3.9.0
djangorestframework-jwt==1.11.0
et-xmlfile==1.0.1
idna==2.9
idna-ssl==1.1.0
isodate==0.6.0
jdcal==1.4.1
lxml==4.5.1
marshmallow==3.6.1
multidict==4.7.6
oauthlib==3.1.0
openpyxl==3.0.4
pdfminer3k==1.3.4
Pillow==7.1.2
ply==3.11
...
...
src/apps/doc/consts.py
View file @
9799467
PAGE_DEFAULT
=
1
PAGE_SIZE_DEFAULT
=
10
DOC_SCHEME_LIST
=
[
'Acceptance'
,
'Settlement'
,
'Contract Management'
]
DATA_SOURCE_LIST
=
[
'POS'
,
'EAPP'
,
'Econtract'
]
BUSINESS_TYPE_LIST
=
[
'HIL'
,
'AFC'
]
HIL_SET
=
{
'HIL'
,
'HIl'
,
'HiL'
,
'Hil'
,
'hIL'
,
'hIl'
,
'hiL'
,
'hil'
,
'CO00002'
}
FIXED_APPLICATION_ID
=
'手工单'
DOC_SCHEME_LIST
=
[
'ACCEPTANCE'
,
'SETTLEMENT'
,
'CONTRACT MANAGEMENT'
]
DATA_SOURCE_LIST
=
[
'POS'
,
'EAPP'
,
'ECONTRACT'
]
HIL_PREFIX
=
'HIL'
AFC_PREFIX
=
'AFC'
SPLIT_STR
=
'_'
BUSINESS_TYPE_LIST
=
[
HIL_PREFIX
,
AFC_PREFIX
]
HIL_SET
=
{
'HIL'
,
'HIl'
,
'HiL'
,
'Hil'
,
'hIL'
,
'hIl'
,
'hiL'
,
'hil'
,
'CO00002'
}
SESSION_PREFIX
=
'FHLSID'
CUSTOM_CLIENT
=
'CustomClient'
...
...
@@ -15,12 +19,22 @@ FIXED_FILE_SIZE = 0
DOWNLOAD_ACTION_TYPE
=
'Downloaded'
DOC_SCHEMA_ID_FILL
=
{
'Acceptance'
:
(
1
,
'DFE-AutoFilingScript'
),
'Settlement'
:
(
20
,
'DFE-AutoFilingScript'
),
'Contract Management'
:
(
86
,
'Schema-Based'
)
'ACCEPTANCE'
:
(
1
,
'DFE-AutoFilingScript'
),
'SETTLEMENT'
:
(
20
,
'DFE-AutoFilingScript'
),
'CONTRACT MANAGEMENT'
:
(
86
,
'Schema-Based'
)
}
BUSINESS_TYPE_DICT
=
{
HIL_PREFIX
:
'CO00002'
,
AFC_PREFIX
:
'CO00001'
}
DOC_SCHEMA_TYPE
=
'ElectronicRecord'
APPLICATION_ID_META_FIELD_id
=
1
DEALER_CODE_META_FIELD_id
=
13
BUSINESS_TYPE_META_FIELD_id
=
93
DEALER_CODE
=
'ocr_situ_group'
AMOUNT_COL_TITLE_SET
=
{
"交易金额"
,
"金额"
,
"收入/支出金额"
,
"发生额"
}
OVERAGE_COL_TITLE_SET
=
{
"账户余额"
,
"余额"
}
PROOF_COL_TITLE
=
'核对结果'
PROOF_RES
=
(
'对'
,
'错'
)
META_SHEET_TITLE
=
'关键信息提取和展示'
...
...
src/apps/doc/edms.py
View file @
9799467
import
os
import
requests
from
zeep
import
Client
,
xsd
from
settings
import
conf
...
...
@@ -65,9 +66,9 @@ class EDMS:
params
=
{
'token'
:
token
}
self
.
download_handler
(
params
,
headers
,
save_path
)
def
create_upload_token
(
self
,
headers
,
file_size
):
def
create_upload_token
(
self
,
headers
):
with
self
.
rc_client
.
settings
(
extra_http_headers
=
headers
):
token
=
self
.
rc_client
.
service
.
CreateUploadToken
(
fileSize
=
file_size
)
token
=
self
.
rc_client
.
service
.
CreateUploadToken
(
fileSize
=
consts
.
FIXED_FILE_SIZE
)
return
token
def
upload_handler
(
self
,
file_path
,
params
,
headers
):
...
...
@@ -80,11 +81,19 @@ class EDMS:
else
:
raise
Exception
def
get_doc_info
(
self
,
token
,
doc_info
):
doc_schema_id
,
auto_filing
=
consts
.
DOC_SCHEMA_ID_FILL
.
get
(
doc_info
.
get
(
'document_scheme'
))
application_id
=
doc_info
.
get
(
'application_id'
)
doc_file_name
=
doc_info
.
get
(
'doc_file_name'
)
business_type
=
doc_info
.
get
(
'business_type'
)
@staticmethod
def
get_doc_file_name
(
doc_name
):
if
doc_name
.
endswith
(
'pdf'
):
name
,
_
=
os
.
path
.
splitext
(
doc_name
)
return
name
return
doc_name
def
get_doc_info
(
self
,
token
,
doc
,
business_type
,
file_path
):
business_type
=
consts
.
BUSINESS_TYPE_DICT
.
get
(
business_type
)
doc_schema_id
,
auto_filing
=
consts
.
DOC_SCHEMA_ID_FILL
.
get
(
doc
.
document_scheme
)
application_id
=
doc
.
application_id
doc_file_name
=
self
.
get_doc_file_name
(
doc
.
document_name
)
origin_file_name
=
os
.
path
.
basename
(
file_path
)
fields_with_value
=
[
{
'FieldId'
:
consts
.
APPLICATION_ID_META_FIELD_id
,
'FieldValue'
:
xsd
.
AnyObject
(
xsd
.
String
(),
application_id
)},
...
...
@@ -99,20 +108,20 @@ class EDMS:
'DocumentName'
:
doc_file_name
,
'FieldsWithValues'
:
fields_with_values
,
'UploadToken'
:
token
,
'OriginalFileName'
:
doc
_file_name
,
'OriginalFileName'
:
origin
_file_name
,
'SendEmailToMembers'
:
False
,
'AutoFilingScriptToUse'
:
auto_filing
,
'DocumentSchemaType'
:
consts
.
DOC_SCHEMA_TYPE
,
}
return
info
def
add_doc_info
(
self
,
headers
,
token
,
doc
_info
):
info
=
self
.
get_doc_info
(
token
,
doc
_info
)
def
add_doc_info
(
self
,
headers
,
token
,
doc
,
business_type
,
file_path
):
info
=
self
.
get_doc_info
(
token
,
doc
,
business_type
,
file_path
)
with
self
.
dm_client
.
settings
(
extra_http_headers
=
headers
):
metadata_version_id
=
self
.
dm_client
.
service
.
AddDocumentInfo
(
info
=
info
)
return
metadata_version_id
def
upload
(
self
,
file_path
,
file_size
,
doc_info
):
def
upload
(
self
,
file_path
,
doc
,
business_type
):
# file_path = '/Users/clay/Postman/files/OCRuploadTest4.txt'
# file_size = 16
# doc_info = {
...
...
@@ -122,12 +131,12 @@ class EDMS:
# 'business_type': 'CO00001',
# }
headers
=
self
.
get_headers
()
token
=
self
.
create_upload_token
(
headers
,
file_size
)
token
=
self
.
create_upload_token
(
headers
)
headers
.
update
({
'Content-Type'
:
'application/octet-stream'
})
params
=
{
'token'
:
token
}
self
.
upload_handler
(
file_path
,
params
,
headers
)
headers
.
pop
(
'Content-Type'
)
metadata_version_id
=
self
.
add_doc_info
(
headers
,
token
,
doc
_info
)
metadata_version_id
=
self
.
add_doc_info
(
headers
,
token
,
doc
,
business_type
,
file_path
)
return
metadata_version_id
...
...
src/apps/doc/management/commands/doc_process.py
View file @
9799467
import
os
import
time
import
fitz
import
xlwt
import
signal
import
base64
import
asyncio
import
aiohttp
import
locale
from
PIL
import
Image
from
io
import
BytesIO
from
zeep
import
Client
from
openpyxl
import
Workbook
from
openpyxl.styles
import
numbers
from
openpyxl.utils
import
get_column_letter
from
django.core.management
import
BaseCommand
from
common.mixins
import
LoggerMixin
...
...
@@ -23,7 +25,7 @@ class Command(BaseCommand, LoggerMixin):
def
__init__
(
self
):
super
()
.
__init__
()
self
.
log_base
=
'[doc process]'
self
.
log_base
=
'[doc
ocr
process]'
# 处理文件开关
self
.
switch
=
True
# 数据目录
...
...
@@ -50,46 +52,54 @@ class Command(BaseCommand, LoggerMixin):
task_str
,
is_priority
=
rh
.
dequeue
()
if
task_str
is
None
:
self
.
cronjob_log
.
info
(
'{0} [get_doc_info] [queue empty]'
.
format
(
self
.
log_base
))
return
None
,
None
,
None
,
None
return
None
,
None
business_type
,
doc_id_str
=
task_str
.
split
(
'_'
)
business_type
,
doc_id_str
=
task_str
.
split
(
consts
.
SPLIT_STR
)
doc_id
=
int
(
doc_id_str
)
doc_class
=
HILDoc
if
business_type
==
consts
.
HIL_PREFIX
else
AFCDoc
doc_info
=
doc_class
.
objects
.
filter
(
id
=
doc_id
,
status
=
DocStatus
.
INIT
.
value
)
.
values
(
'id'
,
'metadata_version_id'
,
'application_id'
,
'document_name'
,
'document_scheme'
)
.
first
()
if
doc_info
is
None
:
self
.
cronjob_log
.
warn
(
'{0} [get_doc_info] [doc completed] [task_str={1}] [is_priority={2}]'
.
format
(
# doc_info = doc_class.objects.filter(id=doc_id, status=DocStatus.INIT.value).values(
# 'id', 'metadata_version_id', 'application_id', 'document_name', 'document_scheme').first()
doc
=
doc_class
.
objects
.
filter
(
id
=
doc_id
)
.
first
()
if
doc
is
None
:
self
.
cronjob_log
.
warn
(
'{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'
.
format
(
self
.
log_base
,
task_str
,
is_priority
))
return
None
,
None
,
None
,
None
doc_class
.
objects
.
filter
(
id
=
doc_id
)
.
update
(
status
=
DocStatus
.
PROCESSING
.
value
)
self
.
cronjob_log
.
info
(
'{0} [get_doc_info] [task_str={1}] [is_priority={2}] [doc_info={3}]'
.
format
(
self
.
log_base
,
task_str
,
is_priority
,
doc_info
))
return
doc_info
,
doc_class
,
doc_id
,
business_type
def
pdf_download
(
self
,
doc_id
,
doc_info
,
business_type
):
if
doc_info
is
None
:
return
None
,
None
elif
doc
.
status
!=
DocStatus
.
INIT
.
value
:
self
.
cronjob_log
.
warn
(
'{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] '
'[doc_status={3}]'
.
format
(
self
.
log_base
,
task_str
,
is_priority
,
doc
.
status
))
return
None
,
None
doc
.
status
=
DocStatus
.
PROCESSING
.
value
doc
.
save
()
self
.
cronjob_log
.
info
(
'{0} [get_doc_info] [success] [task_str={1}] [is_priority={2}]'
.
format
(
self
.
log_base
,
task_str
,
is_priority
))
return
doc
,
business_type
def
pdf_download
(
self
,
doc
,
business_type
):
if
doc
is
None
:
return
None
,
None
,
None
# TODO EDMS下载pdf
doc_data_path
=
os
.
path
.
join
(
self
.
data_dir
,
business_type
,
str
(
doc_id
))
pdf_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.pdf'
.
format
(
doc_id
))
excel_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.xls'
.
format
(
doc_id
))
self
.
cronjob_log
.
info
(
'{0} [pdf download success] [business_type={1}] [doc_info={2}] [pdf_path={3}]'
.
format
(
self
.
log_base
,
business_type
,
doc_info
,
pdf_path
))
doc_data_path
=
os
.
path
.
join
(
self
.
data_dir
,
business_type
,
str
(
doc
.
id
))
pdf_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.pdf'
.
format
(
doc
.
id
))
if
doc
.
application_id
!=
consts
.
FIXED_APPLICATION_ID
:
self
.
edms
.
download
(
pdf_path
,
doc
.
metadata_version_id
)
excel_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.xls'
.
format
(
doc
.
id
))
self
.
cronjob_log
.
info
(
'{0} [pdf download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
pdf_path
))
return
doc_data_path
,
excel_path
,
pdf_path
@staticmethod
def
append_sheet
(
wb
,
sheets_list
,
img_name
):
for
i
,
sheet
in
enumerate
(
sheets_list
):
ws
=
wb
.
add
_sheet
(
'{0}_{1}'
.
format
(
img_name
,
i
))
ws
=
wb
.
create
_sheet
(
'{0}_{1}'
.
format
(
img_name
,
i
))
cells
=
sheet
.
get
(
'cells'
)
for
cell
in
cells
:
c1
=
cell
.
get
(
'start_column'
)
c2
=
cell
.
get
(
'end_column'
)
#
c2 = cell.get('end_column')
r1
=
cell
.
get
(
'start_row'
)
r2
=
cell
.
get
(
'end_row'
)
#
r2 = cell.get('end_row')
label
=
cell
.
get
(
'words'
)
ws
.
write_merge
(
r1
,
r2
,
c1
,
c2
,
label
=
label
)
ws
.
cell
(
row
=
r1
+
1
,
column
=
c1
+
1
,
value
=
label
)
@staticmethod
def
get_ocr_json
(
img_path
):
...
...
@@ -112,6 +122,46 @@ class Command(BaseCommand, LoggerMixin):
img_name
=
os
.
path
.
basename
(
img_path
)
self
.
append_sheet
(
wb
,
sheets_list
,
img_name
)
def
proof
(
self
,
ws
):
# 找到金额、余额列
amount_col
=
overage_col
=
None
for
i
in
ws
[
1
]:
if
i
.
value
in
consts
.
AMOUNT_COL_TITLE_SET
:
amount_col
=
i
.
column
amount_col_letter
=
get_column_letter
(
amount_col
)
elif
i
.
value
in
consts
.
OVERAGE_COL_TITLE_SET
:
overage_col
=
i
.
column
overage_col_letter
=
get_column_letter
(
overage_col
)
if
amount_col
is
None
or
overage_col
is
None
:
return
# 文本转数值
for
col_tuple
in
ws
.
iter_cols
(
min_row
=
2
,
min_col
=
amount_col
,
max_col
=
overage_col
):
for
c
in
col_tuple
:
try
:
c
.
value
=
locale
.
atof
(
c
.
value
)
c
.
number_format
=
numbers
.
FORMAT_NUMBER_00
except
Exception
:
continue
# 增加核对结果列
proof_col_letter
=
get_column_letter
(
ws
.
max_column
+
1
)
for
c
in
ws
[
proof_col_letter
]:
if
c
.
row
==
1
:
c
.
value
=
consts
.
PROOF_COL_TITLE
elif
c
.
row
==
2
:
continue
else
:
c
.
value
=
'=IF({3}{0}=SUM({2}{0},{3}{1}), "{4}", "{5}")'
.
format
(
c
.
row
,
c
.
row
-
1
,
amount_col_letter
,
overage_col_letter
,
*
consts
.
PROOF_RES
)
def
wb_process
(
self
,
wb
,
excel_path
):
locale
.
setlocale
(
locale
.
LC_NUMERIC
,
'en_US.UTF-8'
)
for
ws
in
wb
.
worksheets
:
if
ws
.
title
==
'Sheet'
:
ws
.
title
=
consts
.
META_SHEET_TITLE
else
:
self
.
proof
(
ws
)
wb
.
save
(
excel_path
)
# TODO no sheet (res always [])
@staticmethod
def
getimage
(
pix
):
if
pix
.
colorspace
.
n
!=
4
:
...
...
@@ -124,7 +174,7 @@ class Command(BaseCommand, LoggerMixin):
s
=
item
[
1
]
# xref of its /SMask
is_rgb
=
True
if
item
[
5
]
==
'DeviceRGB'
else
False
#
GRAY/RGB # TODO 颜色空间不同处理
#
RGB
if
is_rgb
:
if
s
==
0
:
return
doc
.
extractImage
(
x
)
...
...
@@ -158,7 +208,7 @@ class Command(BaseCommand, LoggerMixin):
pix1
=
pix2
=
None
# free temp pixmaps
pix
=
fitz
.
Pixmap
(
fitz
.
csRGB
,
pix
)
# CMYK to RGB
pix
=
fitz
.
Pixmap
(
fitz
.
csRGB
,
pix
)
#
GRAY/
CMYK to RGB
return
self
.
getimage
(
pix
)
@staticmethod
...
...
@@ -200,10 +250,11 @@ class Command(BaseCommand, LoggerMixin):
while
self
.
switch
:
# 1. 从队列获取文件信息
doc
_info
,
doc_class
,
doc_id
,
business_type
=
self
.
get_doc_info
()
doc
,
business_type
=
self
.
get_doc_info
()
try
:
# 2. 从EDMS获取PDF文件
doc_data_path
,
excel_path
,
pdf_path
=
self
.
pdf_download
(
doc_id
,
doc_info
,
business_type
)
doc_data_path
,
excel_path
,
pdf_path
=
self
.
pdf_download
(
doc
,
business_type
)
# 队列为空时的处理
if
pdf_path
is
None
:
...
...
@@ -212,7 +263,7 @@ class Command(BaseCommand, LoggerMixin):
continue
sleep_second
=
int
(
conf
.
SLEEP_SECOND
)
try
:
# 3.PDF文件提取图片
img_save_path
=
os
.
path
.
join
(
doc_data_path
,
'img'
)
os
.
makedirs
(
img_save_path
,
exist_ok
=
True
)
...
...
@@ -233,8 +284,8 @@ class Command(BaseCommand, LoggerMixin):
save_path
=
os
.
path
.
join
(
img_save_path
,
'page_{0}_img_0.png'
.
format
(
page
.
number
))
pm
.
writePNG
(
save_path
)
img_path_list
.
append
(
save_path
)
self
.
cronjob_log
.
info
(
'{0} [page to img success] [
doc_id={1}] [pdf_path={2}] '
'[page={3}]'
.
format
(
self
.
log_base
,
doc_id
,
pdf_path
,
page
.
number
))
self
.
cronjob_log
.
info
(
'{0} [page to img success] [
pdf_path={1}] [page={2}]'
.
format
(
self
.
log_base
,
pdf_path
,
page
.
number
))
else
:
# 提取图片
for
img_index
,
img_il
in
enumerate
(
img_il_list
):
if
len
(
img_il
)
==
1
:
# 当只有一张图片时, 简化处理
...
...
@@ -246,8 +297,8 @@ class Command(BaseCommand, LoggerMixin):
f
.
write
(
img_data
)
img_path_list
.
append
(
save_path
)
self
.
cronjob_log
.
info
(
'{0} [extract img success] [
doc_id={1}] [pdf_path={2}] [page={3}] '
'[img_index={4}]'
.
format
(
self
.
log_base
,
doc_id
,
pdf_path
,
pno
,
img_index
))
'{0} [extract img success] [
pdf_path={1}] [page={2}] [img_index={3}]'
.
format
(
self
.
log_base
,
pdf_path
,
pno
,
img_index
))
else
:
# 多张图片,竖向拼接
height_sum
=
0
im_list
=
[]
...
...
@@ -276,28 +327,41 @@ class Command(BaseCommand, LoggerMixin):
res
.
save
(
save_path
)
img_path_list
.
append
(
save_path
)
self
.
cronjob_log
.
info
(
'{0} [extract img success] [doc_id={1}] [pdf_path={2}] [page={3}] '
'[img_index={4}]'
.
format
(
self
.
log_base
,
doc_id
,
pdf_path
,
pno
,
img_index
))
self
.
cronjob_log
.
info
(
'{0} [pdf to img success] [doc_id={1}]'
.
format
(
self
.
log_base
,
doc_id
))
'{0} [extract img success] [pdf_path={1}] [page={2}] [img_index={3}]'
.
format
(
self
.
log_base
,
pdf_path
,
pno
,
img_index
))
self
.
cronjob_log
.
info
(
'{0} [pdf to img success] [business_type={1}] [doc_id={2}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
))
write_zip_file
(
img_save_path
,
os
.
path
.
join
(
doc_data_path
,
'{0}_img.zip'
.
format
(
doc
_
id
)))
write_zip_file
(
img_save_path
,
os
.
path
.
join
(
doc_data_path
,
'{0}_img.zip'
.
format
(
doc
.
id
)))
# 4.图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件
wb
=
xlwt
.
Workbook
()
wb
=
Workbook
()
loop
=
asyncio
.
get_event_loop
()
tasks
=
[
self
.
img_ocr_excel
(
wb
,
img_path
)
for
img_path
in
img_path_list
]
loop
.
run_until_complete
(
asyncio
.
wait
(
tasks
))
# loop.close()
wb
.
save
(
excel_path
)
# TODO no sheet (res always [])
# 整合excel文件
# 整合excel文件
# self.wb_process(wb, excel_path)
wb
.
save
(
excel_path
)
except
Exception
as
e
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
save
()
self
.
cronjob_log
.
error
(
'{0} [process failed] [business_type={1}] [doc_id={2}] [err={3}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
e
))
else
:
try
:
# 5.上传至EDMS
self
.
edms
.
upload
(
excel_path
,
doc
,
business_type
)
except
Exception
as
e
:
doc_class
.
objects
.
filter
(
id
=
doc_id
)
.
update
(
status
=
DocStatus
.
PROCESS_FAILED
.
value
)
self
.
cronjob_log
.
error
(
'{0} [process failed] [doc_id={1}] [err={2}]'
.
format
(
self
.
log_base
,
doc_id
,
e
))
doc
.
status
=
DocStatus
.
UPLOAD_FAILED
.
value
doc
.
save
()
self
.
cronjob_log
.
error
(
'{0} [upload failed] [business_type={1}] [doc_id={2}] [err={3}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
e
))
else
:
doc_class
.
objects
.
filter
(
id
=
doc_id
)
.
update
(
status
=
DocStatus
.
COMPLETE
.
value
)
self
.
cronjob_log
.
info
(
'{0} [doc process complete] [doc_id={1}]'
.
format
(
self
.
log_base
,
doc_id
))
doc
.
status
=
DocStatus
.
COMPLETE
.
value
doc
.
save
()
self
.
cronjob_log
.
info
(
'{0} [doc process complete] [business_type={1}] [doc_id={2}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
))
self
.
cronjob_log
.
info
(
'{0} [stop safely]'
.
format
(
self
.
log_base
))
...
...
src/apps/doc/management/commands/pdf_to_img.py
View file @
9799467
...
...
@@ -86,73 +86,143 @@ class Command(BaseCommand, LoggerMixin):
@staticmethod
def
split_il
(
il
):
img_il_list
=
[]
small_img_il_list
=
[]
big_img_il_list
=
[]
start
=
0
index
=
0
length
=
len
(
il
)
for
i
in
range
(
length
):
if
il
[
i
][
2
]
>=
700
and
il
[
i
][
3
]
>=
647
:
if
start
<
i
:
small_img_il_list
.
append
((
il
[
start
:
i
],
index
))
index
+=
1
else
:
start
+=
1
big_img_il_list
.
append
((
il
[
i
],
index
))
index
+=
1
continue
if
i
==
start
:
if
i
==
length
-
1
:
img_il_list
.
append
(
il
[
start
:
length
]
)
small_img_il_list
.
append
((
il
[
start
:
length
],
index
)
)
continue
elif
i
==
length
-
1
:
img_il_list
.
append
(
il
[
start
:
length
])
if
il
[
i
][
2
]
==
il
[
i
-
1
][
2
]:
small_img_il_list
.
append
((
il
[
start
:
length
],
index
))
else
:
small_img_il_list
.
append
((
il
[
start
:
i
],
index
))
small_img_il_list
.
append
((
il
[
i
:
length
],
index
+
1
))
continue
if
il
[
i
][
2
]
!=
il
[
i
-
1
][
2
]:
img_il_list
.
append
(
il
[
start
:
i
])
small_img_il_list
.
append
((
il
[
start
:
i
],
index
))
index
+=
1
start
=
i
elif
il
[
i
][
3
]
!=
il
[
i
-
1
][
3
]:
img_il_list
.
append
(
il
[
start
:
i
+
1
])
elif
il
[
i
][
3
]
!=
il
[
i
-
1
][
3
]
and
il
[
i
][
2
]
<
1200
:
small_img_il_list
.
append
((
il
[
start
:
i
+
1
],
index
))
index
+=
1
start
=
i
+
1
return
img_il_list
return
small_img_il_list
,
big_
img_il_list
def
handle
(
self
,
*
args
,
**
kwargs
):
pdf_dir
=
'/Users/clay/Desktop/普通打印-部分无线/竖版-无表格-农业银行'
img_dir
=
'/Users/clay/Desktop/普通打印-部分无线_img/竖版-无表格-农业银行'
os
.
makedirs
(
img_dir
,
exist_ok
=
True
)
pdf_dir
=
'/Users/clay/Desktop/问题PDF'
img_dir
=
'/Users/clay/Desktop/问题PDF'
for
d
in
os
.
listdir
(
pdf_dir
):
# if d in ['.DS_Store', 'CH-B008486764.pdf', 'CH-B008003736.pdf', 'CH-B008487476.pdf', 'CH-B006763780.pdf',
# 'CH-B009000564.pdf', 'CH-B009020488.pdf']:
if
d
in
[
'.DS_Store'
,
'1竖版-无表格-农业银行样例.PNG'
]:
# if d in ['.DS_Store', 'CH-B008003736.pdf', 'CH-B006317088.pdf', 'CH-B008487476.pdf', 'CH-B006337608.pdf',
# 'CH-B006391612.pdf', 'CH-B006536124.pdf', 'CH-B006526652.pdf', 'CH-B009003592.pdf']:
# continue
# if d != 'CH-B006393152.PDF':
# if d != 'CH-B006526652.pdf':
if
d
!=
'CH-B008487944.pdf'
:
continue
pdf_path
=
os
.
path
.
join
(
pdf_dir
,
d
)
# pdf_path = '/Users/clay/Desktop/普通打印part2/工商银行(标准版)/CH-B006754676.pdf'
if
os
.
path
.
isfile
(
pdf_path
):
img_save_path
=
os
.
path
.
join
(
img_dir
,
d
)
if
os
.
path
.
exists
(
img_save_path
):
continue
img_save_path
=
os
.
path
.
join
(
img_dir
,
d
[:
-
4
]
)
#
if os.path.exists(img_save_path):
#
continue
os
.
makedirs
(
img_save_path
,
exist_ok
=
True
)
with
fitz
.
Document
(
pdf_path
)
as
pdf
:
self
.
cronjob_log
.
info
(
'{0} [pdf_path={1}] [metadata={2}]'
.
format
(
self
.
log_base
,
pdf_path
,
pdf
.
metadata
))
# xref_list = []
xref_set
=
set
()
for
pno
in
range
(
pdf
.
pageCount
):
print
(
'---------------------------------------'
)
il
=
pdf
.
getPageImageList
(
pno
)
# (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
print
(
il
)
# for img_index, img in enumerate(il):
# pix = self.recoverpix(pdf, img)
# ext, img_data = self.get_img_data(pix)
# save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format(
# pno, img_index, ext))
# with open(save_path, "wb") as f:
# f.write(img_data)
if
len
(
il
)
==
0
:
page
=
pdf
.
loadPage
(
pno
)
pm
=
page
.
getPixmap
(
matrix
=
self
.
trans
,
alpha
=
False
)
save_path
=
os
.
path
.
join
(
img_save_path
,
'page_{0}_img_0.png'
.
format
(
page
.
number
))
pm
.
writePNG
(
save_path
)
elif
len
(
il
)
==
1
:
width
=
il
[
0
][
2
]
height
=
il
[
0
][
3
]
colorspace
=
il
[
0
][
5
]
adobe_filter
=
il
[
0
][
-
1
]
if
colorspace
==
''
or
adobe_filter
in
[
''
,
''
]:
continue
# 小图
if
width
<
500
and
height
<
500
:
page
=
pdf
.
loadPage
(
pno
)
pm
=
page
.
getPixmap
(
matrix
=
self
.
trans
,
alpha
=
False
)
save_path
=
os
.
path
.
join
(
img_save_path
,
'page_{0}_img_0.png'
.
format
(
page
.
number
))
pm
.
writePNG
(
save_path
)
# 大图
elif
il
[
0
][
0
]
not
in
xref_set
:
pix
=
self
.
recoverpix
(
pdf
,
il
[
0
])
ext
,
img_data
=
self
.
get_img_data
(
pix
)
save_path
=
os
.
path
.
join
(
img_save_path
,
'page_{0}_img_0.{1}'
.
format
(
pno
,
ext
))
with
open
(
save_path
,
"wb"
)
as
f
:
f
.
write
(
img_data
)
xref_set
.
add
(
il
[
0
][
0
])
else
:
il
.
sort
(
key
=
lambda
x
:
x
[
0
])
img_il_list
=
self
.
split_il
(
il
)
del
il
small_img_il_list
,
big_img_il_list
=
self
.
split_il
(
il
)
print
(
small_img_il_list
)
print
(
big_img_il_list
)
print
(
'+++++++++++++++++++++++++++++++++++'
)
print
(
img_il_list
)
if
len
(
img_il_list
)
>
3
:
# 单页无规律小图过多时,使用页面转图片
if
len
(
small_img_il_list
)
>
2
:
# 单页无规律小图过多时,使用页面转图片
page
=
pdf
.
loadPage
(
pno
)
pm
=
page
.
getPixmap
(
matrix
=
self
.
trans
,
alpha
=
False
)
save_path
=
os
.
path
.
join
(
img_save_path
,
'page_{0}_img_0.png'
.
format
(
page
.
number
))
pm
.
writePNG
(
save_path
)
# img_path_list.append(save_path)
# self.cronjob_log.info('{0} [page to img success] [doc_id={1}] [pdf_path={2}] '
# '[page={3}]'.format(self.log_base, doc_id, pdf_path, page.number))
else
:
# 提取图片
for
img_index
,
img_il
in
enumerate
(
img_il_list
):
if
len
(
img_il
)
==
1
:
# 当只有一张图片时, 简化处理
for
img_il
,
img_index
in
big_img_il_list
:
if
img_il
[
0
]
in
xref_set
:
continue
pix
=
self
.
recoverpix
(
pdf
,
img_il
)
ext
,
img_data
=
self
.
get_img_data
(
pix
)
save_path
=
os
.
path
.
join
(
img_save_path
,
'page_{0}_img_{1}.{2}'
.
format
(
pno
,
img_index
,
ext
))
with
open
(
save_path
,
"wb"
)
as
f
:
f
.
write
(
img_data
)
xref_set
.
add
(
img_il
[
0
])
for
img_il
,
img_index
in
small_img_il_list
:
# 小图
if
len
(
img_il
)
==
1
and
img_il
[
0
][
2
]
<
500
and
img_il
[
0
][
3
]
<
500
:
page
=
pdf
.
loadPage
(
pno
)
pm
=
page
.
getPixmap
(
matrix
=
self
.
trans
,
alpha
=
False
)
save_path
=
os
.
path
.
join
(
img_save_path
,
'page_{0}_img_0.png'
.
format
(
page
.
number
))
pm
.
writePNG
(
save_path
)
elif
len
(
img_il
)
==
1
and
img_il
[
0
][
0
]
not
in
xref_set
:
# 当只有一张图片时, 简化处理
pix
=
self
.
recoverpix
(
pdf
,
img_il
[
0
])
ext
,
img_data
=
self
.
get_img_data
(
pix
)
save_path
=
os
.
path
.
join
(
img_save_path
,
'page_{0}_img_{1}.{2}'
.
format
(
pno
,
img_index
,
ext
))
with
open
(
save_path
,
"wb"
)
as
f
:
f
.
write
(
img_data
)
# img_path_list.append(save_path)
# self.cronjob_log.info(
# '{0} [extract img success] [doc_id={1}] [pdf_path={2}] [page={3}] '
# '[img_index={4}]'.format(self.log_base, doc_id, pdf_path, pno, img_index))
xref_set
.
add
(
img_il
[
0
][
0
])
else
:
# 多张图片,竖向拼接
height_sum
=
0
im_list
=
[]
...
...
@@ -179,6 +249,3 @@ class Command(BaseCommand, LoggerMixin):
res
.
paste
(
m
,
box
=
(
0
,
h_now
))
h_now
+=
h
res
.
save
(
save_path
)
# else:
# img_dir_path = os.path.join(img_dir, d)
# os.makedirs(img_dir_path, exist_ok=True)
...
...
src/apps/doc/mixins.py
View file @
9799467
...
...
@@ -26,7 +26,21 @@ class DocHandler:
@staticmethod
def
get_doc_class
(
business_type
):
is_hil
=
business_type
in
consts
.
HIL_SET
doc_class
,
prefix
=
(
HILDoc
,
consts
.
HIL_PREFIX
)
if
is_hil
else
(
AFCDoc
,
consts
.
AFC_PREFIX
)
return
doc_class
,
prefix
return
(
HILDoc
,
consts
.
HIL_PREFIX
)
if
business_type
in
consts
.
HIL_SET
else
(
AFCDoc
,
consts
.
AFC_PREFIX
)
def
fix_scheme
(
self
,
scheme
):
if
scheme
in
consts
.
DOC_SCHEME_LIST
:
return
scheme
elif
scheme
.
upper
()
in
consts
.
DOC_SCHEME_LIST
:
return
scheme
.
upper
()
else
:
return
consts
.
DOC_SCHEME_LIST
[
0
]
def
fix_data_source
(
self
,
data_source
):
if
data_source
in
consts
.
DATA_SOURCE_LIST
:
return
data_source
elif
data_source
.
upper
()
in
consts
.
DATA_SOURCE_LIST
:
return
data_source
.
upper
()
else
:
return
consts
.
DATA_SOURCE_LIST
[
0
]
...
...
src/apps/doc/named_enum.py
View file @
9799467
...
...
@@ -7,20 +7,3 @@ class DocStatus(NamedEnum):
PROCESS_FAILED
=
(
2
,
'识别失败'
)
UPLOAD_FAILED
=
(
3
,
'同步失败'
)
COMPLETE
=
(
4
,
'已完成'
)
class
DocScheme
(
NamedEnum
):
ACCEPTANCE
=
(
0
,
"Acceptance"
)
SETTLEMENT
=
(
1
,
'Settlement'
)
CONTRACT_MANAGEMENT
=
(
2
,
'Contract Management'
)
class
BusinessType
(
NamedEnum
):
AFC
=
(
0
,
"CO00001"
)
HIL
=
(
1
,
'CO00002'
)
class
DataSource
(
NamedEnum
):
POS
=
(
0
,
"POS"
)
EAPP
=
(
1
,
'EAPP'
)
ECONTRACT
=
(
2
,
'Econtract'
)
...
...
src/apps/doc/views.py
View file @
9799467
...
...
@@ -60,7 +60,7 @@ doc_list_args = {
'status'
:
fields
.
Int
(
required
=
False
,
validate
=
validate
.
OneOf
(
DocStatus
.
get_value_lst
())),
'application_id'
:
fields
.
Str
(
required
=
False
,
validate
=
validate
.
Length
(
max
=
64
)),
'data_source'
:
fields
.
Str
(
required
=
False
,
validate
=
validate
.
Length
(
max
=
64
)),
'data_source'
:
fields
.
Str
(
required
=
False
,
validate
=
validate
.
OneOf
(
consts
.
DATA_SOURCE_LIST
)),
'business_type'
:
fields
.
Str
(
required
=
True
,
validate
=
validate
.
OneOf
(
consts
.
BUSINESS_TYPE_LIST
)),
'upload_time_start'
:
fields
.
Date
(
required
=
False
),
'upload_time_end'
:
fields
.
Date
(
required
=
False
),
...
...
@@ -100,6 +100,8 @@ class UploadDocView(GenericView, DocHandler):
document
=
args
.
get
(
'document'
)
business_type
=
document
.
get
(
'businessType'
)
application_id
=
application_data
.
get
(
'applicationId'
)
document_scheme
=
document
.
get
(
'documentScheme'
)
data_source
=
document
.
get
(
'dataSource'
)
try
:
# 1. 上传信息记录
record
=
UploadDocRecords
.
objects
.
create
(
...
...
@@ -110,9 +112,9 @@ class UploadDocView(GenericView, DocHandler):
guarantor_1
=
applicant_data
.
get
(
'guarantor1Name'
),
guarantor_2
=
applicant_data
.
get
(
'guarantor2Name'
),
document_name
=
document
.
get
(
'documentName'
),
document_scheme
=
document
.
get
(
'documentScheme'
)
,
document_scheme
=
document
_scheme
,
business_type
=
business_type
,
data_source
=
d
ocument
.
get
(
'dataSource'
)
,
data_source
=
d
ata_source
,
upload_finish_time
=
document
.
get
(
'uploadFinishTime'
),
)
except
IntegrityError
as
e
:
...
...
@@ -130,17 +132,17 @@ class UploadDocView(GenericView, DocHandler):
guarantor_1
=
applicant_data
.
get
(
'guarantor1Name'
),
guarantor_2
=
applicant_data
.
get
(
'guarantor2Name'
),
document_name
=
document
.
get
(
'documentName'
),
document_scheme
=
document
.
get
(
'documentScheme'
),
data_source
=
document
.
get
(
'dataSource'
),
document_scheme
=
self
.
fix_scheme
(
document_scheme
),
data_source
=
self
.
fix_data_source
(
data_source
),
upload_finish_time
=
document
.
get
(
'uploadFinishTime'
),
)
# 3. 选择队列进入
is_priority
=
PriorityApplication
.
objects
.
filter
(
application_id
=
application_id
,
on_off
=
True
)
.
exists
()
value
=
[
'{0}_{1}'
.
format
(
prefix
,
doc
.
id
)]
redis_res
=
rh
.
enqueue
(
value
,
is_priority
)
self
.
running_log
.
info
(
'[doc upload success] [args={0}] [record_id={1}] [
prefix
={2}] [doc_id={3}] '
tasks
=
[
'{0}{1}{2}'
.
format
(
prefix
,
consts
.
SPLIT_STR
,
doc
.
id
)]
enqueue_res
=
rh
.
enqueue
(
tasks
,
is_priority
)
self
.
running_log
.
info
(
'[doc upload success] [args={0}] [record_id={1}] [
business_type
={2}] [doc_id={3}] '
'[is_priority={4}] [enqueue_res={5}]'
.
format
(
args
,
record
.
id
,
prefix
,
doc
.
id
,
is_priority
,
redis
_res
))
is_priority
,
enqueue
_res
))
return
response
.
ok
()
post
.
openapi_doc
=
'''
...
...
@@ -174,6 +176,7 @@ class PriorityDocView(GenericView, DocHandler):
application_id
=
application_info
.
get
(
'APPLICATION_ID'
)
submit_datetime
=
application_info
.
get
(
'SUBMIT_DATETIME'
)
entity
=
application_info
.
get
(
'ENTITY'
)
if
submit_datetime
.
utcoffset
()
is
not
None
:
submit_datetime
=
timezone
.
make_naive
(
submit_datetime
,
timezone
.
get_current_timezone
())
GCAPRecords
.
objects
.
create
(
entity
=
entity
,
...
...
@@ -190,14 +193,14 @@ class PriorityDocView(GenericView, DocHandler):
doc_class
,
prefix
=
self
.
get_doc_class
(
entity
)
doc_ids
=
doc_class
.
objects
.
filter
(
application_id
=
application_id
,
status
=
DocStatus
.
INIT
.
value
)
.
values_list
(
'id'
,
flat
=
True
)
task
_str_list
=
[
'{0}_{1}'
.
format
(
prefix
,
doc_id
)
for
doc_id
in
doc_ids
]
if
not
task
_str
_list
:
task
s_list
=
[
'{0}{1}{2}'
.
format
(
prefix
,
consts
.
SPLIT_STR
,
doc_id
)
for
doc_id
in
doc_ids
]
if
not
task
s
_list
:
self
.
running_log
.
info
(
'[priority doc success] [args={0}]
[task_str_list={1}]'
.
format
(
args
,
task_str_list
))
'[priority doc success] [args={0}]
'
.
format
(
args
))
else
:
enqueue_res
=
rh
.
enqueue
(
task
_str
_list
,
is_priority
=
True
)
self
.
running_log
.
info
(
'[priority doc success] [args={0}] [task
_str
_list={1}] [enqueue_res={2}]'
.
format
(
args
,
task
_str
_list
,
enqueue_res
))
enqueue_res
=
rh
.
enqueue
(
task
s
_list
,
is_priority
=
True
)
self
.
running_log
.
info
(
'[priority doc success] [args={0}] [task
s
_list={1}] [enqueue_res={2}]'
.
format
(
args
,
task
s
_list
,
enqueue_res
))
return
response
.
ok
()
post
.
openapi_doc
=
'''
...
...
@@ -268,7 +271,7 @@ class DocView(GenericView, DocHandler):
@use_args
(
upload_pdf_args
,
location
=
'files'
)
def
post
(
self
,
request
,
args
):
# 1. 上传信息记录
const_str
=
'手工单'
const_str
=
consts
.
FIXED_APPLICATION_ID
metadata_version_id
=
str
(
int
(
time
.
time
()))
upload_finish_time
=
timezone
.
now
()
document_scheme
=
random
.
choice
(
consts
.
DOC_SCHEME_LIST
)
...
...
@@ -305,8 +308,8 @@ class DocView(GenericView, DocHandler):
)
# 3. 选择队列进入
is_priority
=
False
value
=
[
'{0}_{1}'
.
format
(
prefix
,
doc
.
id
)]
redis_res
=
rh
.
enqueue
(
value
,
is_priority
)
tasks
=
[
'{0}{1}{2}'
.
format
(
prefix
,
consts
.
SPLIT_STR
,
doc
.
id
)]
enqueue_res
=
rh
.
enqueue
(
tasks
,
is_priority
)
pdf_file
=
args
.
get
(
'pdf_file'
)
save_dir_path
=
os
.
path
.
join
(
conf
.
DATA_DIR
,
business_type
,
str
(
doc
.
id
))
...
...
@@ -314,7 +317,7 @@ class DocView(GenericView, DocHandler):
os
.
makedirs
(
save_dir_path
,
exist_ok
=
True
)
file_write
(
pdf_file
,
save_file_path
)
self
.
running_log
.
info
(
'[mock doc upload success] [args={0}] [record_id={1}] [
prefix
={2}] [doc_id={3}] '
self
.
running_log
.
info
(
'[mock doc upload success] [args={0}] [record_id={1}] [
business_type
={2}] [doc_id={3}] '
'[is_priority={4}] [enqueue_res={5}]'
.
format
(
args
,
record
.
id
,
prefix
,
doc
.
id
,
is_priority
,
redis
_res
))
is_priority
,
enqueue
_res
))
return
response
.
ok
()
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment