Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
9e8023a0
authored
2021-01-19 10:53:02 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Plain Diff
Merge branch 'feature/main2' into feature/main
2 parents
548dc7a0
c001972a
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
89 additions
and
663 deletions
src/apps/doc/management/commands/doc_ocr_process.py
src/apps/doc/management/commands/idcard_statistics.py
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/management/commands/doc_ocr_process.py
deleted
100644 → 0
View file @
548dc7a
import
os
import
time
import
json
import
signal
import
asyncio
import
aiohttp
import
difflib
import
base64
import
requests
from
datetime
import
datetime
,
date
from
collections
import
Counter
from
apps.doc.ocr.wb
import
BSWorkbook
,
Workbook
from
django.core.management
import
BaseCommand
from
settings
import
conf
from
common.mixins
import
LoggerMixin
from
common.tools.file_tools
import
write_zip_file
from
common.tools.pdf_to_img
import
PDFHandler
from
apps.doc.models
import
DocStatus
,
HILDoc
,
AFCDoc
,
Keywords
from
apps.doc.named_enum
import
KeywordsType
from
apps.doc
import
consts
from
apps.doc.ocr.edms
import
EDMS
,
rh
from
apps.doc.exceptions
import
EDMSException
class
Command
(
BaseCommand
,
LoggerMixin
):
def
__init__
(
self
):
super
()
.
__init__
()
self
.
log_base
=
'[doc ocr process]'
# 处理文件开关
self
.
switch
=
True
# 数据目录
self
.
data_dir
=
conf
.
DATA_DIR
# ocr相关
self
.
ocr_url_1
=
conf
.
OCR_URL_1
self
.
ocr_url_2
=
conf
.
OCR_URL_2
self
.
ocr_url_3
=
conf
.
BC_URL
# EDMS web_service_api
self
.
edms
=
EDMS
()
# 优雅退出信号:15
signal
.
signal
(
signal
.
SIGTERM
,
self
.
signal_handler
)
def
signal_handler
(
self
,
sig
,
frame
):
self
.
switch
=
False
# 停止处理文件
def
get_doc_info
(
self
):
task_str
,
is_priority
=
rh
.
dequeue
()
if
task_str
is
None
:
self
.
cronjob_log
.
info
(
'{0} [get_doc_info] [queue empty]'
.
format
(
self
.
log_base
))
return
None
,
None
business_type
,
doc_id_str
=
task_str
.
split
(
consts
.
SPLIT_STR
)
doc_id
=
int
(
doc_id_str
)
doc_class
=
HILDoc
if
business_type
==
consts
.
HIL_PREFIX
else
AFCDoc
# doc_info = doc_class.objects.filter(id=doc_id, status=DocStatus.INIT.value).values(
# 'id', 'metadata_version_id', 'application_id', 'document_name', 'document_scheme').first()
doc
=
doc_class
.
objects
.
filter
(
id
=
doc_id
)
.
first
()
if
doc
is
None
:
self
.
cronjob_log
.
warn
(
'{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'
.
format
(
self
.
log_base
,
task_str
,
is_priority
))
return
None
,
None
elif
doc
.
status
!=
DocStatus
.
INIT
.
value
:
self
.
cronjob_log
.
warn
(
'{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] '
'[doc_status={3}]'
.
format
(
self
.
log_base
,
task_str
,
is_priority
,
doc
.
status
))
return
None
,
None
doc
.
status
=
DocStatus
.
PROCESSING
.
value
doc
.
save
()
self
.
cronjob_log
.
info
(
'{0} [get_doc_info] [success] [task_str={1}] [is_priority={2}]'
.
format
(
self
.
log_base
,
task_str
,
is_priority
))
return
doc
,
business_type
def
pdf_download
(
self
,
doc
,
business_type
):
doc_data_path
=
os
.
path
.
join
(
self
.
data_dir
,
business_type
,
str
(
doc
.
id
))
os
.
makedirs
(
doc_data_path
,
exist_ok
=
True
)
pdf_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.pdf'
.
format
(
doc
.
id
))
if
not
doc
.
application_id
.
startswith
(
consts
.
FIXED_APPLICATION_ID_PREFIX
):
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
self
.
edms
.
download
(
pdf_path
,
doc
.
metadata_version_id
)
except
Exception
as
e
:
self
.
cronjob_log
.
warn
(
'{0} [edms download failed] [times={1}] [business_type={2}] [doc_id={3}] '
'[error={4}]'
.
format
(
self
.
log_base
,
times
,
business_type
,
doc
.
id
,
e
))
edms_exc
=
str
(
e
)
else
:
break
else
:
raise
EDMSException
(
edms_exc
)
excel_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.xlsx'
.
format
(
doc
.
id
))
src_excel_path
=
os
.
path
.
join
(
doc_data_path
,
'src.xlsx'
)
self
.
cronjob_log
.
info
(
'{0} [edms download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
pdf_path
))
return
doc_data_path
,
excel_path
,
src_excel_path
,
pdf_path
def
bs_process
(
self
,
wb
,
ocr_data
,
bs_summary
,
unknown_summary
,
classify
,
res_list
,
pno
,
ino
):
sheets
=
ocr_data
.
get
(
'data'
,
[])
if
not
sheets
:
res_list
.
append
((
pno
,
ino
,
consts
.
RES_SUCCESS_EMPTY
))
return
confidence
=
ocr_data
.
get
(
'confidence'
,
1
)
img_name
=
'page_{0}_img_{1}'
.
format
(
pno
,
ino
)
cells_exists
=
False
for
i
,
sheet
in
enumerate
(
sheets
):
cells
=
sheet
.
get
(
'cells'
)
if
not
cells
:
continue
cells_exists
=
True
sheet_name
=
'{0}_{1}'
.
format
(
img_name
,
i
)
ws
=
wb
.
create_sheet
(
sheet_name
)
for
cell
in
cells
:
c1
=
cell
.
get
(
'start_column'
)
r1
=
cell
.
get
(
'start_row'
)
words
=
cell
.
get
(
'words'
)
ws
.
cell
(
row
=
r1
+
1
,
column
=
c1
+
1
,
value
=
words
)
# ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间']
summary
=
sheet
.
get
(
'summary'
)
card
=
summary
[
1
]
if
card
is
None
:
classify_dict
=
unknown_summary
.
setdefault
(
classify
,
{})
role
=
consts
.
UNKNOWN_ROLE
if
summary
[
0
]
is
None
else
summary
[
0
]
role_dict
=
classify_dict
.
setdefault
(
role
,
{})
role_dict
[
'classify'
]
=
classify
role_dict
[
'role'
]
=
role
role_dict
.
setdefault
(
'sheet'
,
[])
.
append
(
sheet_name
)
role_dict
.
setdefault
(
'confidence'
,
[])
.
append
(
confidence
)
code_list
=
role_dict
.
setdefault
(
'code'
,
[])
pt_list
=
role_dict
.
setdefault
(
'print_time'
,
[])
sd_list
=
role_dict
.
setdefault
(
'start_date'
,
[])
ed_list
=
role_dict
.
setdefault
(
'end_date'
,
[])
if
summary
[
3
]
is
not
None
:
code_list
.
append
((
summary
[
2
],
summary
[
3
]))
if
summary
[
4
]
is
not
None
:
pt_list
.
append
(
summary
[
4
])
if
summary
[
5
]
is
not
None
:
sd_list
.
append
(
summary
[
5
])
if
summary
[
6
]
is
not
None
:
ed_list
.
append
(
summary
[
6
])
else
:
card_dict
=
bs_summary
.
setdefault
(
card
,
{})
card_dict
[
'count'
]
=
card_dict
.
get
(
'count'
,
0
)
+
1
card_dict
.
setdefault
(
'classify'
,
[])
.
append
(
classify
)
card_dict
.
setdefault
(
'confidence'
,
[])
.
append
(
confidence
)
card_dict
.
setdefault
(
'sheet'
,
[])
.
append
(
sheet_name
)
role_list
=
card_dict
.
setdefault
(
'role'
,
[])
role_set
=
card_dict
.
setdefault
(
'role_set'
,
set
())
code_list
=
card_dict
.
setdefault
(
'code'
,
[])
pt_list
=
card_dict
.
setdefault
(
'print_time'
,
[])
sd_list
=
card_dict
.
setdefault
(
'start_date'
,
[])
ed_list
=
card_dict
.
setdefault
(
'end_date'
,
[])
if
summary
[
0
]
is
not
None
:
role_list
.
append
(
summary
[
0
])
role_set
.
add
(
summary
[
0
])
if
summary
[
3
]
is
not
None
:
code_list
.
append
((
summary
[
2
],
summary
[
3
]))
if
summary
[
4
]
is
not
None
:
pt_list
.
append
(
summary
[
4
])
if
summary
[
5
]
is
not
None
:
sd_list
.
append
(
summary
[
5
])
if
summary
[
6
]
is
not
None
:
ed_list
.
append
(
summary
[
6
])
if
cells_exists
:
res_list
.
append
((
pno
,
ino
,
consts
.
RES_SUCCESS
))
else
:
res_list
.
append
((
pno
,
ino
,
consts
.
RES_SUCCESS_EMPTY
))
def
license1_process
(
self
,
ocr_data
,
license_summary
,
classify
,
res_list
,
pno
,
ino
):
# 类别:'0'身份证, '1'居住证
license_data
=
ocr_data
.
get
(
'data'
,
[])
if
not
license_data
:
res_list
.
append
((
pno
,
ino
,
consts
.
RES_SUCCESS_EMPTY
))
return
res_list
.
append
((
pno
,
ino
,
consts
.
RES_SUCCESS
))
license_summary
.
setdefault
(
classify
,
[])
.
extend
(
license_data
)
def
license2_process
(
self
,
ocr_res_2
,
license_summary
,
pid
,
classify
,
res_list
,
pno
,
ino
):
if
ocr_res_2
.
get
(
'ErrorCode'
)
in
consts
.
SUCCESS_CODE_SET
:
res_list
.
append
((
pno
,
ino
,
consts
.
RES_SUCCESS
))
if
pid
==
consts
.
BC_PID
:
# 银行卡
# res_dict = {}
# for en_key, chn_key in consts.BC_FIELD:
# res_dict[chn_key] = ocr_res_2.get(en_key, '')
license_summary
.
setdefault
(
classify
,
[])
.
append
(
ocr_res_2
)
else
:
# 营业执照等
for
result_dict
in
ocr_res_2
.
get
(
'ResultList'
,
[]):
res_dict
=
{}
for
field_dict
in
result_dict
.
get
(
'FieldList'
,
[]):
res_dict
[
field_dict
.
get
(
'chn_key'
,
''
)]
=
field_dict
.
get
(
'value'
,
''
)
license_summary
.
setdefault
(
classify
,
[])
.
append
(
res_dict
)
else
:
res_list
.
append
((
pno
,
ino
,
consts
.
RES_FAILED
))
@staticmethod
async
def
fetch_ocr_1_result
(
url
,
json_data
):
async
with
aiohttp
.
ClientSession
(
connector
=
aiohttp
.
TCPConnector
(
ssl
=
False
))
as
session
:
async
with
session
.
post
(
url
,
json
=
json_data
)
as
response
:
if
response
.
status
==
200
:
return
await
response
.
json
()
@staticmethod
async
def
fetch_ocr_2_result
(
url
,
json_data
):
async
with
aiohttp
.
ClientSession
(
connector
=
aiohttp
.
TCPConnector
(
ssl
=
False
))
as
session
:
async
with
session
.
post
(
url
,
data
=
json_data
)
as
response
:
if
response
.
status
==
200
:
return
await
response
.
text
()
@staticmethod
async
def
fetch_bc_name_result
(
url
,
json_data
):
async
with
aiohttp
.
ClientSession
(
connector
=
aiohttp
.
TCPConnector
(
ssl
=
False
))
as
session
:
async
with
session
.
post
(
url
,
json
=
json_data
)
as
response
:
if
response
.
status
==
200
:
return
await
response
.
json
()
async
def
img_2_ocr_2_wb
(
self
,
wb
,
img_path
,
bs_summary
,
unknown_summary
,
license_summary
,
res_list
):
pno
,
ino
=
self
.
parse_img_path
(
img_path
)
with
open
(
img_path
,
'rb'
)
as
f
:
base64_data
=
base64
.
b64encode
(
f
.
read
())
# 获取解码后的base64值
file_data
=
base64_data
.
decode
()
json_data_1
=
{
"file"
:
file_data
}
ocr_res_1
=
await
self
.
fetch_ocr_1_result
(
self
.
ocr_url_1
,
json_data_1
)
if
ocr_res_1
is
None
:
res_list
.
append
((
pno
,
ino
,
consts
.
RES_FAILED
))
self
.
cronjob_log
.
info
(
'{0} [ocr_1 failed] [img={1}]'
.
format
(
self
.
log_base
,
img_path
))
# raise Exception('ocr 1 error, img_path={0}'.format(img_path))
else
:
self
.
cronjob_log
.
info
(
'{0} [ocr_1 success] [img={1}] [res={2}]'
.
format
(
self
.
log_base
,
img_path
,
ocr_res_1
))
if
ocr_res_1
.
get
(
'code'
)
==
1
:
ocr_data
=
ocr_res_1
.
get
(
'data'
,
{})
classify
=
ocr_data
.
get
(
'classify'
)
if
classify
is
None
:
res_list
.
append
((
pno
,
ino
,
consts
.
RES_FAILED
))
self
.
cronjob_log
.
info
(
'{0} [ocr_1 res error] [img={1}] [res={2}]'
.
format
(
self
.
log_base
,
img_path
,
ocr_res_1
))
return
elif
classify
in
consts
.
OTHER_CLASSIFY_SET
:
# 其他类
res_list
.
append
((
pno
,
ino
,
consts
.
RES_SUCCESS_OTHER
))
return
elif
classify
in
consts
.
LICENSE_CLASSIFY_SET_1
:
# 证件1
self
.
license1_process
(
ocr_data
,
license_summary
,
classify
,
res_list
,
pno
,
ino
)
elif
classify
in
consts
.
LICENSE_CLASSIFY_SET_2
:
# 证件2
pid
,
_
,
_
,
_
,
_
=
consts
.
LICENSE_CLASSIFY_MAPPING
.
get
(
classify
)
json_data_2
=
{
"pid"
:
str
(
pid
),
# "key": conf.OCR_KEY,
# "secret": conf.OCR_SECRET,
"filedata"
:
file_data
}
ocr_res_2
=
await
self
.
fetch_ocr_2_result
(
self
.
ocr_url_2
,
json_data_2
)
if
ocr_res_2
is
None
:
res_list
.
append
((
pno
,
ino
,
consts
.
RES_FAILED
))
self
.
cronjob_log
.
info
(
'{0} [ocr_2 failed] [img={1}]'
.
format
(
self
.
log_base
,
img_path
))
# raise Exception('ocr 2 error, img_path={0}'.format(img_path))
else
:
# 识别结果
ocr_res_2
=
json
.
loads
(
ocr_res_2
)
self
.
cronjob_log
.
info
(
'{0} [ocr_2 success] [img={1}] [res={2}]'
.
format
(
self
.
log_base
,
img_path
,
ocr_res_2
))
if
classify
==
consts
.
BC_CLASSIFY
:
name
=
'有'
json_data_1
[
'card_res'
]
=
ocr_res_2
card_name_res
=
await
self
.
fetch_bc_name_result
(
self
.
ocr_url_3
,
json_data_1
)
if
isinstance
(
card_name_res
,
dict
)
and
\
card_name_res
.
get
(
'data'
,
{})
.
get
(
'is_exists_name'
)
==
0
:
name
=
'无'
ocr_res_2
[
'Name'
]
=
name
self
.
license2_process
(
ocr_res_2
,
license_summary
,
pid
,
classify
,
res_list
,
pno
,
ino
)
else
:
# 流水处理
self
.
bs_process
(
wb
,
ocr_data
,
bs_summary
,
unknown_summary
,
classify
,
res_list
,
pno
,
ino
)
else
:
res_list
.
append
((
pno
,
ino
,
consts
.
RES_FAILED
))
self
.
cronjob_log
.
info
(
'{0} [ocr_1 res error] [img={1}] [res={2}]'
.
format
(
self
.
log_base
,
img_path
,
ocr_res_1
))
# def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
# # # 流水
# # res = {
# # 'code': 1,
# # 'msg': 'success',
# # 'data': {
# # 'classify': 0,
# # 'confidence': 0.999,
# # 'data': [
# # {
# # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
# # 'cells': []
# # },
# # {
# # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
# # 'cells': []
# # }
# # ]
# # }
# # }
# #
# # # 证件-1
# # res = {
# # 'code': 1,
# # 'msg': 'success',
# # 'data': {
# # 'classify': 0,
# # 'confidence': 0.999,
# # 'data': [
# # {
# # 'cn_key': 'value',
# # 'cn_key': 'value',
# # },
# # {
# # 'cn_key': 'value',
# # 'cn_key': 'value',
# # },
# # ]
# # }
# # }
# #
# # # 证件-2 or 其他类
# # res = {
# # 'code': 1,
# # 'msg': 'success',
# # 'data': {
# # 'classify': 0,
# # 'confidence': 0.999,
# # }
# # }
# with open(img_path, 'rb') as f:
# base64_data = base64.b64encode(f.read())
# # 获取解码后的base64值
# file_data = base64_data.decode()
# json_data_1 = {
# "file": file_data
# }
# response_1 = requests.post(self.ocr_url_1, json=json_data_1)
# if response_1.status_code == 200:
# ocr_res_1 = response_1.json()
# self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
# self.log_base, img_path, ocr_res_1))
#
# if ocr_res_1.get('code') == 1:
# ocr_data = ocr_res_1.get('data', {})
# classify = ocr_data.get('classify')
# if classify is None:
# skip_img.append(self.parse_img_path(img_path))
# return
# elif classify in consts.OTHER_CLASSIFY_SET: # 其他类
# skip_img.append(self.parse_img_path(img_path))
# return
# elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1
# self.license1_process(ocr_data, license_summary, classify, skip_img, img_path)
# elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2
# pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
# json_data_2 = {
# "pid": str(pid),
# "key": conf.OCR_KEY,
# "secret": conf.OCR_SECRET,
# "file": file_data
# }
# response_2 = requests.post(self.ocr_url_2, data=json_data_2)
# if response_2.status_code == 200:
# # 识别结果
# ocr_res_2 = response_2.json()
# self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
# self.log_base, img_path, ocr_res_2))
# self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path)
# else:
# raise Exception('ocr 2 error, img_path={0}'.format(img_path))
# else: # 流水处理
# self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img)
# else:
# skip_img.append(self.parse_img_path(img_path))
# else:
# raise Exception('ocr 1 error, img_path={0}'.format(img_path))
@staticmethod
def
parse_img_path
(
img_path
):
img_name
,
_
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
img_path
))
part_list
=
img_name
.
split
(
'_'
)
# page_7_img_11_0
return
int
(
part_list
[
1
])
+
1
,
int
(
part_list
[
3
])
+
1
@staticmethod
def
get_most
(
value_list
):
if
value_list
:
most_common
=
Counter
(
value_list
)
.
most_common
(
1
)
return
most_common
[
0
][
0
]
if
most_common
else
None
@staticmethod
def
date_format
(
date_str
,
format_str
):
try
:
date_res
=
datetime
.
strptime
(
date_str
,
format_str
)
.
date
()
except
Exception
as
e
:
return
else
:
return
date_res
def
get_validate_date
(
self
,
date_list
):
for
date_str
in
date_list
:
for
format_str
in
consts
.
DATE_FORMAT
:
date_res
=
self
.
date_format
(
date_str
,
format_str
)
if
isinstance
(
date_res
,
date
):
return
date_res
def
merge_card
(
self
,
bs_summary
):
merged_bs_summary
=
{}
sorted_card
=
sorted
(
bs_summary
.
keys
(),
key
=
lambda
x
:
bs_summary
[
x
][
'count'
],
reverse
=
True
)
for
main_card
in
sorted_card
:
if
bs_summary
.
get
(
main_card
)
is
None
:
continue
merged_bs_summary
[
main_card
]
=
bs_summary
.
pop
(
main_card
)
del
merged_bs_summary
[
main_card
][
'count'
]
merge_cards
=
[]
for
card
in
bs_summary
.
keys
():
if
difflib
.
SequenceMatcher
(
None
,
main_card
,
card
)
.
quick_ratio
()
>
consts
.
CARD_RATIO
:
merged_bs_summary
[
main_card
][
'classify'
]
.
extend
(
bs_summary
[
card
][
'classify'
])
merged_bs_summary
[
main_card
][
'confidence'
]
.
extend
(
bs_summary
[
card
][
'confidence'
])
merged_bs_summary
[
main_card
][
'sheet'
]
.
extend
(
bs_summary
[
card
][
'sheet'
])
merged_bs_summary
[
main_card
][
'role'
]
.
extend
(
bs_summary
[
card
][
'role'
])
merged_bs_summary
[
main_card
][
'role_set'
]
.
update
(
bs_summary
[
card
][
'role_set'
])
merged_bs_summary
[
main_card
][
'code'
]
.
extend
(
bs_summary
[
card
][
'code'
])
merged_bs_summary
[
main_card
][
'print_time'
]
.
extend
(
bs_summary
[
card
][
'print_time'
])
merged_bs_summary
[
main_card
][
'start_date'
]
.
extend
(
bs_summary
[
card
][
'start_date'
])
merged_bs_summary
[
main_card
][
'end_date'
]
.
extend
(
bs_summary
[
card
][
'end_date'
])
merge_cards
.
append
(
card
)
for
card
in
merge_cards
:
del
bs_summary
[
card
]
merged_bs_summary
[
main_card
][
'classify'
]
=
self
.
get_most
(
merged_bs_summary
[
main_card
][
'classify'
])
merged_bs_summary
[
main_card
][
'role'
]
=
self
.
get_most
(
merged_bs_summary
[
main_card
][
'role'
])
del
bs_summary
return
merged_bs_summary
def
prune_bs_summary
(
self
,
bs_summary
):
for
summary
in
bs_summary
.
values
():
del
summary
[
'count'
]
summary
[
'classify'
]
=
self
.
get_most
(
summary
[
'classify'
])
summary
[
'role'
]
=
self
.
get_most
(
summary
[
'role'
])
return
bs_summary
def
rebuild_bs_summary
(
self
,
bs_summary
,
unknown_summary
):
# bs_summary = {
# '卡号': {
# 'count': 100,
# 'classify': [],
# 'confidence': [],
# 'role': [],
# 'code': [('page', 'code')],
# 'print_time': [],
# 'start_date': [],
# 'end_date': [],
# 'sheet': ['sheet_name']
# }
# }
#
# unknown_summary = {
# 0: {
# '户名': {
# 'classify': 0,
# 'confidence': [],
# 'role': '户名',
# 'code': [('page', 'code')],
# 'print_time': [],
# 'start_date': [],
# 'end_date': [],
# 'sheet': ['sheet_name']
# }
# }
# }
# 无卡号
if
len
(
bs_summary
)
==
0
:
del
bs_summary
merged_bs_summary
=
{}
card_num
=
1
for
role_dict
in
unknown_summary
.
values
():
if
len
(
role_dict
)
==
2
and
consts
.
UNKNOWN_ROLE
in
role_dict
:
summary_dict
=
role_dict
.
pop
(
consts
.
UNKNOWN_ROLE
,
{})
for
summary
in
role_dict
.
values
():
summary_dict
[
'confidence'
]
.
extend
(
summary
[
'confidence'
])
summary_dict
[
'role'
]
=
summary
[
'role'
]
summary_dict
[
'code'
]
.
extend
(
summary
[
'code'
])
summary_dict
[
'print_time'
]
.
extend
(
summary
[
'print_time'
])
summary_dict
[
'start_date'
]
.
extend
(
summary
[
'start_date'
])
summary_dict
[
'end_date'
]
.
extend
(
summary
[
'end_date'
])
summary_dict
[
'sheet'
]
.
extend
(
summary
[
'sheet'
])
card
=
'{0}_{1}'
.
format
(
consts
.
UNKNOWN_CARD
,
card_num
)
merged_bs_summary
[
card
]
=
summary_dict
else
:
for
summary
in
role_dict
.
values
():
card
=
'{0}_{1}'
.
format
(
consts
.
UNKNOWN_CARD
,
card_num
)
card_num
+=
1
merged_bs_summary
[
card
]
=
summary
else
:
# 1卡号
one_card
=
False
if
len
(
bs_summary
)
==
1
:
merged_bs_summary
=
self
.
prune_bs_summary
(
bs_summary
)
one_card
=
True
# 多卡号
else
:
merged_bs_summary
=
self
.
merge_card
(
bs_summary
)
for
card_summary
in
merged_bs_summary
.
values
():
merge_role
=
[]
classify_summary
=
unknown_summary
.
get
(
card_summary
[
'classify'
],
{})
for
role
,
summary
in
classify_summary
.
items
():
if
one_card
or
role
in
card_summary
[
'role_set'
]:
merge_role
.
append
(
role
)
card_summary
[
'confidence'
]
.
extend
(
summary
[
'confidence'
])
card_summary
[
'sheet'
]
.
extend
(
summary
[
'sheet'
])
card_summary
[
'code'
]
.
extend
(
summary
[
'code'
])
card_summary
[
'print_time'
]
.
extend
(
summary
[
'print_time'
])
card_summary
[
'start_date'
]
.
extend
(
summary
[
'start_date'
])
card_summary
[
'end_date'
]
.
extend
(
summary
[
'end_date'
])
for
role
in
merge_role
:
del
classify_summary
[
role
]
card_num
=
1
for
role_dict
in
unknown_summary
.
values
():
for
summary
in
role_dict
.
values
():
card
=
'{0}_{1}'
.
format
(
consts
.
UNKNOWN_CARD
,
card_num
)
card_num
+=
1
merged_bs_summary
[
card
]
=
summary
del
unknown_summary
for
summary
in
merged_bs_summary
.
values
():
if
summary
.
get
(
'role_set'
)
is
not
None
:
del
summary
[
'role_set'
]
summary
[
'print_time'
]
=
self
.
get_validate_date
(
summary
[
'print_time'
])
summary
[
'start_date'
]
=
self
.
get_validate_date
(
summary
[
'start_date'
])
summary
[
'end_date'
]
=
self
.
get_validate_date
(
summary
[
'end_date'
])
summary
[
'confidence'
]
=
max
(
summary
[
'confidence'
])
return
merged_bs_summary
# TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理
# TODO 异常邮件通知
# 识别失败:普通异常,如PDF异常、构建过程异常
# EDMS异常:下载异常-->回队列-->邮件;上传异常-->重新上传队列-->邮件
# 算法异常:第一道异常-->识别失败-->邮件;第二道异常-->识别失败-->邮件
# TODO OCR接口调用重试
def
handle
(
self
,
*
args
,
**
kwargs
):
sleep_second
=
int
(
conf
.
SLEEP_SECOND
)
max_sleep_second
=
int
(
conf
.
MAX_SLEEP_SECOND
)
while
self
.
switch
:
# 1. 从队列获取文件信息
doc
,
business_type
=
self
.
get_doc_info
()
# 队列为空时的处理
if
doc
is
None
:
time
.
sleep
(
sleep_second
)
sleep_second
=
min
(
max_sleep_second
,
sleep_second
+
5
)
continue
sleep_second
=
int
(
conf
.
SLEEP_SECOND
)
try
:
start_time
=
time
.
time
()
# 2. 从EDMS获取PDF文件
doc_data_path
,
excel_path
,
src_excel_path
,
pdf_path
=
self
.
pdf_download
(
doc
,
business_type
)
# 3.PDF文件提取图片
img_save_path
=
os
.
path
.
join
(
doc_data_path
,
'img'
)
self
.
cronjob_log
.
info
(
'{0} [pdf to img start] [business_type={1}] [doc_id={2}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
))
pdf_handler
=
PDFHandler
(
pdf_path
,
img_save_path
)
pdf_handler
.
extract_image
()
self
.
cronjob_log
.
info
(
'{0} [pdf to img end] [business_type={1}] [doc_id={2}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
))
# 4.获取OCR结果并且构建excel文件
bs_summary
=
{}
license_summary
=
{}
unknown_summary
=
{}
res_list
=
[]
interest_keyword
=
Keywords
.
objects
.
filter
(
type
=
KeywordsType
.
INTEREST
.
value
,
on_off
=
True
)
.
values_list
(
'keyword'
,
flat
=
True
)
salary_keyword
=
Keywords
.
objects
.
filter
(
type
=
KeywordsType
.
SALARY
.
value
,
on_off
=
True
)
.
values_list
(
'keyword'
,
flat
=
True
)
loan_keyword
=
Keywords
.
objects
.
filter
(
type__in
=
[
KeywordsType
.
LOAN
.
value
,
KeywordsType
.
ALI_WECHART
.
value
],
on_off
=
True
)
.
values_list
(
'keyword'
,
flat
=
True
)
wb
=
BSWorkbook
(
interest_keyword
,
salary_keyword
,
loan_keyword
)
# wb = Workbook()
# 4.1 获取OCR结果
loop
=
asyncio
.
get_event_loop
()
tasks
=
[
self
.
img_2_ocr_2_wb
(
wb
,
img_path
,
bs_summary
,
unknown_summary
,
license_summary
,
res_list
)
for
img_path
in
pdf_handler
.
img_path_list
]
loop
.
run_until_complete
(
asyncio
.
wait
(
tasks
))
# loop.close()
# for img_path in pdf_handler.img_path_list:
# self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, res_list)
self
.
cronjob_log
.
info
(
'{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] '
'[license_summary={5}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
bs_summary
,
unknown_summary
,
license_summary
))
merged_bs_summary
=
self
.
rebuild_bs_summary
(
bs_summary
,
unknown_summary
)
self
.
cronjob_log
.
info
(
'{0} [business_type={1}] [doc_id={2}] [merged_bs_summary={3}] '
'[unknown_summary={4}] [res_list={5}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
merged_bs_summary
,
unknown_summary
,
res_list
))
del
unknown_summary
# 4.2 重构Excel文件
wb
.
save
(
src_excel_path
)
wb
.
rebuild
(
merged_bs_summary
,
license_summary
,
res_list
,
doc
.
document_scheme
)
wb
.
save
(
excel_path
)
except
EDMSException
as
e
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
save
()
self
.
cronjob_log
.
error
(
'{0} [process failed (edms download)] [business_type={1}] [doc_id={2}] '
'[err={3}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
e
))
except
Exception
as
e
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
save
()
self
.
cronjob_log
.
error
(
'{0} [process failed (program)] [business_type={1}] [doc_id={2}] '
'[err={3}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
e
))
else
:
try
:
# 5.上传至EDMS
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
self
.
edms
.
upload
(
excel_path
,
doc
,
business_type
)
except
Exception
as
e
:
self
.
cronjob_log
.
warn
(
'{0} [edms upload failed] [times={1}] [business_type={2}] [doc_id={3}] '
'[error={4}]'
.
format
(
self
.
log_base
,
times
,
business_type
,
doc
.
id
,
e
))
edms_exc
=
str
(
e
)
else
:
break
else
:
raise
EDMSException
(
edms_exc
)
except
Exception
as
e
:
doc
.
status
=
DocStatus
.
UPLOAD_FAILED
.
value
doc
.
save
()
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
cronjob_log
.
error
(
'{0} [process failed (edms upload)] [business_type={1}] [doc_id={2}] '
'[speed_time={3}] [err={4}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
speed_time
,
e
))
write_zip_file
(
img_save_path
,
os
.
path
.
join
(
doc_data_path
,
'{0}_img.zip'
.
format
(
doc
.
id
)))
else
:
doc
.
status
=
DocStatus
.
COMPLETE
.
value
doc
.
save
()
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
cronjob_log
.
info
(
'{0} [process complete] [business_type={1}] [doc_id={2}] '
'[speed_time={3}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
speed_time
))
write_zip_file
(
img_save_path
,
os
.
path
.
join
(
doc_data_path
,
'{0}_img.zip'
.
format
(
doc
.
id
)))
self
.
cronjob_log
.
info
(
'{0} [stop safely]'
.
format
(
self
.
log_base
))
src/apps/doc/management/commands/idcard_statistics.py
0 → 100644
View file @
9e8023a
import
re
import
os
import
ast
import
datetime
from
openpyxl
import
Workbook
from
django.core.management
import
BaseCommand
from
settings
import
conf
from
common.mixins
import
LoggerMixin
from
apps.doc.models
import
HILDoc
,
AFCDoc
from
apps.doc
import
consts
class
Command
(
BaseCommand
,
LoggerMixin
):
def
__init__
(
self
):
super
()
.
__init__
()
self
.
sheet_names
=
(
'AFC'
,
'HIL'
)
self
.
header
=
(
'申请号'
,
'身份证号'
,
'民族'
,
'时间戳'
)
def
add_arguments
(
self
,
parser
):
parser
.
add_argument
(
'--date'
,
default
=
datetime
.
date
.
today
()
-
datetime
.
timedelta
(
days
=
1
),
dest
=
'date'
,
help
=
'将要计算的日期,格式: 2018-01-01'
)
def
handle
(
self
,
*
args
,
**
kwargs
):
date
=
kwargs
.
get
(
'date'
)
if
isinstance
(
date
,
str
):
if
not
re
.
match
(
r'\d{4}-\d{2}-\d{2}'
,
date
):
print
(
'date format error'
)
return
date_str
=
date
else
:
date_str
=
date
.
strftime
(
'
%
Y-
%
m-
%
d'
)
excel_dir
=
os
.
path
.
join
(
conf
.
DATA_DIR
,
'AFC'
,
'Logs'
)
if
not
os
.
path
.
exists
(
excel_dir
):
print
(
'excel dir not exists'
)
return
excel_path
=
os
.
path
.
join
(
excel_dir
,
'idcard_{0}.xlsx'
.
format
(
date_str
))
log_path
=
os
.
path
.
join
(
conf
.
LOG_DIR
,
'idcard.log.{0}'
.
format
(
date_str
))
if
not
os
.
path
.
exists
(
log_path
):
print
(
'log_path not exists'
)
return
wb
=
Workbook
()
for
name
in
self
.
sheet_names
:
ws
=
wb
.
create_sheet
(
name
)
ws
.
append
(
self
.
header
)
wb
.
remove
(
wb
.
get_sheet_by_name
(
'Sheet'
))
with
open
(
log_path
,
'r'
,
encoding
=
'utf-8'
)
as
fp
:
for
line
in
fp
:
search_obj
=
re
.
search
(
r'[(.*)] [task=(.*)] [idcard=(.*)]'
,
line
)
task_str
=
search_obj
.
group
(
1
)
license_summary
=
ast
.
literal_eval
(
search_obj
.
group
(
2
))
business_type
,
doc_id_str
=
task_str
.
split
(
consts
.
SPLIT_STR
)
doc_id
=
int
(
doc_id_str
)
doc_class
=
HILDoc
if
business_type
==
consts
.
HIL_PREFIX
else
AFCDoc
application_id
=
doc_class
.
objects
.
filter
(
id
=
doc_id
)
.
values_list
(
'application_id'
,
flat
=
True
)
for
classify
,
(
_
,
name
,
field_order
,
side_diff
,
_
,
_
)
in
consts
.
LICENSE_ORDER
:
license_list
=
license_summary
.
get
(
classify
)
if
not
license_list
:
continue
ws
=
wb
.
get_sheet_by_name
(
name
)
for
license_dict
in
license_list
:
if
classify
==
consts
.
IC_CLASSIFY
and
license_dict
.
get
(
'类别'
)
==
'1'
:
# 居住证处理
license_summary
.
setdefault
(
consts
.
RP_CLASSIFY
,
[])
.
append
(
license_dict
)
continue
if
side_diff
:
key
,
field_order_yes
,
field_order_no
=
consts
.
FIELD_ORDER_MAP
.
get
(
classify
)
field_order
=
field_order_yes
if
key
in
license_dict
else
field_order_no
all_value
=
[]
for
search_field
,
write_field
in
field_order
:
if
write_field
is
None
:
continue
field_value
=
license_dict
.
get
(
search_field
,
''
)
if
isinstance
(
field_value
,
list
):
all_value
.
append
(
'
\n
'
.
join
(
field_value
))
else
:
all_value
.
append
(
field_value
)
ws
.
append
((
application_id
[
0
],
*
all_value
))
wb
.
save
(
excel_path
)
src/apps/doc/management/commands/ocr_process.py
View file @
9e8023a
...
...
@@ -641,14 +641,14 @@ class Command(BaseCommand, LoggerMixin):
'[license_summary={4}]'
.
format
(
self
.
log_base
,
task_str
,
bs_summary
,
unknown_summary
,
license_summary
))
self
.
license_log
.
info
(
'[
license_summary={0}]'
.
format
(
license_summary
))
self
.
license_log
.
info
(
'[
task={0}] [license_summary={1}]'
.
format
(
task_str
,
license_summary
))
idcard_list
=
license_summary
.
get
(
consts
.
IC_CLASSIFY
)
if
idcard_list
:
self
.
idcard_log
.
info
(
'[
idcard={0}]'
.
format
(
idcard_list
))
self
.
idcard_log
.
info
(
'[
task={0}] [idcard={1}]'
.
format
(
task_str
,
idcard_list
))
merged_bs_summary
=
self
.
rebuild_bs_summary
(
bs_summary
,
unknown_summary
)
self
.
bs_log
.
info
(
'[
bs_summary={0}]'
.
format
(
merged_bs_summary
))
self
.
bs_log
.
info
(
'[
task={0}] [bs_summary={1}]'
.
format
(
task_str
,
merged_bs_summary
))
self
.
cronjob_log
.
info
(
'{0} [task={1}] [merged_bs_summary={2}] [unknown_summary={3}] '
'[res_list={4}]'
.
format
(
self
.
log_base
,
task_str
,
merged_bs_summary
,
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment