Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
bffd2595
authored
2020-11-02 19:35:47 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
add res sheet
1 parent
01efbccb
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
56 additions
and
36 deletions
src/apps/doc/consts.py
src/apps/doc/management/commands/doc_ocr_process.py
src/apps/doc/ocr/wb.py
src/apps/doc/consts.py
View file @
bffd259
...
...
@@ -72,8 +72,12 @@ TRANS_MAP = {
}
TRANS
=
str
.
maketrans
(
TRANS_MAP
)
ERROR_CHARS
=
{
'.'
,
'。'
,
':'
,
':'
,
'•'
,
'·'
,
','
,
','
}
SKIP_IMG_SHEET_NAME
=
'未处理图片'
SKIP_IMG_SHEET_HEADER
=
(
'页码'
,
'序号'
)
RES_SHEET_NAME
=
'结果统计'
RES_SHEET_HEADER
=
(
'页码'
,
'序号'
,
'结果'
)
RES_SUCCESS
=
'识别成功'
RES_SUCCESS_OTHER
=
'识别成功(其他类)'
RES_SUCCESS_EMPTY
=
'识别成功(空数据)'
RES_FAILED
=
'识别识别'
CARD_RATIO
=
0.9
UNKNOWN_CARD
=
'未知卡号'
...
...
src/apps/doc/management/commands/doc_ocr_process.py
View file @
bffd259
...
...
@@ -93,18 +93,19 @@ class Command(BaseCommand, LoggerMixin):
self
.
log_base
,
business_type
,
doc
.
id
,
pdf_path
))
return
doc_data_path
,
excel_path
,
src_excel_path
,
pdf_path
def
bs_process
(
self
,
wb
,
ocr_data
,
bs_summary
,
unknown_summary
,
img_path
,
classify
,
skip_img
):
def
bs_process
(
self
,
wb
,
ocr_data
,
bs_summary
,
unknown_summary
,
classify
,
res_list
,
pno
,
ino
):
sheets
=
ocr_data
.
get
(
'data'
,
[])
if
not
sheets
:
skip_img
.
append
(
self
.
parse_img_path
(
img_path
))
res_list
.
append
((
pno
,
ino
,
consts
.
RES_SUCCESS_EMPTY
))
return
confidence
=
ocr_data
.
get
(
'confidence'
,
1
)
img_name
,
_
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
img_path
))
img_name
=
'page_{0}_img_{1}'
.
format
(
pno
,
ino
)
cells_exists
=
False
for
i
,
sheet
in
enumerate
(
sheets
):
cells
=
sheet
.
get
(
'cells'
)
if
not
cells
:
skip_img
.
append
(
self
.
parse_img_path
(
img_path
))
continue
cells_exists
=
True
sheet_name
=
'{0}_{1}'
.
format
(
img_name
,
i
)
ws
=
wb
.
create_sheet
(
sheet_name
)
for
cell
in
cells
:
...
...
@@ -160,16 +161,23 @@ class Command(BaseCommand, LoggerMixin):
if
summary
[
6
]
is
not
None
:
ed_list
.
append
(
summary
[
6
])
def
license1_process
(
self
,
ocr_data
,
license_summary
,
classify
,
skip_img
,
img_path
):
if
cells_exists
:
res_list
.
append
((
pno
,
ino
,
consts
.
RES_SUCCESS
))
else
:
res_list
.
append
((
pno
,
ino
,
consts
.
RES_SUCCESS_EMPTY
))
def
license1_process
(
self
,
ocr_data
,
license_summary
,
classify
,
res_list
,
pno
,
ino
):
# 类别:'0'身份证, '1'居住证
license_data
=
ocr_data
.
get
(
'data'
,
[])
if
not
license_data
:
skip_img
.
append
(
self
.
parse_img_path
(
img_path
))
res_list
.
append
((
pno
,
ino
,
consts
.
RES_SUCCESS_EMPTY
))
return
res_list
.
append
((
pno
,
ino
,
consts
.
RES_SUCCESS
))
license_summary
.
setdefault
(
classify
,
[])
.
extend
(
license_data
)
def
license2_process
(
self
,
ocr_res_2
,
license_summary
,
pid
,
classify
,
skip_img
,
img_path
):
def
license2_process
(
self
,
ocr_res_2
,
license_summary
,
pid
,
classify
,
res_list
,
pno
,
ino
):
if
ocr_res_2
.
get
(
'ErrorCode'
)
in
consts
.
SUCCESS_CODE_SET
:
res_list
.
append
((
pno
,
ino
,
consts
.
RES_SUCCESS
))
if
pid
==
consts
.
BC_PID
:
# 银行卡
# res_dict = {}
...
...
@@ -184,7 +192,7 @@ class Command(BaseCommand, LoggerMixin):
res_dict
[
field_dict
.
get
(
'chn_key'
,
''
)]
=
field_dict
.
get
(
'value'
,
''
)
license_summary
.
setdefault
(
classify
,
[])
.
append
(
res_dict
)
else
:
skip_img
.
append
(
self
.
parse_img_path
(
img_path
))
res_list
.
append
((
pno
,
ino
,
consts
.
RES_FAILED
))
@staticmethod
async
def
fetch_ocr_1_result
(
url
,
json_data
):
...
...
@@ -207,7 +215,8 @@ class Command(BaseCommand, LoggerMixin):
if
response
.
status
==
200
:
return
await
response
.
json
()
async
def
img_2_ocr_2_wb
(
self
,
wb
,
img_path
,
bs_summary
,
unknown_summary
,
license_summary
,
skip_img
):
async
def
img_2_ocr_2_wb
(
self
,
wb
,
img_path
,
bs_summary
,
unknown_summary
,
license_summary
,
res_list
):
pno
,
ino
=
self
.
parse_img_path
(
img_path
)
with
open
(
img_path
,
'rb'
)
as
f
:
base64_data
=
base64
.
b64encode
(
f
.
read
())
# 获取解码后的base64值
...
...
@@ -217,23 +226,26 @@ class Command(BaseCommand, LoggerMixin):
}
ocr_res_1
=
await
self
.
fetch_ocr_1_result
(
self
.
ocr_url_1
,
json_data_1
)
if
ocr_res_1
is
None
:
skip_img
.
append
(
self
.
parse_img_path
(
img_path
))
raise
Exception
(
'ocr 1 error, img_path={0}'
.
format
(
img_path
))
res_list
.
append
((
pno
,
ino
,
consts
.
RES_FAILED
))
self
.
cronjob_log
.
info
(
'{0} [ocr_1 failed] [img={1}]'
.
format
(
self
.
log_base
,
img_path
))
# raise Exception('ocr 1 error, img_path={0}'.format(img_path))
else
:
self
.
cronjob_log
.
info
(
'{0} [ocr_1
result
] [img={1}] [res={2}]'
.
format
(
self
.
cronjob_log
.
info
(
'{0} [ocr_1
success
] [img={1}] [res={2}]'
.
format
(
self
.
log_base
,
img_path
,
ocr_res_1
))
if
ocr_res_1
.
get
(
'code'
)
==
1
:
ocr_data
=
ocr_res_1
.
get
(
'data'
,
{})
classify
=
ocr_data
.
get
(
'classify'
)
if
classify
is
None
:
skip_img
.
append
(
self
.
parse_img_path
(
img_path
))
res_list
.
append
((
pno
,
ino
,
consts
.
RES_FAILED
))
self
.
cronjob_log
.
info
(
'{0} [ocr_1 res error] [img={1}] [res={2}]'
.
format
(
self
.
log_base
,
img_path
,
ocr_res_1
))
return
elif
classify
in
consts
.
OTHER_CLASSIFY_SET
:
# 其他类
skip_img
.
append
(
self
.
parse_img_path
(
img_path
))
res_list
.
append
((
pno
,
ino
,
consts
.
RES_SUCCESS_OTHER
))
return
elif
classify
in
consts
.
LICENSE_CLASSIFY_SET_1
:
# 证件1
self
.
license1_process
(
ocr_data
,
license_summary
,
classify
,
skip_img
,
img_path
)
self
.
license1_process
(
ocr_data
,
license_summary
,
classify
,
res_list
,
pno
,
ino
)
elif
classify
in
consts
.
LICENSE_CLASSIFY_SET_2
:
# 证件2
pid
,
_
,
_
,
_
,
_
=
consts
.
LICENSE_CLASSIFY_MAPPING
.
get
(
classify
)
json_data_2
=
{
...
...
@@ -244,11 +256,13 @@ class Command(BaseCommand, LoggerMixin):
}
ocr_res_2
=
await
self
.
fetch_ocr_2_result
(
self
.
ocr_url_2
,
json_data_2
)
if
ocr_res_2
is
None
:
raise
Exception
(
'ocr 2 error, img_path={0}'
.
format
(
img_path
))
res_list
.
append
((
pno
,
ino
,
consts
.
RES_FAILED
))
self
.
cronjob_log
.
info
(
'{0} [ocr_2 failed] [img={1}]'
.
format
(
self
.
log_base
,
img_path
))
# raise Exception('ocr 2 error, img_path={0}'.format(img_path))
else
:
# 识别结果
ocr_res_2
=
json
.
loads
(
ocr_res_2
)
self
.
cronjob_log
.
info
(
'{0} [ocr_2
result
] [img={1}] [res={2}]'
.
format
(
self
.
cronjob_log
.
info
(
'{0} [ocr_2
success
] [img={1}] [res={2}]'
.
format
(
self
.
log_base
,
img_path
,
ocr_res_2
))
if
classify
==
consts
.
BC_CLASSIFY
:
name
=
'有'
...
...
@@ -258,11 +272,13 @@ class Command(BaseCommand, LoggerMixin):
card_name_res
.
get
(
'data'
,
{})
.
get
(
'is_exists_name'
)
==
0
:
name
=
'无'
ocr_res_2
[
'Name'
]
=
name
self
.
license2_process
(
ocr_res_2
,
license_summary
,
pid
,
classify
,
skip_img
,
img_path
)
self
.
license2_process
(
ocr_res_2
,
license_summary
,
pid
,
classify
,
res_list
,
pno
,
ino
)
else
:
# 流水处理
self
.
bs_process
(
wb
,
ocr_data
,
bs_summary
,
unknown_summary
,
img_path
,
classify
,
skip_img
)
self
.
bs_process
(
wb
,
ocr_data
,
bs_summary
,
unknown_summary
,
classify
,
res_list
,
pno
,
ino
)
else
:
skip_img
.
append
(
self
.
parse_img_path
(
img_path
))
res_list
.
append
((
pno
,
ino
,
consts
.
RES_FAILED
))
self
.
cronjob_log
.
info
(
'{0} [ocr_1 res error] [img={1}] [res={2}]'
.
format
(
self
.
log_base
,
img_path
,
ocr_res_1
))
# def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
# # # 流水
...
...
@@ -559,7 +575,7 @@ class Command(BaseCommand, LoggerMixin):
bs_summary
=
{}
license_summary
=
{}
unknown_summary
=
{}
skip_img
=
[]
res_list
=
[]
interest_keyword
=
Keywords
.
objects
.
filter
(
type
=
KeywordsType
.
INTEREST
.
value
,
on_off
=
True
)
.
values_list
(
'keyword'
,
flat
=
True
)
salary_keyword
=
Keywords
.
objects
.
filter
(
...
...
@@ -573,13 +589,13 @@ class Command(BaseCommand, LoggerMixin):
# 4.1 获取OCR结果
loop
=
asyncio
.
get_event_loop
()
tasks
=
[
self
.
img_2_ocr_2_wb
(
wb
,
img_path
,
bs_summary
,
unknown_summary
,
license_summary
,
skip_img
)
tasks
=
[
self
.
img_2_ocr_2_wb
(
wb
,
img_path
,
bs_summary
,
unknown_summary
,
license_summary
,
res_list
)
for
img_path
in
pdf_handler
.
img_path_list
]
loop
.
run_until_complete
(
asyncio
.
wait
(
tasks
))
# loop.close()
# for img_path in pdf_handler.img_path_list:
# self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary,
skip_img
)
# self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary,
res_list
)
self
.
cronjob_log
.
info
(
'{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] '
'[license_summary={5}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
bs_summary
,
...
...
@@ -588,14 +604,14 @@ class Command(BaseCommand, LoggerMixin):
merged_bs_summary
=
self
.
rebuild_bs_summary
(
bs_summary
,
unknown_summary
)
self
.
cronjob_log
.
info
(
'{0} [business_type={1}] [doc_id={2}] [merged_bs_summary={3}] '
'[unknown_summary={4}] [
skip_img
={5}]'
.
format
(
self
.
log_base
,
business_type
,
'[unknown_summary={4}] [
res_list
={5}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
merged_bs_summary
,
unknown_summary
,
skip_img
))
unknown_summary
,
res_list
))
del
unknown_summary
# 4.2 重构Excel文件
wb
.
save
(
src_excel_path
)
wb
.
rebuild
(
merged_bs_summary
,
license_summary
,
skip_img
,
doc
.
document_scheme
)
wb
.
rebuild
(
merged_bs_summary
,
license_summary
,
res_list
,
doc
.
document_scheme
)
wb
.
save
(
excel_path
)
except
EDMSException
as
e
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
...
...
src/apps/doc/ocr/wb.py
View file @
bffd259
...
...
@@ -502,19 +502,19 @@ class BSWorkbook(Workbook):
ws
.
append
((
write_field
,
license_dict
.
get
(
search_field
,
''
)))
ws
.
append
((
None
,
))
def
skip_img_sheet
(
self
,
skip_img
):
if
skip_img
:
ws
=
self
.
create_sheet
(
consts
.
SKIP_IMG
_SHEET_NAME
)
ws
.
append
(
consts
.
SKIP_IMG
_SHEET_HEADER
)
for
img_tuple
in
skip_img
:
ws
.
append
(
img
_tuple
)
def
res_sheet
(
self
,
res_list
):
if
res_list
:
ws
=
self
.
create_sheet
(
consts
.
RES
_SHEET_NAME
)
ws
.
append
(
consts
.
RES
_SHEET_HEADER
)
for
res_tuple
in
res_list
:
ws
.
append
(
res
_tuple
)
def
remove_base_sheet
(
self
):
if
len
(
self
.
sheetnames
)
>
1
:
self
.
remove
(
self
.
get_sheet_by_name
(
'Sheet'
))
def
rebuild
(
self
,
bs_summary
,
license_summary
,
skip_img
,
document_scheme
):
def
rebuild
(
self
,
bs_summary
,
license_summary
,
res_list
,
document_scheme
):
self
.
bs_rebuild
(
bs_summary
)
self
.
license_rebuild
(
license_summary
,
document_scheme
)
self
.
skip_img_sheet
(
skip_img
)
self
.
res_sheet
(
res_list
)
self
.
remove_base_sheet
()
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment