Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
2134d7e0
authored
2021-09-26 18:37:14 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Plain Diff
Merge branch 'feature/smart_dda' into feature/0918
2 parents
d37597e9
4ce34ec0
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
39 additions
and
65 deletions
src/apps/doc/management/commands/folder_dda_process.py
src/apps/doc/management/commands/folder_dda_process.py
View file @
2134d7e
...
...
@@ -60,13 +60,6 @@ class Command(BaseCommand, LoggerMixin):
self
.
sleep_time
=
float
(
conf
.
SLEEP_SECOND_FOLDER
)
# input folder
self
.
input_dirs
=
conf
.
get_namespace
(
'DDA_DIR_'
)
# seperate folder name
self
.
seperate_map
=
{
consts
.
IC_CLASSIFY
:
'IDCard'
,
consts
.
BC_CLASSIFY
:
'BankCard'
,
consts
.
PP_CLASSIFY
:
'Passport'
,
consts
.
EEP_CLASSIFY
:
'EntryPermit'
,
}
self
.
field_map
=
{
consts
.
IC_CLASSIFY
:
(
consts
.
IC_CN_NAME
,
'有效期限'
,
consts
.
IC_FIELD_ORDER_3
,
consts
.
IC_FIELD_ORDER_2
),
consts
.
BC_CLASSIFY
:
(
consts
.
BC_CN_NAME
,
None
,
None
,
consts
.
BC_FIELD_ORDER_2
),
...
...
@@ -77,6 +70,7 @@ class Command(BaseCommand, LoggerMixin):
self
.
ocr_url
=
conf
.
OCR_URL_FOLDER
self
.
ocr_url_2
=
conf
.
OCR2_URL_FOLDER
# self.ocr_url_4 = conf.IC_URL
self
.
classify_set
=
{
consts
.
IC_CLASSIFY
,
consts
.
PP_CLASSIFY
,
consts
.
EEP_CLASSIFY
,
consts
.
BC_CLASSIFY
}
# 优雅退出信号:15
signal
.
signal
(
signal
.
SIGTERM
,
self
.
signal_handler
)
...
...
@@ -94,7 +88,7 @@ class Command(BaseCommand, LoggerMixin):
id_card_dict
.
pop
(
'base64_img'
)
except
Exception
as
e
:
continue
all_res
.
extend
(
license_data
)
all_res
.
setdefault
(
classify
,
[])
.
extend
(
license_data
)
def
license2_process
(
self
,
ocr_data
,
all_res
,
classify
,
img_path
):
pid
,
_
,
_
,
_
,
_
,
_
=
consts
.
LICENSE_CLASSIFY_MAPPING
.
get
(
classify
)
...
...
@@ -129,14 +123,14 @@ class Command(BaseCommand, LoggerMixin):
if
ocr_res_2
.
get
(
'ErrorCode'
)
in
consts
.
SUCCESS_CODE_SET
:
if
pid
==
consts
.
BC_PID
:
all_res
.
append
(
ocr_res_2
)
all_res
.
setdefault
(
classify
,
[])
.
append
(
ocr_res_2
)
else
:
# 营业执照等
for
result_dict
in
ocr_res_2
.
get
(
'ResultList'
,
[]):
res_dict
=
{}
for
field_dict
in
result_dict
.
get
(
'FieldList'
,
[]):
res_dict
[
field_dict
.
get
(
'chn_key'
,
''
)]
=
field_dict
.
get
(
'value'
,
''
)
all_res
.
append
(
res_dict
)
all_res
.
setdefault
(
classify
,
[])
.
append
(
res_dict
)
break
@staticmethod
...
...
@@ -150,22 +144,22 @@ class Command(BaseCommand, LoggerMixin):
return
img_name
,
1
,
1
@staticmethod
def
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
,
seperate_dir
):
def
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
):
time_stamp
=
datetime
.
now
()
.
strftime
(
'
%
Y-
%
m-
%
d_
%
H:
%
M:
%
S'
)
new_name
=
'{0}_{1}'
.
format
(
time_stamp
,
name
)
img_save_path
=
os
.
path
.
join
(
img_output_dir
,
new_name
)
pdf_save_path
=
os
.
path
.
join
(
pdf_output_dir
,
new_name
)
excel_name
=
'{0}.xlsx'
.
format
(
os
.
path
.
splitext
(
new_name
)[
0
])
excel_path
=
os
.
path
.
join
(
wb_output_dir
,
excel_name
)
seperate_path
=
None
if
seperate_dir
is
None
else
os
.
path
.
join
(
seperate_dir
,
new_name
)
return
img_save_path
,
excel_path
,
pdf_save_path
,
seperate_path
return
img_save_path
,
excel_path
,
pdf_save_path
def
res_process
(
self
,
all_res
,
excel_path
,
classify
):
def
res_process
(
self
,
all_res
,
excel_path
):
try
:
wb
=
BSWorkbook
(
set
(),
set
(),
set
(),
set
(),
set
())
for
classify
,
ress
in
all_res
.
items
():
sheet_name
,
key_field
,
side_field_order
,
src_field_order
=
self
.
field_map
.
get
(
classify
)
ws
=
wb
.
create_sheet
(
sheet_name
)
for
res
in
all_re
s
:
for
res
in
res
s
:
if
key_field
is
not
None
and
key_field
in
res
:
field_order
=
side_field_order
else
:
...
...
@@ -189,7 +183,7 @@ class Command(BaseCommand, LoggerMixin):
sep
=
os
.
path
.
sep
+
(
os
.
path
.
altsep
or
''
)
return
os
.
path
.
basename
(
path
.
rstrip
(
sep
))
def
ocr_process
(
self
,
img_path
,
classify
,
all_res
,
seperate_dir
):
def
ocr_process
(
self
,
img_path
,
all_res
):
if
os
.
path
.
exists
(
img_path
):
# TODO 图片验证
with
open
(
img_path
,
'rb'
)
as
f
:
...
...
@@ -199,9 +193,6 @@ class Command(BaseCommand, LoggerMixin):
json_data
=
{
"file"
:
file_data
,
}
if
seperate_dir
is
None
:
json_data
[
"classify"
]
=
classify
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
start_time
=
time
.
time
()
...
...
@@ -223,12 +214,9 @@ class Command(BaseCommand, LoggerMixin):
data_list
=
ocr_res
.
get
(
'data'
,
[])
if
isinstance
(
data_list
,
list
):
for
ocr_data
in
data_list
:
if
ocr_data
.
get
(
'classify'
)
==
classify
:
if
seperate_dir
is
not
None
:
os
.
makedirs
(
seperate_dir
,
exist_ok
=
True
)
real_dst
=
os
.
path
.
join
(
seperate_dir
,
self
.
basename
(
img_path
))
if
not
os
.
path
.
exists
(
real_dst
):
shutil
.
move
(
img_path
,
seperate_dir
)
classify
=
ocr_data
.
get
(
'classify'
)
if
classify
not
in
self
.
classify_set
:
continue
if
classify
in
consts
.
LICENSE_CLASSIFY_SET_1
:
self
.
license1_process
(
ocr_data
,
all_res
,
classify
)
elif
classify
in
consts
.
LICENSE_CLASSIFY_SET_2
:
...
...
@@ -237,20 +225,20 @@ class Command(BaseCommand, LoggerMixin):
else
:
self
.
folder_log
.
warn
(
'{0} [ocr failed] [img_path={1}]'
.
format
(
self
.
log_base
,
img_path
))
def
images_process
(
self
,
img_path_list
,
classify
,
excel_path
,
seperate_dir
):
all_res
=
[]
def
images_process
(
self
,
img_path_list
,
excel_path
):
all_res
=
{}
for
img_path
in
img_path_list
:
self
.
ocr_process
(
img_path
,
classify
,
all_res
,
seperate_dir
)
self
.
ocr_process
(
img_path
,
all_res
)
# if len(all_res) > 0:
self
.
res_process
(
all_res
,
excel_path
,
classify
)
self
.
res_process
(
all_res
,
excel_path
)
return
all_res
def
pdf_process
(
self
,
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
,
seperate
_dir
):
def
pdf_process
(
self
,
name
,
path
,
img_output_dir
,
wb_output_dir
,
pdf_output
_dir
):
if
os
.
path
.
exists
(
path
):
rebuild_res
=
None
try
:
img_save_path
,
excel_path
,
pdf_save_path
,
seperate_path
=
self
.
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
,
seperate_dir
)
img_save_path
,
excel_path
,
pdf_save_path
=
self
.
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
)
self
.
folder_log
.
info
(
'{0} [pdf to img start] [path={1}]'
.
format
(
self
.
log_base
,
path
))
pdf_handler
=
PDFHandler
(
path
,
img_save_path
)
pdf_handler
.
extract_image
()
...
...
@@ -260,16 +248,16 @@ class Command(BaseCommand, LoggerMixin):
self
.
log_base
,
path
,
traceback
.
format_exc
()))
raise
e
else
:
rebuild_res
=
self
.
images_process
(
pdf_handler
.
img_path_list
,
classify
,
excel_path
,
seperate
_path
)
rebuild_res
=
self
.
images_process
(
pdf_handler
.
img_path_list
,
excel
_path
)
shutil
.
move
(
path
,
pdf_save_path
)
return
rebuild_res
def
tif_process
(
self
,
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
tiff_output_dir
,
seperate
_dir
):
def
tif_process
(
self
,
name
,
path
,
img_output_dir
,
wb_output_dir
,
tiff_output
_dir
):
if
os
.
path
.
exists
(
path
):
rebuild_res
=
None
try
:
img_save_path
,
excel_path
,
tiff_save_path
,
seperate_path
=
self
.
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
tiff_output_dir
,
seperate_dir
)
img_save_path
,
excel_path
,
tiff_save_path
=
self
.
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
tiff_output_dir
)
self
.
folder_log
.
info
(
'{0} [tiff to img start] [path={1}]'
.
format
(
self
.
log_base
,
path
))
tiff_handler
=
TIFFHandler
(
path
,
img_save_path
)
tiff_handler
.
extract_image
()
...
...
@@ -279,20 +267,19 @@ class Command(BaseCommand, LoggerMixin):
self
.
log_base
,
path
,
traceback
.
format_exc
()))
raise
e
else
:
rebuild_res
=
self
.
images_process
(
tiff_handler
.
img_path_list
,
classify
,
excel_path
,
seperate
_path
)
rebuild_res
=
self
.
images_process
(
tiff_handler
.
img_path_list
,
excel
_path
)
shutil
.
move
(
path
,
tiff_save_path
)
return
rebuild_res
def
img_process
(
self
,
name
,
path
,
classify
,
wb_output_dir
,
img_output_dir
,
pdf_output_dir
,
seperate
_dir
):
def
img_process
(
self
,
name
,
path
,
wb_output_dir
,
img_output_dir
,
pdf_output
_dir
):
rebuild_res
=
None
try
:
img_save_path
,
excel_path
,
_
,
seperate_path
=
self
.
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
,
seperate_dir
)
img_save_path
,
excel_path
,
_
=
self
.
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
)
except
Exception
as
e
:
self
.
folder_log
.
error
(
'{0} [get path error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
else
:
rebuild_res
=
self
.
images_process
([
path
],
classify
,
excel_path
,
seperate
_path
)
rebuild_res
=
self
.
images_process
([
path
],
excel
_path
)
shutil
.
move
(
path
,
img_save_path
)
return
rebuild_res
...
...
@@ -344,7 +331,7 @@ class Command(BaseCommand, LoggerMixin):
self
.
log_base
,
result
,
traceback
.
format_exc
()))
wb
.
save
(
wb_path
)
def
folder_process
(
self
,
input_dir
,
classify
,
is_combined
,
result_queue
):
def
folder_process
(
self
,
input_dir
,
result_queue
):
while
not
os
.
path
.
isdir
(
input_dir
):
self
.
folder_log
.
info
(
'{0} [input dir is not dir] [input_dir={1}]'
.
format
(
self
.
log_base
,
input_dir
))
if
self
.
switch
:
...
...
@@ -353,7 +340,6 @@ class Command(BaseCommand, LoggerMixin):
else
:
return
output_dir
=
os
.
path
.
join
(
os
.
path
.
dirname
(
input_dir
),
'Output'
)
seperate_dir
=
os
.
path
.
join
(
output_dir
,
self
.
seperate_map
.
get
(
classify
,
'Unknown'
))
if
is_combined
else
None
img_output_dir
=
os
.
path
.
join
(
output_dir
,
'image'
)
wb_output_dir
=
os
.
path
.
join
(
output_dir
,
'excel'
)
pdf_output_dir
=
os
.
path
.
join
(
output_dir
,
'pdf'
)
...
...
@@ -365,8 +351,6 @@ class Command(BaseCommand, LoggerMixin):
os
.
makedirs
(
pdf_output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
tiff_output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
failed_output_dir
,
exist_ok
=
True
)
if
seperate_dir
is
not
None
:
os
.
makedirs
(
seperate_dir
,
exist_ok
=
True
)
os_error_filename_set
=
set
()
while
self
.
switch
:
# if not os.path.isdir(input_dir):
...
...
@@ -389,14 +373,14 @@ class Command(BaseCommand, LoggerMixin):
if
os
.
path
.
isfile
(
path
):
self
.
folder_log
.
info
(
'{0} [file start] [path={1}]'
.
format
(
self
.
log_base
,
path
))
if
name
.
endswith
(
'.pdf'
)
or
name
.
endswith
(
'.PDF'
):
result
=
self
.
pdf_process
(
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
,
seperate_dir
)
result
=
self
.
pdf_process
(
name
,
path
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
)
elif
name
.
endswith
(
'.tif'
)
or
name
.
endswith
(
'.TIF'
):
result
=
self
.
tif_process
(
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
tiff_output_dir
,
seperate_dir
)
result
=
self
.
tif_process
(
name
,
path
,
img_output_dir
,
wb_output_dir
,
tiff_output_dir
)
else
:
result
=
self
.
img_process
(
name
,
path
,
classify
,
wb_output_dir
,
img_output_dir
,
pdf_output_dir
,
seperate_dir
)
result
=
self
.
img_process
(
name
,
path
,
wb_output_dir
,
img_output_dir
,
pdf_output_dir
)
self
.
folder_log
.
info
(
'{0} [file end] [path={1}]'
.
format
(
self
.
log_base
,
path
))
else
:
result
=
None
...
...
@@ -420,16 +404,8 @@ class Command(BaseCommand, LoggerMixin):
else
:
if
isinstance
(
result
,
dict
)
and
len
(
result
)
>
0
:
date_str
=
time
.
strftime
(
"
%
Y-
%
m-
%
d"
)
result_queue
.
put
(
{
self
.
CLASSIFY_KEY
:
classify
,
self
.
RESULT_KEY
:
result
,
self
.
DATE_KEY
:
date_str
}
)
elif
isinstance
(
result
,
list
)
and
len
(
result
)
>
0
:
date_str
=
time
.
strftime
(
"
%
Y-
%
m-
%
d"
)
for
res
in
result
:
for
classify
,
res_list
in
result
.
items
():
for
res
in
res_list
:
result_queue
.
put
(
{
self
.
CLASSIFY_KEY
:
classify
,
...
...
@@ -444,12 +420,10 @@ class Command(BaseCommand, LoggerMixin):
result_queue
=
Queue
()
process_list
=
[]
one_input_dir
=
None
for
classify_idx
,
input_dir
in
self
.
input_dirs
.
items
():
for
_
,
input_dir
in
self
.
input_dirs
.
items
():
if
one_input_dir
is
None
:
one_input_dir
=
input_dir
classify
=
int
(
classify_idx
.
split
(
'_'
)[
0
])
is_combined
=
True
if
int
(
classify_idx
.
split
(
'_'
)[
2
])
==
1
else
False
process
=
Process
(
target
=
self
.
folder_process
,
args
=
(
input_dir
,
classify
,
is_combined
,
result_queue
))
process
=
Process
(
target
=
self
.
folder_process
,
args
=
(
input_dir
,
result_queue
))
process_list
.
append
(
process
)
wb_dir
=
os
.
path
.
dirname
(
os
.
path
.
dirname
(
one_input_dir
))
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment