Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
c1ca6fa5
authored
2021-07-26 16:11:28 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
add ltgt wb daily
1 parent
c39b3051
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
509 additions
and
72 deletions
src/apps/doc/management/commands/folder_ltgt_process.py
src/apps/doc/management/commands/folder_ocr_process.py
src/apps/doc/management/commands/folder_ltgt_process.py
0 → 100644
View file @
c1ca6fa
import
os
import
re
import
time
import
shutil
import
base64
import
signal
import
requests
import
traceback
from
PIL
import
Image
from
datetime
import
datetime
from
django.core.management
import
BaseCommand
from
multiprocessing
import
Process
,
Queue
from
openpyxl
import
load_workbook
,
Workbook
from
settings
import
conf
from
common.mixins
import
LoggerMixin
from
common.tools.pdf_to_img
import
PDFHandler
from
apps.doc
import
consts
from
apps.doc.exceptions
import
OCR1Exception
,
OCR4Exception
,
LTGTException
from
apps.doc.ocr.wb
import
BSWorkbook
class
TIFFHandler
:
def
__init__
(
self
,
path
,
img_save_path
):
self
.
path
=
path
self
.
img_save_path
=
img_save_path
self
.
img_path_list
=
[]
def
extract_image
(
self
):
os
.
makedirs
(
self
.
img_save_path
,
exist_ok
=
True
)
tiff
=
Image
.
open
(
self
.
path
)
tiff
.
load
()
for
i
in
range
(
tiff
.
n_frames
):
try
:
save_path
=
os
.
path
.
join
(
self
.
img_save_path
,
'page_{0}.jpeg'
.
format
(
i
))
tiff
.
seek
(
i
)
tiff
.
save
(
save_path
)
self
.
img_path_list
.
append
(
save_path
)
except
EOFError
:
break
class
Command
(
BaseCommand
,
LoggerMixin
):
def
__init__
(
self
):
super
()
.
__init__
()
self
.
log_base
=
'[folder ltgt process]'
# 处理文件开关
self
.
switch
=
True
self
.
ltgt_classify_mapping
=
{
128
:
'执行裁定书'
,
129
:
'民事判决书'
,
130
:
'民事调解书'
}
self
.
sheet_content
=
{
128
:
(
'执行裁定书'
,
(
'承办法院'
,
'案号/标号'
,
'被执行人'
,
'债权金额'
,
'诉讼时间'
)),
129
:
(
'民事判决书'
,
(
'承办法院'
,
'案号/标号'
,
'被告'
,
'判决结果: 贷款本金'
,
'判决结果: 罚息'
,
'判决结果: 律师费'
,
'判决结果: 案件受理费'
,
'诉讼时间'
)),
130
:
(
'民事调解书'
,
(
'承办法院'
,
'案号/标号'
,
'被告'
,
'协议内容: 支付金额'
,
'协议内容: 案件受理费'
,
'诉讼时间'
)),
}
self
.
DATE_KEY
=
'date'
self
.
CLASSIFY_KEY
=
'classify'
self
.
RESULT_KEY
=
'result'
self
.
daily_wb_name
=
'Output_{0}.xlsx'
self
.
short_sleep_time
=
10
self
.
long_sleep_time
=
3600
# 睡眠时间
self
.
sleep_time
=
float
(
conf
.
SLEEP_SECOND_FOLDER
)
# input folder
self
.
input_dirs
=
conf
.
get_namespace
(
'LTGT_DIR_'
)
# ocr相关
# self.ocr_url = conf.OCR_URL_FOLDER
# self.ocr_url_4 = conf.IC_URL
self
.
ltgt_ocr_url
=
conf
.
LTGT_URL
# 优雅退出信号:15
signal
.
signal
(
signal
.
SIGTERM
,
self
.
signal_handler
)
def
signal_handler
(
self
,
sig
,
frame
):
self
.
switch
=
False
# 停止处理文件
def
license1_process
(
self
,
ocr_data
,
license_summary
,
classify
,
img_path
):
# 类别:'0'身份证, '1'居住证
license_data
=
ocr_data
.
get
(
'data'
,
[])
if
not
license_data
:
return
if
classify
==
consts
.
MVC_CLASSIFY
:
# 车辆登记证 3/4页结果整合
for
mvc_dict
in
license_data
:
try
:
mvc_page
=
mvc_dict
.
pop
(
'page'
)
except
Exception
as
e
:
pass
else
:
if
mvc_page
==
'VehicleRegArea'
:
mvc_res
=
mvc_dict
.
pop
(
'results'
,
{})
mvc_dict
[
'机动车登记证书编号'
]
=
mvc_res
.
get
(
'register_no'
,
{})
.
get
(
'words'
,
''
)
for
register_info
in
mvc_res
.
get
(
'register_info'
,
[]):
for
detail_dict
in
register_info
.
get
(
'details'
,
{})
.
values
():
mvc_dict
.
setdefault
(
detail_dict
.
get
(
'chinese_key'
,
'未知'
),
[])
.
append
(
detail_dict
.
get
(
'words'
,
''
))
del
mvc_res
if
classify
==
consts
.
IC_CLASSIFY
:
for
id_card_dict
in
license_data
:
try
:
base64_img
=
id_card_dict
.
pop
(
'base64_img'
)
except
Exception
as
e
:
continue
else
:
card_type
=
-
1
json_data_4
=
{
'mode'
:
1
,
'user_info'
:
{
'image_content'
:
base64_img
,
},
'options'
:
{
'distinguish_type'
:
1
,
'auto_rotate'
:
True
,
},
}
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
start_time
=
time
.
time
()
ocr_4_response
=
requests
.
post
(
self
.
ocr_url_4
,
json
=
json_data_4
)
if
ocr_4_response
.
status_code
!=
200
:
raise
OCR4Exception
(
'ocr_4 status code: {0}'
.
format
(
ocr_4_response
.
status_code
))
except
Exception
as
e
:
self
.
folder_log
.
warn
(
'{0} [ocr_4 failed] [times={1}] [img_path={2}] [error={3}]'
.
format
(
self
.
log_base
,
times
,
img_path
,
traceback
.
format_exc
()))
else
:
ocr_4_res
=
ocr_4_response
.
json
()
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
if
ocr_4_res
.
get
(
'code'
)
==
0
and
ocr_4_res
.
get
(
'result'
,
{})
.
get
(
'rtn'
)
==
0
:
card_type
=
ocr_4_res
.
get
(
'result'
,
{})
.
get
(
'idcard_distinguish_result'
,
{})
.
get
(
'result'
,
-
1
)
self
.
folder_log
.
info
(
'{0} [ocr_4 success] [img_path={1}] [speed_time={2}]'
.
format
(
self
.
log_base
,
img_path
,
speed_time
))
break
else
:
self
.
folder_log
.
warn
(
'{0} [ocr_4 failed] [img_path={1}]'
.
format
(
self
.
log_base
,
img_path
))
id_card_dict
[
consts
.
IC_TURE_OR_FALSE
]
=
consts
.
IC_RES_MAPPING
.
get
(
card_type
)
license_summary
.
setdefault
(
classify
,
[])
.
extend
(
license_data
)
@staticmethod
def
parse_img_path
(
img_path
):
# 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext)
img_name
,
_
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
img_path
))
if
re
.
match
(
r'page_\d+_img_\d+'
,
img_name
):
part_list
=
img_name
.
split
(
'_'
)
return
img_name
,
int
(
part_list
[
1
])
+
1
,
int
(
part_list
[
3
])
+
1
else
:
return
img_name
,
1
,
1
@staticmethod
def
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
):
time_stamp
=
datetime
.
now
()
.
strftime
(
'
%
Y-
%
m-
%
d_
%
H:
%
M:
%
S'
)
new_name
=
'{0}_{1}'
.
format
(
time_stamp
,
name
)
img_save_path
=
os
.
path
.
join
(
img_output_dir
,
new_name
)
pdf_save_path
=
os
.
path
.
join
(
pdf_output_dir
,
new_name
)
excel_name
=
'{0}.xlsx'
.
format
(
os
.
path
.
splitext
(
new_name
)[
0
])
excel_path
=
os
.
path
.
join
(
wb_output_dir
,
excel_name
)
return
img_save_path
,
excel_path
,
pdf_save_path
def
res_process
(
self
,
all_res
,
classify
,
excel_path
):
try
:
license_summary
=
{}
if
not
all_res
:
return
else
:
for
img_path
,
ocr_res
in
all_res
.
items
():
# img_name, pno, ino = self.parse_img_path(img_path)
# part_idx = 1
if
isinstance
(
ocr_res
,
dict
):
if
ocr_res
.
get
(
'code'
)
==
1
:
data_list
=
ocr_res
.
get
(
'data'
,
[])
if
isinstance
(
data_list
,
list
):
for
ocr_data
in
data_list
:
# part_idx = part_idx + 1
self
.
license1_process
(
ocr_data
,
license_summary
,
classify
,
img_path
)
wb
=
BSWorkbook
(
set
(),
set
(),
set
(),
set
(),
set
())
wb
.
simple_license_rebuild
(
license_summary
,
consts
.
DOC_SCHEME_LIST
[
0
])
wb
.
remove_base_sheet
()
wb
.
save
(
excel_path
)
except
Exception
as
e
:
self
.
folder_log
.
error
(
'{0} [wb build error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
excel_path
,
traceback
.
format_exc
()))
def
ocr_process
(
self
,
img_path
,
classify
):
if
os
.
path
.
exists
(
img_path
):
# TODO 图片验证
with
open
(
img_path
,
'rb'
)
as
f
:
base64_data
=
base64
.
b64encode
(
f
.
read
())
# 获取解码后的base64值
file_data
=
base64_data
.
decode
()
json_data
=
{
"file"
:
file_data
,
"classify"
:
classify
}
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
start_time
=
time
.
time
()
ocr_response
=
requests
.
post
(
self
.
ocr_url
,
json
=
json_data
)
if
ocr_response
.
status_code
!=
200
:
raise
OCR1Exception
(
'{0} ocr status code: {1}'
.
format
(
self
.
log_base
,
ocr_response
.
status_code
))
except
Exception
as
e
:
self
.
folder_log
.
warn
(
'{0} [ocr failed] [times={1}] [img_path={2}] [error={3}]'
.
format
(
self
.
log_base
,
times
,
img_path
,
traceback
.
format_exc
()))
else
:
ocr_res
=
ocr_response
.
json
()
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
folder_log
.
info
(
'{0} [ocr success] [img={1}] [res={2}] [speed_time={3}]'
.
format
(
self
.
log_base
,
img_path
,
ocr_res
,
speed_time
))
return
ocr_res
else
:
self
.
folder_log
.
warn
(
'{0} [ocr failed] [img_path={1}]'
.
format
(
self
.
log_base
,
img_path
))
def
ltgt_ocr_process
(
self
,
img_path_list
,
label
,
path
):
img_data_list
=
[]
for
img_path
in
img_path_list
:
if
os
.
path
.
exists
(
img_path
):
with
open
(
img_path
,
'rb'
)
as
f
:
base64_data
=
base64
.
b64encode
(
f
.
read
())
# 获取解码后的base64值
file_data
=
base64_data
.
decode
()
img_data_list
.
append
(
file_data
)
json_data
=
{
"label"
:
label
,
"img_data_list"
:
img_data_list
}
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
start_time
=
time
.
time
()
ocr_response
=
requests
.
post
(
self
.
ltgt_ocr_url
,
json
=
json_data
)
if
ocr_response
.
status_code
!=
200
:
raise
LTGTException
(
'{0} ltgt ocr status code: {1}'
.
format
(
self
.
log_base
,
ocr_response
.
status_code
))
except
Exception
as
e
:
self
.
folder_log
.
warn
(
'{0} [ltgt ocr failed] [times={1}] [path={2}] [error={3}]'
.
format
(
self
.
log_base
,
times
,
path
,
traceback
.
format_exc
()))
else
:
ocr_res
=
ocr_response
.
json
()
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
folder_log
.
info
(
'{0} [ltgt ocr success] [path={1}] [res={2}] [speed_time={3}]'
.
format
(
self
.
log_base
,
path
,
ocr_res
,
speed_time
))
return
ocr_res
else
:
self
.
folder_log
.
warn
(
'{0} [ltgt ocr failed] [path={1}]'
.
format
(
self
.
log_base
,
path
))
def
ltgt_res_process
(
self
,
ocr_res
,
label
,
excel_path
):
try
:
if
isinstance
(
ocr_res
,
dict
):
if
ocr_res
.
get
(
'code'
)
==
1
:
result_dict
=
ocr_res
.
get
(
'data'
,
{})
wb
=
BSWorkbook
(
set
(),
set
(),
set
(),
set
(),
set
())
rebuild_res
=
wb
.
ltgt_build
(
label
,
result_dict
)
wb
.
remove_base_sheet
()
wb
.
save
(
excel_path
)
return
rebuild_res
except
Exception
as
e
:
self
.
folder_log
.
error
(
'{0} [wb build error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
excel_path
,
traceback
.
format_exc
()))
def
ltgt_process
(
self
,
img_path_list
,
label
,
excel_path
,
path
):
ocr_res
=
self
.
ltgt_ocr_process
(
img_path_list
,
label
,
path
)
rebuild_res
=
self
.
ltgt_res_process
(
ocr_res
,
label
,
excel_path
)
return
rebuild_res
def
images_process
(
self
,
img_path_list
,
classify
,
excel_path
):
all_res
=
{}
for
img_path
in
img_path_list
:
ocr_res
=
self
.
ocr_process
(
img_path
,
classify
)
all_res
[
img_path
]
=
ocr_res
self
.
res_process
(
all_res
,
classify
,
excel_path
)
def
pdf_process
(
self
,
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
):
if
os
.
path
.
exists
(
path
):
rebuild_res
=
None
try
:
img_save_path
,
excel_path
,
pdf_save_path
=
self
.
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
)
self
.
folder_log
.
info
(
'{0} [pdf to img start] [path={1}]'
.
format
(
self
.
log_base
,
path
))
pdf_handler
=
PDFHandler
(
path
,
img_save_path
)
if
classify
in
self
.
ltgt_classify_mapping
:
pdf_handler
.
extract_page_image
()
else
:
pdf_handler
.
extract_image
()
self
.
folder_log
.
info
(
'{0} [pdf to img end] [path={1}]'
.
format
(
self
.
log_base
,
path
))
except
Exception
as
e
:
self
.
folder_log
.
error
(
'{0} [pdf to img error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
raise
e
else
:
if
classify
in
self
.
ltgt_classify_mapping
:
rebuild_res
=
self
.
ltgt_process
(
pdf_handler
.
img_path_list
,
self
.
ltgt_classify_mapping
[
classify
],
excel_path
,
path
)
else
:
self
.
images_process
(
pdf_handler
.
img_path_list
,
classify
,
excel_path
)
shutil
.
move
(
path
,
pdf_save_path
)
return
rebuild_res
def
tif_process
(
self
,
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
tiff_output_dir
):
if
os
.
path
.
exists
(
path
):
rebuild_res
=
None
try
:
img_save_path
,
excel_path
,
tiff_save_path
=
self
.
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
tiff_output_dir
)
self
.
folder_log
.
info
(
'{0} [tiff to img start] [path={1}]'
.
format
(
self
.
log_base
,
path
))
tiff_handler
=
TIFFHandler
(
path
,
img_save_path
)
tiff_handler
.
extract_image
()
self
.
folder_log
.
info
(
'{0} [tiff to img end] [path={1}]'
.
format
(
self
.
log_base
,
path
))
except
Exception
as
e
:
self
.
folder_log
.
error
(
'{0} [tiff to img error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
raise
e
else
:
if
classify
in
self
.
ltgt_classify_mapping
:
rebuild_res
=
self
.
ltgt_process
(
tiff_handler
.
img_path_list
,
self
.
ltgt_classify_mapping
[
classify
],
excel_path
,
path
)
else
:
self
.
images_process
(
tiff_handler
.
img_path_list
,
classify
,
excel_path
)
shutil
.
move
(
path
,
tiff_save_path
)
return
rebuild_res
def
img_process
(
self
,
name
,
path
,
classify
,
wb_output_dir
,
img_output_dir
,
pdf_output_dir
):
rebuild_res
=
None
try
:
img_save_path
,
excel_path
,
_
=
self
.
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
)
except
Exception
as
e
:
self
.
folder_log
.
error
(
'{0} [get path error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
else
:
if
classify
in
self
.
ltgt_classify_mapping
:
rebuild_res
=
self
.
ltgt_process
([
path
],
self
.
ltgt_classify_mapping
[
classify
],
excel_path
,
path
)
else
:
ocr_res
=
self
.
ocr_process
(
path
,
classify
)
all_res
=
{
path
:
ocr_res
}
self
.
res_process
(
all_res
,
classify
,
excel_path
)
shutil
.
move
(
path
,
img_save_path
)
return
rebuild_res
def
wb_process
(
self
,
wb_dir
,
result_queue
):
while
self
.
switch
:
result_list
=
[]
date_str
=
None
for
i
in
range
(
100
):
try
:
result
=
result_queue
.
get
(
block
=
False
)
except
Exception
as
e
:
time
.
sleep
(
self
.
short_sleep_time
)
else
:
if
date_str
is
None
:
date_str
=
result
[
self
.
DATE_KEY
]
result_list
.
append
(
result
)
elif
result
[
self
.
DATE_KEY
]
==
date_str
:
result_list
.
append
(
result
)
else
:
break
if
date_str
is
None
:
time
.
sleep
(
self
.
long_sleep_time
)
continue
else
:
wb_name
=
self
.
daily_wb_name
.
format
(
date_str
)
wb_path
=
os
.
path
.
join
(
wb_dir
,
wb_name
)
if
os
.
path
.
isfile
(
wb_path
):
wb
=
load_workbook
(
wb_path
)
else
:
wb
=
Workbook
()
for
result
in
result_list
:
sheet_name
,
head_fields
=
self
.
sheet_content
[
result
[
self
.
CLASSIFY_KEY
]]
row
=
[]
for
field
in
head_fields
:
row
.
append
(
result
[
self
.
RESULT_KEY
]
.
get
(
field
))
if
sheet_name
in
wb
.
sheetnames
:
ws
=
wb
.
get_sheet_by_name
(
sheet_name
)
else
:
ws
=
wb
.
create_sheet
(
sheet_name
)
ws
.
append
(
head_fields
)
ws
.
append
(
row
)
wb
.
save
(
wb_path
)
def
folder_process
(
self
,
input_dir
,
classify
,
result_queue
):
while
not
os
.
path
.
isdir
(
input_dir
):
self
.
folder_log
.
info
(
'{0} [input dir is not dir] [input_dir={1}]'
.
format
(
self
.
log_base
,
input_dir
))
if
self
.
switch
:
time
.
sleep
(
self
.
sleep_time
)
continue
else
:
return
output_dir
=
os
.
path
.
join
(
os
.
path
.
dirname
(
input_dir
),
'Output'
)
img_output_dir
=
os
.
path
.
join
(
output_dir
,
'image'
)
wb_output_dir
=
os
.
path
.
join
(
output_dir
,
'excel'
)
pdf_output_dir
=
os
.
path
.
join
(
output_dir
,
'pdf'
)
tiff_output_dir
=
os
.
path
.
join
(
output_dir
,
'tiff'
)
failed_output_dir
=
os
.
path
.
join
(
output_dir
,
'failed'
)
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
img_output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
wb_output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
pdf_output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
tiff_output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
failed_output_dir
,
exist_ok
=
True
)
os_error_filename_set
=
set
()
while
self
.
switch
:
# if not os.path.isdir(input_dir):
# self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir))
# time.sleep(self.sleep_time)
# continue
# 1. 从input dir获取pdf or image
list_dir
=
os
.
listdir
(
input_dir
)
if
not
list_dir
and
len
(
os_error_filename_set
)
==
0
:
self
.
folder_log
.
info
(
'{0} [input dir empty] [input_dir={1}]'
.
format
(
self
.
log_base
,
input_dir
))
time
.
sleep
(
self
.
sleep_time
)
continue
all_file_set
=
set
(
list_dir
)
true_file_set
=
all_file_set
-
os_error_filename_set
if
len
(
true_file_set
)
==
0
and
len
(
os_error_filename_set
)
>
0
:
true_file_set
.
add
(
os_error_filename_set
.
pop
())
for
name
in
true_file_set
:
path
=
os
.
path
.
join
(
input_dir
,
name
)
try
:
if
os
.
path
.
isfile
(
path
):
self
.
folder_log
.
info
(
'{0} [file start] [path={1}]'
.
format
(
self
.
log_base
,
path
))
if
name
.
endswith
(
'.pdf'
)
or
name
.
endswith
(
'.PDF'
):
result
=
self
.
pdf_process
(
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
)
elif
name
.
endswith
(
'.tif'
)
or
name
.
endswith
(
'.TIF'
):
result
=
self
.
tif_process
(
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
tiff_output_dir
)
else
:
result
=
self
.
img_process
(
name
,
path
,
classify
,
wb_output_dir
,
img_output_dir
,
pdf_output_dir
)
self
.
folder_log
.
info
(
'{0} [file end] [path={1}]'
.
format
(
self
.
log_base
,
path
))
else
:
result
=
None
self
.
folder_log
.
info
(
'{0} [path is dir] [path={1}]'
.
format
(
self
.
log_base
,
input_dir
))
failed_path
=
os
.
path
.
join
(
failed_output_dir
,
'{0}_{1}'
.
format
(
time
.
time
(),
name
))
shutil
.
move
(
path
,
failed_path
)
except
OSError
:
os_error_filename_set
.
add
(
name
)
self
.
folder_log
.
error
(
'{0} [os error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
except
Exception
as
e
:
try
:
self
.
folder_log
.
error
(
'{0} [file error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
failed_path
=
os
.
path
.
join
(
failed_output_dir
,
'{0}_{1}'
.
format
(
time
.
time
(),
name
))
shutil
.
move
(
path
,
failed_path
)
except
Exception
as
e
:
os_error_filename_set
.
add
(
name
)
self
.
folder_log
.
error
(
'{0} [file move error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
else
:
if
isinstance
(
result
,
dict
)
and
len
(
result
)
>
0
:
date_str
=
time
.
strftime
(
"
%
Y-
%
m-
%
d"
)
result_queue
.
put
(
{
self
.
CLASSIFY_KEY
:
classify
,
self
.
RESULT_KEY
:
result
,
self
.
DATE_KEY
:
date_str
}
)
elif
isinstance
(
result
,
list
)
and
len
(
result
)
>
0
:
date_str
=
time
.
strftime
(
"
%
Y-
%
m-
%
d"
)
for
res
in
result
:
result_queue
.
put
(
{
self
.
CLASSIFY_KEY
:
classify
,
self
.
RESULT_KEY
:
res
,
self
.
DATE_KEY
:
date_str
}
)
def
handle
(
self
,
*
args
,
**
kwargs
):
if
len
(
self
.
input_dirs
)
==
0
:
return
result_queue
=
Queue
()
process_list
=
[]
one_input_dir
=
None
for
classify_idx
,
input_dir
in
self
.
input_dirs
.
items
():
if
one_input_dir
is
None
:
one_input_dir
=
input_dir
classify
=
int
(
classify_idx
.
split
(
'_'
)[
0
])
process
=
Process
(
target
=
self
.
folder_process
,
args
=
(
input_dir
,
classify
,
result_queue
))
process_list
.
append
(
process
)
wb_dir
=
os
.
path
.
dirname
(
os
.
path
.
dirname
(
one_input_dir
))
wb_process
=
Process
(
target
=
self
.
wb_process
,
args
=
(
wb_dir
,
result_queue
,
))
process_list
.
append
(
wb_process
)
for
p
in
process_list
:
p
.
start
()
for
p
in
process_list
:
p
.
join
()
self
.
folder_log
.
info
(
'{0} [stop safely]'
.
format
(
self
.
log_base
))
src/apps/doc/management/commands/folder_ocr_process.py
View file @
c1ca6fa
...
...
@@ -15,7 +15,7 @@ from settings import conf
from
common.mixins
import
LoggerMixin
from
common.tools.pdf_to_img
import
PDFHandler
from
apps.doc
import
consts
from
apps.doc.exceptions
import
OCR1Exception
,
OCR4Exception
,
LTGTException
from
apps.doc.exceptions
import
OCR1Exception
,
OCR4Exception
from
apps.doc.ocr.wb
import
BSWorkbook
...
...
@@ -48,11 +48,6 @@ class Command(BaseCommand, LoggerMixin):
self
.
log_base
=
'[folder ocr process]'
# 处理文件开关
self
.
switch
=
True
self
.
ltgt_classify_mapping
=
{
128
:
'执行裁定书'
,
129
:
'民事判决书'
,
130
:
'民事调解书'
}
# 睡眠时间
self
.
sleep_time
=
float
(
conf
.
SLEEP_SECOND_FOLDER
)
# input folder
...
...
@@ -60,7 +55,6 @@ class Command(BaseCommand, LoggerMixin):
# ocr相关
self
.
ocr_url
=
conf
.
OCR_URL_FOLDER
self
.
ocr_url_4
=
conf
.
IC_URL
self
.
ltgt_ocr_url
=
conf
.
LTGT_URL
# 优雅退出信号:15
signal
.
signal
(
signal
.
SIGTERM
,
self
.
signal_handler
)
...
...
@@ -213,59 +207,6 @@ class Command(BaseCommand, LoggerMixin):
else
:
self
.
folder_log
.
warn
(
'{0} [ocr failed] [img_path={1}]'
.
format
(
self
.
log_base
,
img_path
))
def
ltgt_ocr_process
(
self
,
img_path_list
,
label
,
path
):
img_data_list
=
[]
for
img_path
in
img_path_list
:
if
os
.
path
.
exists
(
img_path
):
with
open
(
img_path
,
'rb'
)
as
f
:
base64_data
=
base64
.
b64encode
(
f
.
read
())
# 获取解码后的base64值
file_data
=
base64_data
.
decode
()
img_data_list
.
append
(
file_data
)
json_data
=
{
"label"
:
label
,
"img_data_list"
:
img_data_list
}
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
start_time
=
time
.
time
()
ocr_response
=
requests
.
post
(
self
.
ltgt_ocr_url
,
json
=
json_data
)
if
ocr_response
.
status_code
!=
200
:
raise
LTGTException
(
'{0} ltgt ocr status code: {1}'
.
format
(
self
.
log_base
,
ocr_response
.
status_code
))
except
Exception
as
e
:
self
.
folder_log
.
warn
(
'{0} [ltgt ocr failed] [times={1}] [path={2}] [error={3}]'
.
format
(
self
.
log_base
,
times
,
path
,
traceback
.
format_exc
()))
else
:
ocr_res
=
ocr_response
.
json
()
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
folder_log
.
info
(
'{0} [ltgt ocr success] [path={1}] [res={2}] [speed_time={3}]'
.
format
(
self
.
log_base
,
path
,
ocr_res
,
speed_time
))
return
ocr_res
else
:
self
.
folder_log
.
warn
(
'{0} [ltgt ocr failed] [path={1}]'
.
format
(
self
.
log_base
,
path
))
def
ltgt_res_process
(
self
,
ocr_res
,
label
,
excel_path
):
try
:
if
isinstance
(
ocr_res
,
dict
):
if
ocr_res
.
get
(
'code'
)
==
1
:
result_dict
=
ocr_res
.
get
(
'data'
,
{})
wb
=
BSWorkbook
(
set
(),
set
(),
set
(),
set
(),
set
())
rebuild_res
=
wb
.
ltgt_build
(
label
,
result_dict
)
wb
.
remove_base_sheet
()
wb
.
save
(
excel_path
)
except
Exception
as
e
:
self
.
folder_log
.
error
(
'{0} [wb build error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
excel_path
,
traceback
.
format_exc
()))
def
ltgt_process
(
self
,
img_path_list
,
label
,
excel_path
,
path
):
ocr_res
=
self
.
ltgt_ocr_process
(
img_path_list
,
label
,
path
)
self
.
ltgt_res_process
(
ocr_res
,
label
,
excel_path
)
def
images_process
(
self
,
img_path_list
,
classify
,
excel_path
):
all_res
=
{}
for
img_path
in
img_path_list
:
...
...
@@ -279,9 +220,6 @@ class Command(BaseCommand, LoggerMixin):
img_save_path
,
excel_path
,
pdf_save_path
=
self
.
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
)
self
.
folder_log
.
info
(
'{0} [pdf to img start] [path={1}]'
.
format
(
self
.
log_base
,
path
))
pdf_handler
=
PDFHandler
(
path
,
img_save_path
)
if
classify
in
self
.
ltgt_classify_mapping
:
pdf_handler
.
extract_page_image
()
else
:
pdf_handler
.
extract_image
()
self
.
folder_log
.
info
(
'{0} [pdf to img end] [path={1}]'
.
format
(
self
.
log_base
,
path
))
except
Exception
as
e
:
...
...
@@ -289,9 +227,6 @@ class Command(BaseCommand, LoggerMixin):
self
.
log_base
,
path
,
traceback
.
format_exc
()))
raise
e
else
:
if
classify
in
self
.
ltgt_classify_mapping
:
self
.
ltgt_process
(
pdf_handler
.
img_path_list
,
self
.
ltgt_classify_mapping
[
classify
],
excel_path
,
path
)
else
:
self
.
images_process
(
pdf_handler
.
img_path_list
,
classify
,
excel_path
)
shutil
.
move
(
path
,
pdf_save_path
)
...
...
@@ -308,9 +243,6 @@ class Command(BaseCommand, LoggerMixin):
self
.
log_base
,
path
,
traceback
.
format_exc
()))
raise
e
else
:
if
classify
in
self
.
ltgt_classify_mapping
:
self
.
ltgt_process
(
tiff_handler
.
img_path_list
,
self
.
ltgt_classify_mapping
[
classify
],
excel_path
,
path
)
else
:
self
.
images_process
(
tiff_handler
.
img_path_list
,
classify
,
excel_path
)
shutil
.
move
(
path
,
tiff_save_path
)
...
...
@@ -321,9 +253,6 @@ class Command(BaseCommand, LoggerMixin):
self
.
folder_log
.
error
(
'{0} [get path error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
else
:
if
classify
in
self
.
ltgt_classify_mapping
:
self
.
ltgt_process
([
path
],
self
.
ltgt_classify_mapping
[
classify
],
excel_path
,
path
)
else
:
ocr_res
=
self
.
ocr_process
(
path
,
classify
)
all_res
=
{
path
:
ocr_res
}
self
.
res_process
(
all_res
,
classify
,
excel_path
)
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment