Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
b8745dc6
authored
2021-08-09 10:21:13 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Plain Diff
Merge branch 'feature/ltgt' into feature/0611
2 parents
d78669c5
f63f9c2a
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
613 additions
and
31 deletions
src/apps/doc/consts.py
src/apps/doc/exceptions.py
src/apps/doc/management/commands/folder_ltgt_process.py
src/apps/doc/management/commands/folder_ocr_process.py
src/apps/doc/ocr/wb.py
src/common/tools/pdf_to_img.py
src/apps/doc/consts.py
View file @
b8745dc
...
...
@@ -1448,4 +1448,16 @@ SE_SECOND_ID_FIELD_MAPPING = {
HEAD_LIST
=
[
'Info'
,
'Index'
,
'License'
,
'Field'
,
'Input'
,
'OCR'
,
'Result'
,
'Position'
,
'Image'
,
'errorType'
]
# ----------------litigation------------------------
IC_FIELD_ORDER_2
=
((
'姓名'
,
'姓名'
),
(
'公民身份号码'
,
'公民身份号码'
),
(
'出生年月'
,
'出生年月'
),
(
'住址'
,
'住址'
),
(
'性别'
,
'性别'
),
(
'民族'
,
'民族'
),)
IC_FIELD_ORDER_3
=
((
'有效期限'
,
'有效期限'
),
(
'签发机关'
,
'签发机关'
),)
BC_FIELD_ORDER_2
=
((
'BankName'
,
'发卡行名称'
),
(
'CardNum'
,
'银行卡号'
),
(
'CardType'
,
'银行卡类型'
),)
...
...
src/apps/doc/exceptions.py
View file @
b8745dc
...
...
@@ -13,6 +13,9 @@ class OCR2Exception(Exception):
class
OCR4Exception
(
Exception
):
pass
class
LTGTException
(
Exception
):
pass
class
GCAPException
(
Exception
):
pass
...
...
src/apps/doc/management/commands/folder_ltgt_process.py
0 → 100644
View file @
b8745dc
import
os
import
re
import
time
import
json
import
shutil
import
base64
import
signal
import
requests
import
traceback
from
PIL
import
Image
from
datetime
import
datetime
from
django.core.management
import
BaseCommand
from
multiprocessing
import
Process
,
Queue
from
openpyxl
import
load_workbook
,
Workbook
from
settings
import
conf
from
common.mixins
import
LoggerMixin
from
common.tools.pdf_to_img
import
PDFHandler
from
apps.doc
import
consts
from
apps.doc.exceptions
import
OCR1Exception
,
OCR2Exception
,
LTGTException
from
apps.doc.ocr.wb
import
BSWorkbook
class
TIFFHandler
:
def
__init__
(
self
,
path
,
img_save_path
):
self
.
path
=
path
self
.
img_save_path
=
img_save_path
self
.
img_path_list
=
[]
def
extract_image
(
self
):
os
.
makedirs
(
self
.
img_save_path
,
exist_ok
=
True
)
tiff
=
Image
.
open
(
self
.
path
)
tiff
.
load
()
for
i
in
range
(
tiff
.
n_frames
):
try
:
save_path
=
os
.
path
.
join
(
self
.
img_save_path
,
'page_{0}.jpeg'
.
format
(
i
))
tiff
.
seek
(
i
)
tiff
.
save
(
save_path
)
self
.
img_path_list
.
append
(
save_path
)
except
EOFError
:
break
class
Command
(
BaseCommand
,
LoggerMixin
):
def
__init__
(
self
):
super
()
.
__init__
()
self
.
log_base
=
'[folder ltgt process]'
# 处理文件开关
self
.
switch
=
True
self
.
ltgt_classify_mapping
=
{
128
:
'执行裁定书'
,
129
:
'民事判决书'
,
130
:
'民事调解书'
}
self
.
sheet_content
=
{
128
:
(
'执行裁定书'
,
(
'承办法院'
,
'案号/标号'
,
'被执行人'
,
'债权金额'
,
'诉讼时间'
)),
129
:
(
'民事判决书'
,
(
'承办法院'
,
'案号/标号'
,
'被告'
,
'判决结果: 贷款本金'
,
'判决结果: 罚息'
,
'判决结果: 律师费'
,
'判决结果: 案件受理费'
,
'诉讼时间'
)),
130
:
(
'民事调解书'
,
(
'承办法院'
,
'案号/标号'
,
'被告'
,
'协议内容: 支付金额'
,
'协议内容: 案件受理费'
,
'诉讼时间'
)),
}
self
.
DATE_KEY
=
'date'
self
.
CLASSIFY_KEY
=
'classify'
self
.
RESULT_KEY
=
'result'
self
.
daily_wb_name
=
'Output_{0}.xlsx'
self
.
short_sleep_time
=
10
self
.
long_sleep_time
=
3600
# 睡眠时间
self
.
sleep_time
=
float
(
conf
.
SLEEP_SECOND_FOLDER
)
# input folder
self
.
input_dirs
=
conf
.
get_namespace
(
'LTGT_DIR_'
)
# seperate folder name
self
.
seperate_map
=
{
consts
.
IC_CLASSIFY
:
'IDCard'
,
consts
.
BC_CLASSIFY
:
'BankCard'
}
self
.
field_map
=
{
consts
.
VAT_CLASSIFY
:
(
consts
.
VAT_CN_NAME
,
None
,
None
,
consts
.
VATS_FIELD_ORDER
),
consts
.
IC_CLASSIFY
:
(
consts
.
IC_CN_NAME
,
'有效期限'
,
consts
.
IC_FIELD_ORDER_3
,
consts
.
IC_FIELD_ORDER_2
),
consts
.
BC_CLASSIFY
:
(
consts
.
BC_CN_NAME
,
None
,
None
,
consts
.
BC_FIELD_ORDER_2
)
}
# ocr相关
self
.
ocr_url
=
conf
.
OCR_URL_FOLDER
self
.
ocr_url_2
=
conf
.
OCR2_URL_FOLDER
# self.ocr_url_4 = conf.IC_URL
self
.
ltgt_ocr_url
=
conf
.
LTGT_URL
# 优雅退出信号:15
signal
.
signal
(
signal
.
SIGTERM
,
self
.
signal_handler
)
def
signal_handler
(
self
,
sig
,
frame
):
self
.
switch
=
False
# 停止处理文件
def
license1_process
(
self
,
ocr_data
,
all_res
,
classify
):
# 类别:'0'身份证, '1'居住证
license_data
=
ocr_data
.
get
(
'data'
,
[])
if
not
license_data
:
return
if
classify
==
consts
.
IC_CLASSIFY
:
for
id_card_dict
in
license_data
:
try
:
id_card_dict
.
pop
(
'base64_img'
)
except
Exception
as
e
:
continue
all_res
.
extend
(
license_data
)
def
license2_process
(
self
,
ocr_data
,
all_res
,
classify
,
img_path
):
pid
,
_
,
_
,
_
,
_
,
_
=
consts
.
LICENSE_CLASSIFY_MAPPING
.
get
(
classify
)
file_data
=
ocr_data
.
get
(
'section_img'
)
if
file_data
is
None
:
with
open
(
img_path
,
'rb'
)
as
f
:
base64_data
=
base64
.
b64encode
(
f
.
read
())
# 获取解码后的base64值
file_data
=
base64_data
.
decode
()
json_data_2
=
{
"pid"
:
str
(
pid
),
"filedata"
:
file_data
}
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
start_time
=
time
.
time
()
ocr_2_response
=
requests
.
post
(
self
.
ocr_url_2
,
data
=
json_data_2
)
if
ocr_2_response
.
status_code
!=
200
:
raise
OCR2Exception
(
'ocr_2 status code: {0}'
.
format
(
ocr_2_response
.
status_code
))
except
Exception
as
e
:
self
.
folder_log
.
warn
(
'{0} [ocr_2 failed] [times={1}] [img_path={2}] [error={3}]'
.
format
(
self
.
log_base
,
times
,
img_path
,
traceback
.
format_exc
()))
else
:
ocr_res_2
=
json
.
loads
(
ocr_2_response
.
text
)
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
folder_log
.
info
(
'{0} [ocr_2 success] [img={1}] [speed_time={2}]'
.
format
(
self
.
log_base
,
img_path
,
speed_time
))
if
ocr_res_2
.
get
(
'ErrorCode'
)
in
consts
.
SUCCESS_CODE_SET
:
if
pid
==
consts
.
BC_PID
:
all_res
.
append
(
ocr_res_2
)
else
:
# 营业执照等
for
result_dict
in
ocr_res_2
.
get
(
'ResultList'
,
[]):
res_dict
=
{}
for
field_dict
in
result_dict
.
get
(
'FieldList'
,
[]):
res_dict
[
field_dict
.
get
(
'chn_key'
,
''
)]
=
field_dict
.
get
(
'value'
,
''
)
all_res
.
append
(
res_dict
)
@staticmethod
def
parse_img_path
(
img_path
):
# 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext)
img_name
,
_
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
img_path
))
if
re
.
match
(
r'page_\d+_img_\d+'
,
img_name
):
part_list
=
img_name
.
split
(
'_'
)
return
img_name
,
int
(
part_list
[
1
])
+
1
,
int
(
part_list
[
3
])
+
1
else
:
return
img_name
,
1
,
1
@staticmethod
def
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
,
seperate_dir
):
time_stamp
=
datetime
.
now
()
.
strftime
(
'
%
Y-
%
m-
%
d_
%
H:
%
M:
%
S'
)
new_name
=
'{0}_{1}'
.
format
(
time_stamp
,
name
)
img_save_path
=
os
.
path
.
join
(
img_output_dir
,
new_name
)
pdf_save_path
=
os
.
path
.
join
(
pdf_output_dir
,
new_name
)
excel_name
=
'{0}.xlsx'
.
format
(
os
.
path
.
splitext
(
new_name
)[
0
])
excel_path
=
os
.
path
.
join
(
wb_output_dir
,
excel_name
)
seperate_path
=
None
if
seperate_dir
is
None
else
os
.
path
.
join
(
seperate_dir
,
new_name
)
return
img_save_path
,
excel_path
,
pdf_save_path
,
seperate_path
def
res_process
(
self
,
all_res
,
excel_path
,
classify
):
try
:
wb
=
BSWorkbook
(
set
(),
set
(),
set
(),
set
(),
set
())
sheet_name
,
key_field
,
side_field_order
,
src_field_order
=
self
.
field_map
.
get
(
classify
)
ws
=
wb
.
create_sheet
(
sheet_name
)
for
res
in
all_res
:
if
key_field
is
not
None
and
key_field
in
res
:
field_order
=
side_field_order
else
:
field_order
=
src_field_order
for
search_field
,
write_field
in
field_order
:
field_value
=
res
.
get
(
search_field
,
''
)
if
isinstance
(
field_value
,
list
):
ws
.
append
((
write_field
,
*
field_value
))
else
:
ws
.
append
((
write_field
,
field_value
))
ws
.
append
((
None
,))
wb
.
remove_base_sheet
()
wb
.
save
(
excel_path
)
except
Exception
as
e
:
self
.
folder_log
.
error
(
'{0} [wb build error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
excel_path
,
traceback
.
format_exc
()))
def
basename
(
self
,
path
):
# A basename() variant which first strips the trailing slash, if present.
# Thus we always get the last component of the path, even for directories.
sep
=
os
.
path
.
sep
+
(
os
.
path
.
altsep
or
''
)
return
os
.
path
.
basename
(
path
.
rstrip
(
sep
))
def
ocr_process
(
self
,
img_path
,
classify
,
all_res
,
seperate_dir
):
if
os
.
path
.
exists
(
img_path
):
# TODO 图片验证
with
open
(
img_path
,
'rb'
)
as
f
:
base64_data
=
base64
.
b64encode
(
f
.
read
())
# 获取解码后的base64值
file_data
=
base64_data
.
decode
()
json_data
=
{
"file"
:
file_data
,
}
if
seperate_dir
is
None
:
json_data
[
"classify"
]
=
classify
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
start_time
=
time
.
time
()
ocr_response
=
requests
.
post
(
self
.
ocr_url
,
json
=
json_data
)
if
ocr_response
.
status_code
!=
200
:
raise
OCR1Exception
(
'{0} ocr status code: {1}'
.
format
(
self
.
log_base
,
ocr_response
.
status_code
))
except
Exception
as
e
:
self
.
folder_log
.
warn
(
'{0} [ocr failed] [times={1}] [img_path={2}] [error={3}]'
.
format
(
self
.
log_base
,
times
,
img_path
,
traceback
.
format_exc
()))
else
:
ocr_res
=
ocr_response
.
json
()
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
folder_log
.
info
(
'{0} [ocr success] [img={1}] [res={2}] [speed_time={3}]'
.
format
(
self
.
log_base
,
img_path
,
ocr_res
,
speed_time
))
if
isinstance
(
ocr_res
,
dict
):
if
ocr_res
.
get
(
'code'
)
==
1
:
data_list
=
ocr_res
.
get
(
'data'
,
[])
if
isinstance
(
data_list
,
list
):
for
ocr_data
in
data_list
:
if
ocr_data
.
get
(
'classify'
)
==
classify
:
if
seperate_dir
is
not
None
:
os
.
makedirs
(
seperate_dir
,
exist_ok
=
True
)
real_dst
=
os
.
path
.
join
(
seperate_dir
,
self
.
basename
(
img_path
))
if
not
os
.
path
.
exists
(
real_dst
):
shutil
.
move
(
img_path
,
seperate_dir
)
if
classify
in
consts
.
LICENSE_CLASSIFY_SET_1
:
self
.
license1_process
(
ocr_data
,
all_res
,
classify
)
elif
classify
in
consts
.
LICENSE_CLASSIFY_SET_2
:
self
.
license2_process
(
ocr_data
,
all_res
,
classify
,
img_path
)
break
else
:
self
.
folder_log
.
warn
(
'{0} [ocr failed] [img_path={1}]'
.
format
(
self
.
log_base
,
img_path
))
def
ltgt_ocr_process
(
self
,
img_path_list
,
label
,
path
):
img_data_list
=
[]
for
img_path
in
img_path_list
:
if
os
.
path
.
exists
(
img_path
):
with
open
(
img_path
,
'rb'
)
as
f
:
base64_data
=
base64
.
b64encode
(
f
.
read
())
# 获取解码后的base64值
file_data
=
base64_data
.
decode
()
img_data_list
.
append
(
file_data
)
json_data
=
{
"label"
:
label
,
"img_data_list"
:
img_data_list
}
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
start_time
=
time
.
time
()
ocr_response
=
requests
.
post
(
self
.
ltgt_ocr_url
,
json
=
json_data
)
if
ocr_response
.
status_code
!=
200
:
raise
LTGTException
(
'{0} ltgt ocr status code: {1}'
.
format
(
self
.
log_base
,
ocr_response
.
status_code
))
except
Exception
as
e
:
self
.
folder_log
.
warn
(
'{0} [ltgt ocr failed] [times={1}] [path={2}] [error={3}]'
.
format
(
self
.
log_base
,
times
,
path
,
traceback
.
format_exc
()))
else
:
ocr_res
=
ocr_response
.
json
()
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
folder_log
.
info
(
'{0} [ltgt ocr success] [path={1}] [res={2}] [speed_time={3}]'
.
format
(
self
.
log_base
,
path
,
ocr_res
,
speed_time
))
return
ocr_res
else
:
self
.
folder_log
.
warn
(
'{0} [ltgt ocr failed] [path={1}]'
.
format
(
self
.
log_base
,
path
))
def
ltgt_res_process
(
self
,
ocr_res
,
label
,
excel_path
):
try
:
if
isinstance
(
ocr_res
,
dict
):
if
ocr_res
.
get
(
'code'
)
==
1
:
result_dict
=
ocr_res
.
get
(
'data'
,
{})
wb
=
BSWorkbook
(
set
(),
set
(),
set
(),
set
(),
set
())
rebuild_res
=
wb
.
ltgt_build
(
label
,
result_dict
)
wb
.
remove_base_sheet
()
wb
.
save
(
excel_path
)
return
rebuild_res
except
Exception
as
e
:
self
.
folder_log
.
error
(
'{0} [wb build error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
excel_path
,
traceback
.
format_exc
()))
def
ltgt_process
(
self
,
img_path_list
,
label
,
excel_path
,
path
):
ocr_res
=
self
.
ltgt_ocr_process
(
img_path_list
,
label
,
path
)
rebuild_res
=
self
.
ltgt_res_process
(
ocr_res
,
label
,
excel_path
)
return
rebuild_res
def
images_process
(
self
,
img_path_list
,
classify
,
excel_path
,
seperate_dir
):
all_res
=
[]
for
img_path
in
img_path_list
:
self
.
ocr_process
(
img_path
,
classify
,
all_res
,
seperate_dir
)
# if len(all_res) > 0:
self
.
res_process
(
all_res
,
excel_path
,
classify
)
return
all_res
def
pdf_process
(
self
,
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
,
seperate_dir
):
if
os
.
path
.
exists
(
path
):
rebuild_res
=
None
try
:
img_save_path
,
excel_path
,
pdf_save_path
,
seperate_path
=
self
.
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
,
seperate_dir
)
self
.
folder_log
.
info
(
'{0} [pdf to img start] [path={1}]'
.
format
(
self
.
log_base
,
path
))
pdf_handler
=
PDFHandler
(
path
,
img_save_path
)
if
classify
in
self
.
ltgt_classify_mapping
:
pdf_handler
.
extract_page_image
()
else
:
pdf_handler
.
extract_image
()
self
.
folder_log
.
info
(
'{0} [pdf to img end] [path={1}]'
.
format
(
self
.
log_base
,
path
))
except
Exception
as
e
:
self
.
folder_log
.
error
(
'{0} [pdf to img error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
raise
e
else
:
if
classify
in
self
.
ltgt_classify_mapping
:
rebuild_res
=
self
.
ltgt_process
(
pdf_handler
.
img_path_list
,
self
.
ltgt_classify_mapping
[
classify
],
excel_path
,
path
)
else
:
rebuild_res
=
self
.
images_process
(
pdf_handler
.
img_path_list
,
classify
,
excel_path
,
seperate_path
)
shutil
.
move
(
path
,
pdf_save_path
)
return
rebuild_res
def
tif_process
(
self
,
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
tiff_output_dir
,
seperate_dir
):
if
os
.
path
.
exists
(
path
):
rebuild_res
=
None
try
:
img_save_path
,
excel_path
,
tiff_save_path
,
seperate_path
=
self
.
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
tiff_output_dir
,
seperate_dir
)
self
.
folder_log
.
info
(
'{0} [tiff to img start] [path={1}]'
.
format
(
self
.
log_base
,
path
))
tiff_handler
=
TIFFHandler
(
path
,
img_save_path
)
tiff_handler
.
extract_image
()
self
.
folder_log
.
info
(
'{0} [tiff to img end] [path={1}]'
.
format
(
self
.
log_base
,
path
))
except
Exception
as
e
:
self
.
folder_log
.
error
(
'{0} [tiff to img error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
raise
e
else
:
if
classify
in
self
.
ltgt_classify_mapping
:
rebuild_res
=
self
.
ltgt_process
(
tiff_handler
.
img_path_list
,
self
.
ltgt_classify_mapping
[
classify
],
excel_path
,
path
)
else
:
rebuild_res
=
self
.
images_process
(
tiff_handler
.
img_path_list
,
classify
,
excel_path
,
seperate_path
)
shutil
.
move
(
path
,
tiff_save_path
)
return
rebuild_res
def
img_process
(
self
,
name
,
path
,
classify
,
wb_output_dir
,
img_output_dir
,
pdf_output_dir
,
seperate_dir
):
rebuild_res
=
None
try
:
img_save_path
,
excel_path
,
_
,
seperate_path
=
self
.
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
,
seperate_dir
)
except
Exception
as
e
:
self
.
folder_log
.
error
(
'{0} [get path error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
else
:
if
classify
in
self
.
ltgt_classify_mapping
:
rebuild_res
=
self
.
ltgt_process
([
path
],
self
.
ltgt_classify_mapping
[
classify
],
excel_path
,
path
)
else
:
rebuild_res
=
self
.
images_process
([
path
],
classify
,
excel_path
,
seperate_path
)
shutil
.
move
(
path
,
img_save_path
)
return
rebuild_res
def
wb_process
(
self
,
wb_dir
,
result_queue
):
while
self
.
switch
:
result_list
=
[]
date_str
=
None
for
i
in
range
(
100
):
try
:
result
=
result_queue
.
get
(
block
=
False
)
except
Exception
as
e
:
time
.
sleep
(
self
.
short_sleep_time
)
else
:
if
date_str
is
None
:
date_str
=
result
[
self
.
DATE_KEY
]
result_list
.
append
(
result
)
elif
result
[
self
.
DATE_KEY
]
==
date_str
:
result_list
.
append
(
result
)
else
:
break
if
date_str
is
None
:
time
.
sleep
(
self
.
long_sleep_time
)
continue
else
:
wb_name
=
self
.
daily_wb_name
.
format
(
date_str
)
wb_path
=
os
.
path
.
join
(
wb_dir
,
wb_name
)
if
os
.
path
.
isfile
(
wb_path
):
wb
=
load_workbook
(
wb_path
)
else
:
wb
=
Workbook
()
for
result
in
result_list
:
try
:
if
result
[
self
.
CLASSIFY_KEY
]
in
self
.
sheet_content
:
sheet_name
,
head_fields
=
self
.
sheet_content
[
result
[
self
.
CLASSIFY_KEY
]]
else
:
sheet_name
,
key_field
,
side_field_order
,
field_order
=
self
.
field_map
[
result
[
self
.
CLASSIFY_KEY
]]
if
key_field
is
not
None
and
key_field
in
result
[
self
.
RESULT_KEY
]:
head_fields
=
[
a
for
a
,
_
in
side_field_order
]
else
:
head_fields
=
[
a
for
a
,
_
in
field_order
]
row
=
[]
for
field
in
head_fields
:
row
.
append
(
result
[
self
.
RESULT_KEY
]
.
get
(
field
))
if
sheet_name
in
wb
.
sheetnames
:
ws
=
wb
.
get_sheet_by_name
(
sheet_name
)
else
:
ws
=
wb
.
create_sheet
(
sheet_name
)
ws
.
append
(
head_fields
)
ws
.
append
(
row
)
except
Exception
as
e
:
self
.
folder_log
.
info
(
'{0} [daily wb failed] [result={1}] [error={2}]'
.
format
(
self
.
log_base
,
result
,
traceback
.
format_exc
()))
wb
.
save
(
wb_path
)
def
folder_process
(
self
,
input_dir
,
classify
,
is_combined
,
result_queue
):
while
not
os
.
path
.
isdir
(
input_dir
):
self
.
folder_log
.
info
(
'{0} [input dir is not dir] [input_dir={1}]'
.
format
(
self
.
log_base
,
input_dir
))
if
self
.
switch
:
time
.
sleep
(
self
.
sleep_time
)
continue
else
:
return
output_dir
=
os
.
path
.
join
(
os
.
path
.
dirname
(
input_dir
),
'Output'
)
seperate_dir
=
os
.
path
.
join
(
output_dir
,
self
.
seperate_map
.
get
(
classify
,
'Unknown'
))
if
is_combined
else
None
img_output_dir
=
os
.
path
.
join
(
output_dir
,
'image'
)
wb_output_dir
=
os
.
path
.
join
(
output_dir
,
'excel'
)
pdf_output_dir
=
os
.
path
.
join
(
output_dir
,
'pdf'
)
tiff_output_dir
=
os
.
path
.
join
(
output_dir
,
'tiff'
)
failed_output_dir
=
os
.
path
.
join
(
output_dir
,
'failed'
)
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
img_output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
wb_output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
pdf_output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
tiff_output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
failed_output_dir
,
exist_ok
=
True
)
if
seperate_dir
is
not
None
:
os
.
makedirs
(
seperate_dir
,
exist_ok
=
True
)
os_error_filename_set
=
set
()
while
self
.
switch
:
# if not os.path.isdir(input_dir):
# self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir))
# time.sleep(self.sleep_time)
# continue
# 1. 从input dir获取pdf or image
list_dir
=
os
.
listdir
(
input_dir
)
if
not
list_dir
and
len
(
os_error_filename_set
)
==
0
:
self
.
folder_log
.
info
(
'{0} [input dir empty] [input_dir={1}]'
.
format
(
self
.
log_base
,
input_dir
))
time
.
sleep
(
self
.
sleep_time
)
continue
all_file_set
=
set
(
list_dir
)
true_file_set
=
all_file_set
-
os_error_filename_set
if
len
(
true_file_set
)
==
0
and
len
(
os_error_filename_set
)
>
0
:
true_file_set
.
add
(
os_error_filename_set
.
pop
())
for
name
in
true_file_set
:
path
=
os
.
path
.
join
(
input_dir
,
name
)
try
:
if
os
.
path
.
isfile
(
path
):
self
.
folder_log
.
info
(
'{0} [file start] [path={1}]'
.
format
(
self
.
log_base
,
path
))
if
name
.
endswith
(
'.pdf'
)
or
name
.
endswith
(
'.PDF'
):
result
=
self
.
pdf_process
(
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
,
seperate_dir
)
elif
name
.
endswith
(
'.tif'
)
or
name
.
endswith
(
'.TIF'
):
result
=
self
.
tif_process
(
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
tiff_output_dir
,
seperate_dir
)
else
:
result
=
self
.
img_process
(
name
,
path
,
classify
,
wb_output_dir
,
img_output_dir
,
pdf_output_dir
,
seperate_dir
)
self
.
folder_log
.
info
(
'{0} [file end] [path={1}]'
.
format
(
self
.
log_base
,
path
))
else
:
result
=
None
self
.
folder_log
.
info
(
'{0} [path is dir] [path={1}]'
.
format
(
self
.
log_base
,
input_dir
))
failed_path
=
os
.
path
.
join
(
failed_output_dir
,
'{0}_{1}'
.
format
(
time
.
time
(),
name
))
shutil
.
move
(
path
,
failed_path
)
except
OSError
:
os_error_filename_set
.
add
(
name
)
self
.
folder_log
.
error
(
'{0} [os error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
except
Exception
as
e
:
try
:
self
.
folder_log
.
error
(
'{0} [file error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
failed_path
=
os
.
path
.
join
(
failed_output_dir
,
'{0}_{1}'
.
format
(
time
.
time
(),
name
))
shutil
.
move
(
path
,
failed_path
)
except
Exception
as
e
:
os_error_filename_set
.
add
(
name
)
self
.
folder_log
.
error
(
'{0} [file move error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
else
:
if
isinstance
(
result
,
dict
)
and
len
(
result
)
>
0
:
date_str
=
time
.
strftime
(
"
%
Y-
%
m-
%
d"
)
result_queue
.
put
(
{
self
.
CLASSIFY_KEY
:
classify
,
self
.
RESULT_KEY
:
result
,
self
.
DATE_KEY
:
date_str
}
)
elif
isinstance
(
result
,
list
)
and
len
(
result
)
>
0
:
date_str
=
time
.
strftime
(
"
%
Y-
%
m-
%
d"
)
for
res
in
result
:
result_queue
.
put
(
{
self
.
CLASSIFY_KEY
:
classify
,
self
.
RESULT_KEY
:
res
,
self
.
DATE_KEY
:
date_str
}
)
def
handle
(
self
,
*
args
,
**
kwargs
):
if
len
(
self
.
input_dirs
)
==
0
:
return
result_queue
=
Queue
()
process_list
=
[]
one_input_dir
=
None
for
classify_idx
,
input_dir
in
self
.
input_dirs
.
items
():
if
one_input_dir
is
None
:
one_input_dir
=
input_dir
classify
=
int
(
classify_idx
.
split
(
'_'
)[
0
])
is_combined
=
True
if
int
(
classify_idx
.
split
(
'_'
)[
2
])
==
1
else
False
process
=
Process
(
target
=
self
.
folder_process
,
args
=
(
input_dir
,
classify
,
is_combined
,
result_queue
))
process_list
.
append
(
process
)
wb_dir
=
os
.
path
.
dirname
(
os
.
path
.
dirname
(
one_input_dir
))
wb_process
=
Process
(
target
=
self
.
wb_process
,
args
=
(
wb_dir
,
result_queue
,
))
process_list
.
append
(
wb_process
)
for
p
in
process_list
:
p
.
start
()
for
p
in
process_list
:
p
.
join
()
self
.
folder_log
.
info
(
'{0} [stop safely]'
.
format
(
self
.
log_base
))
src/apps/doc/management/commands/folder_ocr_process.py
View file @
b8745dc
...
...
@@ -61,13 +61,11 @@ class Command(BaseCommand, LoggerMixin):
def
signal_handler
(
self
,
sig
,
frame
):
self
.
switch
=
False
# 停止处理文件
def
license1_process
(
self
,
ocr_data
,
license_summary
,
classify
,
res_list
,
pno
,
ino
,
part_idx
,
img_path
):
def
license1_process
(
self
,
ocr_data
,
license_summary
,
classify
,
img_path
):
# 类别:'0'身份证, '1'居住证
license_data
=
ocr_data
.
get
(
'data'
,
[])
if
not
license_data
:
res_list
.
append
((
pno
,
ino
,
part_idx
,
consts
.
RES_SUCCESS_EMPTY
))
return
res_list
.
append
((
pno
,
ino
,
part_idx
,
consts
.
RES_SUCCESS
))
if
classify
==
consts
.
MVC_CLASSIFY
:
# 车辆登记证 3/4页结果整合
for
mvc_dict
in
license_data
:
try
:
...
...
@@ -154,29 +152,21 @@ class Command(BaseCommand, LoggerMixin):
def
res_process
(
self
,
all_res
,
classify
,
excel_path
):
try
:
license_summary
=
{}
res_list
=
[]
if
not
all_res
:
return
else
:
for
img_path
,
ocr_res
in
all_res
.
items
():
img_name
,
pno
,
ino
=
self
.
parse_img_path
(
img_path
)
part_idx
=
1
#
img_name, pno, ino = self.parse_img_path(img_path)
#
part_idx = 1
if
isinstance
(
ocr_res
,
dict
):
if
ocr_res
.
get
(
'code'
)
==
1
:
data_list
=
ocr_res
.
get
(
'data'
,
[])
if
isinstance
(
data_list
,
list
):
for
part_idx
,
ocr_data
in
enumerate
(
data_list
):
part_idx
=
part_idx
+
1
self
.
license1_process
(
ocr_data
,
license_summary
,
classify
,
res_list
,
pno
,
ino
,
part_idx
,
img_path
)
else
:
res_list
.
append
((
pno
,
ino
,
part_idx
,
consts
.
RES_FAILED_3
))
else
:
res_list
.
append
((
pno
,
ino
,
part_idx
,
consts
.
RES_FAILED
))
else
:
res_list
.
append
((
pno
,
ino
,
part_idx
,
consts
.
RES_FAILED
))
for
ocr_data
in
data_list
:
# part_idx = part_idx + 1
self
.
license1_process
(
ocr_data
,
license_summary
,
classify
,
img_path
)
wb
=
BSWorkbook
(
set
(),
set
(),
set
(),
set
(),
set
())
wb
.
simple_license_rebuild
(
license_summary
,
consts
.
DOC_SCHEME_LIST
[
0
])
...
...
@@ -217,6 +207,13 @@ class Command(BaseCommand, LoggerMixin):
else
:
self
.
folder_log
.
warn
(
'{0} [ocr failed] [img_path={1}]'
.
format
(
self
.
log_base
,
img_path
))
def
images_process
(
self
,
img_path_list
,
classify
,
excel_path
):
all_res
=
{}
for
img_path
in
img_path_list
:
ocr_res
=
self
.
ocr_process
(
img_path
,
classify
)
all_res
[
img_path
]
=
ocr_res
self
.
res_process
(
all_res
,
classify
,
excel_path
)
def
pdf_process
(
self
,
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
):
if
os
.
path
.
exists
(
path
):
try
:
...
...
@@ -230,11 +227,7 @@ class Command(BaseCommand, LoggerMixin):
self
.
log_base
,
path
,
traceback
.
format_exc
()))
raise
e
else
:
all_res
=
{}
for
img_path
in
pdf_handler
.
img_path_list
:
ocr_res
=
self
.
ocr_process
(
img_path
,
classify
)
all_res
[
img_path
]
=
ocr_res
self
.
res_process
(
all_res
,
classify
,
excel_path
)
self
.
images_process
(
pdf_handler
.
img_path_list
,
classify
,
excel_path
)
shutil
.
move
(
path
,
pdf_save_path
)
def
tif_process
(
self
,
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
tiff_output_dir
):
...
...
@@ -250,23 +243,18 @@ class Command(BaseCommand, LoggerMixin):
self
.
log_base
,
path
,
traceback
.
format_exc
()))
raise
e
else
:
all_res
=
{}
for
img_path
in
tiff_handler
.
img_path_list
:
ocr_res
=
self
.
ocr_process
(
img_path
,
classify
)
all_res
[
img_path
]
=
ocr_res
self
.
res_process
(
all_res
,
classify
,
excel_path
)
self
.
images_process
(
tiff_handler
.
img_path_list
,
classify
,
excel_path
)
shutil
.
move
(
path
,
tiff_save_path
)
def
img_process
(
self
,
name
,
path
,
classify
,
wb_output_dir
,
img_output_dir
,
pdf_output_dir
):
ocr_res
=
self
.
ocr_process
(
path
,
classify
)
all_res
=
{
path
:
ocr_res
}
try
:
img_save_path
,
excel_path
,
_
=
self
.
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
)
except
Exception
as
e
:
self
.
folder_log
.
error
(
'{0} [get path error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
else
:
ocr_res
=
self
.
ocr_process
(
path
,
classify
)
all_res
=
{
path
:
ocr_res
}
self
.
res_process
(
all_res
,
classify
,
excel_path
)
shutil
.
move
(
path
,
img_save_path
)
...
...
@@ -312,9 +300,9 @@ class Command(BaseCommand, LoggerMixin):
try
:
if
os
.
path
.
isfile
(
path
):
self
.
folder_log
.
info
(
'{0} [file start] [path={1}]'
.
format
(
self
.
log_base
,
path
))
if
name
.
endswith
(
'.pdf'
):
if
name
.
endswith
(
'.pdf'
)
or
name
.
endswith
(
'.PDF'
)
:
self
.
pdf_process
(
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
)
elif
name
.
endswith
(
'.tif'
):
elif
name
.
endswith
(
'.tif'
)
or
name
.
endswith
(
'.TIF'
)
:
self
.
tif_process
(
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
tiff_output_dir
)
else
:
self
.
img_process
(
name
,
path
,
classify
,
wb_output_dir
,
img_output_dir
,
pdf_output_dir
)
...
...
src/apps/doc/ocr/wb.py
View file @
b8745dc
...
...
@@ -702,6 +702,31 @@ class BSWorkbook(Workbook):
if
field_str
is
not
None
:
count_list
.
append
((
field_str
,
count
))
def
ltgt_build
(
self
,
label
,
result_dict
):
ws
=
self
.
create_sheet
(
label
)
rebuild_res
=
{}
for
key
,
value
in
result_dict
.
items
():
if
isinstance
(
value
,
list
):
value_list
=
[
dict_item
.
get
(
'words'
)
for
dict_item
in
value
]
ws
.
append
((
key
,
'、'
.
join
(
value_list
)))
rebuild_res
[
key
]
=
'、'
.
join
(
value_list
)
elif
isinstance
(
value
,
dict
):
if
'words'
in
value
:
ws
.
append
((
key
,
value
[
'words'
]))
rebuild_res
[
key
]
=
value
[
'words'
]
else
:
for
sub_key
,
sub_value
in
value
.
items
():
if
isinstance
(
sub_value
,
dict
):
ws
.
append
((
'{0}: {1}'
.
format
(
key
,
sub_key
),
sub_value
.
get
(
'words'
,
''
)))
rebuild_res
[
'{0}: {1}'
.
format
(
key
,
sub_key
)]
=
sub_value
.
get
(
'words'
,
''
)
else
:
ws
.
append
((
'{0}: {1}'
.
format
(
key
,
sub_key
),
sub_value
))
rebuild_res
[
'{0}: {1}'
.
format
(
key
,
sub_key
)]
=
sub_value
else
:
ws
.
append
((
key
,
value
))
rebuild_res
[
key
]
=
value
return
rebuild_res
def
simple_license_rebuild
(
self
,
license_summary
,
document_scheme
):
# for ic_license_dict in license_summary.get(consts.IC_CLASSIFY, []):
# if ic_license_dict.get('类别') == '1':
...
...
src/common/tools/pdf_to_img.py
View file @
b8745dc
...
...
@@ -225,3 +225,13 @@ class PDFHandler:
else
:
self
.
merge_il
(
pdf
,
pno
,
il
)
self
.
img_count
=
len
(
self
.
img_path_list
)
def
extract_page_image
(
self
):
self
.
img_path_list
=
[]
self
.
xref_set
=
set
()
os
.
makedirs
(
self
.
img_dir_path
,
exist_ok
=
True
)
with
fitz
.
Document
(
self
.
path
)
as
pdf
:
for
pno
in
range
(
pdf
.
pageCount
):
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
)
self
.
img_count
=
len
(
self
.
img_path_list
)
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment