Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
d9b0ae8c
authored
2020-11-11 19:40:45 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
add folder ocr process
1 parent
4076848e
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
243 additions
and
2 deletions
.gitignore
src/apps/doc/consts.py
src/apps/doc/management/commands/folder_ocr_process.py
src/apps/doc/ocr/wb.py
src/settings/conf/prd.ini
src/settings/conf/sit.ini
src/settings/conf/uat.ini
.gitignore
View file @
d9b0ae8
...
...
@@ -34,5 +34,4 @@ ocr/*
# 脚本
src/*.sh
test*
folder_ocr_process.py
\ No newline at end of file
test*
\ No newline at end of file
...
...
src/apps/doc/consts.py
View file @
d9b0ae8
...
...
@@ -83,6 +83,7 @@ RES_SHEET_HEADER = ('页码', '图片序号', '检测图片序号', '结果')
RES_SUCCESS
=
'识别成功'
RES_SUCCESS_OTHER
=
'识别成功(其他类)'
RES_SUCCESS_EMPTY
=
'识别成功(空数据)'
RES_FAILED
=
'识别失败'
RES_FAILED_1
=
'识别失败(阶段1)'
RES_FAILED_2
=
'识别失败(阶段2)'
RES_FAILED_3
=
'识别失败(阶段1数据格式错误)'
...
...
src/apps/doc/management/commands/folder_ocr_process.py
0 → 100644
View file @
d9b0ae8
import
os
import
re
import
time
import
json
import
shutil
import
base64
import
signal
import
asyncio
import
aiohttp
import
difflib
import
requests
import
traceback
from
collections
import
Counter
from
datetime
import
datetime
,
date
from
django.utils
import
timezone
from
django.core.management
import
BaseCommand
from
multiprocessing
import
Process
,
Queue
,
Manager
,
Lock
from
settings
import
conf
from
common.mixins
import
LoggerMixin
from
common.tools.file_tools
import
write_zip_file
from
common.tools.pdf_to_img
import
PDFHandler
from
apps.doc
import
consts
from
apps.doc.ocr.edms
import
EDMS
,
rh
from
apps.doc.named_enum
import
KeywordsType
from
apps.doc.exceptions
import
EDMSException
,
OCR1Exception
,
OCR2Exception
from
apps.doc.ocr.wb
import
BSWorkbook
,
Workbook
from
apps.doc.models
import
DocStatus
,
HILDoc
,
AFCDoc
,
Keywords
class
Command
(
BaseCommand
,
LoggerMixin
):
def
__init__
(
self
):
super
()
.
__init__
()
self
.
log_base
=
'[folder ocr process]'
# 处理文件开关
self
.
switch
=
True
# 睡眠时间
self
.
sleep_time
=
float
(
conf
.
SLEEP_SECOND_FOLDER
)
# input foler
self
.
input_dirs
=
conf
.
get_namespace
(
'INPUT_DIR_'
)
# ocr相关
self
.
ocr_url
=
conf
.
OCR_URL_FOLDER
# 优雅退出信号:15
signal
.
signal
(
signal
.
SIGTERM
,
self
.
signal_handler
)
def
signal_handler
(
self
,
sig
,
frame
):
self
.
switch
=
False
# 停止处理文件
def
license1_process
(
self
,
ocr_data
,
license_summary
,
classify
,
res_list
,
pno
,
ino
,
part_idx
):
# 类别:'0'身份证, '1'居住证
license_data
=
ocr_data
.
get
(
'data'
,
[])
if
not
license_data
:
res_list
.
append
((
pno
,
ino
,
part_idx
,
consts
.
RES_SUCCESS_EMPTY
))
return
res_list
.
append
((
pno
,
ino
,
part_idx
,
consts
.
RES_SUCCESS
))
license_summary
.
setdefault
(
classify
,
[])
.
extend
(
license_data
)
@staticmethod
def
parse_img_path
(
img_path
):
# 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext)
img_name
,
_
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
img_path
))
if
re
.
match
(
r'page_\d+_img_\d+'
,
img_name
):
part_list
=
img_name
.
split
(
'_'
)
return
img_name
,
int
(
part_list
[
1
])
+
1
,
int
(
part_list
[
3
])
+
1
else
:
return
img_name
,
1
,
1
@staticmethod
def
get_path
(
name
,
img_output_dir
,
wb_output_dir
):
time_stamp
=
int
(
time
.
time
())
new_name
=
'{0}_{1}'
.
format
(
time_stamp
,
name
)
img_save_path
=
os
.
path
.
join
(
img_output_dir
,
new_name
)
excel_name
=
'{0}.xlsx'
.
format
(
os
.
path
.
splitext
(
new_name
)[
0
])
excel_path
=
os
.
path
.
join
(
wb_output_dir
,
excel_name
)
return
img_save_path
,
excel_path
def
res_process
(
self
,
all_res
,
classify
,
excel_path
):
try
:
license_summary
=
{}
res_list
=
[]
if
not
all_res
:
return
else
:
for
img_path
,
ocr_res
in
all_res
.
items
():
img_name
,
pno
,
ino
=
self
.
parse_img_path
(
img_path
)
part_idx
=
1
if
isinstance
(
ocr_res
,
dict
):
if
ocr_res
.
get
(
'code'
)
==
1
:
data_list
=
ocr_res
.
get
(
'data'
,
[])
if
isinstance
(
data_list
,
list
):
for
part_idx
,
ocr_data
in
enumerate
(
data_list
):
part_idx
=
part_idx
+
1
self
.
license1_process
(
ocr_data
,
license_summary
,
classify
,
res_list
,
pno
,
ino
,
part_idx
)
else
:
res_list
.
append
((
pno
,
ino
,
part_idx
,
consts
.
RES_FAILED_3
))
else
:
res_list
.
append
((
pno
,
ino
,
part_idx
,
consts
.
RES_FAILED
))
else
:
res_list
.
append
((
pno
,
ino
,
part_idx
,
consts
.
RES_FAILED
))
wb
=
BSWorkbook
(
set
(),
set
(),
set
())
wb
.
simple_license_rebuild
(
license_summary
,
consts
.
DOC_SCHEME_LIST
[
0
])
wb
.
save
(
excel_path
)
except
Exception
as
e
:
self
.
cronjob_log
.
error
(
'{0} [wb build error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
excel_path
,
traceback
.
format_exc
()))
def
ocr_process
(
self
,
img_path
,
classify
):
if
os
.
path
.
exists
(
img_path
):
# TODO 图片验证
with
open
(
img_path
,
'rb'
)
as
f
:
base64_data
=
base64
.
b64encode
(
f
.
read
())
# 获取解码后的base64值
file_data
=
base64_data
.
decode
()
json_data
=
{
"file"
:
file_data
,
"classify"
:
classify
}
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
start_time
=
time
.
time
()
ocr_response
=
requests
.
post
(
self
.
ocr_url
,
json
=
json_data
)
if
ocr_response
.
status_code
!=
200
:
raise
OCR1Exception
(
'{0} ocr status code: {0}'
.
format
(
self
.
log_base
,
ocr_response
.
status_code
))
except
Exception
as
e
:
self
.
cronjob_log
.
warn
(
'{0} [ocr failed] [times={1}] [img_path={2}] [error={3}]'
.
format
(
self
.
log_base
,
times
,
img_path
,
traceback
.
format_exc
()))
else
:
ocr_res
=
ocr_response
.
json
()
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
cronjob_log
.
info
(
'{0} [ocr success] [img={1}] [res={2}] [speed_time={3}]'
.
format
(
self
.
log_base
,
img_path
,
ocr_res
,
speed_time
))
return
ocr_res
else
:
self
.
cronjob_log
.
warn
(
'{0} [ocr failed] [img_path={1}]'
.
format
(
self
.
log_base
,
img_path
))
def
pdf_process
(
self
,
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
):
if
os
.
path
.
exists
(
path
):
try
:
img_save_path
,
excel_path
=
self
.
get_path
(
name
,
img_output_dir
,
wb_output_dir
)
self
.
cronjob_log
.
info
(
'{0} [pdf to img start] [path={1}]'
.
format
(
self
.
log_base
,
path
))
pdf_handler
=
PDFHandler
(
path
,
img_save_path
)
pdf_handler
.
extract_image
()
self
.
cronjob_log
.
info
(
'{0} [pdf to img end] [path={1}]'
.
format
(
self
.
log_base
,
path
))
except
Exception
as
e
:
self
.
cronjob_log
.
error
(
'{0} [pdf to img error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
else
:
all_res
=
{}
for
img_path
in
pdf_handler
.
img_path_list
:
ocr_res
=
self
.
ocr_process
(
img_path
,
classify
)
all_res
[
img_path
]
=
ocr_res
self
.
res_process
(
all_res
,
classify
,
excel_path
)
shutil
.
move
(
path
,
pdf_output_dir
)
def
img_process
(
self
,
name
,
path
,
classify
,
wb_output_dir
,
img_output_dir
):
ocr_res
=
self
.
ocr_process
(
path
,
classify
)
all_res
=
{
path
:
ocr_res
}
try
:
img_save_path
,
excel_path
=
self
.
get_path
(
name
,
img_output_dir
,
wb_output_dir
)
except
Exception
as
e
:
self
.
cronjob_log
.
error
(
'{0} [get path error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
else
:
self
.
res_process
(
all_res
,
classify
,
excel_path
)
shutil
.
move
(
path
,
img_save_path
)
def
folder_process
(
self
,
input_dir
,
classify
):
output_dir
=
os
.
path
.
join
(
os
.
path
.
dirname
(
input_dir
),
'Output'
)
img_output_dir
=
os
.
path
.
join
(
output_dir
,
'image'
)
wb_output_dir
=
os
.
path
.
join
(
output_dir
,
'excel'
)
pdf_output_dir
=
os
.
path
.
join
(
output_dir
,
'pdf'
)
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
img_output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
wb_output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
pdf_output_dir
,
exist_ok
=
True
)
while
self
.
switch
:
# 1. 从input dir获取pdf or image
list_dir
=
os
.
listdir
(
input_dir
)
if
not
list_dir
:
self
.
cronjob_log
.
error
(
'{0} [input dir empty] [input_dir={1}]'
.
format
(
self
.
log_base
,
input_dir
))
time
.
sleep
(
self
.
sleep_time
)
for
name
in
list_dir
:
path
=
os
.
path
.
join
(
input_dir
,
name
)
if
os
.
path
.
isfile
(
path
):
self
.
cronjob_log
.
info
(
'{0} [file start] [path={1}]'
.
format
(
self
.
log_base
,
path
))
if
name
.
endswith
(
'.pdf'
):
self
.
pdf_process
(
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
)
else
:
self
.
img_process
(
name
,
path
,
classify
,
wb_output_dir
,
img_output_dir
)
self
.
cronjob_log
.
info
(
'{0} [file end] [path={1}]'
.
format
(
self
.
log_base
,
path
))
def
handle
(
self
,
*
args
,
**
kwargs
):
process_list
=
[]
for
classify_idx
,
input_dir
in
self
.
input_dirs
.
items
():
classify
=
int
(
classify_idx
.
split
(
'_'
)[
0
])
process
=
Process
(
target
=
self
.
folder_process
,
args
=
(
input_dir
,
classify
))
process_list
.
append
(
process
)
for
p
in
process_list
:
p
.
start
()
for
p
in
process_list
:
p
.
join
()
self
.
cronjob_log
.
info
(
'{0} [stop safely]'
.
format
(
self
.
log_base
))
src/apps/doc/ocr/wb.py
View file @
d9b0ae8
...
...
@@ -583,6 +583,29 @@ class BSWorkbook(Workbook):
count
+=
1
count_list
.
append
((
field_str
,
count
))
def
simple_license_rebuild
(
self
,
license_summary
,
document_scheme
):
for
classify
,
(
_
,
name
,
field_order
,
side_diff
,
scheme_diff
,
_
)
in
consts
.
LICENSE_ORDER
:
license_list
=
license_summary
.
get
(
classify
)
if
not
license_list
:
continue
ws
=
self
.
create_sheet
(
name
)
if
scheme_diff
and
document_scheme
==
consts
.
DOC_SCHEME_LIST
[
1
]:
classify
=
consts
.
MVC_CLASSIFY_SE
for
license_dict
in
license_list
:
if
classify
==
consts
.
IC_CLASSIFY
and
license_dict
.
get
(
'类别'
)
==
'1'
:
license_summary
.
setdefault
(
consts
.
RP_CLASSIFY
,
[])
.
append
(
license_dict
)
continue
if
side_diff
:
key
,
field_order_yes
,
field_order_no
=
consts
.
FIELD_ORDER_MAP
.
get
(
classify
)
field_order
=
field_order_yes
if
key
in
license_dict
else
field_order_no
for
search_field
,
write_field
in
field_order
:
field_value
=
license_dict
.
get
(
search_field
,
''
)
if
isinstance
(
field_value
,
list
):
ws
.
append
((
write_field
,
*
field_value
))
else
:
ws
.
append
((
write_field
,
field_value
))
ws
.
append
((
None
,
))
def
res_sheet
(
self
,
res_list
):
if
res_list
:
res_list
.
sort
(
key
=
lambda
x
:
(
x
[
0
],
x
[
1
],
x
[
2
]))
...
...
src/settings/conf/prd.ini
View file @
d9b0ae8
...
...
@@ -4,6 +4,8 @@ SLEEP_SECOND_DOC_GET = 2
SLEEP_SECOND_IMG_PUT
=
2
SLEEP_SECOND_IMG_GET
=
0.5
SLEEP_SECOND_TASK_GET
=
2
SLEEP_SECOND_FOLDER
=
2
IMG_QUEUE_SIZE
=
500
EDMS_DOWNLOAD_URL
=
https://edms-test.bmw.com/FH/FileHold/DocumentRepository/DownloadHandler.ashx
...
...
src/settings/conf/sit.ini
View file @
d9b0ae8
...
...
@@ -4,6 +4,8 @@ SLEEP_SECOND_DOC_GET = 10
SLEEP_SECOND_IMG_PUT
=
2
SLEEP_SECOND_IMG_GET
=
0.5
SLEEP_SECOND_TASK_GET
=
2
SLEEP_SECOND_FOLDER
=
2
IMG_QUEUE_SIZE
=
500
EDMS_DOWNLOAD_URL
=
https://edms-test.bmw.com/FH/FileHold/DocumentRepository/DownloadHandler.ashx
...
...
src/settings/conf/uat.ini
View file @
d9b0ae8
...
...
@@ -4,6 +4,8 @@ SLEEP_SECOND_DOC_GET = 2
SLEEP_SECOND_IMG_PUT
=
2
SLEEP_SECOND_IMG_GET
=
0.5
SLEEP_SECOND_TASK_GET
=
2
SLEEP_SECOND_FOLDER
=
2
IMG_QUEUE_SIZE
=
500
EDMS_DOWNLOAD_URL
=
https://edms-test.bmw.com/FH/FileHold/DocumentRepository/DownloadHandler.ashx
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment