Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
ec638e4f
authored
2020-11-16 18:16:14 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
PROD Version
1 parent
364772ed
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
25 additions
and
17 deletions
src/apps/doc/consts.py
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/consts.py
View file @
ec638e4
...
...
@@ -8,7 +8,7 @@ PAGE_SIZE_DEFAULT = 10
FIXED_APPLICATION_ID_PREFIX
=
'CH-S'
DOC_SCHEME_LIST
=
[
'ACCEPTANCE'
,
'SETTLEMENT'
,
'CONTRACT
MANAGEMENT'
]
DOC_SCHEME_LIST
=
[
'ACCEPTANCE'
,
'SETTLEMENT'
,
'CONTRACTMANAGEMENT'
]
DATA_SOURCE_LIST
=
[
'POS'
,
'EAPP'
,
'ECONTRACT'
]
HIL_PREFIX
=
'HIL'
...
...
@@ -31,7 +31,7 @@ DOWNLOAD_ACTION_TYPE = 'Downloaded'
DOC_SCHEMA_ID_FILL
=
{
'ACCEPTANCE'
:
(
1
,
'DFE-AutoFilingScript'
),
'SETTLEMENT'
:
(
20
,
'DFE-AutoFilingScript'
),
'CONTRACT
MANAGEMENT'
:
(
86
,
'Schema-Based'
)
'CONTRACTMANAGEMENT'
:
(
86
,
'Schema-Based'
)
}
BUSINESS_TYPE_DICT
=
{
HIL_PREFIX
:
'CO00002'
,
...
...
src/apps/doc/management/commands/ocr_process.py
View file @
ec638e4
...
...
@@ -72,7 +72,12 @@ class Command(BaseCommand, LoggerMixin):
return
None
,
None
,
None
self
.
cronjob_log
.
info
(
'{0} [get_doc_info success] [task={1}] [is_priority={2}]'
.
format
(
self
.
log_base
,
task_str
,
is_priority
))
doc
,
business_type
=
self
.
get_doc_object
(
task_str
)
try
:
doc
,
business_type
=
self
.
get_doc_object
(
task_str
)
except
Exception
as
e
:
rh
.
enqueue
([
task_str
],
is_priority
)
self
.
cronjob_log
.
error
(
'{0} [process error (get doc info in)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
raise
e
if
doc
is
None
:
self
.
cronjob_log
.
warn
(
'{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'
.
format
(
...
...
@@ -364,7 +369,7 @@ class Command(BaseCommand, LoggerMixin):
# summary['confidence'] = max(summary['confidence'])
return
merged_bs_summary
def
pdf_2_img_2_queue
(
self
,
img_queue
,
todo_count_dict
,
lock
):
def
pdf_2_img_2_queue
(
self
,
img_queue
,
todo_count_dict
,
lock
,
error_list
):
while
self
.
switch
:
try
:
# 1. 从队列获取文件信息
...
...
@@ -374,8 +379,10 @@ class Command(BaseCommand, LoggerMixin):
time
.
sleep
(
self
.
sleep_time_doc_get
)
continue
except
Exception
as
e
:
self
.
cronjob_log
.
error
(
'{0} [process
failed (get doc into
)] [error={1}]'
.
format
(
self
.
cronjob_log
.
error
(
'{0} [process
error (get doc info out
)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
error_list
.
append
(
1
)
return
else
:
try
:
# 2. 从EDMS获取PDF文件
...
...
@@ -413,8 +420,8 @@ class Command(BaseCommand, LoggerMixin):
self
.
cronjob_log
.
error
(
'{0} [process failed (pdf to img)] [task={1}] [error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
def
img_2_ocr_1
(
self
,
img_queue
,
todo_count_dict
,
res_dict
,
finish_queue
,
lock
,
url
):
while
True
:
def
img_2_ocr_1
(
self
,
img_queue
,
todo_count_dict
,
res_dict
,
finish_queue
,
lock
,
url
,
error_list
):
while
len
(
error_list
)
==
0
or
not
img_queue
.
empty
()
:
try
:
img_path
=
img_queue
.
get
(
block
=
False
)
except
Exception
as
e
:
...
...
@@ -478,8 +485,8 @@ class Command(BaseCommand, LoggerMixin):
self
.
cronjob_log
.
error
(
'{0} [process error (store ocr res)] [img_path={1}] [error={2}]'
.
format
(
self
.
log_base
,
img_path
,
traceback
.
format_exc
()))
def
res_2_wb
(
self
,
res_dict
,
finish_queue
,
lock
):
while
True
:
def
res_2_wb
(
self
,
res_dict
,
img_queue
,
finish_queue
,
lock
,
error_list
):
while
len
(
error_list
)
==
0
or
not
img_queue
.
empty
()
or
not
finish_queue
.
empty
()
:
try
:
task_str
=
finish_queue
.
get
(
block
=
False
)
except
Exception
as
e
:
...
...
@@ -605,8 +612,8 @@ class Command(BaseCommand, LoggerMixin):
doc
,
business_type
=
self
.
get_doc_object
(
task_str
)
doc_data_path
=
os
.
path
.
join
(
self
.
data_dir
,
business_type
,
consts
.
TMP_DIR_NAME
,
str
(
doc
.
id
))
excel_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.xlsx'
.
format
(
doc
.
id
))
src_excel_path
=
os
.
path
.
join
(
doc_data_path
,
'src.xlsx'
)
wb
.
save
(
src_excel_path
)
#
src_excel_path = os.path.join(doc_data_path, 'src.xlsx')
#
wb.save(src_excel_path)
count_list
=
wb
.
rebuild
(
merged_bs_summary
,
license_summary
,
res_list
,
doc
.
document_scheme
)
wb
.
save
(
excel_path
)
except
Exception
as
e
:
...
...
@@ -637,8 +644,8 @@ class Command(BaseCommand, LoggerMixin):
img_save_path
=
os
.
path
.
join
(
doc_data_path
,
'img'
)
write_zip_file
(
img_save_path
,
os
.
path
.
join
(
doc_data_path
,
'{0}_img.zip'
.
format
(
doc
.
id
)))
shutil
.
rmtree
(
img_save_path
,
ignore_errors
=
True
)
#
pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
#
os.remove(pdf_path)
pdf_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.pdf'
.
format
(
doc
.
id
))
os
.
remove
(
pdf_path
)
# os.remove(src_excel_path)
except
Exception
as
e
:
self
.
cronjob_log
.
error
(
'{0} [process error (file remove 2)] [task={1}] [error={2}]'
.
format
(
...
...
@@ -681,7 +688,7 @@ class Command(BaseCommand, LoggerMixin):
setattr
(
doc
,
field
,
count
)
doc
.
save
()
self
.
cronjob_log
.
info
(
'{0} [process complete] [task={1}]'
.
format
(
self
.
log_base
,
task_str
))
#
os.remove(excel_path)
os
.
remove
(
excel_path
)
except
Exception
as
e
:
self
.
cronjob_log
.
error
(
'{0} [process error (completed)] [task={1}] [error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
...
...
@@ -695,21 +702,22 @@ class Command(BaseCommand, LoggerMixin):
def
handle
(
self
,
*
args
,
**
kwargs
):
lock
=
Lock
()
with
Manager
()
as
manager
:
error_list
=
manager
.
list
()
todo_count_dict
=
manager
.
dict
()
res_dict
=
manager
.
dict
()
img_queue
=
Queue
(
self
.
img_queue_size
)
finish_queue
=
Queue
()
process_list
=
[]
pdf_process
=
Process
(
target
=
self
.
pdf_2_img_2_queue
,
args
=
(
img_queue
,
todo_count_dict
,
lock
))
pdf_process
=
Process
(
target
=
self
.
pdf_2_img_2_queue
,
args
=
(
img_queue
,
todo_count_dict
,
lock
,
error_list
))
process_list
.
append
(
pdf_process
)
for
url
in
self
.
ocr_1_urls
.
values
():
ocr_1_process
=
Process
(
target
=
self
.
img_2_ocr_1
,
args
=
(
img_queue
,
todo_count_dict
,
res_dict
,
finish_queue
,
lock
,
url
))
img_queue
,
todo_count_dict
,
res_dict
,
finish_queue
,
lock
,
url
,
error_list
))
process_list
.
append
(
ocr_1_process
)
wb_process
=
Process
(
target
=
self
.
res_2_wb
,
args
=
(
res_dict
,
finish_queue
,
lock
))
wb_process
=
Process
(
target
=
self
.
res_2_wb
,
args
=
(
res_dict
,
img_queue
,
finish_queue
,
lock
,
error_list
))
process_list
.
append
(
wb_process
)
for
p
in
process_list
:
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment