Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
91ff8153
authored
2025-08-08 12:26:46 +0800
by
冯轩
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
init reocr
1 parent
e08e5c00
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
68 additions
and
12 deletions
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/views.py
src/apps/doc/management/commands/ocr_process.py
View file @
91ff815
...
...
@@ -100,7 +100,7 @@ class Command(BaseCommand, LoggerMixin):
if
len
(
info_tuple
)
==
2
:
business_type
,
doc_id_str
=
info_tuple
else
:
business_type
,
doc_id_str
,
classify_1_str
=
info_tuple
business_type
,
doc_id_str
,
classify_1_str
,
re_ocr_flag
=
info_tuple
doc_id
=
int
(
doc_id_str
)
doc_class
=
HILDoc
if
business_type
==
consts
.
HIL_PREFIX
else
AFCDoc
zip_doc
=
doc_class
.
objects
.
filter
(
id
=
doc_id
)
.
first
()
...
...
@@ -124,7 +124,7 @@ class Command(BaseCommand, LoggerMixin):
else
:
self
.
online_log
.
info
(
'{0} [zip_2_pdfs] [db save end] [task_str={1}]'
.
format
(
self
.
log_base
,
task_str
))
return
zip_doc
,
business_type
return
zip_doc
,
business_type
,
re_ocr_flag
def
get_doc_info
(
self
,
task_str
,
is_priority
=
False
):
try
:
...
...
@@ -135,7 +135,7 @@ class Command(BaseCommand, LoggerMixin):
classify_1_str
=
'0'
rebuild_task_str
=
task_str
else
:
business_type
,
doc_id_str
,
classify_1_str
=
info_tuple
business_type
,
doc_id_str
,
classify_1_str
,
re_ocr_flag
=
info_tuple
rebuild_task_str
=
'{0}{1}{2}'
.
format
(
business_type
,
consts
.
SPLIT_STR
,
doc_id_str
)
doc_id
=
int
(
doc_id_str
)
doc_class
=
HILDoc
if
business_type
==
consts
.
HIL_PREFIX
else
AFCDoc
...
...
@@ -160,7 +160,7 @@ class Command(BaseCommand, LoggerMixin):
else
:
self
.
online_log
.
info
(
'{0} [get_doc_info] [db save end] [task_str={1}] [is_priority={2}]'
.
format
(
self
.
log_base
,
task_str
,
is_priority
))
return
doc
,
business_type
,
rebuild_task_str
,
classify_1_str
return
doc
,
business_type
,
rebuild_task_str
,
classify_1_str
,
re_ocr_flag
# def pdf_download(self, doc, pdf_path):
# if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
...
...
@@ -1202,7 +1202,7 @@ class Command(BaseCommand, LoggerMixin):
self
.
online_log
.
info
(
'{0} [zip_2_pdfs] [task={1}]'
.
format
(
self
.
log_base
,
task_str
))
# 2. 修改doc状态: 识别中
zip_doc
,
business_type
=
self
.
get_zip_doc_info
(
task_str
)
zip_doc
,
business_type
,
re_ocr_flag
=
self
.
get_zip_doc_info
(
task_str
)
if
zip_doc
is
None
:
time
.
sleep
(
self
.
sleep_time_doc_get
)
continue
...
...
@@ -1287,7 +1287,7 @@ class Command(BaseCommand, LoggerMixin):
target_pdf_path
=
os
.
path
.
join
(
pdf_doc_data_path
,
'{0}.pdf'
.
format
(
pdf_doc
.
id
))
shutil
.
move
(
pdf_path
,
target_pdf_path
)
pdf_task_str
=
consts
.
SPLIT_STR
.
join
([
business_type
,
str
(
pdf_doc
.
id
),
'0'
])
pdf_task_str
=
consts
.
SPLIT_STR
.
join
([
business_type
,
str
(
pdf_doc
.
id
),
'0'
,
re_ocr_flag
])
pdf_task_str_list
.
append
(
pdf_task_str
)
except
Exception
as
e
:
self
.
online_log
.
warn
(
'{0} [zip_2_pdfs] [recreate pdf task failed] [task={1}] [pdf_path={2}]'
...
...
@@ -1336,7 +1336,7 @@ class Command(BaseCommand, LoggerMixin):
try
:
# 1. 从队列获取文件信息
doc
,
business_type
,
task_str
,
classify_1_str
=
self
.
get_doc_info
(
task_str
,
is_priority
)
doc
,
business_type
,
task_str
,
classify_1_str
,
re_ocr_flag
=
self
.
get_doc_info
(
task_str
,
is_priority
)
# 队列为空时的处理
if
doc
is
None
:
time
.
sleep
(
self
.
sleep_time_doc_get
)
...
...
@@ -1386,7 +1386,8 @@ class Command(BaseCommand, LoggerMixin):
self
.
online_log
.
info
(
'{0} [pdf to img start] [task={1}] [times={2}]'
.
format
(
self
.
log_base
,
task_str
,
times
))
start_time
=
time
.
time
()
pdf_handler
.
extract_image
(
max_img_count
)
max_img_count_or_none
=
None
if
re_ocr_flag
==
'Y'
else
max_img_count
pdf_handler
.
extract_image
(
max_img_count_or_none
)
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
online_log
.
info
(
'{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'
.
format
(
...
...
@@ -1404,7 +1405,7 @@ class Command(BaseCommand, LoggerMixin):
self
.
online_log
.
warn
(
'{0} [pdf to img failed (pdf img empty)] [task={1}]'
.
format
(
self
.
log_base
,
task_str
))
raise
Exception
(
'pdf img empty'
)
elif
pdf_handler
.
img_count
>=
max_img_count
:
elif
re_ocr_flag
==
'N'
and
pdf_handler
.
img_count
>=
max_img_count
:
self
.
online_log
.
info
(
'{0} [too many pdf image] [task={1}] [img_count={2}]'
.
format
(
self
.
log_base
,
task_str
,
pdf_handler
.
img_count
))
...
...
src/apps/doc/views.py
View file @
91ff815
...
...
@@ -589,6 +589,11 @@ invoice_download_args = {
'application_ids'
:
fields
.
Str
(
required
=
True
),
}
doc_reocr_args
=
{
'doc_id'
:
fields
.
Int
(
required
=
True
),
'application_entity'
:
fields
.
Int
(
required
=
True
),
}
class
UploadDocView
(
GenericView
,
DocHandler
):
# permission_classes = []
...
...
@@ -698,7 +703,7 @@ class UploadDocView(GenericView, DocHandler):
or
document_name
.
endswith
(
'.RAR'
):
is_zip
=
True
task
=
consts
.
SPLIT_STR
.
join
([
prefix
,
str
(
doc
.
id
),
str
(
classify_1
)])
task
=
consts
.
SPLIT_STR
.
join
([
prefix
,
str
(
doc
.
id
),
str
(
classify_1
)
,
'N'
])
enqueue_res
=
rh
.
enqueue
([
task
],
is_priority
,
is_zip
)
self
.
running_log
.
info
(
'[doc upload success] [args={0}] [business_type={1}] [doc_id={2}] '
'[is_priority={3}] [enqueue_res={4}] [is_fsm={5} [classify_1={6}]]'
.
format
(
args
,
prefix
,
doc
.
id
,
...
...
@@ -1249,7 +1254,7 @@ class DocView(DocGenericView, DocHandler):
break
# tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)]
task
=
consts
.
SPLIT_STR
.
join
([
prefix
,
str
(
doc
.
id
),
str
(
classify_1
)])
task
=
consts
.
SPLIT_STR
.
join
([
prefix
,
str
(
doc
.
id
),
str
(
classify_1
)
,
'N'
])
enqueue_res
=
rh
.
enqueue
([
task
],
is_priority
)
self
.
running_log
.
info
(
'[mock doc upload success] [args={0}] [business_type={1}] [doc_id={2}] '
...
...
@@ -2068,4 +2073,54 @@ class InvoiceQueryInfoView(GenericView):
return
response2
.
ok
(
data
=
java_result
)
except
Exception
as
e
:
self
.
running_log
.
error
(
"invoice info request to java error, url:{0}, param:{1}, errorMsg:{2}"
.
format
(
url
,
json
.
dumps
(
body
),
traceback
.
format_exc
()))
\ No newline at end of file
url
,
json
.
dumps
(
body
),
traceback
.
format_exc
()))
class
DocReOcrView
(
GenericView
,
DocHandler
):
permission_classes
=
[
IsAuthenticated
]
authentication_classes
=
[
OAuth2AuthenticationWithUser
]
# required_scopes = ['write']
# 现有文件重新识别接口
@use_args
(
doc_reocr_args
,
location
=
'data'
)
def
post
(
self
,
request
,
args
):
start_time
=
time
.
time
()
application_entity
=
args
.
get
(
'application_entity'
)
doc_id
=
args
.
get
(
'doc_id'
)
doc_class
,
prefix
=
self
.
get_doc_class
(
application_entity
)
doc
=
doc_class
.
objects
.
filter
(
id
=
doc_id
)
.
first
()
# 3. 选择队列进入
is_priority
=
PriorityApplication
.
objects
.
filter
(
application_id
=
doc
.
application_id
,
on_off
=
True
)
.
exists
()
is_zip
=
False
classify_1
=
0
# 电子合同 Econtract or OVP(FSM)
if
doc
.
data_source
==
consts
.
DATA_SOURCE_LIST
[
2
]
or
doc
.
data_source
==
consts
.
DATA_SOURCE_LIST
[
3
]:
if
doc
.
document_scheme
==
consts
.
DOC_SCHEME_LIST
[
1
]:
for
keyword
,
classify_1_tmp
in
consts
.
ECONTRACT_KEYWORDS_MAP
.
get
(
prefix
):
if
keyword
in
doc
.
document_name
:
classify_1
=
classify_1_tmp
break
# FSM合同:WEP/MSI/SC/SC2
elif
doc
.
data_source
==
consts
.
DATA_SOURCE_LIST
[
0
]
and
doc
.
document_scheme
==
consts
.
DOC_SCHEME_LIST
[
0
]:
for
keyword
,
classify_1_tmp
in
consts
.
FSM_ECONTRACT_KEYWORDS_MAP
.
get
(
prefix
):
if
keyword
in
doc
.
document_name
:
classify_1
=
classify_1_tmp
break
if
doc
.
document_name
.
endswith
(
'.zip'
)
or
doc
.
document_name
.
endswith
(
'.rar'
)
or
doc
.
document_name
.
endswith
(
'.ZIP'
)
\
or
doc
.
document_name
.
endswith
(
'.RAR'
):
is_zip
=
True
# task = 'AFC_11001_0_Y' 'AFC_11001_0_N' 最后的Y,N表示是否是reocr,N否,Y是
task
=
consts
.
SPLIT_STR
.
join
([
prefix
,
str
(
doc
.
id
),
str
(
classify_1
),
'Y'
])
enqueue_res
=
rh
.
enqueue
([
task
],
is_priority
,
is_zip
)
self
.
running_log
.
info
(
'[doc upload success] [args={0}] [business_type={1}] [doc_id={2}] '
'[is_priority={3}] [enqueue_res={4}] [classify_1={5}]'
.
format
(
args
,
prefix
,
doc
.
id
,
is_priority
,
enqueue_res
,
classify_1
))
return
response
.
ok
()
\ No newline at end of file
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment