Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
adb37243
authored
2025-07-23 16:05:55 +0800
by
冯轩
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
init
1 parent
e08e5c00
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
136 additions
and
1 deletions
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/views.py
src/apps/doc/management/commands/ocr_process.py
View file @
adb3724
...
...
@@ -1504,6 +1504,134 @@ class Command(BaseCommand, LoggerMixin):
self
.
log_base
,
traceback
.
format_exc
()))
# error_list.append(1)
# return
elif
classify_1_str
==
'29'
:
# e-invoice
try
:
max_img_count
=
500
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
if
doc
.
application_id
.
startswith
(
consts
.
FIXED_APPLICATION_ID_PREFIX
):
self
.
online_log
.
info
(
'{0} [mo ni xia dan] [task={1}] [times={2}] '
'[pdf_path={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
pdf_path
))
elif
os
.
path
.
exists
(
pdf_path
):
self
.
online_log
.
info
(
'{0} [pdf from zip file] [task={1}] [times={2}] '
'[pdf_path={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
pdf_path
))
else
:
# self.edms.download(pdf_path, doc.metadata_version_id)
self
.
edms
.
download
(
pdf_path
,
doc
.
metadata_version_id
,
doc
.
document_scheme
,
business_type
)
self
.
online_log
.
info
(
'{0} [ecm download success] [task={1}] [times={2}] '
'[pdf_path={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
pdf_path
))
# 3.PDF文件提取图片
self
.
online_log
.
info
(
'{0} [pdf to img start] [task={1}] [times={2}]'
.
format
(
self
.
log_base
,
task_str
,
times
))
start_time
=
time
.
time
()
pdf_handler
.
extract_image_for_weixin
(
max_img_count
)
#沿用微信流程
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
online_log
.
info
(
'{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
speed_time
,
pdf_handler
.
is_new_modify
))
except
Exception
as
e
:
self
.
online_log
.
warn
(
'{0} [download or pdf to img failed] [task={1}] [times={2}] '
'[error={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
traceback
.
format_exc
()))
else
:
break
else
:
raise
Exception
(
'download or pdf to img failed'
)
if
pdf_handler
.
img_count
==
0
:
self
.
online_log
.
warn
(
'{0} [pdf to img failed (pdf img empty)] [task={1}]'
.
format
(
self
.
log_base
,
task_str
))
raise
Exception
(
'pdf img empty'
)
elif
pdf_handler
.
img_count
>=
max_img_count
:
self
.
online_log
.
info
(
'{0} [too many pdf image] [task={1}] [img_count={2}]'
.
format
(
self
.
log_base
,
task_str
,
pdf_handler
.
img_count
))
try
:
report_table
=
HILOCRReport
if
business_type
==
consts
.
HIL_PREFIX
else
AFCOCRReport
report_table
.
objects
.
create
(
case_number
=
doc
.
application_id
,
request_team
=
RequestTeam
.
get_value
(
doc
.
document_scheme
,
0
),
request_trigger
=
RequestTrigger
.
get_value
(
doc
.
data_source
,
0
),
input_file
=
doc
.
document_name
,
transaction_start
=
doc
.
start_time
,
transaction_end
=
doc
.
start_time
,
successful_at_this_level
=
False
,
failure_reason
=
FailureReason
.
IMG_LIMIT
.
value
,
process_name
=
ProcessName
.
ALL
.
value
,
notes
=
'pdf page count: {0}'
.
format
(
str
(
pdf_handler
.
img_count
))
)
except
Exception
as
e
:
self
.
online_log
.
error
(
'{0} [process error (report db save)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
try
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
page_count
=
pdf_handler
.
page_count
doc
.
save
()
except
Exception
as
e
:
self
.
online_log
.
error
(
'{0} [process error (db save)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
else
:
try
:
if
pdf_handler
.
is_e_pdf
:
doc
.
metadata
=
pdf_handler
.
metadata
if
pdf_handler
.
metadata
is
None
else
\
json
.
dumps
(
pdf_handler
.
metadata
)
doc
.
page_count
=
pdf_handler
.
page_count
doc
.
save
()
except
Exception
as
e
:
self
.
online_log
.
error
(
'{0} [process error (db save)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
with
lock
:
todo_count_dict
[
task_str
]
=
pdf_handler
.
img_count
self
.
online_log
.
info
(
'{0} [pdf_2_img_2_queue] [{1}] [is_ebank={2}]'
.
format
(
self
.
log_base
,
task_str
,
pdf_handler
.
is_ebank
))
for
img_idx
,
img_path
in
enumerate
(
pdf_handler
.
img_path_list
):
while
img_queue
.
full
():
self
.
online_log
.
info
(
'{0} [pdf_2_img_2_queue] [img queue full]'
.
format
(
self
.
log_base
))
time
.
sleep
(
self
.
sleep_time_img_put
)
if
pdf_handler
.
is_e_weixin_bs
:
try
:
#self.online_log.info('{0} [pdf_2_img_2_queue] [img_idx={1}] [page_text_list={2}]'.format(self.log_base, img_idx, pdf_handler.page_text_list))
text_list
=
pdf_handler
.
page_text_list
[
img_idx
]
.
pop
(
'rebuild_text'
)
except
Exception
as
e
:
text_list
=
[]
else
:
text_list
=
[]
img_queue
.
put
((
business_type
,
img_path
,
text_list
))
except
Exception
as
e
:
try
:
end_time
=
timezone
.
now
()
report_table
=
HILOCRReport
if
business_type
==
consts
.
HIL_PREFIX
else
AFCOCRReport
report_table
.
objects
.
create
(
case_number
=
doc
.
application_id
,
request_team
=
RequestTeam
.
get_value
(
doc
.
document_scheme
,
0
),
request_trigger
=
RequestTrigger
.
get_value
(
doc
.
data_source
,
0
),
input_file
=
doc
.
document_name
,
transaction_start
=
doc
.
start_time
,
transaction_end
=
end_time
,
successful_at_this_level
=
False
,
failure_reason
=
FailureReason
.
PDF
.
value
,
process_name
=
ProcessName
.
ALL
.
value
,
)
except
Exception
as
e
:
self
.
online_log
.
error
(
'{0} [process error (report db save)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
try
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
page_count
=
pdf_handler
.
page_count
doc
.
save
()
self
.
online_log
.
warn
(
'{0} [process failed (pdf_2_img_2_queue)] [task={1}] '
'[error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
except
Exception
as
e
:
self
.
online_log
.
error
(
'{0} [process error (db save)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
else
:
# e-contract or or e-fsm-contract or e-hmh
try
:
# pdf下载 处理 图片存储 识别
...
...
src/apps/doc/views.py
View file @
adb3724
...
...
@@ -692,7 +692,10 @@ class UploadDocView(GenericView, DocHandler):
if
keyword
in
document_name
:
classify_1
=
classify_1_tmp
break
if
classify_1
==
0
and
(
document_name
.
startswith
(
'dzfp_'
)):
classify_1
=
29
self
.
running_log
.
info
(
'[dzfp process] [doc_id={0}]'
.
format
(
doc
.
id
))
if
document_name
.
endswith
(
'.zip'
)
or
document_name
.
endswith
(
'.rar'
)
or
document_name
.
endswith
(
'.ZIP'
)
\
or
document_name
.
endswith
(
'.RAR'
):
...
...
@@ -1248,6 +1251,10 @@ class DocView(DocGenericView, DocHandler):
classify_1
=
classify_1_tmp
break
if
classify_1
==
0
and
(
document_name
.
startswith
(
'dzfp_'
)):
classify_1
=
29
self
.
running_log
.
info
(
'[dzfp process] [doc_id={0}]'
.
format
(
doc
.
id
))
# tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)]
task
=
consts
.
SPLIT_STR
.
join
([
prefix
,
str
(
doc
.
id
),
str
(
classify_1
)])
enqueue_res
=
rh
.
enqueue
([
task
],
is_priority
)
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment