Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
2932c540
authored
2025-07-04 16:25:06 +0800
by
冯轩
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
init
1 parent
8ddb1d4c
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
245 additions
and
0 deletions
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/views.py
src/common/tools/pdf_to_img.py
src/apps/doc/management/commands/ocr_process.py
View file @
2932c54
...
...
@@ -1504,6 +1504,137 @@ class Command(BaseCommand, LoggerMixin):
self
.
log_base
,
traceback
.
format_exc
()))
# error_list.append(1)
# return
elif
classify_1_str
==
'12'
:
# weixin e-bs
try
:
max_img_count
=
500
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
if
doc
.
application_id
.
startswith
(
consts
.
FIXED_APPLICATION_ID_PREFIX
):
self
.
online_log
.
info
(
'{0} [mo ni xia dan] [task={1}] [times={2}] '
'[pdf_path={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
pdf_path
))
elif
os
.
path
.
exists
(
pdf_path
):
self
.
online_log
.
info
(
'{0} [pdf from zip file] [task={1}] [times={2}] '
'[pdf_path={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
pdf_path
))
else
:
# self.edms.download(pdf_path, doc.metadata_version_id)
self
.
edms
.
download
(
pdf_path
,
doc
.
metadata_version_id
,
doc
.
document_scheme
,
business_type
)
self
.
online_log
.
info
(
'{0} [ecm download success] [task={1}] [times={2}] '
'[pdf_path={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
pdf_path
))
# 3.PDF文件提取图片
self
.
online_log
.
info
(
'{0} [pdf to img start] [task={1}] [times={2}]'
.
format
(
self
.
log_base
,
task_str
,
times
))
start_time
=
time
.
time
()
pdf_handler
.
extract_image_for_weixin
(
max_img_count
)
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
online_log
.
info
(
'{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
speed_time
,
pdf_handler
.
is_new_modify
))
except
Exception
as
e
:
self
.
online_log
.
warn
(
'{0} [download or pdf to img failed] [task={1}] [times={2}] '
'[error={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
traceback
.
format_exc
()))
else
:
break
else
:
raise
Exception
(
'download or pdf to img failed'
)
if
pdf_handler
.
img_count
==
0
:
self
.
online_log
.
warn
(
'{0} [pdf to img failed (pdf img empty)] [task={1}]'
.
format
(
self
.
log_base
,
task_str
))
raise
Exception
(
'pdf img empty'
)
elif
pdf_handler
.
img_count
>=
max_img_count
:
self
.
online_log
.
info
(
'{0} [too many pdf image] [task={1}] [img_count={2}]'
.
format
(
self
.
log_base
,
task_str
,
pdf_handler
.
img_count
))
try
:
report_table
=
HILOCRReport
if
business_type
==
consts
.
HIL_PREFIX
else
AFCOCRReport
report_table
.
objects
.
create
(
case_number
=
doc
.
application_id
,
request_team
=
RequestTeam
.
get_value
(
doc
.
document_scheme
,
0
),
request_trigger
=
RequestTrigger
.
get_value
(
doc
.
data_source
,
0
),
input_file
=
doc
.
document_name
,
transaction_start
=
doc
.
start_time
,
transaction_end
=
doc
.
start_time
,
successful_at_this_level
=
False
,
failure_reason
=
FailureReason
.
IMG_LIMIT
.
value
,
process_name
=
ProcessName
.
ALL
.
value
,
notes
=
'pdf page count: {0}'
.
format
(
str
(
pdf_handler
.
img_count
))
)
except
Exception
as
e
:
self
.
online_log
.
error
(
'{0} [process error (report db save)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
try
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
page_count
=
pdf_handler
.
page_count
doc
.
save
()
except
Exception
as
e
:
self
.
online_log
.
error
(
'{0} [process error (db save)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
else
:
try
:
if
pdf_handler
.
is_e_pdf
:
doc
.
metadata
=
pdf_handler
.
metadata
if
pdf_handler
.
metadata
is
None
else
\
json
.
dumps
(
pdf_handler
.
metadata
)
doc
.
page_count
=
pdf_handler
.
page_count
doc
.
save
()
except
Exception
as
e
:
self
.
online_log
.
error
(
'{0} [process error (db save)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
with
lock
:
todo_count_dict
[
task_str
]
=
pdf_handler
.
img_count
self
.
online_log
.
info
(
'{0} [pdf_2_img_2_queue] [{1}] [is_ebank={2}]'
.
format
(
self
.
log_base
,
task_str
,
pdf_handler
.
is_ebank
))
for
img_idx
,
img_path
in
enumerate
(
pdf_handler
.
img_path_list
):
while
img_queue
.
full
():
self
.
online_log
.
info
(
'{0} [pdf_2_img_2_queue] [img queue full]'
.
format
(
self
.
log_base
))
time
.
sleep
(
self
.
sleep_time_img_put
)
if
pdf_handler
.
is_e_weixin_bs
:
try
:
text_list
=
pdf_handler
.
page_text_list
except
Exception
as
e
:
text_list
=
[]
else
:
text_list
=
[]
img_queue
.
put
((
business_type
,
img_path
,
text_list
))
except
Exception
as
e
:
try
:
end_time
=
timezone
.
now
()
report_table
=
HILOCRReport
if
business_type
==
consts
.
HIL_PREFIX
else
AFCOCRReport
report_table
.
objects
.
create
(
case_number
=
doc
.
application_id
,
request_team
=
RequestTeam
.
get_value
(
doc
.
document_scheme
,
0
),
request_trigger
=
RequestTrigger
.
get_value
(
doc
.
data_source
,
0
),
input_file
=
doc
.
document_name
,
transaction_start
=
doc
.
start_time
,
transaction_end
=
end_time
,
successful_at_this_level
=
False
,
failure_reason
=
FailureReason
.
PDF
.
value
,
process_name
=
ProcessName
.
ALL
.
value
,
)
except
Exception
as
e
:
self
.
online_log
.
error
(
'{0} [process error (report db save)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
try
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
page_count
=
pdf_handler
.
page_count
doc
.
save
()
self
.
online_log
.
warn
(
'{0} [process failed (pdf_2_img_2_queue)] [task={1}] '
'[error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
except
Exception
as
e
:
self
.
online_log
.
error
(
'{0} [process error (db save)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
# error_list.append(1)
# return
else
:
# e-contract or or e-fsm-contract or e-hmh
try
:
# pdf下载 处理 图片存储 识别
...
...
@@ -1674,6 +1805,7 @@ class Command(BaseCommand, LoggerMixin):
json_data_1
[
'text_list'
]
=
text_list
start_time
=
time
.
time
()
self
.
online_log
.
info
(
'{0} [ocr_1 api] [img={1}] [json_data_1={2}]'
.
format
(
self
.
log_base
,
img_path
,
json_data_1
))
ocr_1_response
=
requests
.
post
(
url
,
json
=
json_data_1
)
if
ocr_1_response
.
status_code
!=
200
:
raise
OCR1Exception
(
'ocr_1 status code: {0}'
.
format
(
ocr_1_response
.
status_code
))
...
...
src/apps/doc/views.py
View file @
2932c54
...
...
@@ -684,6 +684,10 @@ class UploadDocView(GenericView, DocHandler):
classify_1
=
classify_1_tmp
break
if
classify_1
==
0
and
(
'微信支付交易明细证明'
in
document_name
or
'微信流水'
in
document_name
):
classify_1
=
12
self
.
running_log
.
info
(
'[weixin bs process] [doc_id={0}]'
.
format
(
doc
.
id
))
if
document_name
.
endswith
(
'.zip'
)
or
document_name
.
endswith
(
'.rar'
)
or
document_name
.
endswith
(
'.ZIP'
)
\
or
document_name
.
endswith
(
'.RAR'
):
...
...
@@ -1239,6 +1243,10 @@ class DocView(DocGenericView, DocHandler):
classify_1
=
classify_1_tmp
break
if
classify_1
==
0
and
(
'微信支付交易明细证明'
in
document_name
or
'微信流水'
in
document_name
):
classify_1
=
12
self
.
running_log
.
info
(
'[weixin bs process] [doc_id={0}]'
.
format
(
doc
.
id
))
# tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)]
task
=
consts
.
SPLIT_STR
.
join
([
prefix
,
str
(
doc
.
id
),
str
(
classify_1
)])
enqueue_res
=
rh
.
enqueue
([
task
],
is_priority
)
...
...
src/common/tools/pdf_to_img.py
View file @
2932c54
...
...
@@ -69,6 +69,7 @@ class PDFHandler:
self
.
suffix
=
self
.
get_suffix
(
document_name
)
self
.
is_ebank
=
False
self
.
is_e_pdf
=
False
self
.
is_e_weixin_bs
=
False
self
.
page_text_list
=
[]
self
.
pdf_info
=
{}
self
.
img_path_pno_list
=
[]
...
...
@@ -407,6 +408,57 @@ class PDFHandler:
self
.
is_e_pdf
=
True
self
.
page_text_list
=
page_text_list
def
put_text
(
self
,
pdf
):
page_text_list
=
[]
text_item_sum
=
0
for
pno
in
range
(
pdf
.
pageCount
):
page
=
pdf
.
loadPage
(
pno
)
if
page
.
rotation
is
None
:
rotation
=
0
elif
isinstance
(
page
.
rotation
,
int
):
divisor
,
remainder
=
divmod
(
page
.
rotation
,
90
)
if
remainder
!=
0
:
return
rotation
=
divmod
(
divisor
,
4
)[
1
]
else
:
return
textpage
=
page
.
getTextPage
()
text
=
textpage
.
extractDICT
()
text_list
=
[]
for
block
in
text
.
get
(
'blocks'
):
for
line
in
block
.
get
(
'lines'
):
for
span
in
line
.
get
(
'spans'
):
char
=
span
.
get
(
'text'
)
if
char
.
strip
()
==
''
:
continue
# 特殊emoji跳过
try
:
print
(
char
)
except
Exception
as
e
:
continue
bbox
=
span
.
get
(
'bbox'
)
if
pno
==
0
and
self
.
title_is_ebank
(
char
):
in_ebank_set
=
True
text_list
.
append
((
bbox
,
char
))
text_item_sum
+=
len
(
text_list
)
if
text_item_sum
<
(
pno
+
1
)
*
5
:
return
else
:
page_text_list
.
append
(
{
'width'
:
text
.
get
(
'width'
),
'height'
:
text
.
get
(
'height'
),
'rotation'
:
rotation
,
'text'
:
text_list
}
)
self
.
is_e_pdf
=
True
self
.
is_e_weixin_bs
=
True
self
.
page_text_list
=
page_text_list
def
e_contract_process
(
self
):
os
.
makedirs
(
self
.
img_dir_path
,
exist_ok
=
True
)
with
fitz
.
Document
(
self
.
path
)
as
pdf
:
...
...
@@ -473,6 +525,59 @@ class PDFHandler:
self
.
merge_il
(
pdf
,
pno
,
il
)
self
.
img_count
=
len
(
self
.
img_path_list
)
def
extract_image_for_weixin
(
self
,
max_img_count
=
None
):
self
.
img_path_list
=
[]
self
.
xref_set
=
set
()
os
.
makedirs
(
self
.
img_dir_path
,
exist_ok
=
True
)
if
self
.
suffix
in
self
.
img_suffixs
:
img_save_path
=
self
.
get_img_save_path
(
0
,
ext
=
self
.
suffix
[
1
:])
shutil
.
copy
(
self
.
path
,
img_save_path
)
self
.
img_path_list
.
append
(
img_save_path
)
else
:
with
fitz
.
Document
(
self
.
path
)
as
pdf
:
# 解密
for
pwd
in
self
.
pwd_list
:
if
not
pdf
.
isEncrypted
:
break
pdf
.
authenticate
(
pwd
)
self
.
metadata
=
pdf
.
metadata
self
.
page_count
=
pdf
.
pageCount
if
isinstance
(
max_img_count
,
int
)
and
pdf
.
pageCount
>=
max_img_count
:
self
.
img_count
=
pdf
.
pageCount
return
self
.
put_text
(
pdf
)
for
pno
in
range
(
pdf
.
pageCount
):
il
=
pdf
.
getPageImageList
(
pno
)
# 获取页面图片对象
# (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
# 1.页面图片对象数目为0时,保存整个页面为png图片
if
self
.
is_e_pdf
or
self
.
is_ebank
or
len
(
il
)
==
0
:
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
)
# 2.页面图片对象数目为1时:
# 小图(如电子账单的盖章):保存整个页面为png图片
# 大图:提取图片对象
elif
len
(
il
)
==
1
:
xref
,
smask
,
width
,
height
,
_
,
colorspace
,
_
,
_
,
_
=
il
[
0
]
# 小图
if
width
<
WH_COUPLE_1
[
0
]
and
height
<
WH_COUPLE_1
[
1
]:
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
)
# 大图
elif
width
>=
WH_COUPLE_6
[
0
]
or
height
>=
WH_COUPLE_6
[
1
]:
self
.
is_new_modify
=
1
is_big_img
=
(
width
<
WH_COUPLE_7
[
0
]
and
height
<
WH_COUPLE_7
[
1
])
# 防止图片过大
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
,
is_big_img
=
is_big_img
)
elif
xref
not
in
self
.
xref_set
:
self
.
extract_single_image
(
pdf
,
xref
,
smask
,
colorspace
,
pno
)
# 3.页面图片对象数目大于1时,特殊处理
else
:
self
.
merge_il
(
pdf
,
pno
,
il
)
self
.
img_count
=
len
(
self
.
img_path_list
)
def
extract_page_image
(
self
):
self
.
img_path_list
=
[]
self
.
xref_set
=
set
()
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment