Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
bb0678cb
authored
2021-09-23 18:25:32 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Plain Diff
Merge branch 'feature/ebank' into feature/0918
2 parents
369697a8
4383b4d1
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
72 additions
and
4 deletions
src/apps/doc/management/commands/ocr_process.py
src/common/tools/pdf_to_img.py
src/apps/doc/management/commands/ocr_process.py
View file @
bb0678c
...
...
@@ -730,11 +730,18 @@ class Command(BaseCommand, LoggerMixin):
else
:
with
lock
:
todo_count_dict
[
task_str
]
=
pdf_handler
.
img_count
for
img_
path
in
pdf_handler
.
img_path_list
:
for
img_
idx
,
img_path
in
enumerate
(
pdf_handler
.
img_path_list
)
:
while
img_queue
.
full
():
self
.
online_log
.
info
(
'{0} [pdf_2_img_2_queue] [img queue full]'
.
format
(
self
.
log_base
))
time
.
sleep
(
self
.
sleep_time_img_put
)
img_queue
.
put
(
img_path
)
if
pdf_handler
.
is_ebank
:
try
:
text_list
=
pdf_handler
.
page_text_list
[
img_idx
]
.
pop
(
'rebuild_text'
)
except
Exception
as
e
:
text_list
=
[]
else
:
text_list
=
[]
img_queue
.
put
((
img_path
,
text_list
))
# except EDMSException as e:
# try:
# doc.status = DocStatus.PROCESS_FAILED.value
...
...
@@ -779,7 +786,7 @@ class Command(BaseCommand, LoggerMixin):
def
img_2_ocr_1
(
self
,
img_queue
,
todo_count_dict
,
res_dict
,
finish_queue
,
lock
,
url
,
error_list
):
while
len
(
error_list
)
==
0
or
not
img_queue
.
empty
():
try
:
img_path
=
img_queue
.
get
(
block
=
False
)
img_path
,
text_list
=
img_queue
.
get
(
block
=
False
)
except
Exception
as
e
:
# self.online_log.info('{0} [img_2_ocr_1] [queue empty]'.format(self.log_base))
time
.
sleep
(
self
.
sleep_time_img_get
)
...
...
@@ -797,6 +804,8 @@ class Command(BaseCommand, LoggerMixin):
json_data_1
=
{
"file"
:
file_data
}
if
len
(
text_list
)
>
0
:
json_data_1
[
'text_list'
]
=
text_list
start_time
=
time
.
time
()
ocr_1_response
=
requests
.
post
(
url
,
json
=
json_data_1
)
...
...
src/common/tools/pdf_to_img.py
View file @
bb0678c
...
...
@@ -31,6 +31,8 @@ class PDFHandler:
self
.
xref_set
=
set
()
self
.
img_suffixs
=
{
'.jpeg'
,
'.jpg'
,
'.png'
,
'.webp'
,
'.bmp'
}
self
.
suffix
=
self
.
get_suffix
(
document_name
)
self
.
is_ebank
=
False
self
.
page_text_list
=
[]
def
get_suffix
(
self
,
file_name
):
if
file_name
is
None
:
...
...
@@ -46,6 +48,30 @@ class PDFHandler:
def
get_img_save_path
(
self
,
pno
,
img_index
=
0
,
ext
=
'png'
):
return
os
.
path
.
join
(
self
.
img_dir_path
,
'page_{0}_img_{1}.{2}'
.
format
(
pno
,
img_index
,
ext
))
def
rebuild_bbox
(
self
,
src_width
,
src_height
,
pno
):
try
:
width
=
self
.
page_text_list
[
pno
]
.
pop
(
'width'
)
height
=
self
.
page_text_list
[
pno
]
.
pop
(
'height'
)
src_text_list
=
self
.
page_text_list
[
pno
]
.
pop
(
'text'
)
width_scale
=
src_width
/
width
height_scale
=
src_height
/
height
rebuild_text_list
=
[]
for
bbox
,
text
in
src_text_list
:
x0
,
y0
,
x1
,
y1
=
bbox
x0
=
x0
*
width_scale
y0
=
y0
*
height_scale
x1
=
x1
*
width_scale
y1
=
y1
*
height_scale
rebuild_text_list
.
append
(
((
x0
,
y0
,
x1
,
y0
,
x1
,
y1
,
x0
,
y1
),
text
)
)
self
.
page_text_list
[
pno
][
'rebuild_text'
]
=
rebuild_text_list
except
Exception
as
e
:
pass
def
page_to_png
(
self
,
page
):
if
page
.
MediaBoxSize
.
x
>
1500
or
page
.
MediaBoxSize
.
y
>
1500
:
pm
=
page
.
getPixmap
(
matrix
=
trans_1
,
alpha
=
False
)
...
...
@@ -54,6 +80,8 @@ class PDFHandler:
img_save_path
=
self
.
get_img_save_path
(
page
.
number
)
pm
.
writePNG
(
img_save_path
)
self
.
img_path_list
.
append
(
img_save_path
)
if
self
.
is_ebank
:
self
.
rebuild_bbox
(
pm
.
width
,
pm
.
height
,
page
.
number
)
@staticmethod
def
getimage
(
pix
):
...
...
@@ -207,6 +235,36 @@ class PDFHandler:
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
)
def
check_ebank
(
self
,
pdf
):
page_text_list
=
[]
text_item_sum
=
0
for
pno
in
range
(
pdf
.
pageCount
):
page
=
pdf
.
loadPage
(
pno
)
textpage
=
page
.
getTextPage
()
text
=
textpage
.
extractDICT
()
text_list
=
[]
for
block
in
text
.
get
(
'blocks'
):
for
line
in
block
.
get
(
'lines'
):
for
span
in
line
.
get
(
'spans'
):
char
=
span
.
get
(
'text'
)
bbox
=
span
.
get
(
'bbox'
)
if
char
.
strip
()
==
''
:
continue
text_list
.
append
((
bbox
,
char
))
text_item_sum
+=
len
(
text_list
)
if
text_item_sum
<
(
pno
+
1
)
*
5
:
return
else
:
page_text_list
.
append
(
{
'width'
:
text
.
get
(
'width'
),
'height'
:
text
.
get
(
'height'
),
'text'
:
text_list
}
)
self
.
is_ebank
=
True
self
.
page_text_list
=
page_text_list
def
extract_image
(
self
,
max_img_count
=
None
):
self
.
img_path_list
=
[]
self
.
xref_set
=
set
()
...
...
@@ -221,12 +279,13 @@ class PDFHandler:
if
isinstance
(
max_img_count
,
int
)
and
pdf
.
pageCount
>=
max_img_count
:
self
.
img_count
=
pdf
.
pageCount
return
self
.
check_ebank
(
pdf
)
for
pno
in
range
(
pdf
.
pageCount
):
il
=
pdf
.
getPageImageList
(
pno
)
# 获取页面图片对象
# (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
# 1.页面图片对象数目为0时,保存整个页面为png图片
if
len
(
il
)
==
0
:
if
self
.
is_ebank
or
len
(
il
)
==
0
:
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
)
# 2.页面图片对象数目为1时:
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment