Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
c1f24adf
authored
2021-08-09 10:21:56 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Plain Diff
Merge branch 'feature/0611' of gitlab.situdata.com:zhouweiqi/bmw-ocr into feature/0611
2 parents
b8745dc6
906f258d
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
48 additions
and
28 deletions
src/apps/doc/management/commands/ocr_process.py
src/celery_compare/tasks.py
src/common/tools/pdf_to_img.py
src/apps/doc/management/commands/ocr_process.py
View file @
c1f24ad
...
...
@@ -585,7 +585,7 @@ class Command(BaseCommand, LoggerMixin):
img_save_path
=
os
.
path
.
join
(
doc_data_path
,
'img'
)
pdf_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.pdf'
.
format
(
doc
.
id
))
pdf_handler
=
PDFHandler
(
pdf_path
,
img_save_path
)
pdf_handler
=
PDFHandler
(
pdf_path
,
img_save_path
,
doc
.
document_name
)
max_count_obj
=
Configs
.
objects
.
filter
(
id
=
2
)
.
first
()
try
:
max_img_count
=
int
(
max_count_obj
.
value
)
...
...
src/celery_compare/tasks.py
View file @
c1f24ad
...
...
@@ -27,7 +27,7 @@ from apps.doc.named_enum import RequestTeam, RequestTrigger, ProcessName
from
common.tools.comparison
import
cp
compare_log
=
logging
.
getLogger
(
'compare'
)
log_base
=
'[C
A C
ompare]'
log_base
=
'[Compare]'
def
name_check
(
ocr_res_dict
,
second_ocr_field
,
second_compare_list
,
second_id_num
,
name
):
...
...
src/common/tools/pdf_to_img.py
View file @
c1f24ad
import
os
import
shutil
import
fitz
from
PIL
import
Image
from
io
import
BytesIO
...
...
@@ -22,12 +23,25 @@ WH_COUPLE_5 = (100, 200)
class
PDFHandler
:
def
__init__
(
self
,
path
,
img_dir_path
):
def
__init__
(
self
,
path
,
img_dir_path
,
document_name
=
None
):
self
.
path
=
path
self
.
img_dir_path
=
img_dir_path
self
.
img_path_list
=
[]
self
.
img_count
=
0
self
.
xref_set
=
set
()
self
.
img_suffixs
=
{
'.jpeg'
,
'.jpg'
,
'.png'
,
'.webp'
,
'.bmp'
}
self
.
suffix
=
self
.
get_suffix
(
document_name
)
def
get_suffix
(
self
,
file_name
):
if
file_name
is
None
:
return
None
try
:
_
,
src_suffix
=
os
.
path
.
splitext
(
file_name
)
lower_suffix
=
src_suffix
.
lower
()
if
lower_suffix
in
self
.
img_suffixs
:
return
lower_suffix
except
Exception
as
e
:
return
def
get_img_save_path
(
self
,
pno
,
img_index
=
0
,
ext
=
'png'
):
return
os
.
path
.
join
(
self
.
img_dir_path
,
'page_{0}_img_{1}.{2}'
.
format
(
pno
,
img_index
,
ext
))
...
...
@@ -197,33 +211,39 @@ class PDFHandler:
self
.
img_path_list
=
[]
self
.
xref_set
=
set
()
os
.
makedirs
(
self
.
img_dir_path
,
exist_ok
=
True
)
with
fitz
.
Document
(
self
.
path
)
as
pdf
:
if
isinstance
(
max_img_count
,
int
)
and
pdf
.
pageCount
>=
max_img_count
:
self
.
img_count
=
pdf
.
pageCount
return
for
pno
in
range
(
pdf
.
pageCount
):
il
=
pdf
.
getPageImageList
(
pno
)
# 获取页面图片对象
# (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
# 1.页面图片对象数目为0时,保存整个页面为png图片
if
len
(
il
)
==
0
:
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
)
# 2.页面图片对象数目为1时:
# 小图(如电子账单的盖章):保存整个页面为png图片
# 大图:提取图片对象
elif
len
(
il
)
==
1
:
xref
,
smask
,
width
,
height
,
_
,
colorspace
,
_
,
_
,
_
=
il
[
0
]
# 小图
if
width
<
WH_COUPLE_1
[
0
]
and
height
<
WH_COUPLE_1
[
1
]:
if
self
.
suffix
in
self
.
img_suffixs
:
img_save_path
=
self
.
get_img_save_path
(
0
,
ext
=
self
.
suffix
[
1
:])
shutil
.
copy
(
self
.
path
,
img_save_path
)
self
.
img_path_list
.
append
(
img_save_path
)
else
:
with
fitz
.
Document
(
self
.
path
)
as
pdf
:
if
isinstance
(
max_img_count
,
int
)
and
pdf
.
pageCount
>=
max_img_count
:
self
.
img_count
=
pdf
.
pageCount
return
for
pno
in
range
(
pdf
.
pageCount
):
il
=
pdf
.
getPageImageList
(
pno
)
# 获取页面图片对象
# (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
# 1.页面图片对象数目为0时,保存整个页面为png图片
if
len
(
il
)
==
0
:
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
)
# 大图
elif
xref
not
in
self
.
xref_set
:
self
.
extract_single_image
(
pdf
,
xref
,
smask
,
colorspace
,
pno
)
# 3.页面图片对象数目大于1时,特殊处理
else
:
self
.
merge_il
(
pdf
,
pno
,
il
)
# 2.页面图片对象数目为1时:
# 小图(如电子账单的盖章):保存整个页面为png图片
# 大图:提取图片对象
elif
len
(
il
)
==
1
:
xref
,
smask
,
width
,
height
,
_
,
colorspace
,
_
,
_
,
_
=
il
[
0
]
# 小图
if
width
<
WH_COUPLE_1
[
0
]
and
height
<
WH_COUPLE_1
[
1
]:
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
)
# 大图
elif
xref
not
in
self
.
xref_set
:
self
.
extract_single_image
(
pdf
,
xref
,
smask
,
colorspace
,
pno
)
# 3.页面图片对象数目大于1时,特殊处理
else
:
self
.
merge_il
(
pdf
,
pno
,
il
)
self
.
img_count
=
len
(
self
.
img_path_list
)
def
extract_page_image
(
self
):
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment