Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
cb4acc51
authored
2021-11-15 17:43:31 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
ebank part 2
1 parent
d24fcf2c
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
38 additions
and
21 deletions
src/common/tools/pdf_to_img.py
src/common/tools/pdf_to_img.py
View file @
cb4acc5
...
...
@@ -39,6 +39,14 @@ class PDFHandler:
self
.
page_text_list
=
[]
self
.
pdf_info
=
{}
self
.
img_path_pno_list
=
[]
self
.
ebank_title_list
=
[
'微信支付交易明细证明'
,
'支付宝收支明细证明'
,
'招商银行交易流水'
,
'中国工商银行借记账户历史明细'
,
'中国建设银行个人活期账户全部交易明细'
,
'平安银行个人账户交易明细清单'
,
]
def
get_suffix
(
self
,
file_name
):
if
file_name
is
None
:
...
...
@@ -260,20 +268,27 @@ class PDFHandler:
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
)
def
title_is_ebank
(
self
,
char
):
for
title
in
self
.
ebank_title_list
:
if
title
.
find
(
char
)
!=
-
1
or
char
.
find
(
title
)
!=
-
1
:
return
True
return
False
def
check_ebank
(
self
,
pdf
):
#
page_text_list = []
page_text_list
=
[]
text_item_sum
=
0
in_ebank_set
=
False
for
pno
in
range
(
pdf
.
pageCount
):
page
=
pdf
.
loadPage
(
pno
)
#
if page.rotation is None:
#
rotation = 0
#
elif isinstance(page.rotation, int):
#
divisor, remainder = divmod(page.rotation, 90)
#
if remainder != 0:
#
return
#
rotation = divmod(divisor, 4)[1]
#
else:
#
return
if
page
.
rotation
is
None
:
rotation
=
0
elif
isinstance
(
page
.
rotation
,
int
):
divisor
,
remainder
=
divmod
(
page
.
rotation
,
90
)
if
remainder
!=
0
:
return
rotation
=
divmod
(
divisor
,
4
)[
1
]
else
:
return
textpage
=
page
.
getTextPage
()
text
=
textpage
.
extractDICT
()
text_list
=
[]
...
...
@@ -284,22 +299,24 @@ class PDFHandler:
bbox
=
span
.
get
(
'bbox'
)
if
char
.
strip
()
==
''
:
continue
if
pno
==
0
and
self
.
title_is_ebank
(
char
):
in_ebank_set
=
True
text_list
.
append
((
bbox
,
char
))
text_item_sum
+=
len
(
text_list
)
if
text_item_sum
<
(
pno
+
1
)
*
5
:
return
#
else:
#
page_text_list.append(
#
{
#
'width': text.get('width'),
#
'height': text.get('height'),
#
'rotation': rotation,
#
'text': text_list
#
}
#
)
# self.is_ebank = True
else
:
page_text_list
.
append
(
{
'width'
:
text
.
get
(
'width'
),
'height'
:
text
.
get
(
'height'
),
'rotation'
:
rotation
,
'text'
:
text_list
}
)
self
.
is_ebank
=
in_ebank_set
self
.
is_e_pdf
=
True
#
self.page_text_list = page_text_list
self
.
page_text_list
=
page_text_list
def
e_contract_process
(
self
):
os
.
makedirs
(
self
.
img_dir_path
,
exist_ok
=
True
)
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment