Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
090b26b2
authored
2022-08-29 15:57:53 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
special zfb
1 parent
38da4b7e
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
26 additions
and
3 deletions
src/common/tools/pdf_to_img.py
src/common/tools/pdf_to_img.py
View file @
090b26b
...
...
@@ -63,8 +63,8 @@ class PDFHandler:
self
.
page_text_list
=
[]
self
.
pdf_info
=
{}
self
.
img_path_pno_list
=
[]
# 注意影响self.title_idx
self
.
ebank_title_list
=
[
# '微信支付交易明细证明',
'支付宝收支明细证明'
,
'招商银行交易流水'
,
'中国工商银行借记账户历史明细'
,
...
...
@@ -72,10 +72,13 @@ class PDFHandler:
'平安银行个人账户交易明细清单'
,
'中国农业银行账戶活期交易明细清单'
,
'支付宝(中国)网络技术有限公司 交易流水证明'
,
'支付宝(中国)网络技术有限公司 交易流水证明'
'支付宝(中国)网络技术有限公司 交易流水证明'
,
# '微信支付交易明细证明',
]
self
.
page_count
=
None
self
.
metadata
=
None
self
.
title_idx
=
None
self
.
date_pattern
=
re
.
compile
(
r'^\d+ \d{4}(-)(1[0-2]|0?[1-9])\1(0?[1-9]|[1-2]\d|30|31)$'
)
@staticmethod
def
get_pwd_list
(
doc_name
,
pwd_list
):
...
...
@@ -133,6 +136,25 @@ class PDFHandler:
y0
=
y0
*
height_scale
x1
=
x1
*
width_scale
y1
=
y1
*
height_scale
# 支付宝 交易流水证明 '46428471991912802930901 2022-01-22' 切分日期
if
self
.
title_idx
in
{
6
,
7
}
and
self
.
date_pattern
.
match
(
text
):
try
:
split_x
=
x0
+
((
x1
-
x0
)
*
(
10
/
len
(
text
)))
date_str
=
text
[
-
10
:]
other_str
=
text
[:
-
10
]
rebuild_text_list
.
append
(
((
split_x
,
y0
,
x1
,
y0
,
x1
,
y1
,
split_x
,
y1
),
normalize
(
'NFKC'
,
date_str
))
)
rebuild_text_list
.
append
(
((
x0
,
y0
,
split_x
,
y0
,
split_x
,
y1
,
x0
,
y1
),
normalize
(
'NFKC'
,
other_str
))
)
except
Exception
as
e
:
rebuild_text_list
.
append
(
((
x0
,
y0
,
x1
,
y0
,
x1
,
y1
,
x0
,
y1
),
normalize
(
'NFKC'
,
text
))
)
else
:
rebuild_text_list
.
append
(
((
x0
,
y0
,
x1
,
y0
,
x1
,
y1
,
x0
,
y1
),
normalize
(
'NFKC'
,
text
))
)
...
...
@@ -305,8 +327,9 @@ class PDFHandler:
def
title_is_ebank
(
self
,
char
):
new_char
=
normalize
(
'NFKC'
,
char
)
for
title
in
self
.
ebank_title_list
:
for
title
_idx
,
title
in
enumerate
(
self
.
ebank_title_list
)
:
if
new_char
.
find
(
title
)
!=
-
1
:
self
.
title_idx
=
title_idx
return
True
return
False
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment