Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
272692c8
authored
2025-08-08 12:27:25 +0800
by
冯轩
Browse Files
Options
Browse Files
Tag
Download
Plain Diff
Merge branch 'feature/weixin-bs-2'
2 parents
e08e5c00
df94248b
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
128 additions
and
0 deletions
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/ocr/wb.py
src/apps/doc/views.py
src/common/tools/pdf_to_img.py
src/apps/doc/management/commands/ocr_process.py
View file @
272692c
This diff is collapsed.
Click to expand it.
src/apps/doc/ocr/wb.py
View file @
272692c
...
...
@@ -11,6 +11,8 @@ from openpyxl import Workbook
from
openpyxl.styles
import
PatternFill
,
numbers
from
openpyxl.utils
import
get_column_letter
from
apps.doc
import
consts
import
logging
online_log
=
logging
.
getLogger
(
'online'
)
class
BSWorkbook
(
Workbook
):
...
...
@@ -562,6 +564,8 @@ class BSWorkbook(Workbook):
borrow_cell
=
None
if
borrow_cell_idx
is
None
or
borrow_cell_idx
>=
length
else
rows
[
borrow_cell_idx
]
summary_cell_value
=
None
if
summary_cell
is
None
else
summary_cell
.
value
if
summary_cell
.
value
is
not
None
:
summary_cell_value
=
summary_cell_value
.
strip
()
date_cell_value
=
None
if
date_cell
is
None
else
date_cell
.
value
amount_cell_value
=
None
if
amount_cell
is
None
else
amount_cell
.
value
over_cell_value
=
None
if
over_cell
is
None
else
over_cell
.
value
...
...
@@ -638,6 +642,7 @@ class BSWorkbook(Workbook):
# 3.2.提取信息、高亮
# row = summary_cell.row
# online_log.info('[ti qu xin xi gao liang =========== >] [summary_cell_value={0}]'.format(summary_cell_value))
if
summary_cell
is
not
None
:
# 关键词1提取
if
summary_cell_value
in
self
.
interest_keyword
:
...
...
src/apps/doc/views.py
View file @
272692c
...
...
@@ -693,6 +693,14 @@ class UploadDocView(GenericView, DocHandler):
classify_1
=
classify_1_tmp
break
if
classify_1
==
0
and
(
'微信支付交易明细证明'
in
document_name
or
'微信流水'
in
document_name
):
classify_1
=
12
self
.
running_log
.
info
(
'[weixin bs process] [doc_id={0}]'
.
format
(
doc
.
id
))
if
classify_1
==
0
and
(
document_name
.
startswith
(
"dzfp_"
)):
classify_1
=
0
self
.
running_log
.
info
(
'[dzfp process] [doc_id={0}]'
.
format
(
doc
.
id
))
if
document_name
.
endswith
(
'.zip'
)
or
document_name
.
endswith
(
'.rar'
)
or
document_name
.
endswith
(
'.ZIP'
)
\
or
document_name
.
endswith
(
'.RAR'
):
...
...
@@ -1247,6 +1255,14 @@ class DocView(DocGenericView, DocHandler):
if
keyword
in
document_name
:
classify_1
=
classify_1_tmp
break
if
classify_1
==
0
and
(
'微信支付交易明细证明'
in
document_name
or
'微信流水'
in
document_name
):
classify_1
=
12
self
.
running_log
.
info
(
'[weixin bs process] [doc_id={0}]'
.
format
(
doc
.
id
))
if
classify_1
==
0
and
(
document_name
.
startswith
(
"dzfp_"
)):
classify_1
=
0
self
.
running_log
.
info
(
'[dzfp process] [doc_id={0}]'
.
format
(
doc
.
id
))
# tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)]
task
=
consts
.
SPLIT_STR
.
join
([
prefix
,
str
(
doc
.
id
),
str
(
classify_1
)])
...
...
src/common/tools/pdf_to_img.py
View file @
272692c
...
...
@@ -69,6 +69,7 @@ class PDFHandler:
self
.
suffix
=
self
.
get_suffix
(
document_name
)
self
.
is_ebank
=
False
self
.
is_e_pdf
=
False
self
.
is_e_weixin_bs
=
False
self
.
page_text_list
=
[]
self
.
pdf_info
=
{}
self
.
img_path_pno_list
=
[]
...
...
@@ -186,6 +187,8 @@ class PDFHandler:
self
.
img_path_list
.
append
(
img_save_path
)
if
self
.
is_ebank
:
self
.
rebuild_bbox
(
pm
.
width
,
pm
.
height
,
page
.
number
)
if
self
.
is_e_weixin_bs
:
self
.
rebuild_bbox
(
pm
.
width
,
pm
.
height
,
page
.
number
)
@staticmethod
def
getimage
(
pix
):
...
...
@@ -407,6 +410,57 @@ class PDFHandler:
self
.
is_e_pdf
=
True
self
.
page_text_list
=
page_text_list
def
put_text
(
self
,
pdf
):
page_text_list
=
[]
text_item_sum
=
0
for
pno
in
range
(
pdf
.
pageCount
):
page
=
pdf
.
loadPage
(
pno
)
if
page
.
rotation
is
None
:
rotation
=
0
elif
isinstance
(
page
.
rotation
,
int
):
divisor
,
remainder
=
divmod
(
page
.
rotation
,
90
)
if
remainder
!=
0
:
return
rotation
=
divmod
(
divisor
,
4
)[
1
]
else
:
return
textpage
=
page
.
getTextPage
()
text
=
textpage
.
extractDICT
()
text_list
=
[]
for
block
in
text
.
get
(
'blocks'
):
for
line
in
block
.
get
(
'lines'
):
for
span
in
line
.
get
(
'spans'
):
char
=
span
.
get
(
'text'
)
if
char
.
strip
()
==
''
:
continue
# 特殊emoji跳过
try
:
print
(
char
)
except
Exception
as
e
:
continue
bbox
=
span
.
get
(
'bbox'
)
if
pno
==
0
and
self
.
title_is_ebank
(
char
):
in_ebank_set
=
True
text_list
.
append
((
bbox
,
char
))
text_item_sum
+=
len
(
text_list
)
if
text_item_sum
<
(
pno
+
1
)
*
5
:
return
else
:
page_text_list
.
append
(
{
'width'
:
text
.
get
(
'width'
),
'height'
:
text
.
get
(
'height'
),
'rotation'
:
rotation
,
'text'
:
text_list
}
)
self
.
is_e_pdf
=
True
self
.
is_e_weixin_bs
=
True
self
.
page_text_list
=
page_text_list
def
e_contract_process
(
self
):
os
.
makedirs
(
self
.
img_dir_path
,
exist_ok
=
True
)
with
fitz
.
Document
(
self
.
path
)
as
pdf
:
...
...
@@ -473,6 +527,59 @@ class PDFHandler:
self
.
merge_il
(
pdf
,
pno
,
il
)
self
.
img_count
=
len
(
self
.
img_path_list
)
def
extract_image_for_weixin
(
self
,
max_img_count
=
None
):
self
.
img_path_list
=
[]
self
.
xref_set
=
set
()
os
.
makedirs
(
self
.
img_dir_path
,
exist_ok
=
True
)
if
self
.
suffix
in
self
.
img_suffixs
:
img_save_path
=
self
.
get_img_save_path
(
0
,
ext
=
self
.
suffix
[
1
:])
shutil
.
copy
(
self
.
path
,
img_save_path
)
self
.
img_path_list
.
append
(
img_save_path
)
else
:
with
fitz
.
Document
(
self
.
path
)
as
pdf
:
# 解密
for
pwd
in
self
.
pwd_list
:
if
not
pdf
.
isEncrypted
:
break
pdf
.
authenticate
(
pwd
)
self
.
metadata
=
pdf
.
metadata
self
.
page_count
=
pdf
.
pageCount
if
isinstance
(
max_img_count
,
int
)
and
pdf
.
pageCount
>=
max_img_count
:
self
.
img_count
=
pdf
.
pageCount
return
self
.
put_text
(
pdf
)
for
pno
in
range
(
pdf
.
pageCount
):
il
=
pdf
.
getPageImageList
(
pno
)
# 获取页面图片对象
# (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
# 1.页面图片对象数目为0时,保存整个页面为png图片
if
self
.
is_e_pdf
or
self
.
is_ebank
or
len
(
il
)
==
0
:
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
)
# 2.页面图片对象数目为1时:
# 小图(如电子账单的盖章):保存整个页面为png图片
# 大图:提取图片对象
elif
len
(
il
)
==
1
:
xref
,
smask
,
width
,
height
,
_
,
colorspace
,
_
,
_
,
_
=
il
[
0
]
# 小图
if
width
<
WH_COUPLE_1
[
0
]
and
height
<
WH_COUPLE_1
[
1
]:
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
)
# 大图
elif
width
>=
WH_COUPLE_6
[
0
]
or
height
>=
WH_COUPLE_6
[
1
]:
self
.
is_new_modify
=
1
is_big_img
=
(
width
<
WH_COUPLE_7
[
0
]
and
height
<
WH_COUPLE_7
[
1
])
# 防止图片过大
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
,
is_big_img
=
is_big_img
)
elif
xref
not
in
self
.
xref_set
:
self
.
extract_single_image
(
pdf
,
xref
,
smask
,
colorspace
,
pno
)
# 3.页面图片对象数目大于1时,特殊处理
else
:
self
.
merge_il
(
pdf
,
pno
,
il
)
self
.
img_count
=
len
(
self
.
img_path_list
)
def
extract_page_image
(
self
):
self
.
img_path_list
=
[]
self
.
xref_set
=
set
()
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment