Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
c364c248
authored
2020-06-18 17:31:43 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
update pdf process
1 parent
7594db7e
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
226 additions
and
14 deletions
docs/main.yaml
src/apps/doc/management/commands/doc_process.py
src/apps/doc/views.py
src/common/api_doc.py
src/common/tools/pdf_tools.py
docs/main.yaml
View file @
c364c24
...
...
@@ -102,11 +102,18 @@ definitions:
documentScheme
:
description
:
文件格式?
type
:
string
example
:
CO00001
example
:
Acceptance
enum
:
-
Acceptance
-
Settlement
-
Contract Management
businessType
:
description
:
业务类型
type
:
string
example
:
HIL
example
:
CO00001
enum
:
-
CO00001
-
CO00002
uploadFinishTime
:
description
:
上传完成时间
type
:
string
...
...
@@ -115,6 +122,10 @@ definitions:
description
:
数据源
type
:
string
example
:
POS
enum
:
-
POS
-
EAPP
-
Econtract
metadataVersionId
:
description
:
元数据版本ID
type
:
string
...
...
src/apps/doc/management/commands/doc_process.py
View file @
c364c24
...
...
@@ -23,5 +23,5 @@ class Command(BaseCommand):
# PDF文件分页转化为图片
# 图片调用算法判断是否为银行流水
# 图片调用算法OCR为excel文件
# excel文件上传至EDMS
#
整合
excel文件上传至EDMS
pass
...
...
src/apps/doc/views.py
View file @
c364c24
...
...
@@ -25,10 +25,11 @@ applicant_data_args = {
document_args
=
{
'documentName'
:
fields
.
Str
(
required
=
True
,
validate
=
validate
.
Length
(
max
=
255
)),
# Acceptance/Settlement/Contract Management
'documentScheme'
:
fields
.
Str
(
required
=
True
,
validate
=
validate
.
Length
(
max
=
64
)),
'businessType'
:
fields
.
Str
(
required
=
True
,
validate
=
validate
.
Length
(
max
=
64
)),
'businessType'
:
fields
.
Str
(
required
=
True
,
validate
=
validate
.
Length
(
max
=
64
)),
# CO00001/CO00002
'uploadFinishTime'
:
fields
.
DateTime
(
required
=
True
),
'dataSource'
:
fields
.
Str
(
required
=
True
,
validate
=
validate
.
Length
(
max
=
64
)),
'dataSource'
:
fields
.
Str
(
required
=
True
,
validate
=
validate
.
Length
(
max
=
64
)),
# POS/EAPP/Econtract
'metadataVersionId'
:
fields
.
Str
(
required
=
True
,
validate
=
validate
.
Length
(
max
=
64
)),
}
...
...
src/common/api_doc.py
View file @
c364c24
...
...
@@ -78,11 +78,13 @@ Doc:
documentScheme:
description: 文件格式?
type: string
example: CO00001
example: Acceptance
enum: [Acceptance, Settlement, Contract Management]
businessType:
description: 业务类型
type: string
example: HIL
example: CO00001
enum: [CO00001, CO00002]
uploadFinishTime:
description: 上传完成时间
type: string
...
...
@@ -91,6 +93,7 @@ Doc:
description: 数据源
type: string
example: POS
enum: [POS, EAPP, Econtract]
metadataVersionId:
description: 元数据版本ID
type: string
...
...
src/common/tools/pdf_tools.py
View file @
c364c24
import
fitz
import
os
from
PIL
import
Image
,
ImageCms
from
io
import
BytesIO
class
PdfHandler
:
...
...
@@ -8,16 +10,24 @@ class PdfHandler:
self
.
pdf_path
=
pdf_path
self
.
pdf_name
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
pdf_path
))[
0
]
def
to_pix_img
(
self
,
save_dir_path
,
zoom_x
,
zoom_y
):
def
page_
to_pix_img
(
self
,
save_dir_path
,
zoom_x
,
zoom_y
):
trans
=
fitz
.
Matrix
(
zoom_x
,
zoom_y
)
.
preRotate
(
0
)
# zoom factor 2 in each dimension
with
fitz
.
Document
(
self
.
pdf_path
)
as
pdf
:
# print(pdf.metadata)
# print(pdf.getPageImageList(0))
# print(pdf.getToC()) # 获取大纲
for
page
in
pdf
:
pm
=
page
.
getPixmap
(
matrix
=
trans
,
alpha
=
False
)
# 获得每一页的流对象
pm
=
page
.
getPixmap
(
matrix
=
trans
,
alpha
=
False
)
# print(pm.samples) # a rectangular area of bytes representing the image data (a Python bytes object).
# print(pm.width)
# print(pm.height)
# print(pm.stride) # number of bytes of one horizontal image line)
save_path
=
os
.
path
.
join
(
save_dir_path
,
'{0}_{1}.png'
.
format
(
self
.
pdf_name
,
page
.
number
))
pm
.
writePNG
(
save_path
)
# pm.writePNG(save_path)
pm
.
writeImage
(
save_path
)
def
to_svg_img
(
self
,
save_dir_path
):
def
page_
to_svg_img
(
self
,
save_dir_path
):
with
fitz
.
Document
(
self
.
pdf_path
)
as
pdf
:
for
page
in
pdf
:
svg
=
page
.
getSVGimage
(
matrix
=
fitz
.
Identity
)
# UTF-8 string svg
...
...
@@ -25,8 +35,195 @@ class PdfHandler:
with
open
(
save_path
,
'w'
)
as
f
:
f
.
write
(
svg
)
@staticmethod
def
getimage
(
pix
):
if
pix
.
colorspace
.
n
!=
4
:
return
pix
tpix
=
fitz
.
Pixmap
(
fitz
.
csRGB
,
pix
)
return
tpix
def
recoverpix
(
self
,
doc
,
item
):
x
=
item
[
0
]
# xref of PDF image
s
=
item
[
1
]
# xref of its /SMask
is_rgb
=
True
if
item
[
5
]
==
'DeviceRGB'
else
False
# RGB
if
is_rgb
:
if
s
==
0
:
return
doc
.
extractImage
(
x
)
# we need to reconstruct the alpha channel with the smask
pix1
=
fitz
.
Pixmap
(
doc
,
x
)
pix2
=
fitz
.
Pixmap
(
doc
,
s
)
# create pixmap of the /SMask entry
# sanity check
if
not
(
pix1
.
irect
==
pix2
.
irect
and
pix1
.
alpha
==
pix2
.
alpha
==
0
and
pix2
.
n
==
1
):
pix2
=
None
return
self
.
getimage
(
pix1
)
pix
=
fitz
.
Pixmap
(
pix1
)
# copy of pix1, alpha channel added
pix
.
setAlpha
(
pix2
.
samples
)
# treat pix2.samples as alpha value
pix1
=
pix2
=
None
# free temp pixmaps
return
self
.
getimage
(
pix
)
# GRAY/CMYK
pix1
=
fitz
.
Pixmap
(
doc
,
x
)
pix
=
fitz
.
Pixmap
(
pix1
)
# copy of pix1, alpha channel added
if
s
!=
0
:
pix2
=
fitz
.
Pixmap
(
doc
,
s
)
# create pixmap of the /SMask entry
# sanity check
if
not
(
pix1
.
irect
==
pix2
.
irect
and
pix1
.
alpha
==
pix2
.
alpha
==
0
and
pix2
.
n
==
1
):
pix2
=
None
return
self
.
getimage
(
pix1
)
pix
.
setAlpha
(
pix2
.
samples
)
# treat pix2.samples as alpha value
pix1
=
pix2
=
None
# free temp pixmaps
pix
=
fitz
.
Pixmap
(
fitz
.
csRGB
,
pix
)
# GRAY/CMYK to RGB
return
self
.
getimage
(
pix
)
def
extract_images
(
self
,
save_dir_path
):
dimlimit
=
100
# each image side must be greater than this
relsize
=
0.05
# image : pixmap size ratio must be larger than this (5%)
abssize
=
2048
# absolute image size limit 2 KB: ignore if smaller
imgdir
=
save_dir_path
# found images are stored in this subfolder
xreflist
=
[]
with
fitz
.
Document
(
self
.
pdf_path
)
as
pdf
:
for
pno
in
range
(
pdf
.
pageCount
):
il
=
pdf
.
getPageImageList
(
pno
)
for
img
in
il
:
print
(
img
)
xref
=
img
[
0
]
if
xref
in
xreflist
:
continue
width
=
img
[
2
]
height
=
img
[
3
]
print
(
xref
,
width
,
height
)
# if min(width, height) <= dimlimit:
# continue
pix
=
self
.
recoverpix
(
pdf
,
img
)
if
type
(
pix
)
is
dict
:
# we got a raw image
ext
=
pix
[
"ext"
]
imgdata
=
pix
[
"image"
]
n
=
pix
[
"colorspace"
]
imgfile
=
os
.
path
.
join
(
imgdir
,
"img-
%
i.
%
s"
%
(
xref
,
ext
))
else
:
# we got a pixmap
imgfile
=
os
.
path
.
join
(
imgdir
,
"img-
%
i.png"
%
xref
)
n
=
pix
.
n
imgdata
=
pix
.
getPNGData
()
# if len(imgdata) <= abssize:
# continue
#
# if len(imgdata) / (width * height * n) <= relsize:
# continue
fout
=
open
(
imgfile
,
"wb"
)
fout
.
write
(
imgdata
)
fout
.
close
()
xreflist
.
append
(
xref
)
def
split_il
(
self
,
il
):
img_il_list
=
[]
start
=
0
length
=
len
(
il
)
for
i
in
range
(
length
):
if
i
==
start
:
if
i
==
length
-
1
:
img_il_list
.
append
(
il
[
start
:
length
])
continue
elif
i
==
length
-
1
:
img_il_list
.
append
(
il
[
start
:
length
])
continue
if
il
[
i
][
2
]
!=
il
[
i
-
1
][
2
]:
img_il_list
.
append
(
il
[
start
:
i
])
start
=
i
elif
il
[
i
][
3
]
!=
il
[
i
-
1
][
3
]:
img_il_list
.
append
(
il
[
start
:
i
+
1
])
start
=
i
+
1
return
img_il_list
def
extract_images_pro
(
self
,
save_dir_path
):
with
fitz
.
Document
(
self
.
pdf_path
)
as
pdf
:
print
(
'----------------------------'
)
print
(
self
.
pdf_name
)
print
(
pdf
.
metadata
)
# xref_list = [] # TODO 图片去重
for
pno
in
range
(
pdf
.
pageCount
):
print
(
'========================'
)
il
=
pdf
.
getPageImageList
(
pno
)
il
.
sort
(
key
=
lambda
x
:
x
[
0
])
# (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
img_il_list
=
self
.
split_il
(
il
)
il
=
None
print
(
img_il_list
)
print
(
len
(
img_il_list
))
# TODO 判断单页图片过多时,使用页面转图片
for
img_count
,
img_il
in
enumerate
(
img_il_list
):
print
(
img_il
)
height_sum
=
0
im_list
=
[]
for
img
in
img_il
:
# xref = img[0]
# if xref in xref_list:
# continue
width
=
img
[
2
]
height
=
img
[
3
]
pix
=
self
.
recoverpix
(
pdf
,
img
)
if
type
(
pix
)
is
dict
:
# we got a raw image
ext
=
pix
[
"ext"
]
img_data
=
pix
[
"image"
]
else
:
# we got a pixmap
ext
=
'png'
img_data
=
pix
.
getPNGData
()
# xref_list.append(xref)
im
=
Image
.
open
(
BytesIO
(
img_data
))
im_list
.
append
((
width
,
height
,
im
,
ext
))
height_sum
+=
height
print
(
im_list
)
save_path
=
os
.
path
.
join
(
save_dir_path
,
'page_{0}_img_{1}.{2}'
.
format
(
pno
,
img_count
,
im_list
[
0
][
3
]))
# 当只有一张图片时, 简化处理
if
len
(
im_list
)
==
1
:
im_list
[
0
][
2
]
.
save
(
save_path
)
# 多张图片,竖向拼接
else
:
res
=
Image
.
new
(
im_list
[
0
][
2
]
.
mode
,
(
im_list
[
0
][
0
],
height_sum
))
h_now
=
0
for
_
,
h
,
m
,
_
in
im_list
:
res
.
paste
(
m
,
box
=
(
0
,
h_now
))
h_now
+=
h
res
.
save
(
save_path
)
if
__name__
==
'__main__'
:
pdf_handler
=
PdfHandler
(
'/Users/clay/Desktop/biz/pdf_test/test.pdf'
)
# pdf_handler.to_pix_img('/Users/clay/Desktop/biz/pdf_test/', 1.0, 1.0)
# pdf_handler.to_svg_img('/Users/clay/Desktop/biz/pdf_test/')
dir_path
=
'/Users/clay/Desktop/biz/pdf_test/银行流水/'
pdf_list
=
os
.
listdir
(
dir_path
)
for
path
in
pdf_list
:
if
path
==
'.DS_Store'
:
continue
pdf_handler
=
PdfHandler
(
os
.
path
.
join
(
dir_path
,
path
))
save_path
=
os
.
path
.
join
(
'/Users/clay/Desktop/biz/pdf_test/'
,
'test'
,
os
.
path
.
splitext
(
os
.
path
.
basename
(
path
))[
0
])
os
.
mkdir
(
save_path
)
pdf_handler
.
extract_images_pro
(
save_path
)
# pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/银行流水/竖版-特殊-邮储银行-一本通绿卡通交易明细(客户).pdf')
# pdf_handler = PdfHandler('/Users/clay/Desktop/biz/pdf_test/银行流水/横版-表格-工商银行 借记卡账户历史明细清单 .pdf')
# pdf_handler.page_to_pix_img('/Users/clay/Desktop/biz/pdf_test/', 3.0, 3.0)
# pdf_handler.page_to_svg_img('/Users/clay/Desktop/biz/pdf_test/')
# pdf_handler.extract_images_pro('/Users/clay/Desktop/biz/pdf_test/test')
# pix = fitz.Pixmap(sys.argv[1]) # read image file
# rgb = "RGB" # set PIL parameter
# if pix.alpha: # JPEG cannot have alpha!
# pix0 = fitz.Pixmap(pix, 0) # drop alpha channel
# pix = pix0 # rename pixmap
#
# img = Image.frombuffer(rgb, [pix.width, pix.height], pix.samples, "raw", rgb, 0, 1)
# img.save(outputFileName)
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment