Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
a02a957e
authored
2020-06-23 15:31:28 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
ocr process
1 parent
f8904dcb
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
113 additions
and
15 deletions
requirements/base.txt
src/apps/doc/management/commands/doc_process.py
src/apps/doc/mixins.py
src/common/tools/file_tools.py
requirements/base.txt
View file @
a02a957
aiohttp==3.6.2
async-timeout==3.0.1
attrs==19.3.0
certifi==2016.2.28
chardet==3.0.4
Django==2.1
# django-mysqlpool @ https://github.com/smartfile/django-mysqlpool/archive/master.zip
djangorestframework==3.9.0
djangorestframework-jwt==1.11.0
idna==2.9
idna-ssl==1.1.0
marshmallow==3.6.1
multidict==4.7.6
pdfminer3k==1.3.4
Pillow==7.1.2
ply==3.11
...
...
@@ -17,4 +24,7 @@ redis==3.4.1
# situlogger @ http://gitlab.situdata.com/zhouweiqi/situlogger/repository/archive.tar.gz?ref=master
six==1.14.0
SQLAlchemy==0.9.10
typing-extensions==3.7.4.2
webargs==6.1.0
xlwt==1.3.0
yarl==1.4.2
...
...
src/apps/doc/management/commands/doc_process.py
View file @
a02a957
import
time
import
os
import
signal
import
time
import
fitz
import
xlwt
import
signal
import
base64
import
asyncio
import
aiohttp
from
PIL
import
Image
from
io
import
BytesIO
from
django.core.management
import
BaseCommand
from
common.mixins
import
LoggerMixin
from
common.redis_cache
import
redis_handler
as
rh
from
common.tools.file_tools
import
write_zip_file
from
apps.doc.models
import
UploadDocRecords
,
DocStatus
from
settings
import
conf
...
...
@@ -25,6 +30,12 @@ class Command(BaseCommand, LoggerMixin):
self
.
zoom_x
=
2.0
self
.
zoom_y
=
2.0
self
.
trans
=
fitz
.
Matrix
(
self
.
zoom_x
,
self
.
zoom_y
)
.
preRotate
(
0
)
# zoom factor 2 in each dimension
# ocr相关
self
.
ocr_url
=
conf
.
OCR_URL
self
.
ocr_header
=
{
'X-Auth-Token'
:
conf
.
OCR_TOKEN
,
'Content-Type'
:
'application/json'
}
# 优雅退出信号:15
signal
.
signal
(
signal
.
SIGTERM
,
self
.
signal_handler
)
...
...
@@ -47,16 +58,52 @@ class Command(BaseCommand, LoggerMixin):
def
pdf_download
(
self
,
doc_info
):
if
doc_info
is
None
:
return
return
None
,
None
,
None
,
None
# TODO EDMS下载pdf
# pdf_path = '/Users/clay/Desktop/biz/biz_logic/data/2/横版-表格-工商银行CH-B008802400.pdf'
# doc_data_path = os.path.dirname(pdf_path)
doc_id
=
doc_info
[
'id'
]
doc_data_path
=
os
.
path
.
join
(
self
.
data_dir
,
str
(
doc_id
))
pdf_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.pdf'
.
format
(
doc_id
))
excel_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.xls'
.
format
(
doc_id
))
self
.
cronjob_log
.
info
(
'{0} [pdf download success] [doc_info={1}] [pdf_path={2}]'
.
format
(
self
.
log_base
,
doc_info
,
pdf_path
))
return
pdf_path
,
doc_data_path
return
doc_data_path
,
excel_path
,
pdf_path
,
doc_id
@staticmethod
def
append_sheet
(
wb
,
sheets_list
,
img_name
):
for
i
,
sheet
in
enumerate
(
sheets_list
):
ws
=
wb
.
add_sheet
(
'{0}_{1}'
.
format
(
img_name
,
i
))
cells
=
sheet
.
get
(
'cells'
)
for
cell
in
cells
:
c1
=
cell
.
get
(
'start_column'
)
c2
=
cell
.
get
(
'end_column'
)
r1
=
cell
.
get
(
'start_row'
)
r2
=
cell
.
get
(
'end_row'
)
label
=
cell
.
get
(
'words'
)
ws
.
write_merge
(
r1
,
r2
,
c1
,
c2
,
label
=
label
)
@staticmethod
def
get_ocr_json
(
img_path
):
with
open
(
img_path
,
"rb"
)
as
f
:
base64_data
=
base64
.
b64encode
(
f
.
read
())
return
{
'imgBase64'
:
base64_data
.
decode
(
'utf-8'
)}
async
def
fetch_ocr_result
(
self
,
img_path
):
async
with
aiohttp
.
ClientSession
(
headers
=
self
.
ocr_header
,
connector
=
aiohttp
.
TCPConnector
(
ssl
=
False
)
)
as
session
:
json_data
=
self
.
get_ocr_json
(
img_path
)
async
with
session
.
post
(
self
.
ocr_url
,
json
=
json_data
)
as
response
:
return
await
response
.
json
()
async
def
img_ocr_excel
(
self
,
wb
,
img_path
):
res
=
await
self
.
fetch_ocr_result
(
img_path
)
self
.
cronjob_log
.
info
(
'{0} [fetch ocr result success] [img={1}] [res={2}]'
.
format
(
self
.
log_base
,
img_path
,
res
))
sheets_list
=
res
.
get
(
'result'
)
.
get
(
'res'
)
img_name
=
os
.
path
.
basename
(
img_path
)
self
.
append_sheet
(
wb
,
sheets_list
,
img_name
)
@staticmethod
def
getimage
(
pix
):
...
...
@@ -143,16 +190,18 @@ class Command(BaseCommand, LoggerMixin):
# 从队列获取文件信息
doc_info
=
self
.
get_doc_info
()
# 从EDMS获取PDF文件
pdf_path
,
doc_data_path
=
self
.
pdf_download
(
doc_info
)
doc_data_path
,
excel_path
,
pdf_path
,
doc_id
=
self
.
pdf_download
(
doc_info
)
# 队列为空时的处理
if
pdf_path
is
None
:
time
.
sleep
(
10
)
continue
try
:
# PDF文件提取图片
img_save_path
=
os
.
path
.
join
(
doc_data_path
,
'img'
)
os
.
makedirs
(
img_save_path
,
exist_ok
=
True
)
img_path_list
=
[]
with
fitz
.
Document
(
pdf_path
)
as
pdf
:
self
.
cronjob_log
.
info
(
'{0} [pdf_path={1}] [pdf_
metadata={2}]'
.
format
(
self
.
cronjob_log
.
info
(
'{0} [pdf_path={1}] [
metadata={2}]'
.
format
(
self
.
log_base
,
pdf_path
,
pdf
.
metadata
))
# xref_list = [] # TODO 图片去重
for
pno
in
range
(
pdf
.
pageCount
):
...
...
@@ -166,15 +215,22 @@ class Command(BaseCommand, LoggerMixin):
pm
=
page
.
getPixmap
(
matrix
=
self
.
trans
,
alpha
=
False
)
save_path
=
os
.
path
.
join
(
img_save_path
,
'page_{0}_img_0.png'
.
format
(
page
.
number
))
pm
.
writePNG
(
save_path
)
img_path_list
.
append
(
save_path
)
self
.
cronjob_log
.
info
(
'{0} [page to img success] [pdf_path={1}] [page={2}]'
.
format
(
self
.
log_base
,
pdf_path
,
page
.
number
))
else
:
# 提取图片
for
img_count
,
img_il
in
enumerate
(
img_il_list
):
for
img_index
,
img_il
in
enumerate
(
img_il_list
):
if
len
(
img_il
)
==
1
:
# 当只有一张图片时, 简化处理
pix
=
self
.
recoverpix
(
pdf
,
img_il
[
0
])
ext
,
img_data
=
self
.
get_img_data
(
pix
)
save_path
=
os
.
path
.
join
(
img_save_path
,
'page_{0}_img_{1}.{2}'
.
format
(
pno
,
img_count
,
ext
))
pno
,
img_index
,
ext
))
with
open
(
save_path
,
"wb"
)
as
f
:
f
.
write
(
img_data
)
img_path_list
.
append
(
save_path
)
self
.
cronjob_log
.
info
(
'{0} [extract img success] [pdf_path={1}] [page={2}] [img_index={3}]'
.
format
(
self
.
log_base
,
pdf_path
,
pno
,
img_index
))
else
:
# 多张图片,竖向拼接
height_sum
=
0
im_list
=
[]
...
...
@@ -194,14 +250,31 @@ class Command(BaseCommand, LoggerMixin):
height_sum
+=
height
save_path
=
os
.
path
.
join
(
img_save_path
,
'page_{0}_img_{1}.{2}'
.
format
(
pno
,
img_count
,
im_list
[
0
][
2
]))
pno
,
img_index
,
im_list
[
0
][
2
]))
res
=
Image
.
new
(
im_list
[
0
][
1
]
.
mode
,
(
width
,
height_sum
))
h_now
=
0
for
h
,
m
,
_
in
im_list
:
res
.
paste
(
m
,
box
=
(
0
,
h_now
))
h_now
+=
h
res
.
save
(
save_path
)
img_path_list
.
append
(
save_path
)
self
.
cronjob_log
.
info
(
'{0} [extract img success] [pdf_path={1}] [page={2}] [img_index={3}]'
.
format
(
self
.
log_base
,
pdf_path
,
pno
,
img_index
))
self
.
cronjob_log
.
info
(
'{0} [pdf to img success]'
.
format
(
self
.
log_base
))
# 图片调用算法判断是否为银行流水
# 图片调用算法OCR为excel文件
write_zip_file
(
img_save_path
,
os
.
path
.
join
(
doc_data_path
,
'{0}_img.zip'
.
format
(
doc_id
)))
# 图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件
wb
=
xlwt
.
Workbook
()
loop
=
asyncio
.
get_event_loop
()
tasks
=
[
self
.
img_ocr_excel
(
wb
,
img_path
)
for
img_path
in
img_path_list
]
loop
.
run_until_complete
(
asyncio
.
wait
(
tasks
))
loop
.
close
()
wb
.
save
(
excel_path
)
# 整合excel文件上传至EDMS
except
Exception
as
e
:
UploadDocRecords
.
objects
.
filter
(
id
=
doc_id
)
.
update
(
status
=
DocStatus
.
PROCESS_FAILED
.
value
)
self
.
cronjob_log
.
error
(
'{0} [process failed] [err={1}]'
.
format
(
self
.
log_base
,
e
))
else
:
UploadDocRecords
.
objects
.
filter
(
id
=
doc_id
)
.
update
(
status
=
DocStatus
.
COMPLETE
.
value
)
self
.
cronjob_log
.
info
(
'{0} [doc process complete] [doc_id={1}]'
.
format
(
self
.
log_base
,
doc_id
))
...
...
src/apps/doc/mixins.py
View file @
a02a957
...
...
@@ -7,13 +7,12 @@ class DocHandler:
@staticmethod
def
get_link
(
doc_id
,
file
=
'pdf'
):
data_path
=
os
.
path
.
join
(
conf
.
DATA_DIR
,
str
(
doc_id
))
if
file
==
'pdf'
:
return
os
.
path
.
join
(
data_path
,
'{0}.pdf'
.
format
(
str
(
doc_id
))
)
return
'/data/{0}/{0}.pdf'
.
format
(
doc_id
)
elif
file
==
'img'
:
return
os
.
path
.
join
(
data_path
,
'{0}_img.zip'
.
format
(
str
(
doc_id
))
)
return
'/data/{0}/{0}_img.zip'
.
format
(
doc_id
)
else
:
return
os
.
path
.
join
(
data_path
,
'{0}.xlsx'
.
format
(
str
(
doc_id
))
)
return
'/data/{0}/{0}.xls'
.
format
(
doc_id
)
def
get_doc_list
(
self
,
doc_queryset
):
for
doc_dict
in
doc_queryset
:
...
...
src/common/tools/file_tools.py
View file @
a02a957
import
os
from
zipfile
import
ZipFile
def
file_write
(
file
,
file_path
):
with
open
(
file_path
,
'wb+'
)
as
f
:
for
chunk
in
file
.
chunks
():
f
.
write
(
chunk
)
def
write_zip_file
(
dir_name
,
zipfile_path
):
if
not
os
.
path
.
isdir
(
dir_name
):
return
with
ZipFile
(
zipfile_path
,
'w'
)
as
z
:
for
root
,
dirs
,
files
in
os
.
walk
(
dir_name
):
root_target_path
=
root
.
replace
(
dir_name
,
''
)
for
single_file
in
files
:
src_file_path
=
os
.
path
.
join
(
root
,
single_file
)
file_target_path
=
os
.
path
.
join
(
root_target_path
,
single_file
)
z
.
write
(
src_file_path
,
file_target_path
)
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment