Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
e570371a
authored
2020-10-17 23:23:37 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
add edms retry
1 parent
04020101
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
165 additions
and
124 deletions
src/apps/doc/consts.py
src/apps/doc/exceptions.py
src/apps/doc/management/commands/doc_ocr_process.py
src/apps/doc/consts.py
View file @
e570371
...
...
@@ -35,6 +35,8 @@ DEALER_CODE_META_FIELD_id = 13
BUSINESS_TYPE_META_FIELD_id
=
93
DEALER_CODE
=
'ocr_situ_group'
RETRY_TIMES
=
3
# ---------银行流水模板相关--------------------------------------------------------------------------------------------
TRANS_MAP
=
{
...
...
src/apps/doc/exceptions.py
0 → 100644
View file @
e570371
class
EDMSException
(
Exception
):
pass
src/apps/doc/management/commands/doc_ocr_process.py
View file @
e570371
...
...
@@ -19,6 +19,7 @@ from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords
from
apps.doc.named_enum
import
KeywordsType
from
apps.doc
import
consts
from
apps.doc.ocr.edms
import
EDMS
,
rh
from
apps.doc.exceptions
import
EDMSException
class
Command
(
BaseCommand
,
LoggerMixin
):
...
...
@@ -72,11 +73,21 @@ class Command(BaseCommand, LoggerMixin):
os
.
makedirs
(
doc_data_path
,
exist_ok
=
True
)
pdf_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.pdf'
.
format
(
doc
.
id
))
if
not
doc
.
application_id
.
startswith
(
consts
.
FIXED_APPLICATION_ID_PREFIX
):
self
.
edms
.
download
(
pdf_path
,
doc
.
metadata_version_id
)
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
self
.
edms
.
download
(
pdf_path
,
doc
.
metadata_version_id
)
except
Exception
as
e
:
self
.
cronjob_log
.
warn
(
'{0} [edms download failed] [times={1}] [business_type={2}] [doc_id={3}] '
'[error={4}]'
.
format
(
self
.
log_base
,
times
,
business_type
,
doc
.
id
,
e
))
edms_exc
=
str
(
e
)
else
:
break
else
:
raise
EDMSException
(
edms_exc
)
excel_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.xlsx'
.
format
(
doc
.
id
))
src_excel_path
=
os
.
path
.
join
(
doc_data_path
,
'src.xlsx'
)
self
.
cronjob_log
.
info
(
'{0} [
pdf
download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'
.
format
(
self
.
cronjob_log
.
info
(
'{0} [
edms
download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
pdf_path
))
return
doc_data_path
,
excel_path
,
src_excel_path
,
pdf_path
...
...
@@ -177,105 +188,21 @@ class Command(BaseCommand, LoggerMixin):
else
:
skip_img
.
append
(
self
.
parse_img_path
(
img_path
))
# async def fetch_ocr_result(self, url, json_data):
# async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
# async with session.post(url, json=json_data) as response:
# if response.status == 200:
# return await response.json()
#
# async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary):
# with open(img_path, 'rb') as f:
# base64_data = base64.b64encode(f.read())
# # 获取解码后的base64值
# file_data = base64_data.decode()
# json_data_1 = {
# "file": file_data
# }
# ocr_res_1 = await self.fetch_ocr_result(self.ocr_url_1, json_data_1)
# if ocr_res_1 is None:
# raise Exception('ocr 1 error, img_path={0}'.format(img_path))
# else:
# self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
# self.log_base, img_path, ocr_res_1))
#
# if ocr_res_1.get('code') == 1:
# ocr_data = ocr_res_1.get('data', {})
# classify = ocr_data.get('classify')
# if classify is None:
# return
# elif classify in consts.OTHER_CLASSIFY_SET: # 其他类
# return
# elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1
# self.license1_process(ocr_data, license_summary, classify)
# elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2
# pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
# json_data_2 = {
# "pid": str(pid),
# "key": conf.OCR_KEY,
# "secret": conf.OCR_SECRET,
# "file": file_data
# }
# ocr_res_2 = await self.fetch_ocr_result(self.ocr_url_2, json_data_2)
# if ocr_res_2 is None:
# raise Exception('ocr 2 error, img_path={0}'.format(img_path))
# else:
# # 识别结果
# self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
# self.log_base, img_path, ocr_res_2))
# self.license2_process(ocr_res_2, license_summary, pid, classify)
# else: # 流水处理
# self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify)
def
img_2_ocr_2_wb
(
self
,
wb
,
img_path
,
bs_summary
,
unknown_summary
,
license_summary
,
skip_img
):
# # 流水
# res = {
# 'code': 1,
# 'msg': 'success',
# 'data': {
# 'classify': 0,
# 'confidence': 0.999,
# 'data': [
# {
# 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
# 'cells': []
# },
# {
# 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
# 'cells': []
# }
# ]
# }
# }
#
# # 证件-1
# res = {
# 'code': 1,
# 'msg': 'success',
# 'data': {
# 'classify': 0,
# 'confidence': 0.999,
# 'data': [
# {
# 'cn_key': 'value',
# 'cn_key': 'value',
# },
# {
# 'cn_key': 'value',
# 'cn_key': 'value',
# },
# ]
# }
# }
#
# # 证件-2 or 其他类
# res = {
# 'code': 1,
# 'msg': 'success',
# 'data': {
# 'classify': 0,
# 'confidence': 0.999,
# }
# }
@staticmethod
async
def
fetch_ocr_1_result
(
url
,
json_data
):
async
with
aiohttp
.
ClientSession
(
connector
=
aiohttp
.
TCPConnector
(
ssl
=
False
))
as
session
:
async
with
session
.
post
(
url
,
json
=
json_data
)
as
response
:
if
response
.
status
==
200
:
return
await
response
.
json
()
@staticmethod
async
def
fetch_ocr_2_result
(
url
,
json_data
):
async
with
aiohttp
.
ClientSession
(
connector
=
aiohttp
.
TCPConnector
(
ssl
=
False
))
as
session
:
async
with
session
.
post
(
url
,
data
=
json_data
)
as
response
:
if
response
.
status
==
200
:
return
await
response
.
json
()
async
def
img_2_ocr_2_wb
(
self
,
wb
,
img_path
,
bs_summary
,
unknown_summary
,
license_summary
,
skip_img
):
with
open
(
img_path
,
'rb'
)
as
f
:
base64_data
=
base64
.
b64encode
(
f
.
read
())
# 获取解码后的base64值
...
...
@@ -283,9 +210,10 @@ class Command(BaseCommand, LoggerMixin):
json_data_1
=
{
"file"
:
file_data
}
response_1
=
requests
.
post
(
self
.
ocr_url_1
,
json
=
json_data_1
)
if
response_1
.
status_code
==
200
:
ocr_res_1
=
response_1
.
json
()
ocr_res_1
=
await
self
.
fetch_ocr_1_result
(
self
.
ocr_url_1
,
json_data_1
)
if
ocr_res_1
is
None
:
raise
Exception
(
'ocr 1 error, img_path={0}'
.
format
(
img_path
))
else
:
self
.
cronjob_log
.
info
(
'{0} [ocr_1 result] [img={1}] [res={2}]'
.
format
(
self
.
log_base
,
img_path
,
ocr_res_1
))
...
...
@@ -308,26 +236,121 @@ class Command(BaseCommand, LoggerMixin):
"secret"
:
conf
.
OCR_SECRET
,
"file"
:
file_data
}
response_2
=
requests
.
post
(
self
.
ocr_url_2
,
data
=
json_data_2
)
if
response_2
.
status_code
==
200
:
ocr_res_2
=
await
self
.
fetch_ocr_2_result
(
self
.
ocr_url_2
,
json_data_2
)
if
ocr_res_2
is
None
:
raise
Exception
(
'ocr 2 error, img_path={0}'
.
format
(
img_path
))
else
:
# 识别结果
ocr_res_2
=
response_2
.
json
()
self
.
cronjob_log
.
info
(
'{0} [ocr_2 result] [img={1}] [res={2}]'
.
format
(
self
.
log_base
,
img_path
,
ocr_res_2
))
self
.
license2_process
(
ocr_res_2
,
license_summary
,
pid
,
classify
,
skip_img
,
img_path
)
else
:
raise
Exception
(
'ocr 2 error, img_path={0}'
.
format
(
img_path
))
else
:
# 流水处理
self
.
bs_process
(
wb
,
ocr_data
,
bs_summary
,
unknown_summary
,
img_path
,
classify
,
skip_img
)
else
:
skip_img
.
append
(
self
.
parse_img_path
(
img_path
))
else
:
raise
Exception
(
'ocr 1 error, img_path={0}'
.
format
(
img_path
))
# def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
# # # 流水
# # res = {
# # 'code': 1,
# # 'msg': 'success',
# # 'data': {
# # 'classify': 0,
# # 'confidence': 0.999,
# # 'data': [
# # {
# # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
# # 'cells': []
# # },
# # {
# # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
# # 'cells': []
# # }
# # ]
# # }
# # }
# #
# # # 证件-1
# # res = {
# # 'code': 1,
# # 'msg': 'success',
# # 'data': {
# # 'classify': 0,
# # 'confidence': 0.999,
# # 'data': [
# # {
# # 'cn_key': 'value',
# # 'cn_key': 'value',
# # },
# # {
# # 'cn_key': 'value',
# # 'cn_key': 'value',
# # },
# # ]
# # }
# # }
# #
# # # 证件-2 or 其他类
# # res = {
# # 'code': 1,
# # 'msg': 'success',
# # 'data': {
# # 'classify': 0,
# # 'confidence': 0.999,
# # }
# # }
# with open(img_path, 'rb') as f:
# base64_data = base64.b64encode(f.read())
# # 获取解码后的base64值
# file_data = base64_data.decode()
# json_data_1 = {
# "file": file_data
# }
# response_1 = requests.post(self.ocr_url_1, json=json_data_1)
# if response_1.status_code == 200:
# ocr_res_1 = response_1.json()
# self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
# self.log_base, img_path, ocr_res_1))
#
# if ocr_res_1.get('code') == 1:
# ocr_data = ocr_res_1.get('data', {})
# classify = ocr_data.get('classify')
# if classify is None:
# skip_img.append(self.parse_img_path(img_path))
# return
# elif classify in consts.OTHER_CLASSIFY_SET: # 其他类
# skip_img.append(self.parse_img_path(img_path))
# return
# elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1
# self.license1_process(ocr_data, license_summary, classify, skip_img, img_path)
# elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2
# pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
# json_data_2 = {
# "pid": str(pid),
# "key": conf.OCR_KEY,
# "secret": conf.OCR_SECRET,
# "file": file_data
# }
# response_2 = requests.post(self.ocr_url_2, data=json_data_2)
# if response_2.status_code == 200:
# # 识别结果
# ocr_res_2 = response_2.json()
# self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
# self.log_base, img_path, ocr_res_2))
# self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path)
# else:
# raise Exception('ocr 2 error, img_path={0}'.format(img_path))
# else: # 流水处理
# self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img)
# else:
# skip_img.append(self.parse_img_path(img_path))
# else:
# raise Exception('ocr 1 error, img_path={0}'.format(img_path))
@staticmethod
def
parse_img_path
(
img_path
):
img_name
,
_
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
img_path
))
return
i
mg_name
[
5
],
img_name
[
11
]
return
i
nt
(
img_name
[
5
])
+
1
,
int
(
img_name
[
11
])
+
1
@staticmethod
def
get_most
(
value_list
):
...
...
@@ -520,14 +543,14 @@ class Command(BaseCommand, LoggerMixin):
# wb = Workbook()
# 4.1 获取OCR结果
#
loop = asyncio.get_event_loop()
# tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary
)
#
for img_path in pdf_handler.img_path_list]
#
loop.run_until_complete(asyncio.wait(tasks))
loop
=
asyncio
.
get_event_loop
()
tasks
=
[
self
.
img_2_ocr_2_wb
(
wb
,
img_path
,
bs_summary
,
unknown_summary
,
license_summary
,
skip_img
)
for
img_path
in
pdf_handler
.
img_path_list
]
loop
.
run_until_complete
(
asyncio
.
wait
(
tasks
))
# loop.close()
for
img_path
in
pdf_handler
.
img_path_list
:
self
.
img_2_ocr_2_wb
(
wb
,
img_path
,
bs_summary
,
unknown_summary
,
license_summary
,
skip_img
)
#
for img_path in pdf_handler.img_path_list:
#
self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img)
self
.
cronjob_log
.
info
(
'{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] '
'[license_summary={5}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
bs_summary
,
...
...
@@ -545,23 +568,37 @@ class Command(BaseCommand, LoggerMixin):
wb
.
save
(
src_excel_path
)
wb
.
rebuild
(
merged_bs_summary
,
license_summary
,
skip_img
)
wb
.
save
(
excel_path
)
except
EDMSException
as
e
:
self
.
cronjob_log
.
error
(
'{0} [process failed (edms download)] [business_type={1}] [doc_id={2}] '
'[err={3}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
e
))
except
Exception
as
e
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
save
()
self
.
cronjob_log
.
error
(
'{0} [process failed
] [business_type={1}] [doc_id={2}] [err={3}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
e
))
self
.
cronjob_log
.
error
(
'{0} [process failed
(program)] [business_type={1}] [doc_id={2}] '
'[err={3}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
e
))
else
:
try
:
# 5.上传至EDMS
self
.
edms
.
upload
(
excel_path
,
doc
,
business_type
)
# print('upload pass')
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
self
.
edms
.
upload
(
excel_path
,
doc
,
business_type
)
except
Exception
as
e
:
self
.
cronjob_log
.
warn
(
'{0} [edms upload failed] [times={1}] [business_type={2}] [doc_id={3}] '
'[error={4}]'
.
format
(
self
.
log_base
,
times
,
business_type
,
doc
.
id
,
e
))
edms_exc
=
str
(
e
)
else
:
break
else
:
raise
EDMSException
(
edms_exc
)
except
Exception
as
e
:
doc
.
status
=
DocStatus
.
UPLOAD_FAILED
.
value
doc
.
save
()
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
cronjob_log
.
error
(
'{0} [upload failed] [business_type={1}] [doc_id={2}] [speed_time={3}] '
'[err={4}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
speed_time
,
e
))
self
.
cronjob_log
.
error
(
'{0} [process failed (edms upload)] [business_type={1}] [doc_id={2}] '
'[speed_time={3}] [err={4}]'
.
format
(
self
.
log_base
,
business_type
,
doc
.
id
,
speed_time
,
e
))
write_zip_file
(
img_save_path
,
os
.
path
.
join
(
doc_data_path
,
'{0}_img.zip'
.
format
(
doc
.
id
)))
else
:
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment