Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
7a3d093e
authored
2020-11-18 15:48:12 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
issue list 1117
1 parent
ec638e4f
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
62 additions
and
27 deletions
.gitignore
src/apps/doc/consts.py
src/apps/doc/management/commands/folder_ocr_process.py
src/apps/doc/ocr/edms.py
src/apps/doc/ocr/wb.py
src/apps/doc/views.py
src/settings/conf/prd.ini
src/settings/conf/uat.ini
.gitignore
View file @
7a3d093
...
...
@@ -29,9 +29,6 @@ sftp-config.json
*.sqlite3
conf/*
data/*
ocr/*
# 脚本
src/*.sh
test*
flow_test.py
\ No newline at end of file
...
...
src/apps/doc/consts.py
View file @
7a3d093
...
...
@@ -140,9 +140,9 @@ RESULT_IDX = FIXED_HEADERS.index('核对结果')
# '借贷': ('贷', '借'), # 竖版-无表格-广发银行
# '借贷状态': ('贷', '借'), # 竖版-特殊-交通银行
# '收/支': ('收入', '支出'), # 横版-表格-北京银行
BORROW_HEADERS_SET
=
{
'借贷'
,
'借贷状态'
,
'收/支'
}
BORROW_INCOME_SET
=
{
'贷'
,
'收入'
}
BORROW_OUTLAY_SET
=
{
'借'
,
'支出'
}
BORROW_HEADERS_SET
=
{
'借贷'
,
'借贷状态'
,
'收/支'
,
'收支标志'
}
BORROW_INCOME_SET
=
{
'贷'
,
'收入'
,
'收'
}
BORROW_OUTLAY_SET
=
{
'借'
,
'支出'
,
'支'
}
INCOME_HEADERS_SET
=
{
'收入金额'
,
'收入'
,
'存入'
,
'存入金额(贷)'
,
'存入金额(贷)'
}
OUTLAY_HEADERS_SET
=
{
'支出金额'
,
'支出'
,
'支取金额(借)'
,
'支取金额(借)'
}
...
...
@@ -154,6 +154,7 @@ HEADERS_MAPPING.update(
{
'借贷'
:
BORROW_KEY
,
'借贷状态'
:
BORROW_KEY
,
'收支标志'
:
BORROW_KEY
,
'收/支'
:
BORROW_KEY
,
}
)
...
...
@@ -911,11 +912,11 @@ WECHART_HEADERS_MAPPING.update(
}
)
PATTERN_LIST
=
[
'收入/支出金额'
,
'收入'
,
'存入'
,
'支出'
,
'支取'
,
'金额'
,
'余额'
,
'发生额'
,
'借贷'
,
'借贷状态'
,
'收
/支'
,
'收入金额
'
,
'
存入金额(贷)'
,
'存入金额(贷)'
,
'支出金额'
,
'支取金额(借)'
,
'支取金额(借)'
,
'记账日期'
,
'附言'
,
'交易日期'
,
'摘要
'
,
'
业务摘要'
,
'工作日期'
,
'交易金额'
,
'账户余额'
,
'交易类型'
,
'金额(元)'
,
'金额(元)'
,
'时间'
,
'名称/备注
'
,
'
摘要/附言'
,
'交易发生额'
,
'交易摘要'
,
'借贷发生额(借:-贷:+)'
,
'借贷发生额(借:-贷:+)'
,
'联机余额'
,
'交易金额(元)
'
,
'交易金额
(元)'
,
'账户余额(元)'
,
'账户余额(元)'
,
'会计日期'
,
'摘要代码'
,
'摘要信息'
,
'日期'
,
'短摘要'
,
'本次余额
'
,
'交易后余额'
,
'交易说明'
,
'帐户余额'
,
'交易日期 记账日期'
]
PATTERN_LIST
=
[
'收入/支出金额'
,
'收入'
,
'存入'
,
'支出'
,
'支取'
,
'金额'
,
'余额'
,
'发生额'
,
'借贷'
,
'借贷状态'
,
'收
支标志'
,
'收/支
'
,
'
收入金额'
,
'存入金额(贷)'
,
'存入金额(贷)'
,
'支出金额'
,
'支取金额(借)'
,
'支取金额(借)'
,
'记账日期'
,
'附言
'
,
'
交易日期'
,
'摘要'
,
'业务摘要'
,
'工作日期'
,
'交易金额'
,
'账户余额'
,
'交易类型'
,
'金额(元)'
,
'金额(元)'
,
'时间
'
,
'
名称/备注'
,
'摘要/附言'
,
'交易发生额'
,
'交易摘要'
,
'借贷发生额(借:-贷:+)'
,
'借贷发生额(借:-贷:+)'
,
'联机余额
'
,
'交易金额
(元)'
,
'交易金额(元)'
,
'账户余额(元)'
,
'账户余额(元)'
,
'会计日期'
,
'摘要代码'
,
'摘要信息'
,
'日期
'
,
'
短摘要'
,
'本次余额'
,
'
交易后余额'
,
'交易说明'
,
'帐户余额'
,
'交易日期 记账日期'
]
CN_RE
=
re
.
compile
(
u'[
\u4e00
-
\u9fa5
]'
)
...
...
src/apps/doc/management/commands/folder_ocr_process.py
View file @
7a3d093
...
...
@@ -163,14 +163,19 @@ class Command(BaseCommand, LoggerMixin):
shutil
.
move
(
path
,
img_save_path
)
def
folder_process
(
self
,
input_dir
,
classify
):
while
not
os
.
path
.
isdir
(
input_dir
):
self
.
folder_log
.
info
(
'{0} [input dir is not dir] [input_dir={1}]'
.
format
(
self
.
log_base
,
input_dir
))
time
.
sleep
(
self
.
sleep_time
)
output_dir
=
os
.
path
.
join
(
os
.
path
.
dirname
(
input_dir
),
'Output'
)
img_output_dir
=
os
.
path
.
join
(
output_dir
,
'image'
)
wb_output_dir
=
os
.
path
.
join
(
output_dir
,
'excel'
)
pdf_output_dir
=
os
.
path
.
join
(
output_dir
,
'pdf'
)
failed_output_dir
=
os
.
path
.
join
(
output_dir
,
'failed'
)
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
img_output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
wb_output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
pdf_output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
failed_output_dir
,
exist_ok
=
True
)
while
self
.
switch
:
# 1. 从input dir获取pdf or image
list_dir
=
os
.
listdir
(
input_dir
)
...
...
@@ -178,6 +183,7 @@ class Command(BaseCommand, LoggerMixin):
self
.
folder_log
.
info
(
'{0} [input dir empty] [input_dir={1}]'
.
format
(
self
.
log_base
,
input_dir
))
time
.
sleep
(
self
.
sleep_time
)
for
name
in
list_dir
:
try
:
path
=
os
.
path
.
join
(
input_dir
,
name
)
if
os
.
path
.
isfile
(
path
):
self
.
folder_log
.
info
(
'{0} [file start] [path={1}]'
.
format
(
self
.
log_base
,
path
))
...
...
@@ -186,6 +192,17 @@ class Command(BaseCommand, LoggerMixin):
else
:
self
.
img_process
(
name
,
path
,
classify
,
wb_output_dir
,
img_output_dir
,
pdf_output_dir
)
self
.
folder_log
.
info
(
'{0} [file end] [path={1}]'
.
format
(
self
.
log_base
,
path
))
except
Exception
as
e
:
try
:
path
=
os
.
path
.
join
(
input_dir
,
name
)
self
.
folder_log
.
error
(
'{0} [file error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
shutil
.
move
(
path
,
failed_output_dir
)
continue
except
Exception
as
e
:
self
.
folder_log
.
error
(
'{0} [file error] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
continue
def
handle
(
self
,
*
args
,
**
kwargs
):
process_list
=
[]
...
...
src/apps/doc/ocr/edms.py
View file @
7a3d093
...
...
@@ -20,6 +20,7 @@ class EDMS:
self
.
user_name
=
conf
.
EDMS_USER
self
.
pwd
=
conf
.
EDMS_PWD
self
.
session_id
=
None
self
.
prefix
=
'OCR'
def
set_session_id
(
self
):
self
.
session_id
=
self
.
sm_client
.
service
.
StartSession
(
login
=
self
.
user_name
,
...
...
@@ -83,12 +84,15 @@ class EDMS:
else
:
raise
Exception
@staticmethod
def
get_doc_file_name
(
doc_name
):
if
doc_name
.
endswith
(
'pdf'
):
def
get_doc_file_name
(
self
,
doc_name
):
if
not
isinstance
(
doc_name
,
str
):
return
self
.
prefix
if
doc_name
.
endswith
(
'.pdf'
)
or
doc_name
.
endswith
(
'.PDF'
)
or
\
doc_name
.
endswith
(
'.pdF'
)
or
doc_name
.
endswith
(
'.pDF'
)
or
doc_name
.
endswith
(
'.pDf'
)
or
\
doc_name
.
endswith
(
'.Pdf'
)
or
doc_name
.
endswith
(
'.PdF'
)
or
doc_name
.
endswith
(
'.PDf'
):
name
,
_
=
os
.
path
.
splitext
(
doc_name
)
return
name
return
doc_name
return
'{0}{1}'
.
format
(
self
.
prefix
,
name
)
return
'{0}{1}'
.
format
(
self
.
prefix
,
doc_name
)
def
get_doc_info
(
self
,
token
,
doc
,
business_type
,
file_path
):
business_type
=
consts
.
BUSINESS_TYPE_DICT
.
get
(
business_type
)
...
...
@@ -140,5 +144,3 @@ class EDMS:
headers
.
pop
(
'Content-Type'
)
metadata_version_id
=
self
.
add_doc_info
(
headers
,
token
,
doc
,
business_type
,
file_path
)
return
metadata_version_id
...
...
src/apps/doc/ocr/wb.py
View file @
7a3d093
...
...
@@ -574,12 +574,25 @@ class BSWorkbook(Workbook):
license_list
=
license_summary
.
get
(
classify
)
if
not
license_list
:
continue
if
classify
==
consts
.
IC_CLASSIFY
:
# 身份证、居住证先正面,后反面
key
,
_
,
_
=
consts
.
FIELD_ORDER_MAP
.
get
(
classify
)
side1_list
=
[]
side2_list
=
[]
for
license_dict
in
license_list
:
if
key
in
license_dict
:
side2_list
.
append
(
license_dict
)
else
:
side1_list
.
append
(
license_dict
)
side1_list
.
extend
(
side2_list
)
license_list
=
side1_list
side2_list
=
None
side1_list
=
None
count
=
0
ws
=
self
.
create_sheet
(
name
)
if
scheme_diff
and
document_scheme
==
consts
.
DOC_SCHEME_LIST
[
1
]:
classify
=
consts
.
MVC_CLASSIFY_SE
for
license_dict
in
license_list
:
if
classify
==
consts
.
IC_CLASSIFY
and
license_dict
.
get
(
'类别'
)
==
'1'
:
if
classify
==
consts
.
IC_CLASSIFY
and
license_dict
.
get
(
'类别'
)
==
'1'
:
# 居住证处理
license_summary
.
setdefault
(
consts
.
RP_CLASSIFY
,
[])
.
append
(
license_dict
)
continue
if
side_diff
:
...
...
@@ -632,6 +645,10 @@ class BSWorkbook(Workbook):
def
rebuild
(
self
,
bs_summary
,
license_summary
,
res_list
,
document_scheme
):
count_list
=
[(
consts
.
MODEL_FIELD_BS
,
len
(
self
.
sheetnames
)
-
1
)]
if
document_scheme
==
consts
.
DOC_SCHEME_LIST
[
1
]:
self
.
license_rebuild
(
license_summary
,
document_scheme
,
count_list
)
self
.
bs_rebuild
(
bs_summary
)
else
:
self
.
bs_rebuild
(
bs_summary
)
self
.
license_rebuild
(
license_summary
,
document_scheme
,
count_list
)
self
.
res_sheet
(
res_list
)
...
...
src/apps/doc/views.py
View file @
7a3d093
...
...
@@ -293,7 +293,8 @@ class DocView(GenericView, DocHandler):
metadata_version_id
=
str
(
int
(
time
.
time
())
-
random_int
)
pdf_file
=
args
.
get
(
'pdf_file'
)
if
not
pdf_file
.
name
.
endswith
(
'pdf'
):
if
isinstance
(
pdf_file
.
name
,
str
):
if
not
pdf_file
.
name
.
endswith
(
'pdf'
)
or
not
pdf_file
.
name
.
endswith
(
'PDF'
):
self
.
invalid_params
(
msg
=
'invalid params: not a PDF file'
)
business_type
=
random
.
choice
(
consts
.
BUSINESS_TYPE_LIST
)
...
...
src/settings/conf/prd.ini
View file @
7a3d093
...
...
@@ -8,7 +8,7 @@ SLEEP_SECOND_FOLDER = 2
IMG_QUEUE_SIZE
=
500
EDMS_DOWNLOAD_URL
=
http
s://edms-test.bmw.com
/FH/FileHold/DocumentRepository/DownloadHandler.ashx
EDMS_UPLOAD_URL
=
http
s://edms-test.bmw.com
/FH/FileHold/DocumentRepository/UploadHandler.ashx
DEALER_CODE
=
ocr_
situ_
group
EDMS_DOWNLOAD_URL
=
http
://sccn0639.bmwgroup.net
/FH/FileHold/DocumentRepository/DownloadHandler.ashx
EDMS_UPLOAD_URL
=
http
://sccn0639.bmwgroup.net
/FH/FileHold/DocumentRepository/UploadHandler.ashx
DEALER_CODE
=
ocr_group
...
...
src/settings/conf/uat.ini
View file @
7a3d093
...
...
@@ -8,6 +8,6 @@ SLEEP_SECOND_FOLDER = 2
IMG_QUEUE_SIZE
=
500
EDMS_DOWNLOAD_URL
=
http
s://edms-test.bmw.com
/FH/FileHold/DocumentRepository/DownloadHandler.ashx
EDMS_UPLOAD_URL
=
http
s://edms-test.bmw.com
/FH/FileHold/DocumentRepository/UploadHandler.ashx
EDMS_DOWNLOAD_URL
=
http
://sccn0637.bmwgroup.net
/FH/FileHold/DocumentRepository/DownloadHandler.ashx
EDMS_UPLOAD_URL
=
http
://sccn0637.bmwgroup.net
/FH/FileHold/DocumentRepository/UploadHandler.ashx
DEALER_CODE
=
ocr_situ_group
\ No newline at end of file
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment