Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
37ca9589
authored
2020-11-09 20:05:38 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
change data folder
1 parent
fe7d3a71
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
45 additions
and
9 deletions
.gitignore
src/apps/doc/consts.py
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/ocr/wb.py
src/apps/doc/views.py
src/common/tools/mssql_script.py
.gitignore
View file @
37ca958
...
...
@@ -29,8 +29,10 @@ sftp-config.json
*.sqlite3
conf/*
data/*
ocr/*
# 脚本
src/*.sh
test*
\ No newline at end of file
test*
folder_ocr_process.py
\ No newline at end of file
...
...
src/apps/doc/consts.py
View file @
37ca958
...
...
@@ -16,6 +16,9 @@ SPLIT_STR = '_'
BUSINESS_TYPE_LIST
=
[
HIL_PREFIX
,
AFC_PREFIX
]
HIL_SET
=
{
'HIL'
,
'HIl'
,
'HiL'
,
'Hil'
,
'hIL'
,
'hIl'
,
'hiL'
,
'hil'
,
'CO00002'
}
PRIORITY_WORDS
=
{
'muw'
,
'MUW'
}
TMP_DIR_NAME
=
'tmp'
# -------EDMS相关---------------------------------------------------------------------------------------------------
SESSION_PREFIX
=
'FHLSID'
...
...
src/apps/doc/management/commands/ocr_process.py
View file @
37ca958
...
...
@@ -373,7 +373,7 @@ class Command(BaseCommand, LoggerMixin):
try
:
# 2. 从EDMS获取PDF文件
doc_data_path
=
os
.
path
.
join
(
self
.
data_dir
,
business_type
,
str
(
doc
.
id
))
doc_data_path
=
os
.
path
.
join
(
self
.
data_dir
,
business_type
,
consts
.
TMP_DIR_NAME
,
str
(
doc
.
id
))
os
.
makedirs
(
doc_data_path
,
exist_ok
=
True
)
pdf_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.pdf'
.
format
(
doc
.
id
))
self
.
pdf_download
(
doc
,
pdf_path
)
...
...
@@ -579,7 +579,7 @@ class Command(BaseCommand, LoggerMixin):
# 4.2 重构Excel文件
doc
,
business_type
=
self
.
get_doc_object
(
task_str
)
doc_data_path
=
os
.
path
.
join
(
self
.
data_dir
,
business_type
,
str
(
doc
.
id
))
doc_data_path
=
os
.
path
.
join
(
self
.
data_dir
,
business_type
,
consts
.
TMP_DIR_NAME
,
str
(
doc
.
id
))
excel_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.xlsx'
.
format
(
doc
.
id
))
img_save_path
=
os
.
path
.
join
(
doc_data_path
,
'img'
)
# wb.save(src_excel_path)
...
...
src/apps/doc/ocr/wb.py
View file @
37ca958
...
...
@@ -303,8 +303,8 @@ class BSWorkbook(Workbook):
period_idx
=
len
(
res_str
)
-
3
if
res_str
[
period_idx
]
==
'.'
and
res_str
[
period_idx
-
1
]
in
{
','
,
'.'
}:
# 364,.92 364..92
res_str
=
'{0}{1}'
.
format
(
res_str
[:
period_idx
-
1
],
res_str
[
period_idx
:])
elif
res_str
[
period_idx
]
==
','
:
if
res_str
[
period_idx
-
1
]
in
{
','
,
'.'
}:
# 364.,92 364,,92
elif
res_str
[
period_idx
]
in
{
','
,
':'
,
':'
}
:
if
res_str
[
period_idx
-
1
]
in
{
','
,
'.'
,
':'
,
':'
}:
# 364.,92 364,,92
pre_idx
=
period_idx
-
1
else
:
# 364,92
pre_idx
=
period_idx
...
...
src/apps/doc/views.py
View file @
37ca958
...
...
@@ -178,6 +178,7 @@ class PriorityDocView(GenericView, DocHandler):
application_info
=
args
.
get
(
'APPLICATION_INFORMATION'
)
application_id
=
application_info
.
get
(
'APPLICATION_ID'
)
submit_datetime
=
application_info
.
get
(
'SUBMIT_DATETIME'
)
intermediate_decision
=
application_info
.
get
(
'INTERMEDIATE_DECISION'
)
entity
=
application_info
.
get
(
'ENTITY'
)
if
submit_datetime
.
utcoffset
()
is
not
None
:
submit_datetime
=
timezone
.
make_naive
(
submit_datetime
,
timezone
.
get_current_timezone
())
...
...
@@ -187,9 +188,14 @@ class PriorityDocView(GenericView, DocHandler):
rating
=
application_info
.
get
(
'RATING'
),
application_id
=
application_id
,
application_version
=
application_info
.
get
(
'APPLICATION_VERSION'
),
intermediate_decision
=
application_info
.
get
(
'INTERMEDIATE_DECISION'
)
,
intermediate_decision
=
intermediate_decision
,
submit_datetime
=
submit_datetime
,
)
if
intermediate_decision
not
in
consts
.
PRIORITY_WORDS
:
self
.
running_log
.
info
(
'[priority doc skip] [args={0}]'
.
format
(
args
))
return
response
.
ok
()
_
,
created
=
PriorityApplication
.
objects
.
update_or_create
(
application_id
=
application_id
,
defaults
=
{
'on_off'
:
True
})
if
created
:
...
...
@@ -282,7 +288,8 @@ class DocView(GenericView, DocHandler):
if
not
pdf_file
.
name
.
endswith
(
'pdf'
):
self
.
invalid_params
(
msg
=
'invalid params: not a PDF file'
)
tmp_save_path
=
os
.
path
.
join
(
conf
.
DATA_DIR
,
'{0}.pdf'
.
format
(
metadata_version_id
))
business_type
=
random
.
choice
(
consts
.
BUSINESS_TYPE_LIST
)
tmp_save_path
=
os
.
path
.
join
(
conf
.
DATA_DIR
,
business_type
,
'{0}.pdf'
.
format
(
metadata_version_id
))
file_write
(
pdf_file
,
tmp_save_path
)
try
:
...
...
@@ -302,7 +309,6 @@ class DocView(GenericView, DocHandler):
upload_finish_time
=
timezone
.
now
()
document_scheme
=
random
.
choice
(
consts
.
DOC_SCHEME_LIST
)
data_source
=
random
.
choice
(
consts
.
DATA_SOURCE_LIST
)
business_type
=
random
.
choice
(
consts
.
BUSINESS_TYPE_LIST
)
UploadDocRecords
.
objects
.
create
(
metadata_version_id
=
metadata_version_id
,
application_id
=
application_id
,
...
...
@@ -333,7 +339,7 @@ class DocView(GenericView, DocHandler):
)
# 3.pdf文件移动
save_dir_path
=
os
.
path
.
join
(
conf
.
DATA_DIR
,
business_type
,
str
(
doc
.
id
))
save_dir_path
=
os
.
path
.
join
(
conf
.
DATA_DIR
,
business_type
,
consts
.
TMP_DIR_NAME
,
str
(
doc
.
id
))
save_file_path
=
os
.
path
.
join
(
save_dir_path
,
'{0}.pdf'
.
format
(
doc
.
id
))
os
.
makedirs
(
save_dir_path
,
exist_ok
=
True
)
# file_write(pdf_file, save_file_path)
...
...
src/common/tools/mssql_script.py
View file @
37ca958
...
...
@@ -455,6 +455,30 @@ afc_sql = """
on afc_doc (start_time, end_time);
"""
keywords_sql
=
"""
INSERT INTO afc.dbo.keywords (keyword, type, update_time, create_time) VALUES
(N'利息', 0, N'2020-11-09 16:14:58.000', N'2020-11-09 16:14:59.000'),
(N'结息', 0, N'2020-11-09 16:14:58.000', N'2020-11-09 16:14:59.000'),
(N'工资', 1, N'2020-11-09 16:14:58.000', N'2020-11-09 16:14:59.000'),
(N'代发', 1, N'2020-11-09 16:14:58.000', N'2020-11-09 16:14:59.000'),
(N'养老保险', 1, N'2020-11-09 16:14:58.000', N'2020-11-09 16:14:59.000'),
(N'奖金', 1, N'2020-11-09 16:14:58.000', N'2020-11-09 16:14:59.000'),
(N'理财', 2, N'2020-11-09 16:14:58.000', N'2020-11-09 16:14:59.000'),
(N'赎回', 2, N'2020-11-09 16:14:58.000', N'2020-11-09 16:14:59.000'),
(N'微信', 2, N'2020-11-09 16:14:58.000', N'2020-11-09 16:14:59.000'),
(N'支付宝', 2, N'2020-11-09 16:14:58.000', N'2020-11-09 16:14:59.000'),
(N'财付通', 2, N'2020-11-09 16:14:58.000', N'2020-11-09 16:14:59.000'),
(N'放款', 2, N'2020-11-09 16:14:58.000', N'2020-11-09 16:14:59.000'),
(N'还款', 2, N'2020-11-09 16:14:58.000', N'2020-11-09 16:14:59.000'),
(N'贷款', 2, N'2020-11-09 16:14:58.000', N'2020-11-09 16:14:59.000'),
(N'银证转账', 2, N'2020-11-09 16:14:58.000', N'2020-11-09 16:14:59.000'),
(N'银行卡户名(姓名)', 2, N'2020-11-09 16:14:58.000', N'2020-11-09 16:14:59.000'),
(N'转账/转账', 3, N'2020-11-09 16:14:58.000', N'2020-11-09 16:14:59.000'),
(N'商品/线下', 3, N'2020-11-09 16:14:58.000', N'2020-11-09 16:14:59.000'),
(N'转账', 3, N'2020-11-09 16:14:58.000', N'2020-11-09 16:14:59.000'),
(N'二维码收款', 3, N'2020-11-09 16:14:58.000', N'2020-11-09 16:14:59.000');
"""
hil_cnxn
=
pyodbc
.
connect
(
'DRIVER={ODBC Driver 17 for SQL Server};'
,
autocommit
=
True
)
hil_cursor
=
hil_cnxn
.
cursor
()
...
...
@@ -468,6 +492,7 @@ afc_cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};', autocommit=
afc_cursor
=
afc_cnxn
.
cursor
()
afc_cursor
.
execute
(
afc_sql
)
afc_cursor
.
execute
(
keywords_sql
)
afc_cursor
.
close
()
afc_cnxn
.
close
()
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment