Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
97994674
authored
2020-07-22 17:14:09 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
ocr excel upload eDMS
1 parent
7aa0284c
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
87 additions
and
61 deletions
requirements/base.txt
src/apps/doc/consts.py
src/apps/doc/edms.py
src/apps/doc/management/commands/doc_process.py
src/apps/doc/management/commands/pdf_to_img.py
src/apps/doc/mixins.py
src/apps/doc/named_enum.py
src/apps/doc/views.py
requirements/base.txt
View file @
9799467
...
...
@@ -12,13 +12,16 @@ Django==2.1
django-oauth-toolkit==1.3.2
djangorestframework==3.9.0
djangorestframework-jwt==1.11.0
et-xmlfile==1.0.1
idna==2.9
idna-ssl==1.1.0
isodate==0.6.0
jdcal==1.4.1
lxml==4.5.1
marshmallow==3.6.1
multidict==4.7.6
oauthlib==3.1.0
openpyxl==3.0.4
pdfminer3k==1.3.4
Pillow==7.1.2
ply==3.11
...
...
src/apps/doc/consts.py
View file @
9799467
PAGE_DEFAULT
=
1
PAGE_SIZE_DEFAULT
=
10
DOC_SCHEME_LIST
=
[
'Acceptance'
,
'Settlement'
,
'Contract Management'
]
DATA_SOURCE_LIST
=
[
'POS'
,
'EAPP'
,
'Econtract'
]
BUSINESS_TYPE_LIST
=
[
'HIL'
,
'AFC'
]
HIL_SET
=
{
'HIL'
,
'HIl'
,
'HiL'
,
'Hil'
,
'hIL'
,
'hIl'
,
'hiL'
,
'hil'
,
'CO00002'
}
FIXED_APPLICATION_ID
=
'手工单'
DOC_SCHEME_LIST
=
[
'ACCEPTANCE'
,
'SETTLEMENT'
,
'CONTRACT MANAGEMENT'
]
DATA_SOURCE_LIST
=
[
'POS'
,
'EAPP'
,
'ECONTRACT'
]
HIL_PREFIX
=
'HIL'
AFC_PREFIX
=
'AFC'
SPLIT_STR
=
'_'
BUSINESS_TYPE_LIST
=
[
HIL_PREFIX
,
AFC_PREFIX
]
HIL_SET
=
{
'HIL'
,
'HIl'
,
'HiL'
,
'Hil'
,
'hIL'
,
'hIl'
,
'hiL'
,
'hil'
,
'CO00002'
}
SESSION_PREFIX
=
'FHLSID'
CUSTOM_CLIENT
=
'CustomClient'
...
...
@@ -15,12 +19,22 @@ FIXED_FILE_SIZE = 0
DOWNLOAD_ACTION_TYPE
=
'Downloaded'
DOC_SCHEMA_ID_FILL
=
{
'Acceptance'
:
(
1
,
'DFE-AutoFilingScript'
),
'Settlement'
:
(
20
,
'DFE-AutoFilingScript'
),
'Contract Management'
:
(
86
,
'Schema-Based'
)
'ACCEPTANCE'
:
(
1
,
'DFE-AutoFilingScript'
),
'SETTLEMENT'
:
(
20
,
'DFE-AutoFilingScript'
),
'CONTRACT MANAGEMENT'
:
(
86
,
'Schema-Based'
)
}
BUSINESS_TYPE_DICT
=
{
HIL_PREFIX
:
'CO00002'
,
AFC_PREFIX
:
'CO00001'
}
DOC_SCHEMA_TYPE
=
'ElectronicRecord'
APPLICATION_ID_META_FIELD_id
=
1
DEALER_CODE_META_FIELD_id
=
13
BUSINESS_TYPE_META_FIELD_id
=
93
DEALER_CODE
=
'ocr_situ_group'
AMOUNT_COL_TITLE_SET
=
{
"交易金额"
,
"金额"
,
"收入/支出金额"
,
"发生额"
}
OVERAGE_COL_TITLE_SET
=
{
"账户余额"
,
"余额"
}
PROOF_COL_TITLE
=
'核对结果'
PROOF_RES
=
(
'对'
,
'错'
)
META_SHEET_TITLE
=
'关键信息提取和展示'
...
...
src/apps/doc/edms.py
View file @
9799467
import
os
import
requests
from
zeep
import
Client
,
xsd
from
settings
import
conf
...
...
@@ -65,9 +66,9 @@ class EDMS:
params
=
{
'token'
:
token
}
self
.
download_handler
(
params
,
headers
,
save_path
)
def
create_upload_token
(
self
,
headers
,
file_size
):
def
create_upload_token
(
self
,
headers
):
with
self
.
rc_client
.
settings
(
extra_http_headers
=
headers
):
token
=
self
.
rc_client
.
service
.
CreateUploadToken
(
fileSize
=
file_size
)
token
=
self
.
rc_client
.
service
.
CreateUploadToken
(
fileSize
=
consts
.
FIXED_FILE_SIZE
)
return
token
def
upload_handler
(
self
,
file_path
,
params
,
headers
):
...
...
@@ -80,11 +81,19 @@ class EDMS:
else
:
raise
Exception
def
get_doc_info
(
self
,
token
,
doc_info
):
doc_schema_id
,
auto_filing
=
consts
.
DOC_SCHEMA_ID_FILL
.
get
(
doc_info
.
get
(
'document_scheme'
))
application_id
=
doc_info
.
get
(
'application_id'
)
doc_file_name
=
doc_info
.
get
(
'doc_file_name'
)
business_type
=
doc_info
.
get
(
'business_type'
)
@staticmethod
def
get_doc_file_name
(
doc_name
):
if
doc_name
.
endswith
(
'pdf'
):
name
,
_
=
os
.
path
.
splitext
(
doc_name
)
return
name
return
doc_name
def
get_doc_info
(
self
,
token
,
doc
,
business_type
,
file_path
):
business_type
=
consts
.
BUSINESS_TYPE_DICT
.
get
(
business_type
)
doc_schema_id
,
auto_filing
=
consts
.
DOC_SCHEMA_ID_FILL
.
get
(
doc
.
document_scheme
)
application_id
=
doc
.
application_id
doc_file_name
=
self
.
get_doc_file_name
(
doc
.
document_name
)
origin_file_name
=
os
.
path
.
basename
(
file_path
)
fields_with_value
=
[
{
'FieldId'
:
consts
.
APPLICATION_ID_META_FIELD_id
,
'FieldValue'
:
xsd
.
AnyObject
(
xsd
.
String
(),
application_id
)},
...
...
@@ -99,20 +108,20 @@ class EDMS:
'DocumentName'
:
doc_file_name
,
'FieldsWithValues'
:
fields_with_values
,
'UploadToken'
:
token
,
'OriginalFileName'
:
doc
_file_name
,
'OriginalFileName'
:
origin
_file_name
,
'SendEmailToMembers'
:
False
,
'AutoFilingScriptToUse'
:
auto_filing
,
'DocumentSchemaType'
:
consts
.
DOC_SCHEMA_TYPE
,
}
return
info
def
add_doc_info
(
self
,
headers
,
token
,
doc
_info
):
info
=
self
.
get_doc_info
(
token
,
doc
_info
)
def
add_doc_info
(
self
,
headers
,
token
,
doc
,
business_type
,
file_path
):
info
=
self
.
get_doc_info
(
token
,
doc
,
business_type
,
file_path
)
with
self
.
dm_client
.
settings
(
extra_http_headers
=
headers
):
metadata_version_id
=
self
.
dm_client
.
service
.
AddDocumentInfo
(
info
=
info
)
return
metadata_version_id
def
upload
(
self
,
file_path
,
file_size
,
doc_info
):
def
upload
(
self
,
file_path
,
doc
,
business_type
):
# file_path = '/Users/clay/Postman/files/OCRuploadTest4.txt'
# file_size = 16
# doc_info = {
...
...
@@ -122,12 +131,12 @@ class EDMS:
# 'business_type': 'CO00001',
# }
headers
=
self
.
get_headers
()
token
=
self
.
create_upload_token
(
headers
,
file_size
)
token
=
self
.
create_upload_token
(
headers
)
headers
.
update
({
'Content-Type'
:
'application/octet-stream'
})
params
=
{
'token'
:
token
}
self
.
upload_handler
(
file_path
,
params
,
headers
)
headers
.
pop
(
'Content-Type'
)
metadata_version_id
=
self
.
add_doc_info
(
headers
,
token
,
doc
_info
)
metadata_version_id
=
self
.
add_doc_info
(
headers
,
token
,
doc
,
business_type
,
file_path
)
return
metadata_version_id
...
...
src/apps/doc/management/commands/doc_process.py
View file @
9799467
This diff is collapsed.
Click to expand it.
src/apps/doc/management/commands/pdf_to_img.py
View file @
9799467
This diff is collapsed.
Click to expand it.
src/apps/doc/mixins.py
View file @
9799467
...
...
@@ -26,7 +26,21 @@ class DocHandler:
@staticmethod
def
get_doc_class
(
business_type
):
is_hil
=
business_type
in
consts
.
HIL_SET
doc_class
,
prefix
=
(
HILDoc
,
consts
.
HIL_PREFIX
)
if
is_hil
else
(
AFCDoc
,
consts
.
AFC_PREFIX
)
return
doc_class
,
prefix
return
(
HILDoc
,
consts
.
HIL_PREFIX
)
if
business_type
in
consts
.
HIL_SET
else
(
AFCDoc
,
consts
.
AFC_PREFIX
)
def
fix_scheme
(
self
,
scheme
):
if
scheme
in
consts
.
DOC_SCHEME_LIST
:
return
scheme
elif
scheme
.
upper
()
in
consts
.
DOC_SCHEME_LIST
:
return
scheme
.
upper
()
else
:
return
consts
.
DOC_SCHEME_LIST
[
0
]
def
fix_data_source
(
self
,
data_source
):
if
data_source
in
consts
.
DATA_SOURCE_LIST
:
return
data_source
elif
data_source
.
upper
()
in
consts
.
DATA_SOURCE_LIST
:
return
data_source
.
upper
()
else
:
return
consts
.
DATA_SOURCE_LIST
[
0
]
...
...
src/apps/doc/named_enum.py
View file @
9799467
...
...
@@ -7,20 +7,3 @@ class DocStatus(NamedEnum):
PROCESS_FAILED
=
(
2
,
'识别失败'
)
UPLOAD_FAILED
=
(
3
,
'同步失败'
)
COMPLETE
=
(
4
,
'已完成'
)
class
DocScheme
(
NamedEnum
):
ACCEPTANCE
=
(
0
,
"Acceptance"
)
SETTLEMENT
=
(
1
,
'Settlement'
)
CONTRACT_MANAGEMENT
=
(
2
,
'Contract Management'
)
class
BusinessType
(
NamedEnum
):
AFC
=
(
0
,
"CO00001"
)
HIL
=
(
1
,
'CO00002'
)
class
DataSource
(
NamedEnum
):
POS
=
(
0
,
"POS"
)
EAPP
=
(
1
,
'EAPP'
)
ECONTRACT
=
(
2
,
'Econtract'
)
...
...
src/apps/doc/views.py
View file @
9799467
...
...
@@ -60,7 +60,7 @@ doc_list_args = {
'status'
:
fields
.
Int
(
required
=
False
,
validate
=
validate
.
OneOf
(
DocStatus
.
get_value_lst
())),
'application_id'
:
fields
.
Str
(
required
=
False
,
validate
=
validate
.
Length
(
max
=
64
)),
'data_source'
:
fields
.
Str
(
required
=
False
,
validate
=
validate
.
Length
(
max
=
64
)),
'data_source'
:
fields
.
Str
(
required
=
False
,
validate
=
validate
.
OneOf
(
consts
.
DATA_SOURCE_LIST
)),
'business_type'
:
fields
.
Str
(
required
=
True
,
validate
=
validate
.
OneOf
(
consts
.
BUSINESS_TYPE_LIST
)),
'upload_time_start'
:
fields
.
Date
(
required
=
False
),
'upload_time_end'
:
fields
.
Date
(
required
=
False
),
...
...
@@ -100,6 +100,8 @@ class UploadDocView(GenericView, DocHandler):
document
=
args
.
get
(
'document'
)
business_type
=
document
.
get
(
'businessType'
)
application_id
=
application_data
.
get
(
'applicationId'
)
document_scheme
=
document
.
get
(
'documentScheme'
)
data_source
=
document
.
get
(
'dataSource'
)
try
:
# 1. 上传信息记录
record
=
UploadDocRecords
.
objects
.
create
(
...
...
@@ -110,9 +112,9 @@ class UploadDocView(GenericView, DocHandler):
guarantor_1
=
applicant_data
.
get
(
'guarantor1Name'
),
guarantor_2
=
applicant_data
.
get
(
'guarantor2Name'
),
document_name
=
document
.
get
(
'documentName'
),
document_scheme
=
document
.
get
(
'documentScheme'
)
,
document_scheme
=
document
_scheme
,
business_type
=
business_type
,
data_source
=
d
ocument
.
get
(
'dataSource'
)
,
data_source
=
d
ata_source
,
upload_finish_time
=
document
.
get
(
'uploadFinishTime'
),
)
except
IntegrityError
as
e
:
...
...
@@ -130,17 +132,17 @@ class UploadDocView(GenericView, DocHandler):
guarantor_1
=
applicant_data
.
get
(
'guarantor1Name'
),
guarantor_2
=
applicant_data
.
get
(
'guarantor2Name'
),
document_name
=
document
.
get
(
'documentName'
),
document_scheme
=
document
.
get
(
'documentScheme'
),
data_source
=
document
.
get
(
'dataSource'
),
document_scheme
=
self
.
fix_scheme
(
document_scheme
),
data_source
=
self
.
fix_data_source
(
data_source
),
upload_finish_time
=
document
.
get
(
'uploadFinishTime'
),
)
# 3. 选择队列进入
is_priority
=
PriorityApplication
.
objects
.
filter
(
application_id
=
application_id
,
on_off
=
True
)
.
exists
()
value
=
[
'{0}_{1}'
.
format
(
prefix
,
doc
.
id
)]
redis_res
=
rh
.
enqueue
(
value
,
is_priority
)
self
.
running_log
.
info
(
'[doc upload success] [args={0}] [record_id={1}] [
prefix
={2}] [doc_id={3}] '
tasks
=
[
'{0}{1}{2}'
.
format
(
prefix
,
consts
.
SPLIT_STR
,
doc
.
id
)]
enqueue_res
=
rh
.
enqueue
(
tasks
,
is_priority
)
self
.
running_log
.
info
(
'[doc upload success] [args={0}] [record_id={1}] [
business_type
={2}] [doc_id={3}] '
'[is_priority={4}] [enqueue_res={5}]'
.
format
(
args
,
record
.
id
,
prefix
,
doc
.
id
,
is_priority
,
redis
_res
))
is_priority
,
enqueue
_res
))
return
response
.
ok
()
post
.
openapi_doc
=
'''
...
...
@@ -174,7 +176,8 @@ class PriorityDocView(GenericView, DocHandler):
application_id
=
application_info
.
get
(
'APPLICATION_ID'
)
submit_datetime
=
application_info
.
get
(
'SUBMIT_DATETIME'
)
entity
=
application_info
.
get
(
'ENTITY'
)
submit_datetime
=
timezone
.
make_naive
(
submit_datetime
,
timezone
.
get_current_timezone
())
if
submit_datetime
.
utcoffset
()
is
not
None
:
submit_datetime
=
timezone
.
make_naive
(
submit_datetime
,
timezone
.
get_current_timezone
())
GCAPRecords
.
objects
.
create
(
entity
=
entity
,
status
=
application_info
.
get
(
'STATUS'
),
...
...
@@ -190,14 +193,14 @@ class PriorityDocView(GenericView, DocHandler):
doc_class
,
prefix
=
self
.
get_doc_class
(
entity
)
doc_ids
=
doc_class
.
objects
.
filter
(
application_id
=
application_id
,
status
=
DocStatus
.
INIT
.
value
)
.
values_list
(
'id'
,
flat
=
True
)
task
_str_list
=
[
'{0}_{1}'
.
format
(
prefix
,
doc_id
)
for
doc_id
in
doc_ids
]
if
not
task
_str
_list
:
task
s_list
=
[
'{0}{1}{2}'
.
format
(
prefix
,
consts
.
SPLIT_STR
,
doc_id
)
for
doc_id
in
doc_ids
]
if
not
task
s
_list
:
self
.
running_log
.
info
(
'[priority doc success] [args={0}]
[task_str_list={1}]'
.
format
(
args
,
task_str_list
))
'[priority doc success] [args={0}]
'
.
format
(
args
))
else
:
enqueue_res
=
rh
.
enqueue
(
task
_str
_list
,
is_priority
=
True
)
self
.
running_log
.
info
(
'[priority doc success] [args={0}] [task
_str
_list={1}] [enqueue_res={2}]'
.
format
(
args
,
task
_str
_list
,
enqueue_res
))
enqueue_res
=
rh
.
enqueue
(
task
s
_list
,
is_priority
=
True
)
self
.
running_log
.
info
(
'[priority doc success] [args={0}] [task
s
_list={1}] [enqueue_res={2}]'
.
format
(
args
,
task
s
_list
,
enqueue_res
))
return
response
.
ok
()
post
.
openapi_doc
=
'''
...
...
@@ -268,7 +271,7 @@ class DocView(GenericView, DocHandler):
@use_args
(
upload_pdf_args
,
location
=
'files'
)
def
post
(
self
,
request
,
args
):
# 1. 上传信息记录
const_str
=
'手工单'
const_str
=
consts
.
FIXED_APPLICATION_ID
metadata_version_id
=
str
(
int
(
time
.
time
()))
upload_finish_time
=
timezone
.
now
()
document_scheme
=
random
.
choice
(
consts
.
DOC_SCHEME_LIST
)
...
...
@@ -305,8 +308,8 @@ class DocView(GenericView, DocHandler):
)
# 3. 选择队列进入
is_priority
=
False
value
=
[
'{0}_{1}'
.
format
(
prefix
,
doc
.
id
)]
redis_res
=
rh
.
enqueue
(
value
,
is_priority
)
tasks
=
[
'{0}{1}{2}'
.
format
(
prefix
,
consts
.
SPLIT_STR
,
doc
.
id
)]
enqueue_res
=
rh
.
enqueue
(
tasks
,
is_priority
)
pdf_file
=
args
.
get
(
'pdf_file'
)
save_dir_path
=
os
.
path
.
join
(
conf
.
DATA_DIR
,
business_type
,
str
(
doc
.
id
))
...
...
@@ -314,7 +317,7 @@ class DocView(GenericView, DocHandler):
os
.
makedirs
(
save_dir_path
,
exist_ok
=
True
)
file_write
(
pdf_file
,
save_file_path
)
self
.
running_log
.
info
(
'[mock doc upload success] [args={0}] [record_id={1}] [
prefix
={2}] [doc_id={3}] '
self
.
running_log
.
info
(
'[mock doc upload success] [args={0}] [record_id={1}] [
business_type
={2}] [doc_id={3}] '
'[is_priority={4}] [enqueue_res={5}]'
.
format
(
args
,
record
.
id
,
prefix
,
doc
.
id
,
is_priority
,
redis
_res
))
is_priority
,
enqueue
_res
))
return
response
.
ok
()
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment