Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
7965c565
authored
2022-02-28 17:36:50 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
add 0318
1 parent
ea79bc59
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
107 additions
and
17 deletions
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/models.py
src/apps/doc/ocr/wb.py
src/apps/doc/views.py
src/celery_compare/tasks.py
src/common/tools/mssql_script16.py
src/common/tools/pdf_to_img.py
src/apps/doc/management/commands/ocr_process.py
View file @
7965c56
...
...
@@ -1096,11 +1096,23 @@ class Command(BaseCommand, LoggerMixin):
try
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
page_count
=
pdf_handler
.
page_count
doc
.
save
()
except
Exception
as
e
:
self
.
online_log
.
error
(
'{0} [process error (db save)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
else
:
try
:
if
pdf_handler
.
is_e_pdf
:
doc
.
metadata
=
pdf_handler
.
metadata
if
pdf_handler
.
metadata
is
None
else
\
json
.
dumps
(
pdf_handler
.
metadata
)
doc
.
page_count
=
pdf_handler
.
page_count
doc
.
save
()
except
Exception
as
e
:
self
.
online_log
.
error
(
'{0} [process error (db save)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
with
lock
:
todo_count_dict
[
task_str
]
=
pdf_handler
.
img_count
for
img_idx
,
img_path
in
enumerate
(
pdf_handler
.
img_path_list
):
...
...
@@ -1147,6 +1159,7 @@ class Command(BaseCommand, LoggerMixin):
try
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
page_count
=
pdf_handler
.
page_count
doc
.
save
()
self
.
online_log
.
warn
(
'{0} [process failed (pdf_2_img_2_queue)] [task={1}] '
'[error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
...
...
@@ -1178,6 +1191,13 @@ class Command(BaseCommand, LoggerMixin):
else
:
raise
Exception
(
'download or pdf to img failed'
)
try
:
doc
.
page_count
=
pdf_handler
.
page_count
doc
.
save
()
except
Exception
as
e
:
self
.
online_log
.
error
(
'{0} [process error (db save)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
if
classify_1_str
==
str
(
consts
.
CONTRACT_CLASSIFY
):
ocr_result
=
afc_predict
(
pdf_handler
.
pdf_info
)
page_res
=
{}
...
...
@@ -1234,6 +1254,7 @@ class Command(BaseCommand, LoggerMixin):
except
Exception
as
e
:
try
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
page_count
=
pdf_handler
.
page_count
doc
.
save
()
self
.
online_log
.
warn
(
'{0} [process failed (e-contract)] [task={1}] '
'[error={2}]'
.
format
(
self
.
e_log_base
,
task_str
,
traceback
.
format_exc
()))
...
...
@@ -1560,7 +1581,7 @@ class Command(BaseCommand, LoggerMixin):
# 重构Excel文件
# src_excel_path = os.path.join(doc_data_path, 'src.xlsx')
# wb.save(src_excel_path)
count_list
=
wb
.
rebuild
(
merged_bs_summary
,
license_summary
,
res_list
,
doc
.
document_scheme
,
contract_result
)
count_list
=
wb
.
rebuild
(
merged_bs_summary
,
license_summary
,
res_list
,
doc
.
document_scheme
,
contract_result
,
doc
.
metadata
)
wb
.
save
(
excel_path
)
except
Exception
as
e
:
...
...
src/apps/doc/models.py
View file @
7965c56
...
...
@@ -61,6 +61,9 @@ class HILDoc(models.Model):
mvc_count
=
models
.
IntegerField
(
default
=
0
,
verbose_name
=
'机动车登记证书处理数目'
)
vat_count
=
models
.
IntegerField
(
default
=
0
,
verbose_name
=
'增值税发票处理数目'
)
page_count
=
models
.
IntegerField
(
null
=
True
,
verbose_name
=
'文件page数目'
)
metadata
=
models
.
TextField
(
null
=
True
,
verbose_name
=
"电子PDF专属,PDF信息"
)
class
Meta
:
managed
=
False
db_table
=
'hil_doc'
...
...
@@ -100,6 +103,9 @@ class AFCDoc(models.Model):
mvc_count
=
models
.
IntegerField
(
default
=
0
,
verbose_name
=
'机动车登记证书处理数目'
)
vat_count
=
models
.
IntegerField
(
default
=
0
,
verbose_name
=
'增值税发票处理数目'
)
page_count
=
models
.
IntegerField
(
null
=
True
,
verbose_name
=
'文件page数目'
)
metadata
=
models
.
TextField
(
null
=
True
,
verbose_name
=
"电子PDF专属,PDF信息"
)
class
Meta
:
managed
=
False
situ_db_label
=
'afc'
...
...
src/apps/doc/ocr/wb.py
View file @
7965c56
import
re
import
json
import
random
import
locale
import
numpy
as
np
...
...
@@ -311,7 +312,8 @@ class BSWorkbook(Workbook):
month_mapping
.
setdefault
(
item
,
[])
.
append
(
(
ws
.
title
,
idx_list
[
i
]
+
min_row
,
idx_list
[
i
+
1
]
+
min_row
-
1
,
day_mean
))
def
build_metadata_rows
(
self
,
confidence
,
code
,
verify_list
,
print_time
,
start_date
,
end_date
,
res_count_tuple
,
is_verify_classify
):
def
build_metadata_rows
(
self
,
confidence
,
code
,
verify_list
,
print_time
,
start_date
,
end_date
,
res_count_tuple
,
is_verify_classify
,
metadata
):
metadata_rows
=
[(
'流水识别置信度'
,
confidence
)]
if
is_verify_classify
:
verify_res
=
'疑似伪造'
if
len
(
verify_list
)
>
0
else
'正常'
...
...
@@ -322,11 +324,26 @@ class BSWorkbook(Workbook):
metadata_rows
.
append
((
'识别成功'
,
res_count_tuple
[
1
]))
metadata_rows
.
append
(
self
.
blank_row
)
# PDF info
metadata_highlight_row
=
[]
if
isinstance
(
metadata
,
str
):
metadata_dict
=
json
.
loads
(
metadata
)
author
=
metadata_dict
.
pop
(
'author'
,
''
)
producer
=
metadata_dict
.
pop
(
'producer'
,
''
)
metadata_rows
.
append
((
'Author'
,
author
))
metadata_rows
.
append
((
'Producer'
,
producer
))
if
len
(
author
)
>
0
:
metadata_highlight_row
.
append
(
6
)
if
'iText'
not
in
producer
and
'Qt'
not
in
producer
and
'Haru Free'
not
in
producer
:
metadata_highlight_row
.
append
(
7
)
metadata_rows
.
append
(
self
.
blank_row
)
verify_highlight_row
=
[]
if
is_verify_classify
and
len
(
verify_list
)
>
0
:
metadata_rows
.
append
(
self
.
verify_header
)
verify_start
=
len
(
metadata_rows
)
metadata_rows
.
extend
(
verify_list
)
for
r
in
range
(
6
,
len
(
metadata_rows
)
+
1
):
for
r
in
range
(
verify_start
,
len
(
metadata_rows
)
+
1
):
verify_highlight_row
.
append
(
r
)
metadata_rows
.
append
(
self
.
blank_row
)
...
...
@@ -344,18 +361,23 @@ class BSWorkbook(Workbook):
self
.
blank_row
,
self
.
interest_keyword_header
]
)
return
metadata_rows
,
verify_highlight_row
,
timedelta
return
metadata_rows
,
verify_highlight_row
,
timedelta
,
metadata_highlight_row
def
build_meta_sheet
(
self
,
role_name
,
card
,
confidence
,
code
,
verify_list
,
print_time
,
start_date
,
end_date
,
res_count_tuple
,
is_verify_classify
):
metadata_rows
,
verify_highlight_row
,
timedelta
=
self
.
build_metadata_rows
(
confidence
,
code
,
verify_list
,
print_time
,
start_date
,
end_date
,
res_count_tuple
,
is_verify_classify
)
res_count_tuple
,
is_verify_classify
,
metadata
):
metadata_rows
,
verify_highlight_row
,
timedelta
,
metadata_highlight_row
=
\
self
.
build_metadata_rows
(
confidence
,
code
,
verify_list
,
print_time
,
start_date
,
end_date
,
res_count_tuple
,
is_verify_classify
,
metadata
)
if
not
isinstance
(
role_name
,
str
):
role_name
=
consts
.
UNKNOWN_ROLE
ms
=
self
.
create_sheet
(
'{0}{1}({2})'
.
format
(
self
.
meta_sheet_title
,
role_name
,
card
))
for
row
in
metadata_rows
:
ms
.
append
(
row
)
for
row
in
metadata_highlight_row
:
for
cell
in
ms
[
row
]:
cell
.
fill
=
self
.
amount_fill
if
len
(
verify_highlight_row
)
>
0
:
for
cell
in
ms
[
2
]:
cell
.
fill
=
self
.
amount_fill
...
...
@@ -625,7 +647,7 @@ class BSWorkbook(Workbook):
ms
.
append
(
row
)
self
.
remove
(
tmp2_ws
)
def
bs_rebuild
(
self
,
bs_summary
,
res_count_tuple
):
def
bs_rebuild
(
self
,
bs_summary
,
res_count_tuple
,
metadata
=
None
):
# bs_summary = {
# '卡号': {
# 'classify': 0,
...
...
@@ -691,7 +713,8 @@ class BSWorkbook(Workbook):
start_date
,
end_date
,
res_count_tuple
,
is_verify_classify
)
is_verify_classify
,
metadata
)
summary
[
'timedelta'
]
=
timedelta
...
...
@@ -846,16 +869,16 @@ class BSWorkbook(Workbook):
if
len
(
self
.
sheetnames
)
>
1
:
self
.
remove
(
self
.
get_sheet_by_name
(
'Sheet'
))
def
rebuild
(
self
,
bs_summary
,
license_summary
,
res_list
,
document_scheme
,
contract_result
):
def
rebuild
(
self
,
bs_summary
,
license_summary
,
res_list
,
document_scheme
,
contract_result
,
metadata
):
res_count_tuple
=
self
.
res_sheet
(
res_list
)
count_list
=
[(
consts
.
MODEL_FIELD_BS
,
len
(
bs_summary
))]
if
document_scheme
==
consts
.
DOC_SCHEME_LIST
[
1
]:
self
.
license_rebuild
(
license_summary
,
document_scheme
,
count_list
)
self
.
contract_rebuild
(
contract_result
)
self
.
bs_rebuild
(
bs_summary
,
res_count_tuple
)
self
.
bs_rebuild
(
bs_summary
,
res_count_tuple
,
metadata
)
else
:
self
.
bs_rebuild
(
bs_summary
,
res_count_tuple
)
self
.
bs_rebuild
(
bs_summary
,
res_count_tuple
,
metadata
)
self
.
license_rebuild
(
license_summary
,
document_scheme
,
count_list
)
self
.
move_res_sheet
()
self
.
remove_base_sheet
()
...
...
src/apps/doc/views.py
View file @
7965c56
...
...
@@ -889,7 +889,7 @@ class DocView(GenericView, DocHandler):
create_time__lt
=
create_time_end
+
datetime
.
timedelta
(
days
=
1
))
\
if
create_time_start
is
not
None
and
create_time_end
is
not
None
else
Q
()
query
=
application_id_query
&
status_query
&
data_source_query
&
upload_finish_time_query
&
create_time_query
val_tuple
=
(
'id'
,
'application_id'
,
'upload_finish_time'
,
'create_time'
,
'd
ata_source'
,
'status
'
)
val_tuple
=
(
'id'
,
'application_id'
,
'upload_finish_time'
,
'create_time'
,
'd
ocument_scheme'
,
'data_source'
,
'status'
,
'page_count
'
)
doc_class
,
prefix
=
self
.
get_doc_class
(
business_type
)
total
=
doc_class
.
objects
.
filter
(
query
)
.
count
()
start_index
=
page_size
*
(
page
-
1
)
...
...
@@ -898,14 +898,22 @@ class DocView(GenericView, DocHandler):
raise
self
.
invalid_params
(
'页数不存在'
)
doc_queryset
=
doc_class
.
objects
.
filter
(
query
)
.
values
(
*
val_tuple
)
.
order_by
(
'-create_time'
)[
start_index
:
end_index
]
doc_list
=
self
.
get_doc_list
(
doc_queryset
,
prefix
)
# doc_list = self.get_doc_list(doc_queryset, prefix)
for
doc_dict
in
doc_queryset
:
tmp_scheme
=
consts
.
COMPARE_DOC_SCHEME_LIST
[
0
]
if
doc_dict
[
'document_scheme'
]
==
consts
.
DOC_SCHEME_LIST
[
0
]
\
else
consts
.
COMPARE_DOC_SCHEME_LIST
[
1
]
application_link
=
'{0}/showList/showList?entity={1}&scheme={2}&case_id={3}'
.
format
(
conf
.
BASE_URL
,
prefix
,
tmp_scheme
,
doc_dict
[
'application_id'
])
doc_dict
[
'target_url'
]
=
application_link
# total = len(doc_list)
pagination
=
{
'current'
:
page
,
'total'
:
total
,
'page_size'
:
page_size
}
res
=
{
'pagination'
:
pagination
,
'doc_list'
:
doc_list
'doc_list'
:
list
(
doc_queryset
)
}
# 新增scheme、处理时长、文件页数,删除下载切图
# 新增链接跳转比对结果
self
.
running_log
.
info
(
'[get doc list] [args={0}] [res={1}]'
.
format
(
args
,
res
))
return
response
.
ok
(
data
=
res
)
...
...
src/celery_compare/tasks.py
View file @
7965c56
...
...
@@ -2082,7 +2082,7 @@ def se_bs_compare(license_en, ocr_res_dict, strip_list, is_auto):
dbr1_tmp_res_part
=
{}
for
idx
,
(
name
,
value
)
in
enumerate
(
dbr1_field_list
):
ocr_str_or_list
=
ocr_res
.
get
(
compare_logic
[
name
][
0
])
if
isinstance
(
ocr_str_or_list
,
str
)
or
isinstance
(
ocr_str_or_list
,
list
):
if
isinstance
(
ocr_str_or_list
,
str
)
or
isinstance
(
ocr_str_or_list
,
list
)
or
isinstance
(
ocr_str_or_list
,
int
)
:
result
=
getattr
(
cp
,
compare_logic
[
name
][
1
])(
value
,
ocr_str_or_list
,
**
compare_logic
[
name
][
2
])
if
isinstance
(
ocr_str_or_list
,
list
):
ocr_str
=
json
.
dumps
(
ocr_str_or_list
,
ensure_ascii
=
False
)
...
...
@@ -2114,7 +2114,7 @@ def se_bs_compare(license_en, ocr_res_dict, strip_list, is_auto):
dbr2_tmp_res_part
=
{}
for
idx
,
(
name
,
value
)
in
enumerate
(
dbr2_field_list
):
ocr_str_or_list
=
ocr_res
.
get
(
compare_logic
[
name
][
0
])
if
isinstance
(
ocr_str_or_list
,
str
)
or
isinstance
(
ocr_str_or_list
,
list
):
if
isinstance
(
ocr_str_or_list
,
str
)
or
isinstance
(
ocr_str_or_list
,
list
)
or
isinstance
(
ocr_str_or_list
,
int
)
:
result
=
getattr
(
cp
,
compare_logic
[
name
][
1
])(
value
,
ocr_str_or_list
,
**
compare_logic
[
name
][
2
])
if
isinstance
(
ocr_str_or_list
,
list
):
ocr_str
=
json
.
dumps
(
ocr_str_or_list
,
ensure_ascii
=
False
)
...
...
src/common/tools/mssql_script16.py
0 → 100644
View file @
7965c56
import
pyodbc
hil_sql
=
"""
ALTER TABLE hil_doc ADD page_count smallint;
ALTER TABLE hil_doc ADD metadata nvarchar(max);
"""
afc_sql
=
"""
ALTER TABLE afc_doc ADD page_count smallint;
ALTER TABLE afc_doc ADD metadata nvarchar(max);
"""
hil_cnxn
=
pyodbc
.
connect
(
'DRIVER={ODBC Driver 17 for SQL Server};'
,
autocommit
=
True
)
hil_cursor
=
hil_cnxn
.
cursor
()
hil_cursor
.
execute
(
hil_sql
)
hil_cursor
.
close
()
hil_cnxn
.
close
()
afc_cnxn
=
pyodbc
.
connect
(
'DRIVER={ODBC Driver 17 for SQL Server};'
,
autocommit
=
True
)
afc_cursor
=
afc_cnxn
.
cursor
()
afc_cursor
.
execute
(
afc_sql
)
afc_cursor
.
close
()
afc_cnxn
.
close
()
src/common/tools/pdf_to_img.py
View file @
7965c56
...
...
@@ -47,6 +47,8 @@ class PDFHandler:
'中国建设银行个人活期账户全部交易明细'
,
'平安银行个人账户交易明细清单'
,
]
self
.
page_count
=
None
self
.
metadata
=
None
def
get_suffix
(
self
,
file_name
):
if
file_name
is
None
:
...
...
@@ -321,6 +323,7 @@ class PDFHandler:
def
e_contract_process
(
self
):
os
.
makedirs
(
self
.
img_dir_path
,
exist_ok
=
True
)
with
fitz
.
Document
(
self
.
path
)
as
pdf
:
self
.
page_count
=
pdf
.
pageCount
for
pno
in
range
(
pdf
.
pageCount
):
page
=
pdf
.
loadPage
(
pno
)
self
.
pdf_info
[
str
(
pno
)]
=
json
.
loads
(
page
.
getText
(
'json'
))
...
...
@@ -341,6 +344,8 @@ class PDFHandler:
self
.
img_path_list
.
append
(
img_save_path
)
else
:
with
fitz
.
Document
(
self
.
path
)
as
pdf
:
self
.
metadata
=
pdf
.
metadata
self
.
page_count
=
pdf
.
pageCount
if
isinstance
(
max_img_count
,
int
)
and
pdf
.
pageCount
>=
max_img_count
:
self
.
img_count
=
pdf
.
pageCount
return
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment