Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
15dccc97
authored
2021-01-19 17:39:44 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Plain Diff
Merge branch 'feature/main' into feature/mssql
2 parents
236b64e0
5e463cbd
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
204 additions
and
9 deletions
src/apps/doc/consts.py
src/apps/doc/management/commands/bs_statistics.py
src/apps/doc/management/commands/doc_ocr_process.py
src/apps/doc/management/commands/idcard_daily.py
src/apps/doc/management/commands/idcard_monthly.py
src/apps/doc/management/commands/license_statistics.py
src/apps/doc/management/commands/ocr_process.py
src/common/mixins.py
src/common/tools/pdf_to_img.py
src/settings/conf/logging.conf
src/apps/doc/consts.py
View file @
15dccc9
...
...
@@ -152,7 +152,7 @@ RESULT_IDX = FIXED_HEADERS.index('核对结果')
# '借贷': ('贷', '借'), # 竖版-无表格-广发银行
# '借贷状态': ('贷', '借'), # 竖版-特殊-交通银行
# '收/支': ('收入', '支出'), # 横版-表格-北京银行
BORROW_HEADERS_SET
=
{
'借贷'
,
'借贷状态'
,
'收/支'
,
'收支标志'
}
BORROW_HEADERS_SET
=
{
'借贷'
,
'借
\n
贷'
,
'借
贷状态'
,
'收/支'
,
'收支标志'
}
BORROW_INCOME_SET
=
{
'贷'
,
'收入'
,
'收'
,
'收(Cr)'
}
BORROW_OUTLAY_SET
=
{
'借'
,
'支出'
,
'支'
,
'付(Dr)'
}
INCOME_HEADERS_SET
=
{
'收入金额'
,
'收入'
,
'存入'
,
'存入金额(贷)'
,
'存入金额(贷)'
}
...
...
@@ -165,6 +165,7 @@ HEADERS_MAPPING = {}
HEADERS_MAPPING
.
update
(
{
'借贷'
:
BORROW_KEY
,
'借
\n
贷'
:
BORROW_KEY
,
'借贷状态'
:
BORROW_KEY
,
'收支标志'
:
BORROW_KEY
,
'收/支'
:
BORROW_KEY
,
...
...
src/apps/doc/management/commands/bs_statistics.py
View file @
15dccc9
...
...
@@ -40,7 +40,8 @@ class Command(BaseCommand, LoggerMixin):
print
(
'excel dir not exists'
)
return
excel_path
=
os
.
path
.
join
(
excel_dir
,
'bs_{0}.xlsx'
.
format
(
date_str
))
log_path
=
os
.
path
.
join
(
conf
.
LOG_DIR
,
'bs.log.{0}'
.
format
(
date_str
))
# log_path = os.path.join(conf.LOG_DIR, 'bs.log.{0}'.format(date_str))
log_path
=
os
.
path
.
join
(
conf
.
LOG_DIR
,
'bs_statistics.log.{0}'
.
format
(
date_str
))
if
not
os
.
path
.
exists
(
log_path
):
print
(
'log_path not exists'
)
return
...
...
@@ -48,7 +49,8 @@ class Command(BaseCommand, LoggerMixin):
summary_dict
=
{}
with
open
(
log_path
,
'r'
,
encoding
=
'utf-8'
)
as
fp
:
for
line
in
fp
:
search_obj
=
re
.
search
(
r'task=(.*) merged_bs_summary=(.*)'
,
line
)
# search_obj = re.search(r'task=(.*) merged_bs_summary=(.*)', line)
search_obj
=
re
.
search
(
r'\[task=(.*)] \[bs_summary=(.*)]'
,
line
)
task_str
=
search_obj
.
group
(
1
)
business_type
,
doc_id_str
=
task_str
.
split
(
consts
.
SPLIT_STR
)
doc_id
=
int
(
doc_id_str
)
...
...
src/apps/doc/management/commands/doc_ocr_process.py
deleted
100644 → 0
View file @
236b64e
This diff is collapsed.
Click to expand it.
src/apps/doc/management/commands/idcard_daily.py
0 → 100644
View file @
15dccc9
import
re
import
os
import
ast
import
datetime
from
openpyxl
import
Workbook
from
django.core.management
import
BaseCommand
from
settings
import
conf
from
common.mixins
import
LoggerMixin
from
apps.doc.models
import
HILDoc
,
AFCDoc
from
apps.doc
import
consts
class
Command
(
BaseCommand
,
LoggerMixin
):
def
__init__
(
self
):
super
()
.
__init__
()
self
.
sheet_name
=
'身份证'
self
.
header
=
(
'申请号'
,
'身份证号'
,
'民族'
,
'时间戳'
)
def
add_arguments
(
self
,
parser
):
parser
.
add_argument
(
'--date'
,
default
=
datetime
.
date
.
today
()
-
datetime
.
timedelta
(
days
=
1
),
dest
=
'date'
,
help
=
'将要计算的日期,格式: 2018-01-01'
)
def
handle
(
self
,
*
args
,
**
kwargs
):
date
=
kwargs
.
get
(
'date'
)
if
isinstance
(
date
,
str
):
if
not
re
.
match
(
r'\d{4}-\d{2}-\d{2}'
,
date
):
print
(
'date format error'
)
return
date_str
=
date
else
:
date_str
=
date
.
strftime
(
'
%
Y-
%
m-
%
d'
)
afc_excel_dir
=
os
.
path
.
join
(
conf
.
DATA_DIR
,
'AFC'
,
'IdCard'
)
hil_excel_dir
=
os
.
path
.
join
(
conf
.
DATA_DIR
,
'HIL'
,
'IdCard'
)
if
not
os
.
path
.
exists
(
afc_excel_dir
)
or
not
os
.
path
.
exists
(
hil_excel_dir
):
print
(
'excel_dir not exist'
)
return
log_path
=
os
.
path
.
join
(
conf
.
LOG_DIR
,
'idcard.log.{0}'
.
format
(
date_str
))
if
not
os
.
path
.
exists
(
log_path
):
print
(
'log_path not exists'
)
return
wb_afc
=
Workbook
()
ws_afc
=
wb_afc
.
create_sheet
(
self
.
sheet_name
)
ws_afc
.
append
(
self
.
header
)
wb_afc
.
remove
(
wb_afc
.
get_sheet_by_name
(
'Sheet'
))
wb_hil
=
Workbook
()
ws_hil
=
wb_hil
.
create_sheet
(
self
.
sheet_name
)
ws_hil
.
append
(
self
.
header
)
wb_hil
.
remove
(
wb_hil
.
get_sheet_by_name
(
'Sheet'
))
with
open
(
log_path
,
'r'
,
encoding
=
'utf-8'
)
as
fp
:
for
line
in
fp
:
search_obj
=
re
.
match
(
r'\[(.*)] \[task=(.*)] \[idcard=(.*)]'
,
line
)
idcard_str
=
search_obj
.
group
(
3
)
idcard_list
=
ast
.
literal_eval
(
idcard_str
)
content_list
=
[]
for
idcard_dict
in
idcard_list
:
nation
=
idcard_dict
.
get
(
'民族'
)
if
nation
is
None
:
continue
if
idcard_dict
.
get
(
'类别'
)
==
'1'
:
continue
content_list
.
append
((
idcard_dict
.
get
(
'公民身份号码'
),
nation
))
if
len
(
content_list
)
==
0
:
continue
time_str
=
search_obj
.
group
(
1
)
task_str
=
search_obj
.
group
(
2
)
business_type
,
doc_id_str
=
task_str
.
split
(
consts
.
SPLIT_STR
)
doc_class
=
HILDoc
if
business_type
==
consts
.
HIL_PREFIX
else
AFCDoc
application_id
=
doc_class
.
objects
.
filter
(
id
=
int
(
doc_id_str
))
.
values_list
(
'application_id'
,
flat
=
True
)
if
business_type
==
consts
.
HIL_PREFIX
:
for
id_num
,
nation
in
content_list
:
ws_hil
.
append
((
application_id
[
0
],
id_num
,
nation
,
time_str
))
else
:
for
id_num
,
nation
in
content_list
:
ws_afc
.
append
((
application_id
[
0
],
id_num
,
nation
,
time_str
))
afc_excel_path
=
os
.
path
.
join
(
afc_excel_dir
,
'idcard_{0}.xlsx'
.
format
(
date_str
))
hil_excel_path
=
os
.
path
.
join
(
hil_excel_dir
,
'idcard_{0}.xlsx'
.
format
(
date_str
))
wb_afc
.
save
(
afc_excel_path
)
wb_hil
.
save
(
hil_excel_path
)
src/apps/doc/management/commands/idcard_monthly.py
0 → 100644
View file @
15dccc9
import
os
import
datetime
from
calendar
import
monthrange
from
openpyxl
import
Workbook
,
load_workbook
from
django.core.management
import
BaseCommand
from
settings
import
conf
from
common.mixins
import
LoggerMixin
class
Command
(
BaseCommand
,
LoggerMixin
):
def
__init__
(
self
):
super
()
.
__init__
()
self
.
dirs
=
(
'AFC'
,
'HIL'
)
def
handle
(
self
,
*
args
,
**
kwargs
):
now_time
=
datetime
.
datetime
.
now
()
end_day_in_mouth
=
now_time
.
replace
(
day
=
1
)
pre_mouth
=
end_day_in_mouth
-
datetime
.
timedelta
(
days
=
1
)
for
target_dir
in
self
.
dirs
:
excel_dir
=
os
.
path
.
join
(
conf
.
DATA_DIR
,
target_dir
,
'IdCard'
)
if
not
os
.
path
.
exists
(
excel_dir
):
print
(
'excel dir not exists: {0}'
.
format
(
excel_dir
))
return
monthly_wb
=
Workbook
()
for
d
in
range
(
1
,
monthrange
(
pre_mouth
.
year
,
pre_mouth
.
month
)[
1
]
+
1
):
date_str
=
'{:04d}-{:02d}-{:02d}'
.
format
(
pre_mouth
.
year
,
pre_mouth
.
month
,
d
)
daily_excel_path
=
os
.
path
.
join
(
excel_dir
,
'idcard_{0}.xlsx'
.
format
(
date_str
))
if
not
os
.
path
.
exists
(
daily_excel_path
):
print
(
'daily excel path not exists: {0}'
.
format
(
daily_excel_path
))
continue
monthly_ws
=
monthly_wb
.
create_sheet
(
date_str
)
daily_wb
=
load_workbook
(
daily_excel_path
)
daily_ws
=
daily_wb
.
get_sheet_by_name
(
'身份证'
)
for
row
in
daily_ws
.
iter_rows
(
min_row
=
1
,
values_only
=
True
):
monthly_ws
.
append
(
row
)
monthly_excel_path
=
os
.
path
.
join
(
excel_dir
,
'idcard_{0}.xlsx'
.
format
(
pre_mouth
.
strftime
(
'
%
Y-
%
m'
)))
monthly_wb
.
remove
(
monthly_wb
.
get_sheet_by_name
(
'Sheet'
))
monthly_wb
.
save
(
monthly_excel_path
)
src/apps/doc/management/commands/license_statistics.py
View file @
15dccc9
...
...
@@ -14,7 +14,6 @@ class Command(BaseCommand, LoggerMixin):
def
__init__
(
self
):
super
()
.
__init__
()
self
.
log_base
=
'[license statistics]'
self
.
header_map
=
{
consts
.
MVI_CLASSIFY
:
[(
'申请ID'
,
'发票代码'
,
'发票号码'
,
'开票日期'
,
'不含税价'
,
'发票联'
,
'购买方名称'
,
'购买方证件号码'
,
'纳税人识别号'
,
'车架号'
,
'价税合计小写'
,
'销货单位名称'
,
'增值税税额'
,
...
...
@@ -75,7 +74,8 @@ class Command(BaseCommand, LoggerMixin):
print
(
'excel dir not exists'
)
return
excel_path
=
os
.
path
.
join
(
excel_dir
,
'license_{0}.xlsx'
.
format
(
date_str
))
log_path
=
os
.
path
.
join
(
conf
.
LOG_DIR
,
'license.log.{0}'
.
format
(
date_str
))
# log_path = os.path.join(conf.LOG_DIR, 'license.log.{0}'.format(date_str))
log_path
=
os
.
path
.
join
(
conf
.
LOG_DIR
,
'license_statistics.log.{0}'
.
format
(
date_str
))
if
not
os
.
path
.
exists
(
log_path
):
print
(
'log_path not exists'
)
return
...
...
@@ -92,7 +92,8 @@ class Command(BaseCommand, LoggerMixin):
with
open
(
log_path
,
'r'
,
encoding
=
'utf-8'
)
as
fp
:
for
line
in
fp
:
search_obj
=
re
.
search
(
r'task=(.*) license_summary=(.*)'
,
line
)
# search_obj = re.search(r'task=(.*) license_summary=(.*)', line)
search_obj
=
re
.
search
(
r'\[task=(.*)] \[license_summary=(.*)]'
,
line
)
task_str
=
search_obj
.
group
(
1
)
license_summary
=
ast
.
literal_eval
(
search_obj
.
group
(
2
))
business_type
,
doc_id_str
=
task_str
.
split
(
consts
.
SPLIT_STR
)
...
...
src/apps/doc/management/commands/ocr_process.py
View file @
15dccc9
...
...
@@ -689,8 +689,15 @@ class Command(BaseCommand, LoggerMixin):
'[license_summary={4}]'
.
format
(
self
.
log_base
,
task_str
,
bs_summary
,
unknown_summary
,
license_summary
))
self
.
license_log
.
info
(
'[task={0}] [license_summary={1}]'
.
format
(
task_str
,
license_summary
))
idcard_list
=
license_summary
.
get
(
consts
.
IC_CLASSIFY
)
if
idcard_list
:
self
.
idcard_log
.
info
(
'[task={0}] [idcard={1}]'
.
format
(
task_str
,
idcard_list
))
merged_bs_summary
=
self
.
rebuild_bs_summary
(
bs_summary
,
unknown_summary
)
self
.
bs_log
.
info
(
'[task={0}] [bs_summary={1}]'
.
format
(
task_str
,
merged_bs_summary
))
self
.
cronjob_log
.
info
(
'{0} [task={1}] [merged_bs_summary={2}] [unknown_summary={3}] '
'[res_list={4}]'
.
format
(
self
.
log_base
,
task_str
,
merged_bs_summary
,
unknown_summary
,
res_list
))
...
...
src/common/mixins.py
View file @
15dccc9
...
...
@@ -40,6 +40,9 @@ class LoggerMixin:
exception_log
=
logging
.
getLogger
(
'exception'
)
cronjob_log
=
logging
.
getLogger
(
'cronjob'
)
folder_log
=
logging
.
getLogger
(
'folder'
)
bs_log
=
logging
.
getLogger
(
'bs'
)
license_log
=
logging
.
getLogger
(
'license'
)
idcard_log
=
logging
.
getLogger
(
'idcard'
)
class
GenericView
(
LoggerMixin
,
GenericExceptionMixin
,
GenericAPIView
):
...
...
src/common/tools/pdf_to_img.py
View file @
15dccc9
...
...
@@ -84,6 +84,12 @@ class PDFHandler:
def
extract_single_image
(
self
,
pdf
,
xref
,
smask
,
colorspace
,
pno
,
img_index
=
0
):
pix
=
self
.
recover_pix
(
pdf
,
xref
,
smask
,
colorspace
)
ext
,
img_data
=
self
.
get_img_data
(
pix
)
if
ext
==
'jpx'
:
img_save_path
=
self
.
get_img_save_path
(
pno
,
img_index
=
img_index
,
ext
=
'jpeg'
)
jpx_pix
=
fitz
.
Pixmap
(
img_data
)
jpx_pix
.
writeImage
(
img_save_path
)
jpx_pix
=
None
else
:
img_save_path
=
self
.
get_img_save_path
(
pno
,
img_index
=
img_index
,
ext
=
ext
)
with
open
(
img_save_path
,
"wb"
)
as
f
:
f
.
write
(
img_data
)
...
...
src/settings/conf/logging.conf
View file @
15dccc9
[
loggers
]
keys
=
root
,
running
,
exception
,
cronjob
,
folder
,
django
.
db
.
backends
keys
=
root
,
running
,
exception
,
cronjob
,
folder
,
bs
,
license
,
idcard
,
django
.
db
.
backends
[
handlers
]
keys
=
consoleHandler
,
django_rotateFileHandler
,
exceptionFileHandler
,
cronjobFileHandler
,
folderFileHandler
,
djangodbFileHandler
keys
=
consoleHandler
,
django_rotateFileHandler
,
exceptionFileHandler
,
cronjobFileHandler
,
folderFileHandler
,
bsFileHandler
,
licenseFileHandler
,
idcardFileHandler
,
djangodbFileHandler
[
formatters
]
keys
=
SituFormatter
,
dataLogFormatter
keys
=
SituFormatter
,
dataLogFormatter
,
SimpleFormatter
[
formatter_SituFormatter
]
format
=[%(
asctime
)
s
] [%(
process
)
d
] [%(
thread
)
d
] [%(
threadName
)
s
] [%(
filename
)
s
:%(
lineno
)
d
] %(
levelname
)
s
%(
message
)
s
...
...
@@ -15,6 +15,10 @@ datefmt=
class
=
situlogger
.
JsonFormatter
format
=%(
asctime
)
s
%(
levelname
)
s
%(
funcName
)
s
[
formatter_SimpleFormatter
]
format
=[%(
asctime
)
s
] %(
message
)
s
datefmt
=
[
handler_consoleHandler
]
class
=
StreamHandler
level
=
ERROR
...
...
@@ -45,6 +49,24 @@ level=DEBUG
formatter
=
SituFormatter
args
=(
'../logs/folder_ocr.log'
,)
[
handler_bsFileHandler
]
class
=
situlogger
.
SituRotatingFileHandler
level
=
DEBUG
formatter
=
SimpleFormatter
args
=(
'../logs/bs_statistics.log'
,)
[
handler_licenseFileHandler
]
class
=
situlogger
.
SituRotatingFileHandler
level
=
DEBUG
formatter
=
SimpleFormatter
args
=(
'../logs/license_statistics.log'
,)
[
handler_idcardFileHandler
]
class
=
situlogger
.
SituRotatingFileHandler
level
=
DEBUG
formatter
=
SimpleFormatter
args
=(
'../logs/idcard.log'
,)
[
handler_djangodbFileHandler
]
class
=
situlogger
.
SituRotatingFileHandler
level
=
DEBUG
...
...
@@ -79,6 +101,24 @@ handlers=folderFileHandler
qualname
=
folder
propagate
=
0
[
logger_bs
]
level
=
INFO
handlers
=
bsFileHandler
qualname
=
bs
propagate
=
0
[
logger_license
]
level
=
INFO
handlers
=
licenseFileHandler
qualname
=
license
propagate
=
0
[
logger_idcard
]
level
=
INFO
handlers
=
idcardFileHandler
qualname
=
idcard
propagate
=
0
[
logger_django
.
db
.
backends
]
level
=
DEBUG
handlers
=
djangodbFileHandler
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment