Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
1a51bc03
authored
2020-12-21 15:55:06 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Plain Diff
Merge branch 'feature/main' into feature/mssql
2 parents
05a27cd8
fe25f273
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
106 additions
and
48 deletions
src/apps/doc/consts.py
src/apps/doc/management/commands/folder_ocr_process.py
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/ocr/wb.py
src/common/tools/pdf_to_img.py
src/apps/doc/consts.py
View file @
1a51bc0
...
...
@@ -642,7 +642,7 @@ RP_FIELD_ORDER_0 = (('姓名', '姓名'),
(
'住址'
,
'住址'
),
(
'性别'
,
'性别'
),)
RP_FIELD_ORDER_1
=
IC_FIELD_ORDER_1
# 增值税
发
票
# 增值税
普
票
VAT_CN_NAME
=
'VAT普票'
VAT_CLASSIFY
=
0
VAT_FIELD_ORDER
=
((
'发票代码'
,
'发票代码'
),
...
...
@@ -667,6 +667,32 @@ VAT_FIELD_ORDER = (('发票代码', '发票代码'),
(
'销方开户行及账号'
,
'销售方开户行及账号'
),
(
'下盖章'
,
'销售方:(章)'
),
(
'备注'
,
'备注'
),)
# 增值税专票
VATS_CN_NAME
=
'VAT专票'
VATS_CLASSIFY
=
10088
VATS_FIELD_ORDER
=
((
'发票代码'
,
'发票代码'
),
(
'发票代码_开具'
,
'发票代码(开具)'
),
(
'发票号码'
,
'发票号码'
),
(
'发票号码_开具'
,
'发票号码(开具)'
),
(
'开票日期'
,
'开票日期'
),
(
'校验码'
,
'校验码'
),
(
'货物或应税劳务、服务名称'
,
'货物或应税劳务、服务名称'
),
(
'金额合计'
,
'开具金额合计(不含税)'
),
(
'税率'
,
'税率'
),
(
'税额合计'
,
'税额合计'
),
(
'价税合计小写'
,
'价税合计(小写)'
),
(
'价税合计大写'
,
'价税合计(大写)'
),
(
'购方名称'
,
'购买方名称'
),
(
'购方纳税人识别号'
,
'购买方纳税人识别号'
),
(
'购方地址、电话'
,
'购买方地址、电话'
),
(
'购方开户行及账号'
,
'购买方开户行及账号'
),
(
'销方名称'
,
'销售方名称'
),
(
'销方纳税人识别号'
,
'销售方纳税人识别号'
),
(
'销方地址、电话'
,
'销售方地址、电话'
),
(
'销方开户行及账号'
,
'销售方开户行及账号'
),
(
'下盖章'
,
'销售方:(章)'
),
(
'车船税'
,
'车船税'
),
(
'备注'
,
'备注'
),)
# 机动车登记证书
MVC_CN_NAME
=
'机动车登记证书'
MVC_CLASSIFY
=
28
...
...
@@ -770,7 +796,7 @@ MVI_FIELD_ORDER = (('发票代码', '发票代码'),
(
'主管税务机关及代码'
,
'主管税务机关及代码'
),
(
'吨位'
,
'吨位'
),
(
'限乘人数'
,
'限乘人数'
),)
IC_PID
=
VAT_PID
=
MVC_PID
=
MVI_PID
=
None
IC_PID
=
VAT_PID
=
VATS_PID
=
MVC_PID
=
MVI_PID
=
None
# 营业执照
BL_CN_NAME
=
'营业执照'
...
...
@@ -909,6 +935,11 @@ LICENSE_ORDER = ((MVI_CLASSIFY, (MVI_PID, MVI_CN_NAME, MVI_FIELD_ORDER, False, F
(
MVC_CLASSIFY
,
(
MVC_PID
,
MVC_CN_NAME
,
None
,
True
,
True
,
MODEL_FIELD_MVC
)),
(
VAT_CLASSIFY
,
(
VAT_PID
,
VAT_CN_NAME
,
VAT_FIELD_ORDER
,
False
,
False
,
MODEL_FIELD_VAT
)))
FOLDER_LICENSE_ORDER
=
((
MVI_CLASSIFY
,
(
MVI_PID
,
MVI_CN_NAME
,
MVI_FIELD_ORDER
,
False
,
False
,
MODEL_FIELD_MVI
)),
(
IC_CLASSIFY
,
(
IC_PID
,
IC_CN_NAME
,
None
,
True
,
False
,
MODEL_FIELD_IC
)),
(
VAT_CLASSIFY
,
(
VAT_PID
,
VAT_CN_NAME
,
VAT_FIELD_ORDER
,
False
,
False
,
MODEL_FIELD_VAT
)),
(
VATS_CLASSIFY
,
(
VATS_PID
,
VATS_CN_NAME
,
VATS_FIELD_ORDER
,
False
,
False
,
MODEL_FIELD_VAT
)))
LICENSE_CLASSIFY_MAPPING
=
dict
(
LICENSE_ORDER
)
OTHER_CLASSIFY_SET
=
{
OTHER_CLASSIFY
}
...
...
src/apps/doc/management/commands/folder_ocr_process.py
View file @
1a51bc0
...
...
@@ -165,7 +165,6 @@ class Command(BaseCommand, LoggerMixin):
def
folder_process
(
self
,
input_dir
,
classify
):
while
not
os
.
path
.
isdir
(
input_dir
):
self
.
folder_log
.
info
(
'{0} [input dir is not dir] [input_dir={1}]'
.
format
(
self
.
log_base
,
input_dir
))
print
(
self
.
switch
)
if
self
.
switch
:
time
.
sleep
(
self
.
sleep_time
)
continue
...
...
@@ -202,6 +201,9 @@ class Command(BaseCommand, LoggerMixin):
else
:
self
.
img_process
(
name
,
path
,
classify
,
wb_output_dir
,
img_output_dir
,
pdf_output_dir
)
self
.
folder_log
.
info
(
'{0} [file end] [path={1}]'
.
format
(
self
.
log_base
,
path
))
else
:
self
.
folder_log
.
info
(
'{0} [path is dir] [path={1}]'
.
format
(
self
.
log_base
,
input_dir
))
shutil
.
move
(
path
,
failed_output_dir
)
except
Exception
as
e
:
try
:
path
=
os
.
path
.
join
(
input_dir
,
name
)
...
...
src/apps/doc/management/commands/ocr_process.py
View file @
1a51bc0
...
...
@@ -102,20 +102,20 @@ class Command(BaseCommand, LoggerMixin):
self
.
log_base
,
task_str
,
is_priority
))
return
doc
,
business_type
,
task_str
def
pdf_download
(
self
,
doc
,
pdf_path
):
if
not
doc
.
application_id
.
startswith
(
consts
.
FIXED_APPLICATION_ID_PREFIX
):
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
self
.
edms
.
download
(
pdf_path
,
doc
.
metadata_version_id
)
except
Exception
as
e
:
self
.
cronjob_log
.
warn
(
'{0} [edms download failed] [times={1}] [pdf_path={2}] '
'[error={3}]'
.
format
(
self
.
log_base
,
times
,
pdf_path
,
traceback
.
format_exc
()))
edms_exc
=
str
(
e
)
else
:
break
else
:
raise
EDMSException
(
edms_exc
)
self
.
cronjob_log
.
info
(
'{0} [edms download success] [pdf_path={1}]'
.
format
(
self
.
log_base
,
pdf_path
))
#
def pdf_download(self, doc, pdf_path):
#
if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
#
for times in range(consts.RETRY_TIMES):
#
try:
#
self.edms.download(pdf_path, doc.metadata_version_id)
#
except Exception as e:
#
self.cronjob_log.warn('{0} [edms download failed] [times={1}] [pdf_path={2}] '
#
'[error={3}]'.format(self.log_base, times, pdf_path, traceback.format_exc()))
#
edms_exc = str(e)
#
else:
#
break
#
else:
#
raise EDMSException(edms_exc)
#
self.cronjob_log.info('{0} [edms download success] [pdf_path={1}]'.format(self.log_base, pdf_path))
def
bs_process
(
self
,
wb
,
ocr_data
,
bs_summary
,
unknown_summary
,
classify
,
res_list
,
pno
,
ino
,
part_idx
):
sheets
=
ocr_data
.
get
(
'data'
,
[])
...
...
@@ -439,19 +439,35 @@ class Command(BaseCommand, LoggerMixin):
# 2. 从EDMS获取PDF文件
doc_data_path
=
os
.
path
.
join
(
self
.
data_dir
,
business_type
,
consts
.
TMP_DIR_NAME
,
str
(
doc
.
id
))
os
.
makedirs
(
doc_data_path
,
exist_ok
=
True
)
img_save_path
=
os
.
path
.
join
(
doc_data_path
,
'img'
)
pdf_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.pdf'
.
format
(
doc
.
id
))
self
.
pdf_download
(
doc
,
pdf_path
)
pdf_handler
=
PDFHandler
(
pdf_path
,
img_save_path
)
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
if
not
doc
.
application_id
.
startswith
(
consts
.
FIXED_APPLICATION_ID_PREFIX
):
self
.
edms
.
download
(
pdf_path
,
doc
.
metadata_version_id
)
self
.
cronjob_log
.
info
(
'{0} [edms download success] [task={1}] [times={2}] '
'[pdf_path={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
pdf_path
))
# 3.PDF文件提取图片
self
.
cronjob_log
.
info
(
'{0} [pdf to img start] [task={1}]'
.
format
(
self
.
log_base
,
task_str
))
self
.
cronjob_log
.
info
(
'{0} [pdf to img start] [task={1}] [times={2}]'
.
format
(
self
.
log_base
,
task_str
,
times
))
start_time
=
time
.
time
()
img_save_path
=
os
.
path
.
join
(
doc_data_path
,
'img'
)
pdf_handler
=
PDFHandler
(
pdf_path
,
img_save_path
)
pdf_handler
.
extract_image
()
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
cronjob_log
.
info
(
'{0} [pdf to img end] [task={1}] [spend_time={2}]'
.
format
(
self
.
log_base
,
task_str
,
speed_time
))
self
.
cronjob_log
.
info
(
'{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
speed_time
))
except
Exception
as
e
:
self
.
cronjob_log
.
warn
(
'{0} [download or pdf to img failed] [task={1}] [times={2}] '
'[error={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
traceback
.
format_exc
()))
else
:
break
else
:
raise
Exception
(
'download or pdf to img failed'
)
img_count
=
len
(
pdf_handler
.
img_path_list
)
if
img_count
==
0
:
...
...
@@ -466,25 +482,25 @@ class Command(BaseCommand, LoggerMixin):
self
.
cronjob_log
.
info
(
'{0} [pdf_2_img_2_queue] [img queue full]'
.
format
(
self
.
log_base
))
time
.
sleep
(
self
.
sleep_time_img_put
)
img_queue
.
put
(
img_path
)
except
EDMSException
as
e
:
try
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
save
()
self
.
cronjob_log
.
warn
(
'{0} [process failed (edms download)] [task={1}] [error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
except
Exception
as
e
:
self
.
cronjob_log
.
error
(
'{0} [process error (db save 1)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
error_list
.
append
(
1
)
return
#
except EDMSException as e:
#
try:
#
doc.status = DocStatus.PROCESS_FAILED.value
#
doc.save()
#
self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
#
self.log_base, task_str, traceback.format_exc()))
#
except Exception as e:
#
self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format(
#
self.log_base, traceback.format_exc()))
#
error_list.append(1)
#
return
except
Exception
as
e
:
try
:
doc
.
status
=
DocStatus
.
PROCESS_FAILED
.
value
doc
.
save
()
self
.
cronjob_log
.
warn
(
'{0} [process failed (pdf
to img)] [task={1}] [error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
self
.
cronjob_log
.
warn
(
'{0} [process failed (pdf
_2_img_2_queue)] [task={1}] '
'[error={2}]'
.
format
(
self
.
log_base
,
task_str
,
traceback
.
format_exc
()))
except
Exception
as
e
:
self
.
cronjob_log
.
error
(
'{0} [process error (db save
2
)] [error={1}]'
.
format
(
self
.
cronjob_log
.
error
(
'{0} [process error (db save
1
)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
error_list
.
append
(
1
)
return
...
...
@@ -523,8 +539,8 @@ class Command(BaseCommand, LoggerMixin):
ocr_1_res
=
ocr_1_response
.
json
()
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
cronjob_log
.
info
(
'{0} [ocr_1 success] [img={1}] [
res
={2}] [speed_time={3}]'
.
format
(
self
.
log_base
,
img_path
,
ocr_1_res
,
speed_time
))
self
.
cronjob_log
.
info
(
'{0} [ocr_1 success] [img={1}] [
url
={2}] [speed_time={3}]'
.
format
(
self
.
log_base
,
img_path
,
url
,
speed_time
))
break
else
:
ocr_1_res
=
{}
...
...
@@ -636,8 +652,8 @@ class Command(BaseCommand, LoggerMixin):
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
cronjob_log
.
info
(
'{0} [ocr_2 success] [img={1}] [
res={2}] [speed_time={3
}]'
.
format
(
self
.
log_base
,
img_path
,
ocr_2_res
,
speed_time
))
'{0} [ocr_2 success] [img={1}] [
speed_time={2
}]'
.
format
(
self
.
log_base
,
img_path
,
speed_time
))
if
classify
==
consts
.
BC_CLASSIFY
:
name
=
'有'
...
...
src/apps/doc/ocr/wb.py
View file @
1a51bc0
...
...
@@ -520,7 +520,7 @@ class BSWorkbook(Workbook):
for
row
in
loan_fill_row
:
for
cell
in
new_ws
[
row
]:
cell
.
fill
=
self
.
loan
_fill
cell
.
fill
=
self
.
amount
_fill
# 3.6.同一天相同进出账高亮
del
amount_mapping
...
...
@@ -656,17 +656,24 @@ class BSWorkbook(Workbook):
count_list
.
append
((
field_str
,
count
))
def
simple_license_rebuild
(
self
,
license_summary
,
document_scheme
):
for
classify
,
(
_
,
name
,
field_order
,
side_diff
,
scheme_diff
,
_
)
in
consts
.
LICENSE_ORDER
:
for
ic_license_dict
in
license_summary
.
get
(
consts
.
IC_CLASSIFY
,
[]):
if
ic_license_dict
.
get
(
'类别'
)
==
'1'
:
license_summary
.
setdefault
(
consts
.
RP_CLASSIFY
,
[])
.
append
(
ic_license_dict
)
continue
for
vat_license_dict
in
license_summary
.
get
(
consts
.
VAT_CLASSIFY
,
[]):
if
vat_license_dict
.
get
(
'发票类型'
)
==
'special'
:
license_summary
.
setdefault
(
consts
.
VATS_CLASSIFY
,
[])
.
append
(
vat_license_dict
)
continue
for
classify
,
(
_
,
name
,
field_order
,
side_diff
,
scheme_diff
,
_
)
in
consts
.
FOLDER_LICENSE_ORDER
:
license_list
=
license_summary
.
get
(
classify
)
if
not
license_list
:
continue
ws
=
self
.
create_sheet
(
name
)
if
scheme_diff
and
document_scheme
==
consts
.
DOC_SCHEME_LIST
[
1
]:
classify
=
consts
.
MVC_CLASSIFY_SE
#
if scheme_diff and document_scheme == consts.DOC_SCHEME_LIST[1]:
#
classify = consts.MVC_CLASSIFY_SE
for
license_dict
in
license_list
:
if
classify
==
consts
.
IC_CLASSIFY
and
license_dict
.
get
(
'类别'
)
==
'1'
:
license_summary
.
setdefault
(
consts
.
RP_CLASSIFY
,
[])
.
append
(
license_dict
)
continue
if
side_diff
:
key
,
field_order_yes
,
field_order_no
=
consts
.
FIELD_ORDER_MAP
.
get
(
classify
)
field_order
=
field_order_yes
if
key
in
license_dict
else
field_order_no
...
...
src/common/tools/pdf_to_img.py
View file @
1a51bc0
...
...
@@ -187,6 +187,8 @@ class PDFHandler:
self
.
page_to_png
(
page
)
def
extract_image
(
self
):
self
.
img_path_list
=
[]
self
.
xref_set
=
set
()
os
.
makedirs
(
self
.
img_dir_path
,
exist_ok
=
True
)
with
fitz
.
Document
(
self
.
path
)
as
pdf
:
for
pno
in
range
(
pdf
.
pageCount
):
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment