Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
423427c0
authored
2021-07-26 16:18:58 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Plain Diff
Merge branch 'feature/ltgt' into feature/0611
2 parents
d78669c5
68d7dd98
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
57 additions
and
31 deletions
src/apps/doc/exceptions.py
src/apps/doc/management/commands/folder_ltgt_process.py
src/apps/doc/management/commands/folder_ocr_process.py
src/apps/doc/ocr/wb.py
src/common/tools/pdf_to_img.py
src/apps/doc/exceptions.py
View file @
423427c
...
...
@@ -13,6 +13,9 @@ class OCR2Exception(Exception):
class
OCR4Exception
(
Exception
):
pass
class
LTGTException
(
Exception
):
pass
class
GCAPException
(
Exception
):
pass
...
...
src/apps/doc/management/commands/folder_ltgt_process.py
0 → 100644
View file @
423427c
This diff is collapsed.
Click to expand it.
src/apps/doc/management/commands/folder_ocr_process.py
View file @
423427c
...
...
@@ -61,13 +61,11 @@ class Command(BaseCommand, LoggerMixin):
def
signal_handler
(
self
,
sig
,
frame
):
self
.
switch
=
False
# 停止处理文件
def
license1_process
(
self
,
ocr_data
,
license_summary
,
classify
,
res_list
,
pno
,
ino
,
part_idx
,
img_path
):
def
license1_process
(
self
,
ocr_data
,
license_summary
,
classify
,
img_path
):
# 类别:'0'身份证, '1'居住证
license_data
=
ocr_data
.
get
(
'data'
,
[])
if
not
license_data
:
res_list
.
append
((
pno
,
ino
,
part_idx
,
consts
.
RES_SUCCESS_EMPTY
))
return
res_list
.
append
((
pno
,
ino
,
part_idx
,
consts
.
RES_SUCCESS
))
if
classify
==
consts
.
MVC_CLASSIFY
:
# 车辆登记证 3/4页结果整合
for
mvc_dict
in
license_data
:
try
:
...
...
@@ -154,29 +152,21 @@ class Command(BaseCommand, LoggerMixin):
def
res_process
(
self
,
all_res
,
classify
,
excel_path
):
try
:
license_summary
=
{}
res_list
=
[]
if
not
all_res
:
return
else
:
for
img_path
,
ocr_res
in
all_res
.
items
():
img_name
,
pno
,
ino
=
self
.
parse_img_path
(
img_path
)
part_idx
=
1
#
img_name, pno, ino = self.parse_img_path(img_path)
#
part_idx = 1
if
isinstance
(
ocr_res
,
dict
):
if
ocr_res
.
get
(
'code'
)
==
1
:
data_list
=
ocr_res
.
get
(
'data'
,
[])
if
isinstance
(
data_list
,
list
):
for
part_idx
,
ocr_data
in
enumerate
(
data_list
):
part_idx
=
part_idx
+
1
self
.
license1_process
(
ocr_data
,
license_summary
,
classify
,
res_list
,
pno
,
ino
,
part_idx
,
img_path
)
else
:
res_list
.
append
((
pno
,
ino
,
part_idx
,
consts
.
RES_FAILED_3
))
else
:
res_list
.
append
((
pno
,
ino
,
part_idx
,
consts
.
RES_FAILED
))
else
:
res_list
.
append
((
pno
,
ino
,
part_idx
,
consts
.
RES_FAILED
))
for
ocr_data
in
data_list
:
# part_idx = part_idx + 1
self
.
license1_process
(
ocr_data
,
license_summary
,
classify
,
img_path
)
wb
=
BSWorkbook
(
set
(),
set
(),
set
(),
set
(),
set
())
wb
.
simple_license_rebuild
(
license_summary
,
consts
.
DOC_SCHEME_LIST
[
0
])
...
...
@@ -216,6 +206,13 @@ class Command(BaseCommand, LoggerMixin):
return
ocr_res
else
:
self
.
folder_log
.
warn
(
'{0} [ocr failed] [img_path={1}]'
.
format
(
self
.
log_base
,
img_path
))
def
images_process
(
self
,
img_path_list
,
classify
,
excel_path
):
all_res
=
{}
for
img_path
in
img_path_list
:
ocr_res
=
self
.
ocr_process
(
img_path
,
classify
)
all_res
[
img_path
]
=
ocr_res
self
.
res_process
(
all_res
,
classify
,
excel_path
)
def
pdf_process
(
self
,
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
):
if
os
.
path
.
exists
(
path
):
...
...
@@ -230,11 +227,7 @@ class Command(BaseCommand, LoggerMixin):
self
.
log_base
,
path
,
traceback
.
format_exc
()))
raise
e
else
:
all_res
=
{}
for
img_path
in
pdf_handler
.
img_path_list
:
ocr_res
=
self
.
ocr_process
(
img_path
,
classify
)
all_res
[
img_path
]
=
ocr_res
self
.
res_process
(
all_res
,
classify
,
excel_path
)
self
.
images_process
(
pdf_handler
.
img_path_list
,
classify
,
excel_path
)
shutil
.
move
(
path
,
pdf_save_path
)
def
tif_process
(
self
,
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
tiff_output_dir
):
...
...
@@ -250,23 +243,18 @@ class Command(BaseCommand, LoggerMixin):
self
.
log_base
,
path
,
traceback
.
format_exc
()))
raise
e
else
:
all_res
=
{}
for
img_path
in
tiff_handler
.
img_path_list
:
ocr_res
=
self
.
ocr_process
(
img_path
,
classify
)
all_res
[
img_path
]
=
ocr_res
self
.
res_process
(
all_res
,
classify
,
excel_path
)
self
.
images_process
(
tiff_handler
.
img_path_list
,
classify
,
excel_path
)
shutil
.
move
(
path
,
tiff_save_path
)
def
img_process
(
self
,
name
,
path
,
classify
,
wb_output_dir
,
img_output_dir
,
pdf_output_dir
):
ocr_res
=
self
.
ocr_process
(
path
,
classify
)
all_res
=
{
path
:
ocr_res
}
try
:
img_save_path
,
excel_path
,
_
=
self
.
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
)
except
Exception
as
e
:
self
.
folder_log
.
error
(
'{0} [get path error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
else
:
ocr_res
=
self
.
ocr_process
(
path
,
classify
)
all_res
=
{
path
:
ocr_res
}
self
.
res_process
(
all_res
,
classify
,
excel_path
)
shutil
.
move
(
path
,
img_save_path
)
...
...
@@ -312,9 +300,9 @@ class Command(BaseCommand, LoggerMixin):
try
:
if
os
.
path
.
isfile
(
path
):
self
.
folder_log
.
info
(
'{0} [file start] [path={1}]'
.
format
(
self
.
log_base
,
path
))
if
name
.
endswith
(
'.pdf'
):
if
name
.
endswith
(
'.pdf'
)
or
name
.
endswith
(
'.PDF'
)
:
self
.
pdf_process
(
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
)
elif
name
.
endswith
(
'.tif'
):
elif
name
.
endswith
(
'.tif'
)
or
name
.
endswith
(
'.TIF'
)
:
self
.
tif_process
(
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
tiff_output_dir
)
else
:
self
.
img_process
(
name
,
path
,
classify
,
wb_output_dir
,
img_output_dir
,
pdf_output_dir
)
...
...
src/apps/doc/ocr/wb.py
View file @
423427c
...
...
@@ -702,6 +702,31 @@ class BSWorkbook(Workbook):
if
field_str
is
not
None
:
count_list
.
append
((
field_str
,
count
))
def
ltgt_build
(
self
,
label
,
result_dict
):
ws
=
self
.
create_sheet
(
label
)
rebuild_res
=
{}
for
key
,
value
in
result_dict
.
items
():
if
isinstance
(
value
,
list
):
value_list
=
[
dict_item
.
get
(
'words'
)
for
dict_item
in
value
]
ws
.
append
((
key
,
'、'
.
join
(
value_list
)))
rebuild_res
[
key
]
=
'、'
.
join
(
value_list
)
elif
isinstance
(
value
,
dict
):
if
'words'
in
value
:
ws
.
append
((
key
,
value
[
'words'
]))
rebuild_res
[
key
]
=
value
[
'words'
]
else
:
for
sub_key
,
sub_value
in
value
.
items
():
if
isinstance
(
sub_value
,
dict
):
ws
.
append
((
'{0}: {1}'
.
format
(
key
,
sub_key
),
sub_value
.
get
(
'words'
,
''
)))
rebuild_res
[
'{0}: {1}'
.
format
(
key
,
sub_key
)]
=
sub_value
.
get
(
'words'
,
''
)
else
:
ws
.
append
((
'{0}: {1}'
.
format
(
key
,
sub_key
),
sub_value
))
rebuild_res
[
'{0}: {1}'
.
format
(
key
,
sub_key
)]
=
sub_value
else
:
ws
.
append
((
key
,
value
))
rebuild_res
[
key
]
=
value
return
rebuild_res
def
simple_license_rebuild
(
self
,
license_summary
,
document_scheme
):
# for ic_license_dict in license_summary.get(consts.IC_CLASSIFY, []):
# if ic_license_dict.get('类别') == '1':
...
...
src/common/tools/pdf_to_img.py
View file @
423427c
...
...
@@ -225,3 +225,13 @@ class PDFHandler:
else
:
self
.
merge_il
(
pdf
,
pno
,
il
)
self
.
img_count
=
len
(
self
.
img_path_list
)
def
extract_page_image
(
self
):
self
.
img_path_list
=
[]
self
.
xref_set
=
set
()
os
.
makedirs
(
self
.
img_dir_path
,
exist_ok
=
True
)
with
fitz
.
Document
(
self
.
path
)
as
pdf
:
for
pno
in
range
(
pdf
.
pageCount
):
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
)
self
.
img_count
=
len
(
self
.
img_path_list
)
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment