Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
c1ca6fa5
authored
2021-07-26 16:11:28 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
add ltgt wb daily
1 parent
c39b3051
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
7 additions
and
78 deletions
src/apps/doc/management/commands/folder_ltgt_process.py
src/apps/doc/management/commands/folder_ocr_process.py
src/apps/doc/management/commands/folder_ltgt_process.py
0 → 100644
View file @
c1ca6fa
This diff is collapsed.
Click to expand it.
src/apps/doc/management/commands/folder_ocr_process.py
View file @
c1ca6fa
...
...
@@ -15,7 +15,7 @@ from settings import conf
from
common.mixins
import
LoggerMixin
from
common.tools.pdf_to_img
import
PDFHandler
from
apps.doc
import
consts
from
apps.doc.exceptions
import
OCR1Exception
,
OCR4Exception
,
LTGTException
from
apps.doc.exceptions
import
OCR1Exception
,
OCR4Exception
from
apps.doc.ocr.wb
import
BSWorkbook
...
...
@@ -48,11 +48,6 @@ class Command(BaseCommand, LoggerMixin):
self
.
log_base
=
'[folder ocr process]'
# 处理文件开关
self
.
switch
=
True
self
.
ltgt_classify_mapping
=
{
128
:
'执行裁定书'
,
129
:
'民事判决书'
,
130
:
'民事调解书'
}
# 睡眠时间
self
.
sleep_time
=
float
(
conf
.
SLEEP_SECOND_FOLDER
)
# input folder
...
...
@@ -60,7 +55,6 @@ class Command(BaseCommand, LoggerMixin):
# ocr相关
self
.
ocr_url
=
conf
.
OCR_URL_FOLDER
self
.
ocr_url_4
=
conf
.
IC_URL
self
.
ltgt_ocr_url
=
conf
.
LTGT_URL
# 优雅退出信号:15
signal
.
signal
(
signal
.
SIGTERM
,
self
.
signal_handler
)
...
...
@@ -213,59 +207,6 @@ class Command(BaseCommand, LoggerMixin):
else
:
self
.
folder_log
.
warn
(
'{0} [ocr failed] [img_path={1}]'
.
format
(
self
.
log_base
,
img_path
))
def
ltgt_ocr_process
(
self
,
img_path_list
,
label
,
path
):
img_data_list
=
[]
for
img_path
in
img_path_list
:
if
os
.
path
.
exists
(
img_path
):
with
open
(
img_path
,
'rb'
)
as
f
:
base64_data
=
base64
.
b64encode
(
f
.
read
())
# 获取解码后的base64值
file_data
=
base64_data
.
decode
()
img_data_list
.
append
(
file_data
)
json_data
=
{
"label"
:
label
,
"img_data_list"
:
img_data_list
}
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
start_time
=
time
.
time
()
ocr_response
=
requests
.
post
(
self
.
ltgt_ocr_url
,
json
=
json_data
)
if
ocr_response
.
status_code
!=
200
:
raise
LTGTException
(
'{0} ltgt ocr status code: {1}'
.
format
(
self
.
log_base
,
ocr_response
.
status_code
))
except
Exception
as
e
:
self
.
folder_log
.
warn
(
'{0} [ltgt ocr failed] [times={1}] [path={2}] [error={3}]'
.
format
(
self
.
log_base
,
times
,
path
,
traceback
.
format_exc
()))
else
:
ocr_res
=
ocr_response
.
json
()
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
folder_log
.
info
(
'{0} [ltgt ocr success] [path={1}] [res={2}] [speed_time={3}]'
.
format
(
self
.
log_base
,
path
,
ocr_res
,
speed_time
))
return
ocr_res
else
:
self
.
folder_log
.
warn
(
'{0} [ltgt ocr failed] [path={1}]'
.
format
(
self
.
log_base
,
path
))
def
ltgt_res_process
(
self
,
ocr_res
,
label
,
excel_path
):
try
:
if
isinstance
(
ocr_res
,
dict
):
if
ocr_res
.
get
(
'code'
)
==
1
:
result_dict
=
ocr_res
.
get
(
'data'
,
{})
wb
=
BSWorkbook
(
set
(),
set
(),
set
(),
set
(),
set
())
rebuild_res
=
wb
.
ltgt_build
(
label
,
result_dict
)
wb
.
remove_base_sheet
()
wb
.
save
(
excel_path
)
except
Exception
as
e
:
self
.
folder_log
.
error
(
'{0} [wb build error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
excel_path
,
traceback
.
format_exc
()))
def
ltgt_process
(
self
,
img_path_list
,
label
,
excel_path
,
path
):
ocr_res
=
self
.
ltgt_ocr_process
(
img_path_list
,
label
,
path
)
self
.
ltgt_res_process
(
ocr_res
,
label
,
excel_path
)
def
images_process
(
self
,
img_path_list
,
classify
,
excel_path
):
all_res
=
{}
for
img_path
in
img_path_list
:
...
...
@@ -279,20 +220,14 @@ class Command(BaseCommand, LoggerMixin):
img_save_path
,
excel_path
,
pdf_save_path
=
self
.
get_path
(
name
,
img_output_dir
,
wb_output_dir
,
pdf_output_dir
)
self
.
folder_log
.
info
(
'{0} [pdf to img start] [path={1}]'
.
format
(
self
.
log_base
,
path
))
pdf_handler
=
PDFHandler
(
path
,
img_save_path
)
if
classify
in
self
.
ltgt_classify_mapping
:
pdf_handler
.
extract_page_image
()
else
:
pdf_handler
.
extract_image
()
pdf_handler
.
extract_image
()
self
.
folder_log
.
info
(
'{0} [pdf to img end] [path={1}]'
.
format
(
self
.
log_base
,
path
))
except
Exception
as
e
:
self
.
folder_log
.
error
(
'{0} [pdf to img error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
raise
e
else
:
if
classify
in
self
.
ltgt_classify_mapping
:
self
.
ltgt_process
(
pdf_handler
.
img_path_list
,
self
.
ltgt_classify_mapping
[
classify
],
excel_path
,
path
)
else
:
self
.
images_process
(
pdf_handler
.
img_path_list
,
classify
,
excel_path
)
self
.
images_process
(
pdf_handler
.
img_path_list
,
classify
,
excel_path
)
shutil
.
move
(
path
,
pdf_save_path
)
def
tif_process
(
self
,
name
,
path
,
classify
,
img_output_dir
,
wb_output_dir
,
tiff_output_dir
):
...
...
@@ -308,10 +243,7 @@ class Command(BaseCommand, LoggerMixin):
self
.
log_base
,
path
,
traceback
.
format_exc
()))
raise
e
else
:
if
classify
in
self
.
ltgt_classify_mapping
:
self
.
ltgt_process
(
tiff_handler
.
img_path_list
,
self
.
ltgt_classify_mapping
[
classify
],
excel_path
,
path
)
else
:
self
.
images_process
(
tiff_handler
.
img_path_list
,
classify
,
excel_path
)
self
.
images_process
(
tiff_handler
.
img_path_list
,
classify
,
excel_path
)
shutil
.
move
(
path
,
tiff_save_path
)
def
img_process
(
self
,
name
,
path
,
classify
,
wb_output_dir
,
img_output_dir
,
pdf_output_dir
):
...
...
@@ -321,12 +253,9 @@ class Command(BaseCommand, LoggerMixin):
self
.
folder_log
.
error
(
'{0} [get path error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
else
:
if
classify
in
self
.
ltgt_classify_mapping
:
self
.
ltgt_process
([
path
],
self
.
ltgt_classify_mapping
[
classify
],
excel_path
,
path
)
else
:
ocr_res
=
self
.
ocr_process
(
path
,
classify
)
all_res
=
{
path
:
ocr_res
}
self
.
res_process
(
all_res
,
classify
,
excel_path
)
ocr_res
=
self
.
ocr_process
(
path
,
classify
)
all_res
=
{
path
:
ocr_res
}
self
.
res_process
(
all_res
,
classify
,
excel_path
)
shutil
.
move
(
path
,
img_save_path
)
def
folder_process
(
self
,
input_dir
,
classify
):
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment