Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
d2b24497
authored
2022-09-13 20:14:20 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
add offline report
1 parent
c7852512
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
150 additions
and
13 deletions
src/apps/doc/consts.py
src/apps/doc/management/commands/folder_dda_process.py
src/apps/doc/management/commands/folder_ltgt_process.py
src/apps/doc/management/commands/folder_ocr_process.py
src/apps/doc/management/commands/folder_wsc_process.py
src/apps/doc/models.py
src/apps/doc/named_enum.py
src/common/tools/mssql_script19.py
src/apps/doc/consts.py
View file @
d2b2449
...
...
@@ -2331,3 +2331,5 @@ MPOS_MAP = {
BC_CLASSIFY
,
BL_CLASSIFY
,
}
FOLDER_WSC_CLASSIFY
=
199
...
...
src/apps/doc/management/commands/folder_dda_process.py
View file @
d2b2449
...
...
@@ -7,6 +7,7 @@ import base64
import
signal
import
requests
import
traceback
from
django
import
db
from
PIL
import
Image
from
datetime
import
datetime
from
django.core.management
import
BaseCommand
...
...
@@ -19,6 +20,8 @@ from common.tools.pdf_to_img import PDFHandler
from
apps.doc
import
consts
from
apps.doc.exceptions
import
OCR1Exception
,
OCR2Exception
,
LTGTException
from
apps.doc.ocr.wb
import
BSWorkbook
from
apps.doc.models
import
OfflineReport
from
apps.doc.named_enum
import
OfflineFailureReason
class
TIFFHandler
:
...
...
@@ -384,6 +387,9 @@ class Command(BaseCommand, LoggerMixin):
if
len
(
true_file_set
)
==
0
and
len
(
os_error_filename_set
)
>
0
:
true_file_set
.
add
(
os_error_filename_set
.
pop
())
for
name
in
true_file_set
:
is_success
=
True
failure_reason
=
OfflineFailureReason
.
OS_ERROR
.
value
start_time
=
time
.
time
()
path
=
os
.
path
.
join
(
input_dir
,
name
)
try
:
if
not
os
.
path
.
exists
(
path
):
...
...
@@ -408,16 +414,21 @@ class Command(BaseCommand, LoggerMixin):
failed_path
=
os
.
path
.
join
(
failed_output_dir
,
'{0}_{1}'
.
format
(
time
.
time
(),
name
))
shutil
.
move
(
path
,
failed_path
)
except
OSError
:
is_success
=
False
failure_reason
=
OfflineFailureReason
.
OS_ERROR
.
value
os_error_filename_set
.
add
(
name
)
self
.
folder_log
.
error
(
'{0} [os error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
except
Exception
as
e
:
is_success
=
False
failure_reason
=
OfflineFailureReason
.
PROCESS_ERROR
.
value
try
:
self
.
folder_log
.
error
(
'{0} [file error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
failed_path
=
os
.
path
.
join
(
failed_output_dir
,
'{0}_{1}'
.
format
(
time
.
time
(),
name
))
shutil
.
move
(
path
,
failed_path
)
except
Exception
as
e
:
failure_reason
=
OfflineFailureReason
.
OS_ERROR
.
value
os_error_filename_set
.
add
(
name
)
self
.
folder_log
.
error
(
'{0} [file move error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
...
...
@@ -433,8 +444,23 @@ class Command(BaseCommand, LoggerMixin):
self
.
DATE_KEY
:
date_str
}
)
finally
:
end_time
=
time
.
time
()
try
:
OfflineReport
.
objects
.
create
(
input_folder
=
input_dir
,
doc_type
=
consts
.
DDA_CLASSIFY
,
file_name
=
name
,
status
=
is_success
,
failure_reason
=
failure_reason
,
duration
=
int
(
end_time
-
start_time
)
)
except
Exception
as
e
:
self
.
folder_log
.
error
(
'{0} [db save failed] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
def
handle
(
self
,
*
args
,
**
kwargs
):
db
.
close_old_connections
()
if
len
(
self
.
input_dirs
)
==
0
:
return
result_queue
=
Queue
()
...
...
src/apps/doc/management/commands/folder_ltgt_process.py
View file @
d2b2449
...
...
@@ -7,6 +7,7 @@ import base64
import
signal
import
requests
import
traceback
from
django
import
db
from
PIL
import
Image
from
datetime
import
datetime
from
django.core.management
import
BaseCommand
...
...
@@ -20,6 +21,8 @@ from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predi
from
apps.doc
import
consts
from
apps.doc.exceptions
import
OCR1Exception
,
OCR2Exception
,
LTGTException
from
apps.doc.ocr.wb
import
BSWorkbook
from
apps.doc.models
import
OfflineReport
from
apps.doc.named_enum
import
OfflineFailureReason
class
TIFFHandler
:
...
...
@@ -688,6 +691,9 @@ class Command(BaseCommand, LoggerMixin):
for
name
in
true_file_set
:
time
.
sleep
(
5
)
path
=
os
.
path
.
join
(
input_dir
,
name
)
is_success
=
True
failure_reason
=
OfflineFailureReason
.
OS_ERROR
.
value
start_time
=
time
.
time
()
try
:
if
not
os
.
path
.
exists
(
path
):
...
...
@@ -716,16 +722,21 @@ class Command(BaseCommand, LoggerMixin):
failed_path
=
os
.
path
.
join
(
failed_output_dir
,
'{0}_{1}'
.
format
(
time
.
time
(),
name
))
shutil
.
move
(
path
,
failed_path
)
except
OSError
:
is_success
=
False
failure_reason
=
OfflineFailureReason
.
OS_ERROR
.
value
os_error_filename_set
.
add
(
name
)
self
.
folder_log
.
error
(
'{0} [os error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
except
Exception
as
e
:
is_success
=
False
failure_reason
=
OfflineFailureReason
.
PROCESS_ERROR
.
value
try
:
self
.
folder_log
.
error
(
'{0} [file error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
failed_path
=
os
.
path
.
join
(
failed_output_dir
,
'{0}_{1}'
.
format
(
time
.
time
(),
name
))
shutil
.
move
(
path
,
failed_path
)
except
Exception
as
e
:
failure_reason
=
OfflineFailureReason
.
OS_ERROR
.
value
os_error_filename_set
.
add
(
name
)
self
.
folder_log
.
error
(
'{0} [file move error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
...
...
@@ -755,8 +766,23 @@ class Command(BaseCommand, LoggerMixin):
self
.
FILE_KEY
:
name
,
}
)
finally
:
end_time
=
time
.
time
()
try
:
OfflineReport
.
objects
.
create
(
input_folder
=
input_dir
,
doc_type
=
classify
,
file_name
=
name
,
status
=
is_success
,
failure_reason
=
failure_reason
,
duration
=
int
(
end_time
-
start_time
)
)
except
Exception
as
e
:
self
.
folder_log
.
error
(
'{0} [db save failed] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
def
handle
(
self
,
*
args
,
**
kwargs
):
db
.
close_old_connections
()
if
len
(
self
.
input_dirs
)
==
0
:
return
result_queue
=
Queue
()
...
...
src/apps/doc/management/commands/folder_ocr_process.py
View file @
d2b2449
...
...
@@ -6,6 +6,7 @@ import base64
import
signal
import
requests
import
traceback
from
django
import
db
from
PIL
import
Image
from
datetime
import
datetime
from
django.core.management
import
BaseCommand
...
...
@@ -17,6 +18,8 @@ from common.tools.pdf_to_img import PDFHandler
from
apps.doc
import
consts
from
apps.doc.exceptions
import
OCR1Exception
,
OCR4Exception
from
apps.doc.ocr.wb
import
BSWorkbook
from
apps.doc.models
import
OfflineReport
from
apps.doc.named_enum
import
OfflineFailureReason
class
TIFFHandler
:
...
...
@@ -337,6 +340,9 @@ class Command(BaseCommand, LoggerMixin):
if
len
(
true_file_set
)
==
0
and
len
(
os_error_filename_set
)
>
0
:
true_file_set
.
add
(
os_error_filename_set
.
pop
())
for
name
in
true_file_set
:
is_success
=
True
failure_reason
=
OfflineFailureReason
.
OS_ERROR
.
value
start_time
=
time
.
time
()
path
=
os
.
path
.
join
(
input_dir
,
name
)
try
:
...
...
@@ -358,21 +364,41 @@ class Command(BaseCommand, LoggerMixin):
failed_path
=
os
.
path
.
join
(
failed_output_dir
,
'{0}_{1}'
.
format
(
time
.
time
(),
name
))
shutil
.
move
(
path
,
failed_path
)
except
OSError
:
is_success
=
False
failure_reason
=
OfflineFailureReason
.
OS_ERROR
.
value
os_error_filename_set
.
add
(
name
)
self
.
folder_log
.
error
(
'{0} [os error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
except
Exception
as
e
:
is_success
=
False
failure_reason
=
OfflineFailureReason
.
PROCESS_ERROR
.
value
try
:
self
.
folder_log
.
error
(
'{0} [file error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
failed_path
=
os
.
path
.
join
(
failed_output_dir
,
'{0}_{1}'
.
format
(
time
.
time
(),
name
))
shutil
.
move
(
path
,
failed_path
)
except
Exception
as
e
:
failure_reason
=
OfflineFailureReason
.
OS_ERROR
.
value
os_error_filename_set
.
add
(
name
)
self
.
folder_log
.
error
(
'{0} [file move error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
finally
:
end_time
=
time
.
time
()
try
:
OfflineReport
.
objects
.
create
(
input_folder
=
input_dir
,
doc_type
=
classify
,
file_name
=
name
,
status
=
is_success
,
failure_reason
=
failure_reason
,
duration
=
int
(
end_time
-
start_time
)
)
except
Exception
as
e
:
self
.
folder_log
.
error
(
'{0} [db save failed] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
def
handle
(
self
,
*
args
,
**
kwargs
):
db
.
close_old_connections
()
process_list
=
[]
for
classify_idx
,
input_dir
in
self
.
input_dirs
.
items
():
classify
=
int
(
classify_idx
.
split
(
'_'
)[
0
])
...
...
src/apps/doc/management/commands/folder_wsc_process.py
View file @
d2b2449
...
...
@@ -6,6 +6,7 @@ import base64
import
signal
import
requests
import
traceback
from
django
import
db
from
PIL
import
Image
from
datetime
import
datetime
from
django.core.management
import
BaseCommand
...
...
@@ -20,6 +21,8 @@ from common.tools.pdf_to_img import PDFHandler
from
apps.doc
import
consts
from
apps.doc.exceptions
import
OCR1Exception
,
OCR4Exception
from
apps.doc.ocr.wb
import
BSWorkbook
,
PatternFill
from
apps.doc.models
import
OfflineReport
from
apps.doc.named_enum
import
OfflineFailureReason
class
Finder
:
...
...
@@ -582,6 +585,9 @@ class Command(BaseCommand, LoggerMixin):
for
name
in
true_file_set
:
time
.
sleep
(
10
)
# 防止文件较大时,读取到不完整文件
path
=
os
.
path
.
join
(
input_dir
,
name
)
is_success
=
True
failure_reason
=
OfflineFailureReason
.
OS_ERROR
.
value
start_time
=
time
.
time
()
try
:
if
not
os
.
path
.
exists
(
path
):
...
...
@@ -605,20 +611,40 @@ class Command(BaseCommand, LoggerMixin):
failed_path
=
os
.
path
.
join
(
failed_output_dir
,
'{0}_{1}'
.
format
(
time
.
time
(),
name
))
shutil
.
move
(
path
,
failed_path
)
except
OSError
:
is_success
=
False
failure_reason
=
OfflineFailureReason
.
OS_ERROR
.
value
os_error_filename_set
.
add
(
name
)
self
.
folder_log
.
error
(
'{0} [os error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
except
Exception
as
e
:
is_success
=
False
failure_reason
=
OfflineFailureReason
.
PROCESS_ERROR
.
value
try
:
self
.
folder_log
.
error
(
'{0} [file error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
failed_path
=
os
.
path
.
join
(
failed_output_dir
,
'{0}_{1}'
.
format
(
time
.
time
(),
name
))
shutil
.
move
(
path
,
failed_path
)
except
Exception
as
e
:
failure_reason
=
OfflineFailureReason
.
OS_ERROR
.
value
os_error_filename_set
.
add
(
name
)
self
.
folder_log
.
error
(
'{0} [file move error] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
finally
:
end_time
=
time
.
time
()
try
:
OfflineReport
.
objects
.
create
(
input_folder
=
input_dir
,
doc_type
=
consts
.
FOLDER_WSC_CLASSIFY
,
file_name
=
name
,
status
=
is_success
,
failure_reason
=
failure_reason
,
duration
=
int
(
end_time
-
start_time
)
)
except
Exception
as
e
:
self
.
folder_log
.
error
(
'{0} [db save failed] [path={1}] [error={2}]'
.
format
(
self
.
log_base
,
path
,
traceback
.
format_exc
()))
def
handle
(
self
,
*
args
,
**
kwargs
):
db
.
close_old_connections
()
self
.
folder_process
(
self
.
input_dir
)
self
.
folder_log
.
info
(
'{0} [stop safely]'
.
format
(
self
.
log_base
))
...
...
src/apps/doc/models.py
View file @
d2b2449
...
...
@@ -933,23 +933,37 @@ class MposReport(models.Model):
db_table
=
'mpos_report'
# class HILOfflineReport(models.Model):
class
OfflineReport
(
models
.
Model
):
id
=
models
.
AutoField
(
primary_key
=
True
,
verbose_name
=
"id"
)
# 主键
input_folder
=
models
.
CharField
(
max_length
=
512
,
verbose_name
=
"文件夹路径"
)
doc_type
=
models
.
SmallIntegerField
(
default
=
0
,
verbose_name
=
"文件类型"
)
file_name
=
models
.
CharField
(
max_length
=
1024
,
verbose_name
=
"文件名"
)
status
=
models
.
BooleanField
(
default
=
True
,
verbose_name
=
"是否成功"
)
failure_reason
=
models
.
SmallIntegerField
(
default
=
0
,
verbose_name
=
"失败原因"
)
duration
=
models
.
IntegerField
(
verbose_name
=
'处理时长'
)
create_time
=
models
.
DateTimeField
(
auto_now_add
=
True
,
verbose_name
=
'创建时间'
)
class
Meta
:
managed
=
False
db_table
=
'offline_report'
# class AFCOfflineReport(models.Model):
# id = models.AutoField(primary_key=True, verbose_name="id") # 主键
#
# update_time = models.DateTimeField(auto_now=True, verbose_name='修改时间')
# input_folder = models.CharField(max_length=512, verbose_name="文件夹路径")
# doc_type = models.SmallIntegerField(default=0, verbose_name="文件类型")
# file_name = models.CharField(max_length=1024, verbose_name="文件名")
# status = models.BooleanField(default=True, verbose_name="是否成功")
# failure_reason = models.SmallIntegerField(default=0, verbose_name="失败原因")
# duration = models.IntegerField(verbose_name='处理时长')
#
# create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
#
# class Meta:
# managed = False
# db_table = 'hil_offline_report'
# new teble: hil/afc_offline_ocr_report
# 1. file_name string eg. 'CH-B2432.pdf'
# 2. doc_type int eg. 2(VAT Invoice)
# 3. successful_at_this_level boolean eg. 0
# 4. failure_reason int eg. 2(PDF)
# 5. duration int eg. 100
#
# id/input_folder/start_time/end_time/create_time
# db_table = 'afc_offline_report'
# situ_db_label = 'afc'
...
...
src/apps/doc/named_enum.py
View file @
d2b2449
...
...
@@ -90,3 +90,8 @@ class BSCheckResult(NamedEnum):
CHECK_TRUE
=
(
1
,
'CHECK_TRUE'
)
CHECK_FALSE
=
(
2
,
'CHECK_FALSE'
)
CHECK_FAILED
=
(
3
,
'CHECK_FAILED'
)
class
OfflineFailureReason
(
NamedEnum
):
OS_ERROR
=
(
0
,
'OS_ERROR'
)
PROCESS_ERROR
=
(
1
,
'PROCESS_ERROR'
)
...
...
src/common/tools/mssql_script19.py
View file @
d2b2449
...
...
@@ -14,6 +14,18 @@ hil_sql = """
ALTER TABLE hil_ocr_report ADD bank_name nvarchar(2048);
ALTER TABLE hil_ocr_report ADD is_ebank bit default 0 not null;
ALTER TABLE hil_ocr_report ADD bs_check_result tinyint default 0 not null;
create table offline_report
(
id bigint identity primary key,
input_folder nvarchar(512) not null,
doc_type tinyint default 0 not null,
file_name nvarchar(1024) not null,
status bit default 1 not null,
failure_reason tinyint default 0 not null,
duration smallint not null,
create_time datetime not null
);
"""
afc_sql
=
"""
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment