Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
24c87e7e
authored
2021-06-25 16:49:34 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
pdf page limit
1 parent
96178db6
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
41 additions
and
8 deletions
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/models.py
src/common/tools/mssql_script5.py
src/common/tools/pdf_to_img.py
src/apps/doc/management/commands/ocr_process.py
View file @
24c87e7
...
...
@@ -23,7 +23,7 @@ from apps.doc.ocr.edms import EDMS, rh
from
apps.doc.named_enum
import
KeywordsType
,
FailureReason
,
WorkflowName
,
ProcessName
,
RequestTeam
,
RequestTrigger
from
apps.doc.exceptions
import
EDMSException
,
OCR1Exception
,
OCR2Exception
,
OCR4Exception
from
apps.doc.ocr.wb
import
BSWorkbook
from
apps.doc.models
import
DocStatus
,
HILDoc
,
AFCDoc
,
Keywords
,
HILOCRResult
,
AFCOCRResult
,
HILOCRReport
,
AFCOCRReport
,
DDARecords
,
IDBCRecords
from
apps.doc.models
import
DocStatus
,
HILDoc
,
AFCDoc
,
Keywords
,
HILOCRResult
,
AFCOCRResult
,
HILOCRReport
,
AFCOCRReport
,
DDARecords
,
IDBCRecords
,
Configs
from
celery_compare.tasks
import
compare
...
...
@@ -582,6 +582,11 @@ class Command(BaseCommand, LoggerMixin):
pdf_path
=
os
.
path
.
join
(
doc_data_path
,
'{0}.pdf'
.
format
(
doc
.
id
))
pdf_handler
=
PDFHandler
(
pdf_path
,
img_save_path
)
max_count_obj
=
Configs
.
objects
.
filter
(
id
=
2
)
.
first
()
try
:
max_img_count
=
int
(
max_count_obj
.
value
)
except
Exception
as
e
:
max_img_count
=
500
for
times
in
range
(
consts
.
RETRY_TIMES
):
try
:
...
...
@@ -594,7 +599,7 @@ class Command(BaseCommand, LoggerMixin):
self
.
online_log
.
info
(
'{0} [pdf to img start] [task={1}] [times={2}]'
.
format
(
self
.
log_base
,
task_str
,
times
))
start_time
=
time
.
time
()
pdf_handler
.
extract_image
()
pdf_handler
.
extract_image
(
max_img_count
)
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
online_log
.
info
(
'{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'
.
format
(
...
...
@@ -608,14 +613,13 @@ class Command(BaseCommand, LoggerMixin):
else
:
raise
Exception
(
'download or pdf to img failed'
)
img_count
=
len
(
pdf_handler
.
img_path_list
)
if
img_count
==
0
:
if
pdf_handler
.
img_count
==
0
:
self
.
online_log
.
warn
(
'{0} [pdf to img failed (pdf img empty)] [task={1}]'
.
format
(
self
.
log_base
,
task_str
))
raise
Exception
(
'pdf img empty'
)
elif
img_count
>=
max_img_count
:
elif
pdf_handler
.
img_count
>=
max_img_count
:
self
.
online_log
.
info
(
'{0} [too many pdf image] [task={1}] [img_count={2}]'
.
format
(
self
.
log_base
,
task_str
,
img_count
))
self
.
log_base
,
task_str
,
pdf_handler
.
img_count
))
try
:
report_table
=
HILOCRReport
if
business_type
==
consts
.
HIL_PREFIX
else
AFCOCRReport
...
...
@@ -629,13 +633,14 @@ class Command(BaseCommand, LoggerMixin):
successful_at_this_level
=
False
,
failure_reason
=
FailureReason
.
IMG_LIMIT
.
value
,
process_name
=
ProcessName
.
ALL
.
value
,
notes
=
'pdf page count: {0}'
.
format
(
str
(
pdf_handler
.
img_count
))
)
except
Exception
as
e
:
self
.
online_log
.
error
(
'{0} [process error (report db save)] [error={1}]'
.
format
(
self
.
log_base
,
traceback
.
format_exc
()))
else
:
with
lock
:
todo_count_dict
[
task_str
]
=
img_count
todo_count_dict
[
task_str
]
=
pdf_handler
.
img_count
for
img_path
in
pdf_handler
.
img_path_list
:
while
img_queue
.
full
():
self
.
online_log
.
info
(
'{0} [pdf_2_img_2_queue] [img queue full]'
.
format
(
self
.
log_base
))
...
...
src/apps/doc/models.py
View file @
24c87e7
...
...
@@ -282,6 +282,7 @@ class HILOCRReport(models.Model):
process_name
=
models
.
SmallIntegerField
(
default
=
ProcessName
.
ALL
.
value
,
verbose_name
=
"流程名称"
)
total_fields
=
models
.
IntegerField
(
null
=
True
,
verbose_name
=
'比对字段数目'
)
workflow_name
=
models
.
SmallIntegerField
(
null
=
True
,
verbose_name
=
"工作流程"
)
notes
=
models
.
CharField
(
null
=
True
,
max_length
=
2048
,
verbose_name
=
"备注"
)
class
Meta
:
managed
=
False
...
...
@@ -301,6 +302,7 @@ class AFCOCRReport(models.Model):
process_name
=
models
.
SmallIntegerField
(
default
=
ProcessName
.
ALL
.
value
,
verbose_name
=
"流程名称"
)
total_fields
=
models
.
IntegerField
(
null
=
True
,
verbose_name
=
'比对字段数目'
)
workflow_name
=
models
.
SmallIntegerField
(
null
=
True
,
verbose_name
=
"工作流程"
)
notes
=
models
.
CharField
(
null
=
True
,
max_length
=
2048
,
verbose_name
=
"备注"
)
class
Meta
:
managed
=
False
...
...
src/common/tools/mssql_script5.py
0 → 100644
View file @
24c87e7
import
pyodbc
hil_sql
=
"ALTER TABLE hil_ocr_report ADD notes nvarchar(2048)"
afc_sql
=
"ALTER TABLE afc_ocr_report ADD notes nvarchar(2048)"
hil_cnxn
=
pyodbc
.
connect
(
'DRIVER={ODBC Driver 17 for SQL Server};'
,
autocommit
=
True
)
hil_cursor
=
hil_cnxn
.
cursor
()
hil_cursor
.
execute
(
hil_sql
)
hil_cursor
.
close
()
hil_cnxn
.
close
()
afc_cnxn
=
pyodbc
.
connect
(
'DRIVER={ODBC Driver 17 for SQL Server};'
,
autocommit
=
True
)
afc_cursor
=
afc_cnxn
.
cursor
()
afc_cursor
.
execute
(
afc_sql
)
afc_cursor
.
close
()
afc_cnxn
.
close
()
src/common/tools/pdf_to_img.py
View file @
24c87e7
...
...
@@ -26,6 +26,7 @@ class PDFHandler:
self
.
path
=
path
self
.
img_dir_path
=
img_dir_path
self
.
img_path_list
=
[]
self
.
img_count
=
0
self
.
xref_set
=
set
()
def
get_img_save_path
(
self
,
pno
,
img_index
=
0
,
ext
=
'png'
):
...
...
@@ -192,11 +193,14 @@ class PDFHandler:
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
)
def
extract_image
(
self
):
def
extract_image
(
self
,
max_img_count
=
None
):
self
.
img_path_list
=
[]
self
.
xref_set
=
set
()
os
.
makedirs
(
self
.
img_dir_path
,
exist_ok
=
True
)
with
fitz
.
Document
(
self
.
path
)
as
pdf
:
if
isinstance
(
max_img_count
,
int
)
and
pdf
.
pageCount
>=
max_img_count
:
self
.
img_count
=
pdf
.
pageCount
return
for
pno
in
range
(
pdf
.
pageCount
):
il
=
pdf
.
getPageImageList
(
pno
)
# 获取页面图片对象
# (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
...
...
@@ -220,3 +224,4 @@ class PDFHandler:
# 3.页面图片对象数目大于1时,特殊处理
else
:
self
.
merge_il
(
pdf
,
pno
,
il
)
self
.
img_count
=
len
(
self
.
img_path_list
)
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment