Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
ba0dc000
authored
2023-11-15 16:29:58 +0800
by
冯轩
Browse Files
Options
Browse Files
Tag
Download
Plain Diff
Merge branch 'feature/uat-tmp' of gitlab.situdata.com:zhouweiqi/bmw-ocr into feature/uat-tmp
2 parents
f3671aab
c92067d8
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
33 additions
and
11 deletions
.gitignore
src/apps/doc/consts.py
src/apps/doc/management/commands/ocr_process.py
src/common/fsm_econtract/fsm_contract_ocr.py
src/common/fsm_econtract/tools.py
src/common/tools/pdf_to_img.py
.gitignore
View file @
ba0dc00
...
...
@@ -32,3 +32,4 @@ data/*
test*
flow_test.py
pdf_test.py
\ No newline at end of file
...
...
src/apps/doc/consts.py
View file @
ba0dc00
...
...
@@ -2434,14 +2434,14 @@ ECONTRACT_KEYWORDS_MAP = {
FSM_ECONTRACT_KEYWORDS_MAP
=
{
AFC_PREFIX
:
[
(
'延长保修
条款与条件
'
,
FSM_CONTRACT_WEP_CLASSIFY
),
(
'延长保修
服务合约
'
,
FSM_CONTRACT_WEP_CLASSIFY
),
(
'长悦保养套餐服务合约'
,
FSM_CONTRACT_MSI_CLASSIFY
),
(
'汽车销售合同补充合同'
,
FSM_CONTRACT_SC2_CLASSIFY
),
(
'汽车销售合同'
,
FSM_CONTRACT_SC_CLASSIFY
),
],
HIL_PREFIX
:
[
(
'延长保修
条款与条件
'
,
FSM_CONTRACT_WEP_CLASSIFY
),
(
'长悦保养套餐服务合
同
'
,
FSM_CONTRACT_MSI_CLASSIFY
),
(
'延长保修
服务合约
'
,
FSM_CONTRACT_WEP_CLASSIFY
),
(
'长悦保养套餐服务合
约
'
,
FSM_CONTRACT_MSI_CLASSIFY
),
(
'汽车销售合同补充合同'
,
FSM_CONTRACT_SC2_CLASSIFY
),
(
'汽车销售合同'
,
FSM_CONTRACT_SC_CLASSIFY
),
]
...
...
src/apps/doc/management/commands/ocr_process.py
View file @
ba0dc00
...
...
@@ -1339,8 +1339,8 @@ class Command(BaseCommand, LoggerMixin):
pdf_handler
.
extract_image
(
max_img_count
)
end_time
=
time
.
time
()
speed_time
=
int
(
end_time
-
start_time
)
self
.
online_log
.
info
(
'{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
speed_time
))
self
.
online_log
.
info
(
'{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]
[is_new_modify={4}]
'
.
format
(
self
.
log_base
,
task_str
,
times
,
speed_time
,
pdf_handler
.
is_new_modify
))
except
Exception
as
e
:
self
.
online_log
.
warn
(
'{0} [download or pdf to img failed] [task={1}] [times={2}] '
'[error={3}]'
.
format
(
self
.
log_base
,
task_str
,
times
,
...
...
src/common/fsm_econtract/fsm_contract_ocr.py
View file @
ba0dc00
...
...
@@ -6,7 +6,7 @@ retriever_list = [Retriever(WEP_FIELD), Retriever(MSI_FIELD), Retriever(SC_FIELD
def
predict
(
pdf_info
,
file_type
=
0
):
retriever
=
retriever_list
[
file_type
]
pdf_text_list
,
pdf_img_list
=
pdf_info_rebuild
(
pdf_info
)
pdf_text_list
,
pdf_img_list
=
pdf_info_rebuild
(
pdf_info
,
file_type
=
file_type
)
return
retriever
.
get_target_fields
(
pdf_text_list
,
pdf_img_list
)
...
...
src/common/fsm_econtract/tools.py
View file @
ba0dc00
def
pdf_info_rebuild
(
pdf_info
,
fix_bbox
=
True
):
def
pdf_info_rebuild
(
pdf_info
,
fix_bbox
=
True
,
file_type
=
0
):
pdf_text_info
=
dict
()
pdf_img_info
=
dict
()
for
pno_str
,
page_info
in
pdf_info
.
items
():
...
...
@@ -11,6 +11,7 @@ def pdf_info_rebuild(pdf_info, fix_bbox=True):
for
span
in
line
[
'spans'
]:
bbox
,
text
=
span
[
'bbox'
],
span
[
'text'
]
.
strip
()
if
len
(
text
)
!=
0
and
text
not
in
text_set
:
if
file_type
!=
3
:
# 汽车销售合同补充协议,相同的总价会被过滤,所以取消
text_set
.
add
(
text
)
# bbox的高,不准
if
fix_bbox
and
bbox
[
-
1
]
-
bbox
[
1
]
<
span
[
'size'
]:
...
...
src/common/tools/pdf_to_img.py
View file @
ba0dc00
...
...
@@ -12,8 +12,10 @@ from unicodedata import normalize
# 页面保存为png图片参数
ZOOM_X_1
=
ZOOM_Y_1
=
1.0
ZOOM_X_2
=
ZOOM_Y_2
=
2.0
ZOOM_X_3
=
ZOOM_Y_3
=
3.0
trans_1
=
fitz
.
Matrix
(
ZOOM_X_1
,
ZOOM_X_1
)
.
preRotate
(
0
)
# zoom factor 1 in each dimension
trans_2
=
fitz
.
Matrix
(
ZOOM_X_2
,
ZOOM_X_2
)
.
preRotate
(
0
)
# zoom factor 2 in each dimension
trans_3
=
fitz
.
Matrix
(
ZOOM_X_3
,
ZOOM_X_3
)
.
preRotate
(
0
)
# zoom factor 3 in each dimension
# 特殊filter处理
ADOBE_FILTER_SET
=
{
'FlateDecode'
,
'JPXDecode'
,
'JBIG2Decode'
}
...
...
@@ -25,6 +27,12 @@ WH_COUPLE_3 = (100, 100)
WH_COUPLE_4
=
(
100
,
300
)
WH_COUPLE_5
=
(
100
,
200
)
# 碎图宽度阈值
TINY_IMG_MAX_WIDTH
=
1400
# 大图宽高阈值
WH_COUPLE_6
=
(
1800
,
1400
)
WH_COUPLE_7
=
(
2500
,
3000
)
class
PDFBuild
:
...
...
@@ -55,6 +63,7 @@ class PDFHandler:
self
.
img_dir_path
=
img_dir_path
self
.
img_path_list
=
[]
self
.
img_count
=
0
self
.
is_new_modify
=
0
# 用于记录受新改动影响的PDF
self
.
xref_set
=
set
()
self
.
img_suffixs
=
{
'.jpeg'
,
'.jpg'
,
'.png'
,
'.webp'
,
'.bmp'
}
self
.
suffix
=
self
.
get_suffix
(
document_name
)
...
...
@@ -165,8 +174,10 @@ class PDFHandler:
except
Exception
as
e
:
pass
def
page_to_png
(
self
,
page
):
if
page
.
MediaBoxSize
.
x
>
1500
or
page
.
MediaBoxSize
.
y
>
1500
:
def
page_to_png
(
self
,
page
,
is_big_img
=
False
):
if
is_big_img
:
pm
=
page
.
getPixmap
(
matrix
=
trans_3
,
alpha
=
False
)
elif
page
.
MediaBoxSize
.
x
>
1500
or
page
.
MediaBoxSize
.
y
>
1500
:
pm
=
page
.
getPixmap
(
matrix
=
trans_1
,
alpha
=
False
)
else
:
pm
=
page
.
getPixmap
(
matrix
=
trans_2
,
alpha
=
False
)
...
...
@@ -236,8 +247,8 @@ class PDFHandler:
self
.
xref_set
.
add
(
xref
)
self
.
img_path_list
.
append
(
img_save_path
)
@staticmethod
def
split_il
(
il
):
#
@staticmethod
def
split_il
(
self
,
il
):
broken_il
=
[]
start
=
0
length
=
len
(
il
)
...
...
@@ -247,6 +258,10 @@ class PDFHandler:
if
il
[
i
][
-
1
]
in
ADOBE_FILTER_SET
:
page_to_png
=
True
break
if
il
[
i
][
2
]
>=
TINY_IMG_MAX_WIDTH
:
self
.
is_new_modify
=
1
page_to_png
=
True
break
else
:
for
i
in
range
(
length
):
# 当图片对象够大时,不作碎图合并处理,而是单纯提取
...
...
@@ -446,6 +461,11 @@ class PDFHandler:
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
)
# 大图
elif
width
>=
WH_COUPLE_6
[
0
]
or
height
>=
WH_COUPLE_6
[
1
]:
self
.
is_new_modify
=
1
is_big_img
=
(
width
<
WH_COUPLE_7
[
0
]
and
height
<
WH_COUPLE_7
[
1
])
# 防止图片过大
page
=
pdf
.
loadPage
(
pno
)
self
.
page_to_png
(
page
,
is_big_img
=
is_big_img
)
elif
xref
not
in
self
.
xref_set
:
self
.
extract_single_image
(
pdf
,
xref
,
smask
,
colorspace
,
pno
)
# 3.页面图片对象数目大于1时,特殊处理
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment