Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
9bab1769
authored
2021-11-11 17:30:41 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
fix wsc
1 parent
8cf3f917
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
4 additions
and
40 deletions
src/apps/doc/management/commands/folder_wsc_process.py
src/apps/doc/management/commands/folder_wsc_process.py
View file @
9bab176
...
...
@@ -31,7 +31,6 @@ class Finder:
def
__init__
(
self
,
ocr_results
=
None
):
self
.
ocr_results
=
ocr_results
self
.
init_result
=
{
"合同编号列表"
:
[],
"经销商名称_Page3"
:
""
,
...
...
@@ -49,10 +48,8 @@ class Finder:
"其他约定与条件英文"
:
""
,
"其他约定与条件中文"
:
""
,
}
def
get_line
(
self
,
ocr_results
,
key_string
):
# 根据指定关键词, 找出与关键词同处一行的字符
top
,
bottom
=
-
1
,
-
1
# 首先找到这个关键词所在的 Bbox
for
key
in
ocr_results
:
...
...
@@ -60,21 +57,18 @@ class Finder:
if
key_string
in
text
:
top
,
bottom
=
min
(
bbox
[
1
::
2
]),
max
(
bbox
[
1
::
2
])
break
line_text
=
[]
# 然后找到一行
for
key
in
ocr_results
:
bbox
,
text
=
ocr_results
[
key
]
if
top
<
np
.
mean
(
bbox
[
1
::
2
])
<
bottom
:
line_text
.
append
([
bbox
,
text
])
# 从左到右排序
lines
=
''
if
len
(
line_text
)
>
0
:
line_text
=
sorted
(
line_text
,
key
=
lambda
x
:
x
[
0
][
0
],
reverse
=
False
)
lines
=
''
.
join
([
i
[
1
]
for
i
in
line_text
])
return
lines
def
page_predict
(
self
,
ocr_results
,
page_template
):
classes
=
[]
for
pno
in
ocr_results
:
...
...
@@ -84,12 +78,10 @@ class Finder:
ocr_texts
+=
text
pattern
=
re
.
compile
(
"[^
\u4e00
-
\u9fa5
]"
)
# 匹配不是中文的其他字符
ocr_texts
=
pattern
.
sub
(
''
,
ocr_texts
)
score
=
fuzz
.
ratio
(
page_template
,
ocr_texts
)
/
100.
classes
.
append
([
pno
,
score
])
pred
=
sorted
(
classes
,
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)[
0
]
return
pred
def
get_top_key
(
self
,
ocr_results
,
key_string
):
# 加入过滤词典
"""找到与 key_string 最匹配的字段的 key
"""
...
...
@@ -98,7 +90,6 @@ class Finder:
ratio_list
=
[[
fuzz
.
ratio
(
key_string
,
ocr_results
[
key
][
1
]),
key
]
for
key
in
ocr_results
]
top_key
=
sorted
(
ratio_list
,
key
=
lambda
x
:
x
[
0
])[
-
1
]
return
top_key
def
get_top_iou
(
self
,
ocr_results
,
poly
):
"""求最大IoU
"""
...
...
@@ -117,12 +108,10 @@ class Finder:
return
-
1
,
-
1
top_iou
=
sorted
(
iou_list
,
key
=
lambda
x
:
x
[
0
])[
-
1
]
return
top_iou
def
get_key_value
(
self
,
ocr_results
,
key_string
):
"""根据 key 查找 value
"""
value
=
''
tmp_ocr_results
=
{}
for
key
in
ocr_results
:
bbox
,
text
=
ocr_results
[
key
]
...
...
@@ -131,7 +120,6 @@ class Finder:
pattern
=
re
.
compile
(
"[^
\u4e00
-
\u9fa5
]"
)
# 匹配不是中文的其他字符
text
=
pattern
.
sub
(
''
,
text
)
tmp_ocr_results
[
key
]
=
[
bbox
,
text
]
# 先根据 key_string 找到 key 的位置所在, 再判断该位置是否包含 value
# 若不包含 value, 则往右边一个单位查找 value
ratio
,
key
=
self
.
get_top_key
(
tmp_ocr_results
,
key_string
)
...
...
@@ -151,7 +139,6 @@ class Finder:
else
:
value
=
words
return
value
def
get_contract_No
(
self
):
"""提取左上角的合同编号字段
"""
...
...
@@ -162,22 +149,21 @@ class Finder:
contract_No
=
self
.
get_key_value
(
self
.
ocr_results
[
pno
],
'合同编号'
)
else
:
contract_No
=
''
# 临时解决 S 识别成 8 的问题
# TODO!!!
contract_No_list
.
append
(
contract_No
)
return
contract_No_list
def
get_info_in_page_3
(
self
):
"""提取第三页上的经销商名称,和经销商统一社会信用代码或公司注册号
"""
dealer_name
=
''
dealer_No
=
''
template
=
r"""合同编号宝马汽车金融中国有限公司甲方宝马汽车金融中国有限公司地址中国北京市朝阳区东三环北路霞光里号佳程
广场座层乙方统一社会信用代码或公司注册号地址鉴于甲方是一家依照中国法律合法组建和存续的汽车金融公司愿意
为宝马中国汽车贸易有限公司以下简称宝马中国及华晨宝马汽车有限公司以下简称华晨宝马在中国大陆的宝马集团经
销商提供汽车批售融资服务乙方是一家依据中国法律合法组建和存续与宝马中国和或华晨宝马签署了授权销售合同具
有专营进口和或国产宝马集团产品合法资格的企业本着自愿平等互惠互利的原则甲乙双方经充分协商签署本综合授信
额度合同本合同达成如下条款综合授信额度合同版本"""
.
replace
(
" "
,
""
)
.
replace
(
"
\n
"
,
""
)
# 首先找到第三页纸, 我们阈值设为0.5
pno
,
score
=
self
.
page_predict
(
self
.
ocr_results
,
template
)
if
score
>
0.5
:
...
...
@@ -189,18 +175,15 @@ class Finder:
if
'乙方:'
in
text
:
words
=
text
.
split
(
':'
)[
-
1
]
.
replace
(
'【'
,
'['
)
.
replace
(
'{'
,
'['
)
.
replace
(
'】'
,
']'
)
dealer_name
=
words
words
=
self
.
get_key_value
(
self
.
ocr_results
[
pno
],
'统一社会信用代码或公司注册号'
)
dealer_No
=
words
.
replace
(
'O'
,
'0'
)
return
dealer_name
,
dealer_No
def
get_info_in_page_38
(
self
):
"""提取第38页上的经销商名称
"""
dealer_name
=
''
template
=
r"""宝马汽车金融中国有限公司合同编号签署页甲方宝马汽车金融中国有限公司盖章姓名姓名职务职务日期乙方汽车销售服务
有限公司盖章姓名姓名职务职务日期综合授信额度合同版本"""
.
replace
(
" "
,
""
)
.
replace
(
"
\n
"
,
""
)
# 首先找到第38页纸, 我们阈值设为0.5
pno
,
score
=
self
.
page_predict
(
self
.
ocr_results
,
template
)
if
score
>
0.5
:
...
...
@@ -212,7 +195,6 @@ class Finder:
words
=
re
.
sub
(
r'[(())盖章《]'
,
""
,
words
)
dealer_name
=
words
return
dealer_name
def
get_guarantor
(
self
):
"""提取第10页上保证人段落,所见即所得
"""
...
...
@@ -222,14 +204,12 @@ class Finder:
for
key
in
self
.
ocr_results
[
pno
]:
bbox
,
text
=
self
.
ocr_results
[
pno
][
key
]
all_texts
+=
text
searchObj
=
re
.
search
(
r'保证人\[(.*?)\]与甲方'
,
all_texts
)
if
searchObj
:
words
=
f
'[{searchObj.group(1)}]'
words
=
words
.
replace
(
'【'
,
'['
)
.
replace
(
'】'
,
']'
)
.
replace
(
','
,
','
)
.
replace
(
'('
,
'('
)
.
replace
(
')'
,
')'
)
guarantor
=
words
return
guarantor
def
get_info_in_page_39
(
self
):
"""提取综合授信合同上的一些字段
"""
...
...
@@ -242,7 +222,6 @@ class Finder:
term_end_chn
=
''
deposit_eng
=
''
deposit_chn
=
''
template
=
r"""合同编号中国有限公司宝马汽车金融综合授信额度合同附件确认函综合授信额度金额本合同项下的综合授信额度为人民币
大写综合授信额度下面各个业务的授信额度将由甲方以授信额度通知函的方式时不时的通知乙方本合同项下的综合授信额
度可以由甲方根据乙方的信用和财务状况自行决定随时调整本合同项下的综合授信额度应为在本确认函第条的期间内双方
...
...
@@ -250,7 +229,6 @@ class Finder:
综合授信额度期限从至或者由甲方向乙方通过书面形式在授信额度通知函中沟通的更短期间保证金甲方对乙方的最低保证
金要求为综合授信额度的实际执行的保证金比例以甲方不时另行书面通知根据最新的经销商融资或保证金相关政策或活动
为准综合授信额度合同版本"""
.
replace
(
" "
,
""
)
.
replace
(
"
\n
"
,
""
)
# 首先找到综合授信合同第一面, 我们阈值设为0.5
pno
,
score
=
self
.
page_predict
(
self
.
ocr_results
,
template
)
if
score
>
0.5
:
...
...
@@ -264,7 +242,6 @@ class Finder:
if
searchObj
:
words
=
searchObj
.
group
()
amount_eng
=
words
lines
=
self
.
get_line
(
self
.
ocr_results
[
pno
],
'人民币'
)
searchObj
=
re
.
search
(
r'大写(.*?)综合'
,
lines
)
if
searchObj
:
...
...
@@ -290,7 +267,6 @@ class Finder:
if
searchEnd
:
words
=
searchEnd
.
group
()
term_end_eng
=
words
lines
=
self
.
get_line
(
self
.
ocr_results
[
pno
],
'至'
)
if
len
(
lines
)
>
0
:
start
,
end
=
lines
.
split
(
'至'
)
...
...
@@ -308,55 +284,44 @@ class Finder:
if
searchObj
:
words
=
searchObj
.
group
(
1
)
deposit_eng
=
f
'{words}
%
'
lines
=
self
.
get_line
(
self
.
ocr_results
[
pno
],
'授信额度的'
)
searchObj
=
re
.
search
(
r'授信额度的([0-9]+)'
,
lines
.
replace
(
'O'
,
'0'
))
searchObj
=
re
.
search
(
r'授信额度的([0-9]+)'
,
lines
.
replace
(
'O'
,
'0'
)
.
replace
(
'_'
,
''
)
)
if
searchObj
:
words
=
searchObj
.
group
(
1
)
deposit_chn
=
f
'{words}
%
'
return
amount_eng
,
amount_chn
,
term_start_eng
,
term_end_eng
,
\
term_start_chn
,
term_end_chn
,
deposit_eng
,
deposit_chn
def
get_other_arrangements_and_conditions
(
self
):
"""获取其它约定与条件文本段落
"""
other_arrangements_and_conditions_eng
=
''
other_arrangements_and_conditions_chn
=
''
all_texts
=
''
for
pno
in
self
.
ocr_results
:
for
key
in
self
.
ocr_results
[
pno
]:
all_texts
+=
self
.
ocr_results
[
pno
][
key
][
1
]
searchObj
=
re
.
search
(
r'Conditions:(.*?)其他约定与条件'
,
all_texts
,
re
.
I
)
searchObj
=
re
.
search
(
r'Conditions:(.*?)其他'
,
all_texts
,
re
.
I
)
if
searchObj
:
words
=
searchObj
.
group
(
1
)
pattern
=
re
.
compile
(
"[
\u4e00
-
\u9fa5
]"
)
# 去除中文字符
words
=
pattern
.
sub
(
''
,
words
)
other_arrangements_and_conditions_eng
=
words
searchObj
=
re
.
search
(
r'条件:(.*?)General'
,
all_texts
,
re
.
I
)
if
searchObj
:
words
=
searchObj
.
group
(
1
)
other_arrangements_and_conditions_chn
=
words
return
other_arrangements_and_conditions_eng
,
other_arrangements_and_conditions_chn
def
get_info
(
self
):
# 按照文档页码返回一个合同编号列表,依次表示每一页上识别到的合同编号
contract_No_list
=
self
.
get_contract_No
()
self
.
init_result
[
"合同编号列表"
]
=
contract_No_list
dealer_name
,
dealer_No
=
self
.
get_info_in_page_3
()
self
.
init_result
[
"经销商名称_Page3"
]
=
dealer_name
self
.
init_result
[
"经销商统一社会信用代码或公司注册号"
]
=
dealer_No
dealer_name
=
self
.
get_info_in_page_38
()
self
.
init_result
[
"经销商名称_Page38"
]
=
dealer_name
guarantor
=
self
.
get_guarantor
()
self
.
init_result
[
"保证人"
]
=
guarantor
amount_eng
,
amount_chn
,
term_start_eng
,
term_end_eng
,
\
term_start_chn
,
term_end_chn
,
deposit_eng
,
deposit_chn
=
self
.
get_info_in_page_39
()
self
.
init_result
[
"综合授信额度金额英文"
]
=
amount_eng
...
...
@@ -367,7 +332,6 @@ class Finder:
self
.
init_result
[
"综合授信额度期限截止日期中文"
]
=
term_end_chn
self
.
init_result
[
"保证金比例英文"
]
=
deposit_eng
self
.
init_result
[
"保证金比例中文"
]
=
deposit_chn
words_eng
,
words_chn
=
self
.
get_other_arrangements_and_conditions
()
self
.
init_result
[
"其他约定与条件英文"
]
=
words_eng
self
.
init_result
[
"其他约定与条件中文"
]
=
words_chn
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment