Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
92b21d6a
authored
2021-11-03 17:56:59 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
fix bug
1 parent
6ba8d65d
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
78 additions
and
48 deletions
src/apps/doc/management/commands/folder_wsc_process.py
src/common/tools/pdf_to_img.py
src/apps/doc/management/commands/folder_wsc_process.py
View file @
92b21d6
...
...
@@ -19,16 +19,16 @@ from common.mixins import LoggerMixin
from
common.tools.pdf_to_img
import
PDFHandler
from
apps.doc
import
consts
from
apps.doc.exceptions
import
OCR1Exception
,
OCR4Exception
from
apps.doc.ocr.wb
import
BSWorkbook
from
apps.doc.ocr.wb
import
BSWorkbook
,
PatternFill
class
Finder
:
"""Summary
Attributes:
ocr_results (TYPE): Description
"""
def
__init__
(
self
,
ocr_results
=
None
):
self
.
ocr_results
=
ocr_results
...
...
@@ -82,15 +82,15 @@ class Finder:
for
key
in
ocr_results
[
pno
]:
bbox
,
text
=
ocr_results
[
pno
][
key
]
ocr_texts
+=
text
pattern
=
re
.
compile
(
"[^
\u4e00
-
\u9fa5
]"
)
# 匹配不是中文的其他字符
pattern
=
re
.
compile
(
"[^
\u4e00
-
\u9fa5
]"
)
# 匹配不是中文的其他字符
ocr_texts
=
pattern
.
sub
(
''
,
ocr_texts
)
score
=
fuzz
.
ratio
(
page_template
,
ocr_texts
)
/
100.
score
=
fuzz
.
ratio
(
page_template
,
ocr_texts
)
/
100.
classes
.
append
([
pno
,
score
])
pred
=
sorted
(
classes
,
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)[
0
]
return
pred
def
get_top_key
(
self
,
ocr_results
,
key_string
):
# 加入过滤词典
def
get_top_key
(
self
,
ocr_results
,
key_string
):
# 加入过滤词典
"""找到与 key_string 最匹配的字段的 key
"""
if
len
(
ocr_results
)
==
0
:
...
...
@@ -111,7 +111,7 @@ class Finder:
continue
inter
=
Polygon
(
g
)
.
intersection
(
Polygon
(
p
))
.
area
union
=
g
.
area
+
p
.
area
-
inter
iou
=
inter
/
union
iou
=
inter
/
union
iou_list
.
append
([
iou
,
key
])
if
len
(
iou_list
)
==
0
:
return
-
1
,
-
1
...
...
@@ -128,8 +128,8 @@ class Finder:
bbox
,
text
=
ocr_results
[
key
]
# 定制化规则, 比如过滤一些词呀什么的
# 该例中, 我们要去掉非中文字符
pattern
=
re
.
compile
(
"[^
\u4e00
-
\u9fa5
]"
)
# 匹配不是中文的其他字符
text
=
pattern
.
sub
(
''
,
text
)
pattern
=
re
.
compile
(
"[^
\u4e00
-
\u9fa5
]"
)
# 匹配不是中文的其他字符
text
=
pattern
.
sub
(
''
,
text
)
tmp_ocr_results
[
key
]
=
[
bbox
,
text
]
# 先根据 key_string 找到 key 的位置所在, 再判断该位置是否包含 value
...
...
@@ -141,8 +141,8 @@ class Finder:
if
len
(
words
)
==
0
:
# 将 bbox 右移一个单位
x0
,
y0
,
x1
,
y1
,
x2
,
y2
,
x3
,
y3
=
bbox
rw
=
abs
(
x0
-
x1
)
anchor
=
[
x0
+
rw
,
y0
,
x1
+
rw
,
y1
,
x2
+
rw
,
y2
,
x3
+
rw
,
y3
]
rw
=
abs
(
x0
-
x1
)
anchor
=
[
x0
+
rw
,
y0
,
x1
+
rw
,
y1
,
x2
+
rw
,
y2
,
x3
+
rw
,
y3
]
iou
,
key
=
self
.
get_top_iou
(
ocr_results
,
anchor
)
if
ratio
>
0.3
:
bbox
,
text
=
ocr_results
[
key
]
...
...
@@ -223,7 +223,7 @@ class Finder:
bbox
,
text
=
self
.
ocr_results
[
pno
][
key
]
all_texts
+=
text
searchObj
=
re
.
search
(
r'保证人\[(.*?)\]与甲方'
,
all_texts
)
searchObj
=
re
.
search
(
r'保证人\[(.*?)\]与甲方'
,
all_texts
)
if
searchObj
:
words
=
f
'[{searchObj.group(1)}]'
words
=
words
.
replace
(
'【'
,
'['
)
.
replace
(
'】'
,
']'
)
.
replace
(
','
,
','
)
.
replace
(
'('
,
'('
)
.
replace
(
')'
,
')'
)
...
...
@@ -256,7 +256,9 @@ class Finder:
if
score
>
0.5
:
if
len
(
self
.
ocr_results
[
pno
])
>
0
:
# 根据关键词,找这一行字符
lines
=
self
.
get_line
(
self
.
ocr_results
[
pno
],
'RMB'
)
lines
=
''
for
i
in
[
'RMB'
,
'CNY'
]:
lines
+=
self
.
get_line
(
self
.
ocr_results
[
pno
],
i
)
# searchObj = re.search( r'RMB(.*?)in', lines)
searchObj
=
re
.
search
(
r'[0-9,.]+'
,
lines
)
if
searchObj
:
...
...
@@ -264,10 +266,10 @@ class Finder:
amount_eng
=
words
lines
=
self
.
get_line
(
self
.
ocr_results
[
pno
],
'人民币'
)
searchObj
=
re
.
search
(
r'大写(.*?)综合'
,
lines
)
searchObj
=
re
.
search
(
r'大写(.*?)综合'
,
lines
)
if
searchObj
:
words
=
searchObj
.
group
(
1
)
pattern
=
re
.
compile
(
"[^
\u4e00
-
\u9fa5
]"
)
# 匹配不是中文的其他字符
pattern
=
re
.
compile
(
"[^
\u4e00
-
\u9fa5
]"
)
# 匹配不是中文的其他字符
words
=
pattern
.
sub
(
''
,
words
)
words
=
words
.
replace
(
"仔"
,
"仟"
)
.
replace
(
"任"
,
"仟"
)
words
=
words
.
replace
(
"值"
,
"佰"
)
...
...
@@ -276,15 +278,15 @@ class Finder:
words
=
words
.
replace
(
"政"
,
"玖"
)
words
=
words
.
replace
(
"垒"
,
"叁"
)
amount_chn
=
words
lines
=
self
.
get_line
(
self
.
ocr_results
[
pno
],
'ending'
)
if
len
(
lines
)
>
0
:
start
,
end
=
lines
.
split
(
'ending'
)
searchStart
=
re
.
search
(
r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}'
,
start
)
searchStart
=
re
.
search
(
r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}'
,
start
)
if
searchStart
:
words
=
searchStart
.
group
()
term_start_eng
=
words
searchEnd
=
re
.
search
(
r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}'
,
end
)
searchEnd
=
re
.
search
(
r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}'
,
end
)
if
searchEnd
:
words
=
searchEnd
.
group
()
term_end_eng
=
words
...
...
@@ -292,29 +294,29 @@ class Finder:
lines
=
self
.
get_line
(
self
.
ocr_results
[
pno
],
'至'
)
if
len
(
lines
)
>
0
:
start
,
end
=
lines
.
split
(
'至'
)
searchStart
=
re
.
search
(
r'[0-9]{4}-[0-9]+-[0-9]+'
,
start
)
searchStart
=
re
.
search
(
r'[0-9]{4}-[0-9]+-[0-9]+'
,
start
)
if
searchStart
:
words
=
searchStart
.
group
()
term_start_chn
=
words
searchEnd
=
re
.
search
(
r'[0-9]{4}-[0-9]+-[0-9]+'
,
end
)
searchEnd
=
re
.
search
(
r'[0-9]{4}-[0-9]+-[0-9]+'
,
end
)
if
searchEnd
:
words
=
searchEnd
.
group
()
term_end_chn
=
words
lines
=
self
.
get_line
(
self
.
ocr_results
[
pno
],
'above'
)
searchObj
=
re
.
search
(
r'aboveto([0-9]+)'
,
lines
.
replace
(
'O'
,
'
0'
))
searchObj
=
re
.
search
(
r'aboveto([0-9]+)'
,
lines
.
replace
(
'O'
,
'0'
)
.
replace
(
'too'
,
'to
0'
))
if
searchObj
:
words
=
searchObj
.
group
(
1
)
deposit_eng
=
f
'{words}
%
'
lines
=
self
.
get_line
(
self
.
ocr_results
[
pno
],
'授信额度的'
)
searchObj
=
re
.
search
(
r'授信额度的([0-9]+)'
,
lines
.
replace
(
'O'
,
'0'
))
searchObj
=
re
.
search
(
r'授信额度的([0-9]+)'
,
lines
.
replace
(
'O'
,
'0'
))
if
searchObj
:
words
=
searchObj
.
group
(
1
)
deposit_chn
=
f
'{words}
%
'
return
amount_eng
,
amount_chn
,
term_start_eng
,
term_end_eng
,
\
term_start_chn
,
term_end_chn
,
deposit_eng
,
deposit_chn
term_start_chn
,
term_end_chn
,
deposit_eng
,
deposit_chn
def
get_other_arrangements_and_conditions
(
self
):
"""获取其它约定与条件文本段落
...
...
@@ -330,7 +332,7 @@ class Finder:
searchObj
=
re
.
search
(
r'Conditions:(.*?)其他约定与条件'
,
all_texts
,
re
.
I
)
if
searchObj
:
words
=
searchObj
.
group
(
1
)
pattern
=
re
.
compile
(
"[
\u4e00
-
\u9fa5
]"
)
# 去除中文字符
pattern
=
re
.
compile
(
"[
\u4e00
-
\u9fa5
]"
)
# 去除中文字符
words
=
pattern
.
sub
(
''
,
words
)
other_arrangements_and_conditions_eng
=
words
...
...
@@ -356,7 +358,7 @@ class Finder:
self
.
init_result
[
"保证人"
]
=
guarantor
amount_eng
,
amount_chn
,
term_start_eng
,
term_end_eng
,
\
term_start_chn
,
term_end_chn
,
deposit_eng
,
deposit_chn
=
self
.
get_info_in_page_39
()
term_start_chn
,
term_end_chn
,
deposit_eng
,
deposit_chn
=
self
.
get_info_in_page_39
()
self
.
init_result
[
"综合授信额度金额英文"
]
=
amount_eng
self
.
init_result
[
"综合授信额度金额中文"
]
=
amount_chn
self
.
init_result
[
"综合授信额度期限开始日期英文"
]
=
term_start_eng
...
...
@@ -371,7 +373,6 @@ class Finder:
self
.
init_result
[
"其他约定与条件中文"
]
=
words_chn
return
self
.
init_result
class
TIFFHandler
:
def
__init__
(
self
,
path
,
img_save_path
):
...
...
@@ -409,6 +410,7 @@ class Command(BaseCommand, LoggerMixin):
self
.
input_dir
=
conf
.
WSC_DIR
# ocr相关
self
.
go_ocr_url
=
conf
.
WSC_GO_URL
self
.
amount_fill
=
PatternFill
(
"solid"
,
fgColor
=
"00FFFF00"
)
# 优雅退出信号:15
signal
.
signal
(
signal
.
SIGTERM
,
self
.
signal_handler
)
...
...
@@ -435,6 +437,19 @@ class Command(BaseCommand, LoggerMixin):
excel_path
=
os
.
path
.
join
(
wb_output_dir
,
excel_name
)
return
img_save_path
,
excel_path
,
pdf_save_path
@staticmethod
def
get_mode_code
(
code_list
):
result_dict
=
{}
for
code
in
code_list
:
if
code
in
result_dict
:
result_dict
[
code
]
+=
1
else
:
result_dict
[
code
]
=
1
if
len
(
result_dict
)
==
1
:
return
None
else
:
return
sorted
(
result_dict
.
items
(),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)[
0
][
0
]
def
res_process
(
self
,
all_res
,
excel_path
):
try
:
self
.
finder
.
ocr_results
=
all_res
...
...
@@ -442,11 +457,26 @@ class Command(BaseCommand, LoggerMixin):
wb
=
BSWorkbook
(
set
(),
set
(),
set
(),
set
(),
set
())
ws
=
wb
.
create_sheet
(
self
.
sheet_name
)
row_idx
=
1
code_idx
=
1
mode_code
=
None
for
write_field
,
field_value
in
results
.
items
():
row_idx
+=
1
if
isinstance
(
field_value
,
list
):
if
write_field
==
'合同编号列表'
:
code_idx
=
row_idx
mode_code
=
self
.
get_mode_code
(
field_value
)
ws
.
append
((
write_field
,
*
field_value
))
else
:
ws
.
append
((
write_field
,
field_value
))
if
isinstance
(
mode_code
,
str
):
for
cell
in
ws
[
code_idx
]:
if
cell
.
value
==
'合同编号列表'
:
continue
if
cell
.
value
!=
mode_code
:
cell
.
fill
=
self
.
amount_fill
wb
.
remove_base_sheet
()
wb
.
save
(
excel_path
)
except
Exception
as
e
:
...
...
src/common/tools/pdf_to_img.py
View file @
92b21d6
...
...
@@ -257,19 +257,19 @@ class PDFHandler:
self
.
page_to_png
(
page
)
def
check_ebank
(
self
,
pdf
):
page_text_list
=
[]
#
page_text_list = []
text_item_sum
=
0
for
pno
in
range
(
pdf
.
pageCount
):
page
=
pdf
.
loadPage
(
pno
)
if
page
.
rotation
is
None
:
rotation
=
0
elif
isinstance
(
page
.
rotation
,
int
):
divisor
,
remainder
=
divmod
(
page
.
rotation
,
90
)
if
remainder
!=
0
:
return
rotation
=
divmod
(
divisor
,
4
)[
1
]
else
:
return
#
if page.rotation is None:
#
rotation = 0
#
elif isinstance(page.rotation, int):
#
divisor, remainder = divmod(page.rotation, 90)
#
if remainder != 0:
#
return
#
rotation = divmod(divisor, 4)[1]
#
else:
#
return
textpage
=
page
.
getTextPage
()
text
=
textpage
.
extractDICT
()
text_list
=
[]
...
...
@@ -284,17 +284,17 @@ class PDFHandler:
text_item_sum
+=
len
(
text_list
)
if
text_item_sum
<
(
pno
+
1
)
*
5
:
return
else
:
page_text_list
.
append
(
{
'width'
:
text
.
get
(
'width'
),
'height'
:
text
.
get
(
'height'
),
'rotation'
:
rotation
,
'text'
:
text_list
}
)
#
else:
#
page_text_list.append(
#
{
#
'width': text.get('width'),
#
'height': text.get('height'),
#
'rotation': rotation,
#
'text': text_list
#
}
#
)
self
.
is_ebank
=
True
self
.
page_text_list
=
page_text_list
#
self.page_text_list = page_text_list
def
extract_image
(
self
,
max_img_count
=
None
):
self
.
img_path_list
=
[]
...
...
@@ -310,7 +310,7 @@ class PDFHandler:
if
isinstance
(
max_img_count
,
int
)
and
pdf
.
pageCount
>=
max_img_count
:
self
.
img_count
=
pdf
.
pageCount
return
#
self.check_ebank(pdf)
self
.
check_ebank
(
pdf
)
for
pno
in
range
(
pdf
.
pageCount
):
il
=
pdf
.
getPageImageList
(
pno
)
# 获取页面图片对象
# (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment