hil_contract_ocr.py
2.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# -*- coding: utf-8 -*-
# @Author : lk
# @Email : 9428.al@gmail.com
# @Created Date : 2021-06-29 17:43:46
# @Last Modified : 2021-11-03 16:07:36
# @Description :
from .get_char import Finder
from .get_char_fsm import Finder as FSMFinder
def predict(pdf_info, file_cls, is_fsm=False):
"""Summary
Args:
pdf_info (TYPE): Description
file_cls (TYPE): file_cls = 0: 售后回租合同; file_cls = 1: 车辆处置协议; file_cls = 2: 车辆租赁抵押合同
Returns:
TYPE: Description
"""
# 0: 售后回租合同
pdf_info_0 = []
for pno in pdf_info:
for block in pdf_info[f'{pno}']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '售后回租合同_' in text:
pdf_info_0.append(pdf_info[pno])
# 1: 车辆处置协议
pdf_info_1 = []
for pno in pdf_info:
for block in pdf_info[f'{pno}']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '售后回租合同附件一' in text:
pdf_info_1.append(pdf_info[pno])
# 2: 车辆租赁抵押合同
pdf_info_2 = []
for pno in pdf_info:
for block in pdf_info[f'{pno}']['blocks']:
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if '车辆租赁抵押合同_' in text:
pdf_info_2.append(pdf_info[pno])
is_clczxy = False
# 如果 pdf_info_1 == 4 页,则说明此时输入包含了车辆处置协议
if len(pdf_info_1) == 4 and file_cls == 1 and len(pdf_info_0) != 0:
is_clczxy = True
pdf_info = dict()
for pno, page_info in enumerate(pdf_info_1):
pdf_info[str(pno)] = page_info
if is_fsm:
f = FSMFinder(pdf_info)
else:
f = Finder(pdf_info)
if file_cls == 0:
results = f.get_info()
if file_cls == 1:
# 提取信息 ———— 车辆处置协议
results = f.get_info_1()
if file_cls == 2:
# 提取信息 ———— 车辆租赁抵押合同
results = f.get_info_2()
# if is_clczxy is True:
# for key in results:
# if results[key]['page'] is not None:
# results[key]['page'] = str(int(results[key]['page']) + 6)
for key in results:
if results[key]['page'] is not None:
results[key]['page'] = 'page_' + str(int(results[key]['page']) + 1)
return results