afc_contract_ocr.py
2.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# -*- coding: utf-8 -*-
# @Author : lk
# @Email : 9428.al@gmail.com
# @Created Date : 2021-06-29 17:43:46
# @Last Modified : 2021-09-07 14:11:25
# @Description :
from .get_char import Finder
from .get_char_fsm import Finder as FSMFinder
import numpy as np
def extract_info(ocr_results):
contract_no = {
"words": None,
"position": None
}
for bbox, text in ocr_results.get('0', {}).values():
if text.startswith('CH-B'):
contract_no['words'] = text
contract_no['position'] = [bbox[0], bbox[1], bbox[2], bbox[-1]]
break
return {'page_1': {'合同编号': contract_no}}
def predict(pdf_info, is_qrs=False, is_fsm=False):
pop_seceond_page_info = {}
if not is_fsm and not is_qrs and len(pdf_info) == 9:
pop_seceond_page_info = pdf_info.pop('1', {})
for pno in range(8):
if pno == 0:
pdf_info[str(pno)]['blocks'].extend(pop_seceond_page_info.get('blocks', []))
else:
pdf_info[str(pno)] = pdf_info.pop(str(pno+1))
ocr_results = {}
for pno in pdf_info:
ocr_results[pno] = {}
ocr_result = []
for key, block in enumerate(pdf_info[pno]['blocks']):
if block['type'] != 0:
continue
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if len(text) == 0:
continue
# print(text)
xmin, ymin, xmax, ymax = bbox
polygon = [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax]
polygon = np.array(polygon, dtype=np.int32).tolist()
text = text.replace(":", ":").replace(" ", "")
ocr_result.append([polygon, text])
ocr_result = sorted(ocr_result, key=lambda x: x[0][1], reverse=False) # 按 y0 从小到大排
keys = list(range(len(ocr_result)))
ocr_result = dict(zip(keys, ocr_result))
ocr_results[pno] = ocr_result
if is_qrs:
results = extract_info(ocr_results)
else:
# 输入是整个 PDF 中的信息
if is_fsm:
f = FSMFinder(pdf_info, ocr_results=ocr_results)
else:
f = Finder(pdf_info, ocr_results=ocr_results)
results = f.get_info()
return results