43758c7e by 周伟奇

add general extractor

1 parent d8cec4a0
...@@ -17,3 +17,10 @@ test* ...@@ -17,3 +17,10 @@ test*
17 *.out 17 *.out
18 18
19 *.log 19 *.log
20
21 sample/
22 go_res/
23 test.py
24 simhei.ttf
25 sign_res/
26 res_valid.json
......
1 REPLACE_DICT_1 = {
2 "元": "圆",
3 # "零角": "零",
4 "柴": "柒",
5 "染": "柒",
6 "查": "壹",
7 "武": "贰",
8 "家": "贰",
9 "就": "贰",
10 "登": "叁",
11 # "@整": "叁",
12 "鑫": "叁",
13 "垂": "叁",
14 "捆": "捌",
15 "搁": "捌",
16 "级": "捌",
17 "测": "捌",
18 "拥": "捌",
19 "损": "捌",
20 "盒": "叁",
21 "摄": "捌",
22 "报": "捌",
23 "会": "叁",
24 "索": "壹",
25 "任": "仟",
26 "杆": "仟",
27 "仔": "仟",
28 "什": "仟",
29 "付": "仟",
30 "伴": "仟",
31 "宿": "佰",
32 "信": "佰",
33 "情": "佰",
34 "值": "佰",
35 "荣": "柒",
36 "渠": "柒",
37 "类": "柒",
38 "案": "柒",
39 "集": "柒",
40 "方": "万",
41 "抬": "拾",
42 "给": "拾",
43 "樟": "肆",
44 "单": "肆",
45 "邮": "肆",
46 "政": "玖",
47 "拐": "捌",
48 # "柴": "柒",
49 # "任": "仟",
50 # "拥": "捌",
51 # "会": "叁",
52 }
53
54
55 ARG_KEY_KEY_LIST = 'keys_list'
56 ARG_KEY_VALUE_DICT = 'values_dict'
57
58 INVOICE_KEY_LIST = [
59 ('纳税人识别号', False), # 相近的key 0
60 ('增值税', False), # 相近的key 1
61
62 ('地', False), # 单字的key 2
63 ('址', False), # 单字的key 3
64
65 ('开票日期', '开票曰期', '开票日', True), # 4
66 ('发票代码', '发票代鸡', True), # 5
67 ('发票号码', '发票号瑞', '发要号瑞', True), # 6
68 ('机打代码', False), # 7
69 ('机打号码', '机打号玛', False), # 8
70 ('机器编号', False), # 9
71 ('购买方名称', '购买方名称及', False), # 10
72 ('纳税人识别号/', False), # 11
73 ('统一社会信用代码/', False), # 12
74 ('身份证号码', '身份证号码/', False), # 13
75 ('车辆类型', True), # 14
76 ('厂牌型号', '广牌型号', '厂胖型号', '广牌型考', True), # 15
77 ('产地', '严地', True), # 16
78 ('合格证号', False), # 17
79 ('进口证明书号', True), # 18
80 ('商检单号', True), # 19
81 ('发动机号码', False), # 20
82 ('车辆识别代号/车架号码', True), # 21
83 ('价税合计', '价现合计', '价“税合计', False), # 22
84 ('小写', True), # 23 TODO 多个值时的取值
85 ('销货单位名称', False), # 24
86 ('电话', True), # 25
87 ('账号', '账考', '帐号', '帐考', '张号', '陈号', '昨号', True), # 26
88 ('开户银行', True), # 27
89 ('增值税税率', True), # 28 value false
90 ('或征收税', False), # 29
91 ('税额', False), # 30
92 ('主管税务', True), # 31 value False
93 ('机关及代码', True), # 32
94 ('不含税价', True), # 33 value False
95 ('完税凭证号码', False), # 34
96 ('开票人', True), # 35
97 ('吨位', True), # 36
98 ('限乘人数', '跟乘人数', True), # 37 TODO '人数'这种情况的坐标切分
99 ('备注', True) # 38
100 ]
101
102 # split key-value一体
103 # append key-value_suffix 需要坐标切分
104 # insert key-value_prefix 需要坐标切分
105 INVOICE_VALUE_DICT = {
106 '开票日期': {
107 'length': 10,
108 'str_type': 'date',
109 # idx, location, top, bottom, left, (idx, scope), choice, if_startswith
110 'location': [(4, 'right', 0.3, 0.5, 0, (2, ), 'xmin', 'split')],
111 'fix_methods': [('prune_first_char', {'char_set': {':', ':', ';', }})]
112 },
113 '发票代码': {
114 'length': 12,
115 'str_type': 'int',
116 'location': [(5, 'right', 0.2, 0.2, 0, (2, ), 'xmin', 'split')]
117 },
118 '发票号码': {
119 'length': 8,
120 'str_type': 'int',
121 'location': [(6, 'right', 0.2, 0.5, 0, (2, ), 'length', 'split')],
122 'fix_methods': [('prune_first_char', {'char_set': {'-',}})]
123 },
124 '机打代码': {
125 'length': 12,
126 'str_type': 'int',
127 'location': [(7, 'right', 0.5, 1, 0, (2, ), 'ymin', None)]
128 },
129 '机器编号': {
130 'length': 12,
131 'str_type': 'int',
132 'location': [(9, 'right', 0.5, 1, 0, (2, ), 'ymax', None)]
133 },
134 '机打号码': {
135 'length': 8,
136 'str_type': 'int',
137 'location': [(8, 'right', 0.5, 0.5, 0, (2, ), 'length', None)]
138 },
139 '购买方名称': {
140 'length': None,
141 'str_type': 'str', # cn
142 'location': [(10, 'right', 0.5, 0.5, 0, (11, 12, 13, 2), 'xmin', None)]
143 },
144 '纳税人识别号/统一社会信用代码/身份证号码': {
145 'length': 18,
146 'str_type': 'str', # alnum
147 'location': [(11, 'right', 0, 2, 0, (2.5, ), 'length', None), (12, 'right', 1, 1, 0, (2, ), 'length', None), (13, 'right', 2, 0, 0.5, (3, ), 'length', None)]
148 },
149 '车辆类型': {
150 'length': None,
151 'str_type': 'str',
152 'location': [(14, 'right', 0.2, 0.2, 0, (15, 1.5), 'xmin', 'split'), (15, 'left', 0.2, 0.2, 0, (14, 2.5), 'xmax', None)]
153 },
154 '厂牌型号': {
155 'length': None,
156 'str_type': 'str',
157 'location': [(15, 'right', 0.2, 0.2, 0, (16, 3.5), 'xmin', 'split'), (16, 'left', 0.2, 0.2, 0, (15, 2.5), 'xmax', None)]
158 },
159 '产地': {
160 'length': None,
161 'str_type': 'str', # cn
162 'location': [(16, 'right', 0.2, 0.2, 0, (2.5, ), 'xmin', 'split')]
163 },
164 '合格证号': {
165 'length': None, # 15
166 'str_type': 'str', # alnum
167 'location': [(17, 'right', 0.2, 0.2, 0, (18, 1.5), 'xmin', None), (18, 'left', 0.2, 0.2, 0, (17, 1.5), 'xmax', None)]
168 },
169 '进口证明书号': {
170 'length': None,
171 'str_type': 'str', # alnum
172 'location': [(18, 'right', 0.3, 0.3, 0, (19, 1.5), 'xmin', 'split'), (19, 'left', 0.2, 0.2, 0, (18, 3), 'xmax', None)]
173 },
174 '商检单号': {
175 'length': None,
176 'str_type': 'str',
177 'location': [(19, 'right', 0.2, 0.2, 0, (1.5, ), 'xmin', 'split')]
178 },
179 '发动机号码': {
180 'length': None,
181 'str_type': 'str', # alnum
182 'location': [(20, 'right', 0.2, 0.2, 0, (21, 2), 'xmin', None), (21, 'left', 0.2, 0.2, 0, (20, 1.4), 'xmax', None)]
183 },
184 '车辆识别代号/车架号码': {
185 'length': 17,
186 'str_type': 'str', # alnum
187 'location': [(21, 'right', 0.3, 0.3, 0, (1.2, ), 'xmin', 'split')]
188 },
189 '价税合计大写': {
190 'length': None,
191 'str_type': 'str', # cn
192 'location': [(22, 'right', 0.2, 0.2, 0, (23, 3), 'xmin', None), (23, 'left', 0.2, 0.2, 0, (22, 15), 'xmax', None)],
193 'fix_methods': [('prune_no_cn', {}), ('replace_whole', {'replace_map': REPLACE_DICT_1})]
194 },
195 '价税合计小写': {
196 'length': None,
197 'str_type': 'float',
198 'location': [(23, 'right', 0.4, 0.4, 0, (4, ), 'xmin', 'split')],
199 'fix_methods': [('prune_amount', {})]
200 },
201 '销货单位名称': {
202 'length': None,
203 'str_type': 'str', # cn
204 'location': [(24, 'right', 0.2, 0.2, 0, (25, 3), 'xmin', None), (25, 'left', 0.3, 0.3, 0, (24, 15), 'xmax', None)]
205 },
206 '电话': {
207 'length': None,
208 'str_type': 'str', # int + -
209 'location': [(25, 'right', 0.3, 0.3, 0, (5, ), 'xmin', 'split')]
210 },
211 '纳税人识别号': {
212 'length': None,
213 'str_type': 'str', # cn
214 'location': [(0, 'right', 0.3, 0.3, 0, (26, 2.5), 'xmin', None), (26, 'left', 0.3, 0.3, 0, (0, 15), 'xmax', None)]
215 },
216 '账号': {
217 'length': None,
218 'str_type': 'str',
219 'location': [(26, 'right', 0.3, 0.3, 0, (6, ), 'xmin', 'split')]
220 },
221 '地址': {
222 'length': None,
223 'str_type': 'str', # cn
224 'location': [(27, 'left', 0.3, 0.3, 0, (3, 4), 'merge', None), (3, 'right', 0.3, 0.3, 0, (27, 20), 'xmin', None)]
225 },
226 '开户银行': {
227 'length': None,
228 'str_type': 'str', # cn
229 'location': [(27, 'right', 0.3, 0.3, 0, (3, ), 'xmin', 'split')]
230 },
231 '增值税税率或征收率': {
232 'length': 3,
233 'str_type': 'str', # 13%
234 'location': [(28, 'right', 0, 1, 0, (1, 30, 1), 'xmin', None), (29, 'right', 1, 0, 0, (1, 30, 1), 'xmin', None),
235 (1, 'left', 0, 1, 0, (28, 29, 2), 'xmax', None), (30, 'left', 1, 0, 0, (28, 29, 2), 'xmax', None)],
236 'fix_methods': [('replace_last_char', {'char_set': {'8', '9', '号'}, 'target_char': '%'})]
237
238 },
239 '增值税税额': {
240 'length': None,
241 'str_type': 'float',
242 'location': [(1, 'right', 0, 1, 0, (31, 32, 2.5), 'xmin', None), (30, 'right', 1, 0, 0, (31, 32, 2.5), 'xmin', None),
243 (31, 'left', 0, 1, 0, (1, 30, 2), 'xmax', None), (32, 'left', 1, 0, 0, (1, 30, 2), 'xmax', None)],
244 'fix_methods': [('prune_amount', {})]
245 },
246 '主管税务机关及代码': {
247 'length': None,
248 'str_type': 'str',
249 'location': [(31, 'right', 0, 1.5, 0, (2, ), 'merge', None), (32, 'right', 1, 0.5, 0, (2, ), 'merge', None)]
250 },
251 '不含税价-小写': {
252 'length': None,
253 'str_type': 'float', # cn
254 'location': [(34, 'left', 0.3, 0.3, 0, (33, 1.5), 'xmax', None), (33, 'right', 0.2, 0.2, 0, (34, 1.5), 'xmin', None)],
255 'fix_methods': [('prune_amount', {})]
256 },
257 '完税凭证号码': {
258 'length': None,
259 'str_type': 'str',
260 'location': [(34, 'right', 0.2, 0.2, 0, (36, 1.5), 'xmin', None), (36, 'left', 0.2, 0.2, 0, (34, 6), 'xmax', None)]
261 },
262 '吨位': {
263 'length': None,
264 'str_type': 'str',
265 'location': [(36, 'right', 0.2, 0.2, 0, (37, 1), 'xmin', 'split'), (37, 'left', 0.2, 0.2, 0, (36, 0.5), 'xmax', None)]
266 },
267 '限乘人数': {
268 'length': None,
269 'str_type': 'int',
270 'location': [(37, 'right', 0.2, 0.2, 0, (0.5, ), 'xmin', 'split')]
271 },
272 '开票人': {
273 'length': None,
274 'str_type': 'str',
275 'location': [(35, 'right', 0, 0.5, 0, (1.5, ), 'xmin', 'split')]
276 },
277 '备注': {
278 'length': None,
279 'str_type': 'str',
280 'location': [(38, 'right', 0.2, 0.2, 0, (2, ), 'xmin', 'split')],
281 'fix_methods': [('prune_first_char', {'char_set': {';', ':', ':'}})]
282 },
283
284 }
285
286 INVOICE_CONST = {
287 ARG_KEY_KEY_LIST: INVOICE_KEY_LIST,
288 ARG_KEY_VALUE_DICT: INVOICE_VALUE_DICT
289 }
1 import re
2 import math
3
4
5 class Retriever:
6
7 def __init__(self, keys_list=[], values_dict={}):
8 self.keys_list = keys_list
9 self.values_dict = values_dict
10 self.find_keys_list = []
11
12 @staticmethod
13 def get_theta(x0, y0, x1, y1):
14 theta = math.atan((y0-y1)/(x1-x0))
15 return math.cos(theta), math.sin(theta)
16
17 @staticmethod
18 def rebuild_xy(x, y, cos, sin):
19 rebuild_x = x * cos - y * sin
20 rebuild_y = y * cos + x * sin
21 return rebuild_x, rebuild_y
22
23 def rebuild_coord(self, coord_tuple, cos, sin):
24 rebuild_list = []
25 for idx in range(0, len(coord_tuple), 2):
26 rebuild_list.extend(self.rebuild_xy(coord_tuple[idx], coord_tuple[idx+1], cos, sin))
27 return rebuild_list
28
29 @staticmethod
30 def prune_no_cn(src_str):
31 fix_str = re.sub(r'[^\u4e00-\u9fa5]+', '', src_str)
32 return fix_str
33
34 @staticmethod
35 def prune_first_char(src_str, char_set):
36 if src_str[0] in char_set:
37 return src_str[1:]
38 return src_str
39
40 @staticmethod
41 def prune_amount(src_str):
42 fix_str = ''.join(filter(lambda i: i in [',', '.'] or str.isdigit(i), src_str))
43 return fix_str
44
45 @staticmethod
46 def replace_whole(src_str, replace_map):
47 fix_str = src_str.translate(str.maketrans(replace_map))
48 return fix_str
49
50 @staticmethod
51 def replace_last_char(src_str, char_set, target_char):
52 if src_str[-1] in char_set:
53 return src_str[:-1] + target_char
54 return src_str
55
56 # @staticmethod
57 # def prune_RMB(src_str):
58 # return src_str
59
60 @staticmethod
61 def choice_xmin(value_list, value_length):
62 value_list.sort(key=lambda x: x[1])
63 return value_list[0]
64
65 @staticmethod
66 def choice_xmax(value_list, value_length):
67 value_list.sort(key=lambda x: x[1], reverse=True)
68 return value_list[0]
69
70 @staticmethod
71 def choice_ymin(value_list, value_length):
72 value_list.sort(key=lambda x: x[2])
73 return value_list[0]
74
75 @staticmethod
76 def choice_ymax(value_list, value_length):
77 value_list.sort(key=lambda x: x[2], reverse=True)
78 return value_list[0]
79
80 @staticmethod
81 def choice_merge(value_list, value_length):
82 value_list.sort(key=lambda x: x[2])
83 merged_value_list = []
84 merged_idx_list = []
85 merged_x_list = []
86 merged_y_list = []
87 for text, x0, y0, x1, y1, idx_tuple in value_list:
88 merged_value_list.append(text)
89 merged_idx_list.extend(idx_tuple)
90 merged_x_list.append(x0)
91 merged_x_list.append(x1)
92 merged_y_list.append(y0)
93 merged_y_list.append(y1)
94 return (''.join(merged_value_list),
95 min(merged_x_list),
96 min(merged_y_list),
97 max(merged_x_list),
98 max(merged_y_list),
99 tuple(merged_idx_list))
100
101 @staticmethod
102 def choice_length(value_list, value_length):
103 value_list.sort(key=lambda x: len(x[0]) - value_length)
104 return
105
106 def value_direction_left(self, go_res, key_idx, top_or_left, bottom_or_right, offset, scope_tuple, choice_method,
107 if_startswith, length):
108 # 字段值查找方向:左侧
109
110 if self.find_keys_list[key_idx] is None:
111 return
112
113 _, _, find_key_str, suffix_key, key_x0, key_y0, key_x1, key_y1 = self.find_keys_list[key_idx]
114
115 for scope_key_idx in scope_tuple[:-1]:
116 if self.find_keys_list[scope_key_idx] is None:
117 continue
118 key_scope = self.find_keys_list[scope_key_idx][6] # left x1
119 break
120 else:
121 key_scope = None
122
123 if isinstance(if_startswith, str):
124 if isinstance(suffix_key, str):
125 # TODO suffix_key校验与修正
126 # TODO 目前只考虑了split的情况
127 return suffix_key, key_x0, key_y0, key_x1, key_y1, ()
128
129 height = key_y1 - key_y0
130 y_min = key_y0 - (top_or_left * height)
131 y_max = key_y1 + (bottom_or_right * height)
132
133 width = key_x1 - key_x0
134 x_max = key_x0 - (offset * width)
135 x_min = x_max - (width * scope_tuple[-1]) if key_scope is None else key_scope
136
137 all_find_value_list = []
138 for go_key_idx, ((x0, y0, _, _, x1, y1, _, _), text) in go_res.items():
139 cent_x = x0 + ((x1 - x0) / 2)
140 cent_y = y0 + ((y1 - y0) / 2)
141 # if go_key_idx == '98' and key_idx == 34:
142 # print(key_scope)
143 # print('-------------')
144 # print(cent_x)
145 # print(cent_y)
146 # print('-----------')
147 # print(key_x0)
148 # print(key_x1)
149 # print(key_y0)
150 # print(key_y1)
151 # print('-----------')
152 # print(x_min)
153 # print(x_max)
154 # print(y_min)
155 # print(y_max)
156 # print('===============')
157 if x_min < cent_x < x_max and y_min < cent_y < y_max:
158 all_find_value_list.append((text, x0, y0, x1, y1, (go_key_idx, )))
159
160 if len(all_find_value_list) == 0:
161 return
162 elif len(all_find_value_list) == 1:
163 return all_find_value_list[0]
164 else:
165 choice_value = getattr(self, 'choice_{0}'.format(choice_method))(all_find_value_list, length)
166 return choice_value
167
168 # if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str):
169 # new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {})))
170 # return new_value, coordinates
171
172 def value_direction_right(self, go_res, key_idx, top_or_left, bottom_or_right, offset, scope_tuple, choice_method,
173 if_startswith, length):
174 # 字段值查找方向:右侧
175
176 if self.find_keys_list[key_idx] is None:
177 return
178
179 _, _, find_key_str, suffix_key, key_x0_src, key_y0_src, key_x1_src, key_y1_src, key_x2_src, key_y2_src, key_x3_src, key_y3_src = self.find_keys_list[key_idx]
180
181 for scope_key_idx in scope_tuple[:-1]:
182 if self.find_keys_list[scope_key_idx] is None:
183 continue
184 key_scope_tuple = (self.find_keys_list[scope_key_idx][4], self.find_keys_list[scope_key_idx][5]) # right x0, y0
185 break
186 else:
187 key_scope_tuple = None
188
189 if isinstance(if_startswith, str):
190 if isinstance(suffix_key, str):
191 # TODO suffix_key校验与修正
192 # TODO 目前只考虑了split的情况
193 if isinstance(length, int):
194 if -3 < length - len(suffix_key) < 3:
195 return suffix_key, (key_x0_src, key_y0_src, key_x1_src, key_y1_src, key_x2_src, key_y2_src, key_x3_src, key_y3_src), ()
196 else:
197 return suffix_key, (key_x0_src, key_y0_src, key_x1_src, key_y1_src, key_x2_src, key_y2_src, key_x3_src, key_y3_src), ()
198
199 # 坐标系转换
200 cos, sin = self.get_theta(x0, y0, x1, y1)
201 key_x0, key_y0, key_x1, key_y1, key_x2, key_y2, key_x3, key_y3 = self.rebuild_coord((key_x0_src, key_y0_src, key_x1_src, key_y1_src, key_x2_src, key_y2_src, key_x3_src, key_y3_src), cos, sin)
202
203 height = key_y2 - key_y0
204 y_min = key_y0 - (top_or_left * height)
205 y_max = key_y2 + (bottom_or_right * height)
206
207 width = key_x2 - key_x0
208 x_min = key_x2 + (offset * width)
209 x_max = x_min + (width * scope_tuple[-1]) if key_scope_tuple is None else self.rebuild_xy(*key_scope_tuple, cos, sin)[0]
210
211 all_find_value_list = []
212 for go_key_idx, ((x0, y0, x1, y1, x2, y3, x3, y3), text) in go_res.items():
213 cent_x, cent_y = self.rebuild_xy(x0 + ((x2 - x0) / 2), y0 + ((y2 - y0) / 2), cos, sin)
214 # if go_key_idx == '98' and key_idx == 34:
215 # print(cent_x)
216 # print(cent_y)
217 # print('-----------')
218 # print(key_x0)
219 # print(key_x1)
220 # print(key_y0)
221 # print(key_y1)
222 # print('-----------')
223 # print(x_min)
224 # print(x_max)
225 # print(y_min)
226 # print(y_max)
227 if x_min < cent_x < x_max and y_min < cent_y < y_max:
228 all_find_value_list.append((text, x0, y0, x1, y1, x2, y2, x3, y3, (go_key_idx, )))
229
230 if len(all_find_value_list) == 0:
231 return
232 elif len(all_find_value_list) == 1:
233 return all_find_value_list[0]
234 else:
235 # TODO choice时的坐标转换?
236 choice_value = getattr(self, 'choice_{0}'.format(choice_method))(all_find_value_list, length)
237 return choice_value
238
239 # if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str):
240 # new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {})))
241 # return new_value, coordinates
242
243 @staticmethod
244 def splitext(base_str, key_str, x0, y0, x1, y1, x2, y2, x3, y3):
245 suffix_value = base_str[len(key_str):] # TODO 坐标切分
246 return key_str, suffix_value, x1, y1, x2, y2
247 # return prefix_key, suffix_value, new_x1
248
249 def search_keys(self, go_res):
250 find_keys_list = [None for _ in range(len(self.keys_list))]
251 rm_go_key_set = set()
252 done_key_idx_set = set()
253
254 for key_idx, key_tuple in enumerate(self.keys_list):
255 for str_idx, ((x0, y0, x1, y1, x2, y2, x3, y3), text) in go_res.items():
256 if len(text.strip()) == 0: # 去除空格
257 rm_go_key_set.add(str_idx)
258 continue
259 for key_str in key_tuple[:-1]:
260 if text == key_str: # 全值匹配
261 find_keys_list[key_idx] = (key_tuple[0], key_str, text, None, x0, y0, x1, y1, x2, y2, x3, y3)
262 done_key_idx_set.add(key_idx)
263 rm_go_key_set.add(str_idx)
264 break
265 else:
266 continue
267 break
268
269 for go_key in rm_go_key_set:
270 go_res.pop(go_key)
271 rm_go_key_set.clear()
272
273 for key_idx, key_tuple in enumerate(self.keys_list):
274 if key_idx in done_key_idx_set or not key_tuple[-1]:
275 continue
276
277 for str_idx, ((x0, y0, x1, y1, x2, y2, x3, y3), text) in go_res.items():
278 if text.startswith(key_tuple[0]): # 以key开头
279 prefix_key, suffix_value, new_x1, new_y1, new_x2, new_y2 = self.splitext(
280 text, key_tuple[0], x0, y0, x1, y1, x2, y2, x3, y3)
281 find_keys_list[key_idx] = (key_tuple[0], key_tuple[0], text, suffix_value,
282 x0, y0, new_x1, new_y1, new_x2, new_y2, x3, y3)
283 done_key_idx_set.add(key_idx)
284 rm_go_key_set.add(str_idx)
285 break
286
287 for go_key in rm_go_key_set:
288 go_res.pop(go_key)
289 rm_go_key_set.clear()
290
291 self.find_keys_list = find_keys_list
292
293 # for i in find_keys_list:
294 # print(i)
295
296 def search_values(self, go_res):
297 # idx, location, top, bottom, left, (idx, scope), choice, if_startswith
298 find_value_dict = dict()
299 rm_go_key_set = set()
300 for cn_key, search_dict in self.values_dict.items():
301 for key_idx, direction_str, top_or_left, bottom_or_right, offset, scope_tuple, choice_method, if_startswith in search_dict['location']:
302 value_tuple = getattr(self, 'value_direction_{0}'.format(direction_str))(
303 go_res,
304 key_idx,
305 top_or_left,
306 bottom_or_right,
307 offset,
308 scope_tuple,
309 choice_method,
310 if_startswith,
311 search_dict['length'],
312 )
313 if isinstance(value_tuple, tuple):
314 break
315
316 if isinstance(value_tuple, tuple):
317 fixed_str = value_tuple[0]
318 for fix_method, kwargs in search_dict.get('fix_methods', []):
319 fixed_str = getattr(self, fix_method)(fixed_str, **kwargs)
320 find_value_dict[cn_key] = fixed_str
321 else:
322 find_value_dict[cn_key] = ''
323
324 # TODO 坐标重构
325
326 if isinstance(value_tuple, tuple):
327 for go_key in value_tuple[-1]:
328 go_res.pop(go_key)
329
330 return find_value_dict
331
332 def extract_fields(self, go_res):
333 # 搜索关键词
334 self.search_keys(go_res)
335 res = self.search_values(go_res)
336 return res
337
1 import json
2 import os
3 import base64
4 import requests
5 import cv2
6 import time
7 import numpy as np
8 from PIL import Image, ImageDraw, ImageFont
9
10
11 base_dir = os.path.dirname(os.path.abspath(__file__))
12 img_dir = '/home/zwq/data/gcfp/valid/image'
13 draw_dir = os.path.join(base_dir, 'draw', 'valid')
14 sign_dir = os.path.join(base_dir, 'sign_res', 'valid')
15 go_dir = os.path.join(base_dir, 'go_res', 'valid')
16
17 font_path = os.path.join(base_dir, 'simhei.ttf')
18 font = ImageFont.truetype(font_path, 10, encoding="utf-8")
19
20
21 for image_name in os.listdir(img_dir):
22
23 print('start: {0}'.format(image_name))
24 base_image_name, _ = os.path.splitext(image_name)
25
26 image_path = os.path.join(img_dir, image_name)
27 output_path = os.path.join(draw_dir, image_name)
28 go_res_path = os.path.join(go_dir, '{0}.json'.format(base_image_name))
29 sign_res_path = os.path.join(sign_dir, '{0}.json'.format(base_image_name))
30
31 go_response = requests.post(url=r'http://139.196.149.46:9001/gen_ocr', files={'file': open(image_path, 'rb')})
32 go_res = go_response.json()['ocr_results']
33 # print(go_res)
34
35 with open(go_res_path, 'w') as fp:
36 json.dump(go_res, fp, ensure_ascii=False)
37
38 img = cv2.imread(image_path)
39 for coordinates, text in go_res.values():
40 # print(coordinates)
41 # print(text)
42 cv2.rectangle(img, (coordinates[0], coordinates[1]), (coordinates[4], coordinates[5]), (0, 255, 0), 2)
43 pil_img = Image.fromarray(img)
44 draw = ImageDraw.Draw(pil_img)
45 draw.text((coordinates[0], coordinates[1]), text, (255, 0, 0), font=font)
46 img = np.array(pil_img)
47
48 cv2.imwrite(output_path, img)
49
50 sign_response = requests.post(url=r'http://139.196.149.46:9001/signature_detect', files={'file': open(image_path, 'rb')})
51 signature_res = sign_response.json()
52
53 with open(sign_res_path, 'w') as fp:
54 json.dump(signature_res, fp, ensure_ascii=False)
55
56 # print(signature_res)
57
58 # start_time = time.time()
59
60 # res = retriever_individuals.get_target_fields(go_res, signature_res)
61 # print(res)
62
63 # end_time = time.time()
64 # print('time: {0}'.format(end_time - start_time))
65
66 # break
67
68
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!