d29ec34f by 周伟奇

rm general_extractor

1 parent 32c61c54
1 REPLACE_DICT_1 = {
2 "元": "圆",
3 # "零角": "零",
4 "柴": "柒",
5 "染": "柒",
6 "查": "壹",
7 "武": "贰",
8 "家": "贰",
9 "就": "贰",
10 "登": "叁",
11 # "@整": "叁",
12 "鑫": "叁",
13 "垂": "叁",
14 "捆": "捌",
15 "搁": "捌",
16 "级": "捌",
17 "测": "捌",
18 "拥": "捌",
19 "损": "捌",
20 "盒": "叁",
21 "摄": "捌",
22 "报": "捌",
23 "会": "叁",
24 "索": "壹",
25 "任": "仟",
26 "杆": "仟",
27 "仔": "仟",
28 "什": "仟",
29 "付": "仟",
30 "伴": "仟",
31 "宿": "佰",
32 "信": "佰",
33 "情": "佰",
34 "值": "佰",
35 "荣": "柒",
36 "渠": "柒",
37 "类": "柒",
38 "案": "柒",
39 "集": "柒",
40 "方": "万",
41 "抬": "拾",
42 "给": "拾",
43 "樟": "肆",
44 "单": "肆",
45 "邮": "肆",
46 "政": "玖",
47 "拐": "捌",
48 # "柴": "柒",
49 # "任": "仟",
50 # "拥": "捌",
51 # "会": "叁",
52 }
53
54
55 ARG_KEY_KEY_LIST = 'keys_list'
56 ARG_KEY_VALUE_DICT = 'values_dict'
57
58 INVOICE_KEY_LIST = [
59 ('纳税人识别号', False), # 相近的key 0
60 ('增值税', False), # 相近的key 1
61
62 ('地', False), # 单字的key 2
63 ('址', False), # 单字的key 3
64
65 ('开票日期', '开票曰期', '开票日', True), # 4
66 ('发票代码', '发票代鸡', True), # 5
67 ('发票号码', '发票号瑞', '发要号瑞', True), # 6
68 ('机打代码', False), # 7
69 ('机打号码', '机打号玛', False), # 8
70 ('机器编号', False), # 9
71 ('购买方名称', '购买方名称及', False), # 10
72 ('纳税人识别号/', False), # 11
73 ('统一社会信用代码/', False), # 12
74 ('身份证号码', '身份证号码/', False), # 13
75 ('车辆类型', True), # 14
76 ('厂牌型号', '广牌型号', '厂胖型号', '广牌型考', True), # 15
77 ('产地', '严地', True), # 16
78 ('合格证号', False), # 17
79 ('进口证明书号', True), # 18
80 ('商检单号', True), # 19
81 ('发动机号码', False), # 20
82 ('车辆识别代号/车架号码', True), # 21
83 ('价税合计', '价现合计', '价“税合计', False), # 22
84 ('小写', True), # 23 TODO 多个值时的取值
85 ('销货单位名称', False), # 24
86 ('电话', True), # 25
87 ('账号', '账考', '帐号', '帐考', '张号', '陈号', '昨号', True), # 26
88 ('开户银行', True), # 27
89 ('增值税税率', True), # 28 value false
90 ('或征收税', False), # 29
91 ('税额', False), # 30
92 ('主管税务', True), # 31 value False
93 ('机关及代码', True), # 32
94 ('不含税价', True), # 33 value False
95 ('完税凭证号码', False), # 34
96 ('开票人', True), # 35
97 ('吨位', True), # 36
98 ('限乘人数', '跟乘人数', True), # 37 TODO '人数'这种情况的坐标切分
99 ('备注', True) # 38
100 ]
101
102 # split key-value一体
103 # append key-value_suffix 需要坐标切分
104 # insert key-value_prefix 需要坐标切分
105 INVOICE_VALUE_DICT = {
106 '开票日期': {
107 'length': 10,
108 'str_type': 'date',
109 # idx, location, top, bottom, left, (idx, scope), choice, if_startswith
110 'location': [(4, 'right', 0.3, 0.5, 0, (2, ), 'xmin', 'split')],
111 'fix_methods': [('prune_first_char', {'char_set': {':', ':', ';', }})]
112 },
113 '发票代码': {
114 'length': 12,
115 'str_type': 'int',
116 'location': [(5, 'right', 0.2, 0.2, 0, (2, ), 'xmin', 'split')]
117 },
118 '发票号码': {
119 'length': 8,
120 'str_type': 'int',
121 'location': [(6, 'right', 0.2, 0.5, 0, (2, ), 'length', 'split')],
122 'fix_methods': [('prune_first_char', {'char_set': {'-',}})]
123 },
124 '机打代码': {
125 'length': 12,
126 'str_type': 'int',
127 'location': [(7, 'right', 0.5, 1, 0, (2, ), 'ymin', None)]
128 },
129 '机器编号': {
130 'length': 12,
131 'str_type': 'int',
132 'location': [(9, 'right', 0.5, 1, 0, (2, ), 'ymax', None)]
133 },
134 '机打号码': {
135 'length': 8,
136 'str_type': 'int',
137 'location': [(8, 'right', 0.5, 0.5, 0, (2, ), 'length', None)]
138 },
139 '购买方名称': {
140 'length': None,
141 'str_type': 'str', # cn
142 'location': [(10, 'right', 0.5, 0.5, 0, (11, 12, 13, 2), 'xmin', None)]
143 },
144 '纳税人识别号/统一社会信用代码/身份证号码': {
145 'length': 18,
146 'str_type': 'str', # alnum
147 'location': [(11, 'right', 0, 2, 0, (2.5, ), 'length', None), (12, 'right', 1, 1, 0, (2, ), 'length', None), (13, 'right', 2, 0, 0.5, (3, ), 'length', None)]
148 },
149 '车辆类型': {
150 'length': None,
151 'str_type': 'str',
152 'location': [(14, 'right', 0.2, 0.2, 0, (15, 1.5), 'xmin', 'split'), (15, 'left', 0.2, 0.2, 0, (14, 2.5), 'xmax', None)]
153 },
154 '厂牌型号': {
155 'length': None,
156 'str_type': 'str',
157 'location': [(15, 'right', 0.2, 0.2, 0, (16, 3.5), 'xmin', 'split'), (16, 'left', 0.2, 0.2, 0, (15, 2.5), 'xmax', None)]
158 },
159 '产地': {
160 'length': None,
161 'str_type': 'str', # cn
162 'location': [(16, 'right', 0.2, 0.2, 0, (2.5, ), 'xmin', 'split')]
163 },
164 '合格证号': {
165 'length': None, # 15
166 'str_type': 'str', # alnum
167 'location': [(17, 'right', 0.2, 0.2, 0, (18, 1.5), 'xmin', None), (18, 'left', 0.2, 0.2, 0, (17, 1.5), 'xmax', None)]
168 },
169 '进口证明书号': {
170 'length': None,
171 'str_type': 'str', # alnum
172 'location': [(18, 'right', 0.3, 0.3, 0, (19, 1.5), 'xmin', 'split'), (19, 'left', 0.2, 0.2, 0, (18, 3), 'xmax', None)]
173 },
174 '商检单号': {
175 'length': None,
176 'str_type': 'str',
177 'location': [(19, 'right', 0.2, 0.2, 0, (1.5, ), 'xmin', 'split')]
178 },
179 '发动机号码': {
180 'length': None,
181 'str_type': 'str', # alnum
182 'location': [(20, 'right', 0.2, 0.2, 0, (21, 2), 'xmin', None), (21, 'left', 0.2, 0.2, 0, (20, 1.4), 'xmax', None)]
183 },
184 '车辆识别代号/车架号码': {
185 'length': 17,
186 'str_type': 'str', # alnum
187 'location': [(21, 'right', 0.3, 0.3, 0, (1.2, ), 'xmin', 'split')]
188 },
189 '价税合计大写': {
190 'length': None,
191 'str_type': 'str', # cn
192 'location': [(22, 'right', 0.2, 0.2, 0, (23, 3), 'xmin', None), (23, 'left', 0.2, 0.2, 0, (22, 15), 'xmax', None)],
193 'fix_methods': [('prune_no_cn', {}), ('replace_whole', {'replace_map': REPLACE_DICT_1})]
194 },
195 '价税合计小写': {
196 'length': None,
197 'str_type': 'float',
198 'location': [(23, 'right', 0.4, 0.4, 0, (4, ), 'xmin', 'split')],
199 'fix_methods': [('prune_amount', {})]
200 },
201 '销货单位名称': {
202 'length': None,
203 'str_type': 'str', # cn
204 'location': [(24, 'right', 0.2, 0.2, 0, (25, 3), 'xmin', None), (25, 'left', 0.3, 0.3, 0, (24, 15), 'xmax', None)]
205 },
206 '电话': {
207 'length': None,
208 'str_type': 'str', # int + -
209 'location': [(25, 'right', 0.3, 0.3, 0, (5, ), 'xmin', 'split')]
210 },
211 '纳税人识别号': {
212 'length': None,
213 'str_type': 'str', # cn
214 'location': [(0, 'right', 0.3, 0.3, 0, (26, 2.5), 'xmin', None), (26, 'left', 0.3, 0.3, 0, (0, 15), 'xmax', None)]
215 },
216 '账号': {
217 'length': None,
218 'str_type': 'str',
219 'location': [(26, 'right', 0.3, 0.3, 0, (6, ), 'xmin', 'split')]
220 },
221 '地址': {
222 'length': None,
223 'str_type': 'str', # cn
224 'location': [(27, 'left', 0.3, 0.3, 0, (3, 4), 'merge', None), (3, 'right', 0.3, 0.3, 0, (27, 20), 'xmin', None)]
225 },
226 '开户银行': {
227 'length': None,
228 'str_type': 'str', # cn
229 'location': [(27, 'right', 0.3, 0.3, 0, (3, ), 'xmin', 'split')]
230 },
231 '增值税税率或征收率': {
232 'length': 3,
233 'str_type': 'str', # 13%
234 'location': [(28, 'right', 0, 1, 0, (1, 30, 1), 'xmin', None), (29, 'right', 1, 0, 0, (1, 30, 1), 'xmin', None),
235 (1, 'left', 0, 1, 0, (28, 29, 2), 'xmax', None), (30, 'left', 1, 0, 0, (28, 29, 2), 'xmax', None)],
236 'fix_methods': [('replace_last_char', {'char_set': {'8', '9', '号'}, 'target_char': '%'})]
237
238 },
239 '增值税税额': {
240 'length': None,
241 'str_type': 'float',
242 'location': [(1, 'right', 0, 1, 0, (31, 32, 2.5), 'xmin', None), (30, 'right', 1, 0, 0, (31, 32, 2.5), 'xmin', None),
243 (31, 'left', 0, 1, 0, (1, 30, 2), 'xmax', None), (32, 'left', 1, 0, 0, (1, 30, 2), 'xmax', None)],
244 'fix_methods': [('prune_amount', {})]
245 },
246 '主管税务机关及代码': {
247 'length': None,
248 'str_type': 'str',
249 'location': [(31, 'right', 0, 1.5, 0, (2, ), 'merge', None), (32, 'right', 1, 0.5, 0, (2, ), 'merge', None)]
250 },
251 '不含税价-小写': {
252 'length': None,
253 'str_type': 'float', # cn
254 'location': [(34, 'left', 0.3, 0.3, 0, (33, 1.5), 'xmax', None), (33, 'right', 0.2, 0.2, 0, (34, 1.5), 'xmin', None)],
255 'fix_methods': [('prune_amount', {})]
256 },
257 '完税凭证号码': {
258 'length': None,
259 'str_type': 'str',
260 'location': [(34, 'right', 0.2, 0.2, 0, (36, 1.5), 'xmin', None), (36, 'left', 0.2, 0.2, 0, (34, 6), 'xmax', None)]
261 },
262 '吨位': {
263 'length': None,
264 'str_type': 'str',
265 'location': [(36, 'right', 0.2, 0.2, 0, (37, 1), 'xmin', 'split'), (37, 'left', 0.2, 0.2, 0, (36, 0.5), 'xmax', None)]
266 },
267 '限乘人数': {
268 'length': None,
269 'str_type': 'int',
270 'location': [(37, 'right', 0.2, 0.2, 0, (0.5, ), 'xmin', 'split')]
271 },
272 '开票人': {
273 'length': None,
274 'str_type': 'str',
275 'location': [(35, 'right', 0, 0.5, 0, (1.5, ), 'xmin', 'split')]
276 },
277 '备注': {
278 'length': None,
279 'str_type': 'str',
280 'location': [(38, 'right', 0.2, 0.2, 0, (2, ), 'xmin', 'split')],
281 'fix_methods': [('prune_first_char', {'char_set': {';', ':', ':'}})]
282 },
283
284 }
285
286 INVOICE_CONST = {
287 ARG_KEY_KEY_LIST: INVOICE_KEY_LIST,
288 ARG_KEY_VALUE_DICT: INVOICE_VALUE_DICT
289 }
1 import re
2 import math
3
4
5 class Retriever:
6
7 def __init__(self, keys_list=[], values_dict={}):
8 self.keys_list = keys_list
9 self.values_dict = values_dict
10 self.find_keys_list = []
11
12 @staticmethod
13 def get_theta(x0, y0, x1, y1):
14 theta = math.atan((y0-y1)/(x1-x0))
15 return math.cos(theta), math.sin(theta)
16
17 @staticmethod
18 def rebuild_xy(x, y, cos, sin):
19 rebuild_x = x * cos - y * sin
20 rebuild_y = y * cos + x * sin
21 return rebuild_x, rebuild_y
22
23 def rebuild_coord(self, coord_tuple, cos, sin):
24 rebuild_list = []
25 for idx in range(0, len(coord_tuple), 2):
26 rebuild_list.extend(self.rebuild_xy(coord_tuple[idx], coord_tuple[idx+1], cos, sin))
27 return rebuild_list
28
29 @staticmethod
30 def prune_no_cn(src_str):
31 fix_str = re.sub(r'[^\u4e00-\u9fa5]+', '', src_str)
32 return fix_str
33
34 @staticmethod
35 def prune_first_char(src_str, char_set):
36 if src_str[0] in char_set:
37 return src_str[1:]
38 return src_str
39
40 @staticmethod
41 def prune_amount(src_str):
42 fix_str = ''.join(filter(lambda i: i in [',', '.'] or str.isdigit(i), src_str))
43 return fix_str
44
45 @staticmethod
46 def replace_whole(src_str, replace_map):
47 fix_str = src_str.translate(str.maketrans(replace_map))
48 return fix_str
49
50 @staticmethod
51 def replace_last_char(src_str, char_set, target_char):
52 if src_str[-1] in char_set:
53 return src_str[:-1] + target_char
54 return src_str
55
56 # @staticmethod
57 # def prune_RMB(src_str):
58 # return src_str
59
60 @staticmethod
61 def choice_xmin(value_list, value_length):
62 value_list.sort(key=lambda x: x[1])
63 return value_list[0]
64
65 @staticmethod
66 def choice_xmax(value_list, value_length):
67 value_list.sort(key=lambda x: x[1], reverse=True)
68 return value_list[0]
69
70 @staticmethod
71 def choice_ymin(value_list, value_length):
72 value_list.sort(key=lambda x: x[2])
73 return value_list[0]
74
75 @staticmethod
76 def choice_ymax(value_list, value_length):
77 value_list.sort(key=lambda x: x[2], reverse=True)
78 return value_list[0]
79
80 @staticmethod
81 def choice_merge(value_list, value_length):
82 value_list.sort(key=lambda x: x[2])
83 merged_value_list = []
84 merged_idx_list = []
85 merged_x_list = []
86 merged_y_list = []
87 for text, x0, y0, x1, y1, x2, y2, x3, y3, idx_tuple in value_list:
88 merged_value_list.append(text)
89 merged_idx_list.extend(idx_tuple)
90 merged_x_list.append(x0)
91 merged_x_list.append(x1)
92 merged_y_list.append(y0)
93 merged_y_list.append(y1)
94 return (''.join(merged_value_list),
95 min(merged_x_list),
96 min(merged_y_list),
97 max(merged_x_list),
98 min(merged_y_list),
99 max(merged_x_list),
100 max(merged_y_list),
101 min(merged_x_list),
102 max(merged_y_list),
103 tuple(merged_idx_list))
104
105 @staticmethod
106 def choice_length(value_list, value_length):
107 value_list.sort(key=lambda x: len(x[0]) - value_length)
108 return
109
110 def value_direction_left(self, go_res, key_idx, top_or_left, bottom_or_right, offset, scope_tuple, choice_method,
111 if_startswith, length):
112 # 字段值查找方向:左侧
113
114 if self.find_keys_list[key_idx] is None:
115 return
116
117 _, _, find_key_str, suffix_key, key_x0_src, key_y0_src, key_x1_src, key_y1_src, key_x2_src, key_y2_src, \
118 key_x3_src, key_y3_src = self.find_keys_list[key_idx]
119
120 for scope_key_idx in scope_tuple[:-1]:
121 if self.find_keys_list[scope_key_idx] is None:
122 continue
123 key_scope_tuple = (
124 self.find_keys_list[scope_key_idx][6], self.find_keys_list[scope_key_idx][7]) # left x1, y1
125 break
126 else:
127 key_scope_tuple = None
128
129 # if isinstance(if_startswith, str):
130 # if isinstance(suffix_key, str):
131 # # TODO suffix_key校验与修正
132 # # TODO 目前只考虑了split的情况
133 # if isinstance(length, int):
134 # if -3 < length - len(suffix_key) < 3:
135 # return suffix_key, (
136 # key_x0_src, key_y0_src, key_x1_src, key_y1_src, key_x2_src, key_y2_src, key_x3_src,
137 # key_y3_src), ()
138 # else:
139 # return suffix_key, (
140 # key_x0_src, key_y0_src, key_x1_src, key_y1_src, key_x2_src, key_y2_src, key_x3_src,
141 # key_y3_src), ()
142
143 # 坐标系转换
144 cos, sin = self.get_theta(key_x0_src, key_y0_src, key_x1_src, key_y1_src)
145 key_x0, key_y0, key_x1, key_y1, key_x2, key_y2, key_x3, key_y3 = self.rebuild_coord(
146 (key_x0_src, key_y0_src, key_x1_src, key_y1_src, key_x2_src, key_y2_src, key_x3_src, key_y3_src), cos,
147 sin)
148
149 height = key_y2 - key_y0
150 y_min = key_y0 - (top_or_left * height)
151 y_max = key_y2 + (bottom_or_right * height)
152
153 width = key_x2 - key_x0
154 x_max = key_x0 - (offset * width)
155 x_min = x_max - (width * scope_tuple[-1]) if key_scope_tuple is None else \
156 self.rebuild_xy(*key_scope_tuple, cos, sin)[0]
157
158 all_find_value_list = []
159 for go_key_idx, ((x0, y0, x1, y1, x2, y2, x3, y3), text) in go_res.items():
160 cent_x, cent_y = self.rebuild_xy(x0 + ((x2 - x0) / 2), y0 + ((y2 - y0) / 2), cos, sin)
161 # if go_key_idx == '98' and key_idx == 34:
162 # print(cent_x)
163 # print(cent_y)
164 # print('-----------')
165 # print(key_x0)
166 # print(key_x1)
167 # print(key_y0)
168 # print(key_y1)
169 # print('-----------')
170 # print(x_min)
171 # print(x_max)
172 # print(y_min)
173 # print(y_max)
174 if x_min < cent_x < x_max and y_min < cent_y < y_max:
175 all_find_value_list.append((text, x0, y0, x1, y1, x2, y2, x3, y3, (go_key_idx,)))
176
177 if len(all_find_value_list) == 0:
178 return
179 elif len(all_find_value_list) == 1:
180 return all_find_value_list[0]
181 else:
182 # TODO choice时的坐标转换?
183 choice_value = getattr(self, 'choice_{0}'.format(choice_method))(all_find_value_list, length)
184 return choice_value
185
186 # if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str):
187 # new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {})))
188 # return new_value, coordinates
189
190 def value_direction_right(self, go_res, key_idx, top_or_left, bottom_or_right, offset, scope_tuple, choice_method,
191 if_startswith, length):
192 # 字段值查找方向:右侧
193
194 if self.find_keys_list[key_idx] is None:
195 return
196
197 _, _, find_key_str, suffix_key, key_x0_src, key_y0_src, key_x1_src, key_y1_src, key_x2_src, key_y2_src, \
198 key_x3_src, key_y3_src = self.find_keys_list[key_idx]
199
200 for scope_key_idx in scope_tuple[:-1]:
201 if self.find_keys_list[scope_key_idx] is None:
202 continue
203 key_scope_tuple = (self.find_keys_list[scope_key_idx][4], self.find_keys_list[scope_key_idx][5]) # right x0, y0
204 break
205 else:
206 key_scope_tuple = None
207
208 if isinstance(if_startswith, str):
209 if isinstance(suffix_key, str):
210 # TODO suffix_key校验与修正
211 # TODO 目前只考虑了split的情况
212 if isinstance(length, int):
213 if -3 < length - len(suffix_key) < 3:
214 return suffix_key, (key_x0_src, key_y0_src, key_x1_src, key_y1_src, key_x2_src, key_y2_src, key_x3_src, key_y3_src), ()
215 else:
216 return suffix_key, (key_x0_src, key_y0_src, key_x1_src, key_y1_src, key_x2_src, key_y2_src, key_x3_src, key_y3_src), ()
217
218 # 坐标系转换
219 cos, sin = self.get_theta(key_x0_src, key_y0_src, key_x1_src, key_y1_src)
220 key_x0, key_y0, key_x1, key_y1, key_x2, key_y2, key_x3, key_y3 = self.rebuild_coord(
221 (key_x0_src, key_y0_src, key_x1_src, key_y1_src, key_x2_src, key_y2_src, key_x3_src, key_y3_src), cos, sin)
222
223 height = key_y2 - key_y0
224 y_min = key_y0 - (top_or_left * height)
225 y_max = key_y2 + (bottom_or_right * height)
226
227 width = key_x2 - key_x0
228 x_min = key_x2 + (offset * width)
229 x_max = x_min + (width * scope_tuple[-1]) if key_scope_tuple is None else self.rebuild_xy(
230 *key_scope_tuple, cos, sin)[0]
231
232 all_find_value_list = []
233 for go_key_idx, ((x0, y0, x1, y1, x2, y2, x3, y3), text) in go_res.items():
234 cent_x, cent_y = self.rebuild_xy(x0 + ((x2 - x0) / 2), y0 + ((y2 - y0) / 2), cos, sin)
235 # if go_key_idx == '98' and key_idx == 34:
236 # print(cent_x)
237 # print(cent_y)
238 # print('-----------')
239 # print(key_x0)
240 # print(key_x1)
241 # print(key_y0)
242 # print(key_y1)
243 # print('-----------')
244 # print(x_min)
245 # print(x_max)
246 # print(y_min)
247 # print(y_max)
248 if x_min < cent_x < x_max and y_min < cent_y < y_max:
249 all_find_value_list.append((text, x0, y0, x1, y1, x2, y2, x3, y3, (go_key_idx, )))
250
251 if len(all_find_value_list) == 0:
252 return
253 elif len(all_find_value_list) == 1:
254 return all_find_value_list[0]
255 else:
256 # TODO choice时的坐标转换?
257 choice_value = getattr(self, 'choice_{0}'.format(choice_method))(all_find_value_list, length)
258 return choice_value
259
260 # if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str):
261 # new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {})))
262 # return new_value, coordinates
263
264 @staticmethod
265 def splitext(base_str, key_str, x0, y0, x1, y1, x2, y2, x3, y3):
266 suffix_value = base_str[len(key_str):] # TODO 坐标切分
267 return key_str, suffix_value, x1, y1, x2, y2
268 # return prefix_key, suffix_value, new_x1
269
270 def search_keys(self, go_res):
271 find_keys_list = [None for _ in range(len(self.keys_list))]
272 rm_go_key_set = set()
273 done_key_idx_set = set()
274
275 for key_idx, key_tuple in enumerate(self.keys_list):
276 for str_idx, ((x0, y0, x1, y1, x2, y2, x3, y3), text) in go_res.items():
277 if len(text.strip()) == 0: # 去除空格
278 rm_go_key_set.add(str_idx)
279 continue
280 for key_str in key_tuple[:-1]:
281 if text == key_str: # 全值匹配
282 find_keys_list[key_idx] = (key_tuple[0], key_str, text, None, x0, y0, x1, y1, x2, y2, x3, y3)
283 done_key_idx_set.add(key_idx)
284 rm_go_key_set.add(str_idx)
285 break
286 else:
287 continue
288 break
289
290 for go_key in rm_go_key_set:
291 go_res.pop(go_key)
292 rm_go_key_set.clear()
293
294 for key_idx, key_tuple in enumerate(self.keys_list):
295 if key_idx in done_key_idx_set or not key_tuple[-1]:
296 continue
297
298 for str_idx, ((x0, y0, x1, y1, x2, y2, x3, y3), text) in go_res.items():
299 if text.startswith(key_tuple[0]): # 以key开头
300 prefix_key, suffix_value, new_x1, new_y1, new_x2, new_y2 = self.splitext(
301 text, key_tuple[0], x0, y0, x1, y1, x2, y2, x3, y3)
302 find_keys_list[key_idx] = (key_tuple[0], key_tuple[0], text, suffix_value,
303 x0, y0, new_x1, new_y1, new_x2, new_y2, x3, y3)
304 done_key_idx_set.add(key_idx)
305 rm_go_key_set.add(str_idx)
306 break
307
308 for go_key in rm_go_key_set:
309 go_res.pop(go_key)
310 rm_go_key_set.clear()
311
312 self.find_keys_list = find_keys_list
313
314 # for i in find_keys_list:
315 # print(i)
316
317 def search_values(self, go_res):
318 # idx, location, top, bottom, left, (idx, scope), choice, if_startswith
319 find_value_dict = dict()
320 rm_go_key_set = set()
321 for cn_key, search_dict in self.values_dict.items():
322 for key_idx, direction_str, top_or_left, bottom_or_right, offset, scope_tuple, choice_method, if_startswith in search_dict['location']:
323 value_tuple = getattr(self, 'value_direction_{0}'.format(direction_str))(
324 go_res,
325 key_idx,
326 top_or_left,
327 bottom_or_right,
328 offset,
329 scope_tuple,
330 choice_method,
331 if_startswith,
332 search_dict['length'],
333 )
334 if isinstance(value_tuple, tuple):
335 break
336
337 if isinstance(value_tuple, tuple):
338 fixed_str = value_tuple[0]
339 for fix_method, kwargs in search_dict.get('fix_methods', []):
340 fixed_str = getattr(self, fix_method)(fixed_str, **kwargs)
341 find_value_dict[cn_key] = fixed_str
342 else:
343 find_value_dict[cn_key] = ''
344
345 # TODO 坐标重构
346
347 if isinstance(value_tuple, tuple):
348 for go_key in value_tuple[-1]:
349 go_res.pop(go_key)
350
351 return find_value_dict
352
353 def extract_fields(self, go_res):
354 # 搜索关键词
355 self.search_keys(go_res)
356 res = self.search_values(go_res)
357 return res
358
1 import json
2 import os
3 import base64
4 import requests
5 import cv2
6 import time
7 import numpy as np
8 from PIL import Image, ImageDraw, ImageFont
9
10
11 base_dir = os.path.dirname(os.path.abspath(__file__))
12 img_dir = '/home/zwq/data/gcfp/valid/image'
13 draw_dir = os.path.join(base_dir, 'draw', 'valid')
14 sign_dir = os.path.join(base_dir, 'sign_res', 'valid')
15 go_dir = os.path.join(base_dir, 'go_res', 'valid')
16
17 font_path = os.path.join(base_dir, 'simhei.ttf')
18 font = ImageFont.truetype(font_path, 10, encoding="utf-8")
19
20
21 for image_name in os.listdir(img_dir):
22
23 print('start: {0}'.format(image_name))
24 base_image_name, _ = os.path.splitext(image_name)
25
26 image_path = os.path.join(img_dir, image_name)
27 output_path = os.path.join(draw_dir, image_name)
28 go_res_path = os.path.join(go_dir, '{0}.json'.format(base_image_name))
29 sign_res_path = os.path.join(sign_dir, '{0}.json'.format(base_image_name))
30
31 go_response = requests.post(url=r'http://139.196.149.46:9001/gen_ocr', files={'file': open(image_path, 'rb')})
32 go_res = go_response.json()['ocr_results']
33 # print(go_res)
34
35 with open(go_res_path, 'w') as fp:
36 json.dump(go_res, fp, ensure_ascii=False)
37
38 img = cv2.imread(image_path)
39 for coordinates, text in go_res.values():
40 # print(coordinates)
41 # print(text)
42 cv2.rectangle(img, (coordinates[0], coordinates[1]), (coordinates[4], coordinates[5]), (0, 255, 0), 2)
43 pil_img = Image.fromarray(img)
44 draw = ImageDraw.Draw(pil_img)
45 draw.text((coordinates[0], coordinates[1]), text, (255, 0, 0), font=font)
46 img = np.array(pil_img)
47
48 cv2.imwrite(output_path, img)
49
50 sign_response = requests.post(url=r'http://139.196.149.46:9001/signature_detect', files={'file': open(image_path, 'rb')})
51 signature_res = sign_response.json()
52
53 with open(sign_res_path, 'w') as fp:
54 json.dump(signature_res, fp, ensure_ascii=False)
55
56 # print(signature_res)
57
58 # start_time = time.time()
59
60 # res = retriever_individuals.get_target_fields(go_res, signature_res)
61 # print(res)
62
63 # end_time = time.time()
64 # print('time: {0}'.format(end_time - start_time))
65
66 # break
67
68
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!