add general extractor
Showing
4 changed files
with
701 additions
and
0 deletions
general_extractor/const.py
0 → 100644
1 | REPLACE_DICT_1 = { | ||
2 | "元": "圆", | ||
3 | # "零角": "零", | ||
4 | "柴": "柒", | ||
5 | "染": "柒", | ||
6 | "查": "壹", | ||
7 | "武": "贰", | ||
8 | "家": "贰", | ||
9 | "就": "贰", | ||
10 | "登": "叁", | ||
11 | # "@整": "叁", | ||
12 | "鑫": "叁", | ||
13 | "垂": "叁", | ||
14 | "捆": "捌", | ||
15 | "搁": "捌", | ||
16 | "级": "捌", | ||
17 | "测": "捌", | ||
18 | "拥": "捌", | ||
19 | "损": "捌", | ||
20 | "盒": "叁", | ||
21 | "摄": "捌", | ||
22 | "报": "捌", | ||
23 | "会": "叁", | ||
24 | "索": "壹", | ||
25 | "任": "仟", | ||
26 | "杆": "仟", | ||
27 | "仔": "仟", | ||
28 | "什": "仟", | ||
29 | "付": "仟", | ||
30 | "伴": "仟", | ||
31 | "宿": "佰", | ||
32 | "信": "佰", | ||
33 | "情": "佰", | ||
34 | "值": "佰", | ||
35 | "荣": "柒", | ||
36 | "渠": "柒", | ||
37 | "类": "柒", | ||
38 | "案": "柒", | ||
39 | "集": "柒", | ||
40 | "方": "万", | ||
41 | "抬": "拾", | ||
42 | "给": "拾", | ||
43 | "樟": "肆", | ||
44 | "单": "肆", | ||
45 | "邮": "肆", | ||
46 | "政": "玖", | ||
47 | "拐": "捌", | ||
48 | # "柴": "柒", | ||
49 | # "任": "仟", | ||
50 | # "拥": "捌", | ||
51 | # "会": "叁", | ||
52 | } | ||
53 | |||
54 | |||
55 | ARG_KEY_KEY_LIST = 'keys_list' | ||
56 | ARG_KEY_VALUE_DICT = 'values_dict' | ||
57 | |||
58 | INVOICE_KEY_LIST = [ | ||
59 | ('纳税人识别号', False), # 相近的key 0 | ||
60 | ('增值税', False), # 相近的key 1 | ||
61 | |||
62 | ('地', False), # 单字的key 2 | ||
63 | ('址', False), # 单字的key 3 | ||
64 | |||
65 | ('开票日期', '开票曰期', '开票日', True), # 4 | ||
66 | ('发票代码', '发票代鸡', True), # 5 | ||
67 | ('发票号码', '发票号瑞', '发要号瑞', True), # 6 | ||
68 | ('机打代码', False), # 7 | ||
69 | ('机打号码', '机打号玛', False), # 8 | ||
70 | ('机器编号', False), # 9 | ||
71 | ('购买方名称', '购买方名称及', False), # 10 | ||
72 | ('纳税人识别号/', False), # 11 | ||
73 | ('统一社会信用代码/', False), # 12 | ||
74 | ('身份证号码', '身份证号码/', False), # 13 | ||
75 | ('车辆类型', True), # 14 | ||
76 | ('厂牌型号', '广牌型号', '厂胖型号', '广牌型考', True), # 15 | ||
77 | ('产地', '严地', True), # 16 | ||
78 | ('合格证号', False), # 17 | ||
79 | ('进口证明书号', True), # 18 | ||
80 | ('商检单号', True), # 19 | ||
81 | ('发动机号码', False), # 20 | ||
82 | ('车辆识别代号/车架号码', True), # 21 | ||
83 | ('价税合计', '价现合计', '价“税合计', False), # 22 | ||
84 | ('小写', True), # 23 TODO 多个值时的取值 | ||
85 | ('销货单位名称', False), # 24 | ||
86 | ('电话', True), # 25 | ||
87 | ('账号', '账考', '帐号', '帐考', '张号', '陈号', '昨号', True), # 26 | ||
88 | ('开户银行', True), # 27 | ||
89 | ('增值税税率', True), # 28 value false | ||
90 | ('或征收税', False), # 29 | ||
91 | ('税额', False), # 30 | ||
92 | ('主管税务', True), # 31 value False | ||
93 | ('机关及代码', True), # 32 | ||
94 | ('不含税价', True), # 33 value False | ||
95 | ('完税凭证号码', False), # 34 | ||
96 | ('开票人', True), # 35 | ||
97 | ('吨位', True), # 36 | ||
98 | ('限乘人数', '跟乘人数', True), # 37 TODO '人数'这种情况的坐标切分 | ||
99 | ('备注', True) # 38 | ||
100 | ] | ||
101 | |||
102 | # split key-value一体 | ||
103 | # append key-value_suffix 需要坐标切分 | ||
104 | # insert key-value_prefix 需要坐标切分 | ||
105 | INVOICE_VALUE_DICT = { | ||
106 | '开票日期': { | ||
107 | 'length': 10, | ||
108 | 'str_type': 'date', | ||
109 | # idx, location, top, bottom, left, (idx, scope), choice, if_startswith | ||
110 | 'location': [(4, 'right', 0.3, 0.5, 0, (2, ), 'xmin', 'split')], | ||
111 | 'fix_methods': [('prune_first_char', {'char_set': {':', ':', ';', }})] | ||
112 | }, | ||
113 | '发票代码': { | ||
114 | 'length': 12, | ||
115 | 'str_type': 'int', | ||
116 | 'location': [(5, 'right', 0.2, 0.2, 0, (2, ), 'xmin', 'split')] | ||
117 | }, | ||
118 | '发票号码': { | ||
119 | 'length': 8, | ||
120 | 'str_type': 'int', | ||
121 | 'location': [(6, 'right', 0.2, 0.5, 0, (2, ), 'length', 'split')], | ||
122 | 'fix_methods': [('prune_first_char', {'char_set': {'-',}})] | ||
123 | }, | ||
124 | '机打代码': { | ||
125 | 'length': 12, | ||
126 | 'str_type': 'int', | ||
127 | 'location': [(7, 'right', 0.5, 1, 0, (2, ), 'ymin', None)] | ||
128 | }, | ||
129 | '机器编号': { | ||
130 | 'length': 12, | ||
131 | 'str_type': 'int', | ||
132 | 'location': [(9, 'right', 0.5, 1, 0, (2, ), 'ymax', None)] | ||
133 | }, | ||
134 | '机打号码': { | ||
135 | 'length': 8, | ||
136 | 'str_type': 'int', | ||
137 | 'location': [(8, 'right', 0.5, 0.5, 0, (2, ), 'length', None)] | ||
138 | }, | ||
139 | '购买方名称': { | ||
140 | 'length': None, | ||
141 | 'str_type': 'str', # cn | ||
142 | 'location': [(10, 'right', 0.5, 0.5, 0, (11, 12, 13, 2), 'xmin', None)] | ||
143 | }, | ||
144 | '纳税人识别号/统一社会信用代码/身份证号码': { | ||
145 | 'length': 18, | ||
146 | 'str_type': 'str', # alnum | ||
147 | 'location': [(11, 'right', 0, 2, 0, (2.5, ), 'length', None), (12, 'right', 1, 1, 0, (2, ), 'length', None), (13, 'right', 2, 0, 0.5, (3, ), 'length', None)] | ||
148 | }, | ||
149 | '车辆类型': { | ||
150 | 'length': None, | ||
151 | 'str_type': 'str', | ||
152 | 'location': [(14, 'right', 0.2, 0.2, 0, (15, 1.5), 'xmin', 'split'), (15, 'left', 0.2, 0.2, 0, (14, 2.5), 'xmax', None)] | ||
153 | }, | ||
154 | '厂牌型号': { | ||
155 | 'length': None, | ||
156 | 'str_type': 'str', | ||
157 | 'location': [(15, 'right', 0.2, 0.2, 0, (16, 3.5), 'xmin', 'split'), (16, 'left', 0.2, 0.2, 0, (15, 2.5), 'xmax', None)] | ||
158 | }, | ||
159 | '产地': { | ||
160 | 'length': None, | ||
161 | 'str_type': 'str', # cn | ||
162 | 'location': [(16, 'right', 0.2, 0.2, 0, (2.5, ), 'xmin', 'split')] | ||
163 | }, | ||
164 | '合格证号': { | ||
165 | 'length': None, # 15 | ||
166 | 'str_type': 'str', # alnum | ||
167 | 'location': [(17, 'right', 0.2, 0.2, 0, (18, 1.5), 'xmin', None), (18, 'left', 0.2, 0.2, 0, (17, 1.5), 'xmax', None)] | ||
168 | }, | ||
169 | '进口证明书号': { | ||
170 | 'length': None, | ||
171 | 'str_type': 'str', # alnum | ||
172 | 'location': [(18, 'right', 0.3, 0.3, 0, (19, 1.5), 'xmin', 'split'), (19, 'left', 0.2, 0.2, 0, (18, 3), 'xmax', None)] | ||
173 | }, | ||
174 | '商检单号': { | ||
175 | 'length': None, | ||
176 | 'str_type': 'str', | ||
177 | 'location': [(19, 'right', 0.2, 0.2, 0, (1.5, ), 'xmin', 'split')] | ||
178 | }, | ||
179 | '发动机号码': { | ||
180 | 'length': None, | ||
181 | 'str_type': 'str', # alnum | ||
182 | 'location': [(20, 'right', 0.2, 0.2, 0, (21, 2), 'xmin', None), (21, 'left', 0.2, 0.2, 0, (20, 1.4), 'xmax', None)] | ||
183 | }, | ||
184 | '车辆识别代号/车架号码': { | ||
185 | 'length': 17, | ||
186 | 'str_type': 'str', # alnum | ||
187 | 'location': [(21, 'right', 0.3, 0.3, 0, (1.2, ), 'xmin', 'split')] | ||
188 | }, | ||
189 | '价税合计大写': { | ||
190 | 'length': None, | ||
191 | 'str_type': 'str', # cn | ||
192 | 'location': [(22, 'right', 0.2, 0.2, 0, (23, 3), 'xmin', None), (23, 'left', 0.2, 0.2, 0, (22, 15), 'xmax', None)], | ||
193 | 'fix_methods': [('prune_no_cn', {}), ('replace_whole', {'replace_map': REPLACE_DICT_1})] | ||
194 | }, | ||
195 | '价税合计小写': { | ||
196 | 'length': None, | ||
197 | 'str_type': 'float', | ||
198 | 'location': [(23, 'right', 0.4, 0.4, 0, (4, ), 'xmin', 'split')], | ||
199 | 'fix_methods': [('prune_amount', {})] | ||
200 | }, | ||
201 | '销货单位名称': { | ||
202 | 'length': None, | ||
203 | 'str_type': 'str', # cn | ||
204 | 'location': [(24, 'right', 0.2, 0.2, 0, (25, 3), 'xmin', None), (25, 'left', 0.3, 0.3, 0, (24, 15), 'xmax', None)] | ||
205 | }, | ||
206 | '电话': { | ||
207 | 'length': None, | ||
208 | 'str_type': 'str', # int + - | ||
209 | 'location': [(25, 'right', 0.3, 0.3, 0, (5, ), 'xmin', 'split')] | ||
210 | }, | ||
211 | '纳税人识别号': { | ||
212 | 'length': None, | ||
213 | 'str_type': 'str', # cn | ||
214 | 'location': [(0, 'right', 0.3, 0.3, 0, (26, 2.5), 'xmin', None), (26, 'left', 0.3, 0.3, 0, (0, 15), 'xmax', None)] | ||
215 | }, | ||
216 | '账号': { | ||
217 | 'length': None, | ||
218 | 'str_type': 'str', | ||
219 | 'location': [(26, 'right', 0.3, 0.3, 0, (6, ), 'xmin', 'split')] | ||
220 | }, | ||
221 | '地址': { | ||
222 | 'length': None, | ||
223 | 'str_type': 'str', # cn | ||
224 | 'location': [(27, 'left', 0.3, 0.3, 0, (3, 4), 'merge', None), (3, 'right', 0.3, 0.3, 0, (27, 20), 'xmin', None)] | ||
225 | }, | ||
226 | '开户银行': { | ||
227 | 'length': None, | ||
228 | 'str_type': 'str', # cn | ||
229 | 'location': [(27, 'right', 0.3, 0.3, 0, (3, ), 'xmin', 'split')] | ||
230 | }, | ||
231 | '增值税税率或征收率': { | ||
232 | 'length': 3, | ||
233 | 'str_type': 'str', # 13% | ||
234 | 'location': [(28, 'right', 0, 1, 0, (1, 30, 1), 'xmin', None), (29, 'right', 1, 0, 0, (1, 30, 1), 'xmin', None), | ||
235 | (1, 'left', 0, 1, 0, (28, 29, 2), 'xmax', None), (30, 'left', 1, 0, 0, (28, 29, 2), 'xmax', None)], | ||
236 | 'fix_methods': [('replace_last_char', {'char_set': {'8', '9', '号'}, 'target_char': '%'})] | ||
237 | |||
238 | }, | ||
239 | '增值税税额': { | ||
240 | 'length': None, | ||
241 | 'str_type': 'float', | ||
242 | 'location': [(1, 'right', 0, 1, 0, (31, 32, 2.5), 'xmin', None), (30, 'right', 1, 0, 0, (31, 32, 2.5), 'xmin', None), | ||
243 | (31, 'left', 0, 1, 0, (1, 30, 2), 'xmax', None), (32, 'left', 1, 0, 0, (1, 30, 2), 'xmax', None)], | ||
244 | 'fix_methods': [('prune_amount', {})] | ||
245 | }, | ||
246 | '主管税务机关及代码': { | ||
247 | 'length': None, | ||
248 | 'str_type': 'str', | ||
249 | 'location': [(31, 'right', 0, 1.5, 0, (2, ), 'merge', None), (32, 'right', 1, 0.5, 0, (2, ), 'merge', None)] | ||
250 | }, | ||
251 | '不含税价-小写': { | ||
252 | 'length': None, | ||
253 | 'str_type': 'float', # cn | ||
254 | 'location': [(34, 'left', 0.3, 0.3, 0, (33, 1.5), 'xmax', None), (33, 'right', 0.2, 0.2, 0, (34, 1.5), 'xmin', None)], | ||
255 | 'fix_methods': [('prune_amount', {})] | ||
256 | }, | ||
257 | '完税凭证号码': { | ||
258 | 'length': None, | ||
259 | 'str_type': 'str', | ||
260 | 'location': [(34, 'right', 0.2, 0.2, 0, (36, 1.5), 'xmin', None), (36, 'left', 0.2, 0.2, 0, (34, 6), 'xmax', None)] | ||
261 | }, | ||
262 | '吨位': { | ||
263 | 'length': None, | ||
264 | 'str_type': 'str', | ||
265 | 'location': [(36, 'right', 0.2, 0.2, 0, (37, 1), 'xmin', 'split'), (37, 'left', 0.2, 0.2, 0, (36, 0.5), 'xmax', None)] | ||
266 | }, | ||
267 | '限乘人数': { | ||
268 | 'length': None, | ||
269 | 'str_type': 'int', | ||
270 | 'location': [(37, 'right', 0.2, 0.2, 0, (0.5, ), 'xmin', 'split')] | ||
271 | }, | ||
272 | '开票人': { | ||
273 | 'length': None, | ||
274 | 'str_type': 'str', | ||
275 | 'location': [(35, 'right', 0, 0.5, 0, (1.5, ), 'xmin', 'split')] | ||
276 | }, | ||
277 | '备注': { | ||
278 | 'length': None, | ||
279 | 'str_type': 'str', | ||
280 | 'location': [(38, 'right', 0.2, 0.2, 0, (2, ), 'xmin', 'split')], | ||
281 | 'fix_methods': [('prune_first_char', {'char_set': {';', ':', ':'}})] | ||
282 | }, | ||
283 | |||
284 | } | ||
285 | |||
286 | INVOICE_CONST = { | ||
287 | ARG_KEY_KEY_LIST: INVOICE_KEY_LIST, | ||
288 | ARG_KEY_VALUE_DICT: INVOICE_VALUE_DICT | ||
289 | } |
general_extractor/retriever.py
0 → 100644
1 | import re | ||
2 | import math | ||
3 | |||
4 | |||
5 | class Retriever: | ||
6 | |||
7 | def __init__(self, keys_list=[], values_dict={}): | ||
8 | self.keys_list = keys_list | ||
9 | self.values_dict = values_dict | ||
10 | self.find_keys_list = [] | ||
11 | |||
12 | @staticmethod | ||
13 | def get_theta(x0, y0, x1, y1): | ||
14 | theta = math.atan((y0-y1)/(x1-x0)) | ||
15 | return math.cos(theta), math.sin(theta) | ||
16 | |||
17 | @staticmethod | ||
18 | def rebuild_xy(x, y, cos, sin): | ||
19 | rebuild_x = x * cos - y * sin | ||
20 | rebuild_y = y * cos + x * sin | ||
21 | return rebuild_x, rebuild_y | ||
22 | |||
23 | def rebuild_coord(self, coord_tuple, cos, sin): | ||
24 | rebuild_list = [] | ||
25 | for idx in range(0, len(coord_tuple), 2): | ||
26 | rebuild_list.extend(self.rebuild_xy(coord_tuple[idx], coord_tuple[idx+1], cos, sin)) | ||
27 | return rebuild_list | ||
28 | |||
29 | @staticmethod | ||
30 | def prune_no_cn(src_str): | ||
31 | fix_str = re.sub(r'[^\u4e00-\u9fa5]+', '', src_str) | ||
32 | return fix_str | ||
33 | |||
34 | @staticmethod | ||
35 | def prune_first_char(src_str, char_set): | ||
36 | if src_str[0] in char_set: | ||
37 | return src_str[1:] | ||
38 | return src_str | ||
39 | |||
40 | @staticmethod | ||
41 | def prune_amount(src_str): | ||
42 | fix_str = ''.join(filter(lambda i: i in [',', '.'] or str.isdigit(i), src_str)) | ||
43 | return fix_str | ||
44 | |||
45 | @staticmethod | ||
46 | def replace_whole(src_str, replace_map): | ||
47 | fix_str = src_str.translate(str.maketrans(replace_map)) | ||
48 | return fix_str | ||
49 | |||
50 | @staticmethod | ||
51 | def replace_last_char(src_str, char_set, target_char): | ||
52 | if src_str[-1] in char_set: | ||
53 | return src_str[:-1] + target_char | ||
54 | return src_str | ||
55 | |||
56 | # @staticmethod | ||
57 | # def prune_RMB(src_str): | ||
58 | # return src_str | ||
59 | |||
60 | @staticmethod | ||
61 | def choice_xmin(value_list, value_length): | ||
62 | value_list.sort(key=lambda x: x[1]) | ||
63 | return value_list[0] | ||
64 | |||
65 | @staticmethod | ||
66 | def choice_xmax(value_list, value_length): | ||
67 | value_list.sort(key=lambda x: x[1], reverse=True) | ||
68 | return value_list[0] | ||
69 | |||
70 | @staticmethod | ||
71 | def choice_ymin(value_list, value_length): | ||
72 | value_list.sort(key=lambda x: x[2]) | ||
73 | return value_list[0] | ||
74 | |||
75 | @staticmethod | ||
76 | def choice_ymax(value_list, value_length): | ||
77 | value_list.sort(key=lambda x: x[2], reverse=True) | ||
78 | return value_list[0] | ||
79 | |||
80 | @staticmethod | ||
81 | def choice_merge(value_list, value_length): | ||
82 | value_list.sort(key=lambda x: x[2]) | ||
83 | merged_value_list = [] | ||
84 | merged_idx_list = [] | ||
85 | merged_x_list = [] | ||
86 | merged_y_list = [] | ||
87 | for text, x0, y0, x1, y1, idx_tuple in value_list: | ||
88 | merged_value_list.append(text) | ||
89 | merged_idx_list.extend(idx_tuple) | ||
90 | merged_x_list.append(x0) | ||
91 | merged_x_list.append(x1) | ||
92 | merged_y_list.append(y0) | ||
93 | merged_y_list.append(y1) | ||
94 | return (''.join(merged_value_list), | ||
95 | min(merged_x_list), | ||
96 | min(merged_y_list), | ||
97 | max(merged_x_list), | ||
98 | max(merged_y_list), | ||
99 | tuple(merged_idx_list)) | ||
100 | |||
101 | @staticmethod | ||
102 | def choice_length(value_list, value_length): | ||
103 | value_list.sort(key=lambda x: len(x[0]) - value_length) | ||
104 | return | ||
105 | |||
106 | def value_direction_left(self, go_res, key_idx, top_or_left, bottom_or_right, offset, scope_tuple, choice_method, | ||
107 | if_startswith, length): | ||
108 | # 字段值查找方向:左侧 | ||
109 | |||
110 | if self.find_keys_list[key_idx] is None: | ||
111 | return | ||
112 | |||
113 | _, _, find_key_str, suffix_key, key_x0, key_y0, key_x1, key_y1 = self.find_keys_list[key_idx] | ||
114 | |||
115 | for scope_key_idx in scope_tuple[:-1]: | ||
116 | if self.find_keys_list[scope_key_idx] is None: | ||
117 | continue | ||
118 | key_scope = self.find_keys_list[scope_key_idx][6] # left x1 | ||
119 | break | ||
120 | else: | ||
121 | key_scope = None | ||
122 | |||
123 | if isinstance(if_startswith, str): | ||
124 | if isinstance(suffix_key, str): | ||
125 | # TODO suffix_key校验与修正 | ||
126 | # TODO 目前只考虑了split的情况 | ||
127 | return suffix_key, key_x0, key_y0, key_x1, key_y1, () | ||
128 | |||
129 | height = key_y1 - key_y0 | ||
130 | y_min = key_y0 - (top_or_left * height) | ||
131 | y_max = key_y1 + (bottom_or_right * height) | ||
132 | |||
133 | width = key_x1 - key_x0 | ||
134 | x_max = key_x0 - (offset * width) | ||
135 | x_min = x_max - (width * scope_tuple[-1]) if key_scope is None else key_scope | ||
136 | |||
137 | all_find_value_list = [] | ||
138 | for go_key_idx, ((x0, y0, _, _, x1, y1, _, _), text) in go_res.items(): | ||
139 | cent_x = x0 + ((x1 - x0) / 2) | ||
140 | cent_y = y0 + ((y1 - y0) / 2) | ||
141 | # if go_key_idx == '98' and key_idx == 34: | ||
142 | # print(key_scope) | ||
143 | # print('-------------') | ||
144 | # print(cent_x) | ||
145 | # print(cent_y) | ||
146 | # print('-----------') | ||
147 | # print(key_x0) | ||
148 | # print(key_x1) | ||
149 | # print(key_y0) | ||
150 | # print(key_y1) | ||
151 | # print('-----------') | ||
152 | # print(x_min) | ||
153 | # print(x_max) | ||
154 | # print(y_min) | ||
155 | # print(y_max) | ||
156 | # print('===============') | ||
157 | if x_min < cent_x < x_max and y_min < cent_y < y_max: | ||
158 | all_find_value_list.append((text, x0, y0, x1, y1, (go_key_idx, ))) | ||
159 | |||
160 | if len(all_find_value_list) == 0: | ||
161 | return | ||
162 | elif len(all_find_value_list) == 1: | ||
163 | return all_find_value_list[0] | ||
164 | else: | ||
165 | choice_value = getattr(self, 'choice_{0}'.format(choice_method))(all_find_value_list, length) | ||
166 | return choice_value | ||
167 | |||
168 | # if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str): | ||
169 | # new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {}))) | ||
170 | # return new_value, coordinates | ||
171 | |||
172 | def value_direction_right(self, go_res, key_idx, top_or_left, bottom_or_right, offset, scope_tuple, choice_method, | ||
173 | if_startswith, length): | ||
174 | # 字段值查找方向:右侧 | ||
175 | |||
176 | if self.find_keys_list[key_idx] is None: | ||
177 | return | ||
178 | |||
179 | _, _, find_key_str, suffix_key, key_x0_src, key_y0_src, key_x1_src, key_y1_src, key_x2_src, key_y2_src, key_x3_src, key_y3_src = self.find_keys_list[key_idx] | ||
180 | |||
181 | for scope_key_idx in scope_tuple[:-1]: | ||
182 | if self.find_keys_list[scope_key_idx] is None: | ||
183 | continue | ||
184 | key_scope_tuple = (self.find_keys_list[scope_key_idx][4], self.find_keys_list[scope_key_idx][5]) # right x0, y0 | ||
185 | break | ||
186 | else: | ||
187 | key_scope_tuple = None | ||
188 | |||
189 | if isinstance(if_startswith, str): | ||
190 | if isinstance(suffix_key, str): | ||
191 | # TODO suffix_key校验与修正 | ||
192 | # TODO 目前只考虑了split的情况 | ||
193 | if isinstance(length, int): | ||
194 | if -3 < length - len(suffix_key) < 3: | ||
195 | return suffix_key, (key_x0_src, key_y0_src, key_x1_src, key_y1_src, key_x2_src, key_y2_src, key_x3_src, key_y3_src), () | ||
196 | else: | ||
197 | return suffix_key, (key_x0_src, key_y0_src, key_x1_src, key_y1_src, key_x2_src, key_y2_src, key_x3_src, key_y3_src), () | ||
198 | |||
199 | # 坐标系转换 | ||
200 | cos, sin = self.get_theta(x0, y0, x1, y1) | ||
201 | key_x0, key_y0, key_x1, key_y1, key_x2, key_y2, key_x3, key_y3 = self.rebuild_coord((key_x0_src, key_y0_src, key_x1_src, key_y1_src, key_x2_src, key_y2_src, key_x3_src, key_y3_src), cos, sin) | ||
202 | |||
203 | height = key_y2 - key_y0 | ||
204 | y_min = key_y0 - (top_or_left * height) | ||
205 | y_max = key_y2 + (bottom_or_right * height) | ||
206 | |||
207 | width = key_x2 - key_x0 | ||
208 | x_min = key_x2 + (offset * width) | ||
209 | x_max = x_min + (width * scope_tuple[-1]) if key_scope_tuple is None else self.rebuild_xy(*key_scope_tuple, cos, sin)[0] | ||
210 | |||
211 | all_find_value_list = [] | ||
212 | for go_key_idx, ((x0, y0, x1, y1, x2, y3, x3, y3), text) in go_res.items(): | ||
213 | cent_x, cent_y = self.rebuild_xy(x0 + ((x2 - x0) / 2), y0 + ((y2 - y0) / 2), cos, sin) | ||
214 | # if go_key_idx == '98' and key_idx == 34: | ||
215 | # print(cent_x) | ||
216 | # print(cent_y) | ||
217 | # print('-----------') | ||
218 | # print(key_x0) | ||
219 | # print(key_x1) | ||
220 | # print(key_y0) | ||
221 | # print(key_y1) | ||
222 | # print('-----------') | ||
223 | # print(x_min) | ||
224 | # print(x_max) | ||
225 | # print(y_min) | ||
226 | # print(y_max) | ||
227 | if x_min < cent_x < x_max and y_min < cent_y < y_max: | ||
228 | all_find_value_list.append((text, x0, y0, x1, y1, x2, y2, x3, y3, (go_key_idx, ))) | ||
229 | |||
230 | if len(all_find_value_list) == 0: | ||
231 | return | ||
232 | elif len(all_find_value_list) == 1: | ||
233 | return all_find_value_list[0] | ||
234 | else: | ||
235 | # TODO choice时的坐标转换? | ||
236 | choice_value = getattr(self, 'choice_{0}'.format(choice_method))(all_find_value_list, length) | ||
237 | return choice_value | ||
238 | |||
239 | # if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str): | ||
240 | # new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {}))) | ||
241 | # return new_value, coordinates | ||
242 | |||
243 | @staticmethod | ||
244 | def splitext(base_str, key_str, x0, y0, x1, y1, x2, y2, x3, y3): | ||
245 | suffix_value = base_str[len(key_str):] # TODO 坐标切分 | ||
246 | return key_str, suffix_value, x1, y1, x2, y2 | ||
247 | # return prefix_key, suffix_value, new_x1 | ||
248 | |||
249 | def search_keys(self, go_res): | ||
250 | find_keys_list = [None for _ in range(len(self.keys_list))] | ||
251 | rm_go_key_set = set() | ||
252 | done_key_idx_set = set() | ||
253 | |||
254 | for key_idx, key_tuple in enumerate(self.keys_list): | ||
255 | for str_idx, ((x0, y0, x1, y1, x2, y2, x3, y3), text) in go_res.items(): | ||
256 | if len(text.strip()) == 0: # 去除空格 | ||
257 | rm_go_key_set.add(str_idx) | ||
258 | continue | ||
259 | for key_str in key_tuple[:-1]: | ||
260 | if text == key_str: # 全值匹配 | ||
261 | find_keys_list[key_idx] = (key_tuple[0], key_str, text, None, x0, y0, x1, y1, x2, y2, x3, y3) | ||
262 | done_key_idx_set.add(key_idx) | ||
263 | rm_go_key_set.add(str_idx) | ||
264 | break | ||
265 | else: | ||
266 | continue | ||
267 | break | ||
268 | |||
269 | for go_key in rm_go_key_set: | ||
270 | go_res.pop(go_key) | ||
271 | rm_go_key_set.clear() | ||
272 | |||
273 | for key_idx, key_tuple in enumerate(self.keys_list): | ||
274 | if key_idx in done_key_idx_set or not key_tuple[-1]: | ||
275 | continue | ||
276 | |||
277 | for str_idx, ((x0, y0, x1, y1, x2, y2, x3, y3), text) in go_res.items(): | ||
278 | if text.startswith(key_tuple[0]): # 以key开头 | ||
279 | prefix_key, suffix_value, new_x1, new_y1, new_x2, new_y2 = self.splitext( | ||
280 | text, key_tuple[0], x0, y0, x1, y1, x2, y2, x3, y3) | ||
281 | find_keys_list[key_idx] = (key_tuple[0], key_tuple[0], text, suffix_value, | ||
282 | x0, y0, new_x1, new_y1, new_x2, new_y2, x3, y3) | ||
283 | done_key_idx_set.add(key_idx) | ||
284 | rm_go_key_set.add(str_idx) | ||
285 | break | ||
286 | |||
287 | for go_key in rm_go_key_set: | ||
288 | go_res.pop(go_key) | ||
289 | rm_go_key_set.clear() | ||
290 | |||
291 | self.find_keys_list = find_keys_list | ||
292 | |||
293 | # for i in find_keys_list: | ||
294 | # print(i) | ||
295 | |||
296 | def search_values(self, go_res): | ||
297 | # idx, location, top, bottom, left, (idx, scope), choice, if_startswith | ||
298 | find_value_dict = dict() | ||
299 | rm_go_key_set = set() | ||
300 | for cn_key, search_dict in self.values_dict.items(): | ||
301 | for key_idx, direction_str, top_or_left, bottom_or_right, offset, scope_tuple, choice_method, if_startswith in search_dict['location']: | ||
302 | value_tuple = getattr(self, 'value_direction_{0}'.format(direction_str))( | ||
303 | go_res, | ||
304 | key_idx, | ||
305 | top_or_left, | ||
306 | bottom_or_right, | ||
307 | offset, | ||
308 | scope_tuple, | ||
309 | choice_method, | ||
310 | if_startswith, | ||
311 | search_dict['length'], | ||
312 | ) | ||
313 | if isinstance(value_tuple, tuple): | ||
314 | break | ||
315 | |||
316 | if isinstance(value_tuple, tuple): | ||
317 | fixed_str = value_tuple[0] | ||
318 | for fix_method, kwargs in search_dict.get('fix_methods', []): | ||
319 | fixed_str = getattr(self, fix_method)(fixed_str, **kwargs) | ||
320 | find_value_dict[cn_key] = fixed_str | ||
321 | else: | ||
322 | find_value_dict[cn_key] = '' | ||
323 | |||
324 | # TODO 坐标重构 | ||
325 | |||
326 | if isinstance(value_tuple, tuple): | ||
327 | for go_key in value_tuple[-1]: | ||
328 | go_res.pop(go_key) | ||
329 | |||
330 | return find_value_dict | ||
331 | |||
332 | def extract_fields(self, go_res): | ||
333 | # 搜索关键词 | ||
334 | self.search_keys(go_res) | ||
335 | res = self.search_values(go_res) | ||
336 | return res | ||
337 |
general_extractor/step1.py
0 → 100644
1 | import json | ||
2 | import os | ||
3 | import base64 | ||
4 | import requests | ||
5 | import cv2 | ||
6 | import time | ||
7 | import numpy as np | ||
8 | from PIL import Image, ImageDraw, ImageFont | ||
9 | |||
10 | |||
11 | base_dir = os.path.dirname(os.path.abspath(__file__)) | ||
12 | img_dir = '/home/zwq/data/gcfp/valid/image' | ||
13 | draw_dir = os.path.join(base_dir, 'draw', 'valid') | ||
14 | sign_dir = os.path.join(base_dir, 'sign_res', 'valid') | ||
15 | go_dir = os.path.join(base_dir, 'go_res', 'valid') | ||
16 | |||
17 | font_path = os.path.join(base_dir, 'simhei.ttf') | ||
18 | font = ImageFont.truetype(font_path, 10, encoding="utf-8") | ||
19 | |||
20 | |||
21 | for image_name in os.listdir(img_dir): | ||
22 | |||
23 | print('start: {0}'.format(image_name)) | ||
24 | base_image_name, _ = os.path.splitext(image_name) | ||
25 | |||
26 | image_path = os.path.join(img_dir, image_name) | ||
27 | output_path = os.path.join(draw_dir, image_name) | ||
28 | go_res_path = os.path.join(go_dir, '{0}.json'.format(base_image_name)) | ||
29 | sign_res_path = os.path.join(sign_dir, '{0}.json'.format(base_image_name)) | ||
30 | |||
31 | go_response = requests.post(url=r'http://139.196.149.46:9001/gen_ocr', files={'file': open(image_path, 'rb')}) | ||
32 | go_res = go_response.json()['ocr_results'] | ||
33 | # print(go_res) | ||
34 | |||
35 | with open(go_res_path, 'w') as fp: | ||
36 | json.dump(go_res, fp, ensure_ascii=False) | ||
37 | |||
38 | img = cv2.imread(image_path) | ||
39 | for coordinates, text in go_res.values(): | ||
40 | # print(coordinates) | ||
41 | # print(text) | ||
42 | cv2.rectangle(img, (coordinates[0], coordinates[1]), (coordinates[4], coordinates[5]), (0, 255, 0), 2) | ||
43 | pil_img = Image.fromarray(img) | ||
44 | draw = ImageDraw.Draw(pil_img) | ||
45 | draw.text((coordinates[0], coordinates[1]), text, (255, 0, 0), font=font) | ||
46 | img = np.array(pil_img) | ||
47 | |||
48 | cv2.imwrite(output_path, img) | ||
49 | |||
50 | sign_response = requests.post(url=r'http://139.196.149.46:9001/signature_detect', files={'file': open(image_path, 'rb')}) | ||
51 | signature_res = sign_response.json() | ||
52 | |||
53 | with open(sign_res_path, 'w') as fp: | ||
54 | json.dump(signature_res, fp, ensure_ascii=False) | ||
55 | |||
56 | # print(signature_res) | ||
57 | |||
58 | # start_time = time.time() | ||
59 | |||
60 | # res = retriever_individuals.get_target_fields(go_res, signature_res) | ||
61 | # print(res) | ||
62 | |||
63 | # end_time = time.time() | ||
64 | # print('time: {0}'.format(end_time - start_time)) | ||
65 | |||
66 | # break | ||
67 | |||
68 |
-
Please register or sign in to post a comment