add general extractor
Showing
4 changed files
with
364 additions
and
0 deletions
general_extractor/const.py
0 → 100644
1 | REPLACE_DICT_1 = { | ||
2 | "元": "圆", | ||
3 | # "零角": "零", | ||
4 | "柴": "柒", | ||
5 | "染": "柒", | ||
6 | "查": "壹", | ||
7 | "武": "贰", | ||
8 | "家": "贰", | ||
9 | "就": "贰", | ||
10 | "登": "叁", | ||
11 | # "@整": "叁", | ||
12 | "鑫": "叁", | ||
13 | "垂": "叁", | ||
14 | "捆": "捌", | ||
15 | "搁": "捌", | ||
16 | "级": "捌", | ||
17 | "测": "捌", | ||
18 | "拥": "捌", | ||
19 | "损": "捌", | ||
20 | "盒": "叁", | ||
21 | "摄": "捌", | ||
22 | "报": "捌", | ||
23 | "会": "叁", | ||
24 | "索": "壹", | ||
25 | "任": "仟", | ||
26 | "杆": "仟", | ||
27 | "仔": "仟", | ||
28 | "什": "仟", | ||
29 | "付": "仟", | ||
30 | "伴": "仟", | ||
31 | "宿": "佰", | ||
32 | "信": "佰", | ||
33 | "情": "佰", | ||
34 | "值": "佰", | ||
35 | "荣": "柒", | ||
36 | "渠": "柒", | ||
37 | "类": "柒", | ||
38 | "案": "柒", | ||
39 | "集": "柒", | ||
40 | "方": "万", | ||
41 | "抬": "拾", | ||
42 | "给": "拾", | ||
43 | "樟": "肆", | ||
44 | "单": "肆", | ||
45 | "邮": "肆", | ||
46 | "政": "玖", | ||
47 | "拐": "捌", | ||
48 | # "柴": "柒", | ||
49 | # "任": "仟", | ||
50 | # "拥": "捌", | ||
51 | # "会": "叁", | ||
52 | } | ||
53 | |||
54 | |||
55 | ARG_KEY_KEY_LIST = 'keys_list' | ||
56 | ARG_KEY_VALUE_DICT = 'values_dict' | ||
57 | |||
58 | INVOICE_KEY_LIST = [ | ||
59 | ('纳税人识别号', False), # 相近的key 0 | ||
60 | ('增值税', False), # 相近的key 1 | ||
61 | |||
62 | ('地', False), # 单字的key 2 | ||
63 | ('址', False), # 单字的key 3 | ||
64 | |||
65 | ('开票日期', '开票曰期', '开票日', True), # 4 | ||
66 | ('发票代码', '发票代鸡', True), # 5 | ||
67 | ('发票号码', '发票号瑞', '发要号瑞', True), # 6 | ||
68 | ('机打代码', False), # 7 | ||
69 | ('机打号码', '机打号玛', False), # 8 | ||
70 | ('机器编号', False), # 9 | ||
71 | ('购买方名称', '购买方名称及', False), # 10 | ||
72 | ('纳税人识别号/', False), # 11 | ||
73 | ('统一社会信用代码/', False), # 12 | ||
74 | ('身份证号码', '身份证号码/', False), # 13 | ||
75 | ('车辆类型', True), # 14 | ||
76 | ('厂牌型号', '广牌型号', '厂胖型号', '广牌型考', True), # 15 | ||
77 | ('产地', '严地', True), # 16 | ||
78 | ('合格证号', False), # 17 | ||
79 | ('进口证明书号', True), # 18 | ||
80 | ('商检单号', True), # 19 | ||
81 | ('发动机号码', False), # 20 | ||
82 | ('车辆识别代号/车架号码', True), # 21 | ||
83 | ('价税合计', '价现合计', '价“税合计', False), # 22 | ||
84 | ('小写', True), # 23 TODO 多个值时的取值 | ||
85 | ('销货单位名称', False), # 24 | ||
86 | ('电话', True), # 25 | ||
87 | ('账号', '账考', '帐号', '帐考', '张号', '陈号', '昨号', True), # 26 | ||
88 | ('开户银行', True), # 27 | ||
89 | ('增值税税率', True), # 28 value false | ||
90 | ('或征收税', False), # 29 | ||
91 | ('税额', False), # 30 | ||
92 | ('主管税务', True), # 31 value False | ||
93 | ('机关及代码', True), # 32 | ||
94 | ('不含税价', True), # 33 value False | ||
95 | ('完税凭证号码', False), # 34 | ||
96 | ('开票人', True), # 35 | ||
97 | ('吨位', True), # 36 | ||
98 | ('限乘人数', '跟乘人数', True), # 37 TODO '人数'这种情况的坐标切分 | ||
99 | ('备注', True) # 38 | ||
100 | ] | ||
101 | |||
102 | # split key-value一体 | ||
103 | # append key-value_suffix 需要坐标切分 | ||
104 | # insert key-value_prefix 需要坐标切分 | ||
105 | INVOICE_VALUE_DICT = { | ||
106 | '开票日期': { | ||
107 | 'length': 10, | ||
108 | 'str_type': 'date', | ||
109 | # idx, location, top, bottom, left, (idx, scope), choice, if_startswith | ||
110 | 'location': [(4, 'right', 0.3, 0.5, 0, (2, ), 'xmin', 'split')], | ||
111 | 'fix_methods': [('prune_first_char', {'char_set': {':', ':', ';', }})] | ||
112 | }, | ||
113 | '发票代码': { | ||
114 | 'length': 12, | ||
115 | 'str_type': 'int', | ||
116 | 'location': [(5, 'right', 0.2, 0.2, 0, (2, ), 'xmin', 'split')] | ||
117 | }, | ||
118 | '发票号码': { | ||
119 | 'length': 8, | ||
120 | 'str_type': 'int', | ||
121 | 'location': [(6, 'right', 0.2, 0.5, 0, (2, ), 'length', 'split')], | ||
122 | 'fix_methods': [('prune_first_char', {'char_set': {'-',}})] | ||
123 | }, | ||
124 | '机打代码': { | ||
125 | 'length': 12, | ||
126 | 'str_type': 'int', | ||
127 | 'location': [(7, 'right', 0.5, 1, 0, (2, ), 'ymin', None)] | ||
128 | }, | ||
129 | '机器编号': { | ||
130 | 'length': 12, | ||
131 | 'str_type': 'int', | ||
132 | 'location': [(9, 'right', 0.5, 1, 0, (2, ), 'ymax', None)] | ||
133 | }, | ||
134 | '机打号码': { | ||
135 | 'length': 8, | ||
136 | 'str_type': 'int', | ||
137 | 'location': [(8, 'right', 0.5, 0.5, 0, (2, ), 'length', None)] | ||
138 | }, | ||
139 | '购买方名称': { | ||
140 | 'length': None, | ||
141 | 'str_type': 'str', # cn | ||
142 | 'location': [(10, 'right', 0.5, 0.5, 0, (11, 12, 13, 2), 'xmin', None)] | ||
143 | }, | ||
144 | '纳税人识别号/统一社会信用代码/身份证号码': { | ||
145 | 'length': 18, | ||
146 | 'str_type': 'str', # alnum | ||
147 | 'location': [(11, 'right', 0, 2, 0, (2.5, ), 'length', None), (12, 'right', 1, 1, 0, (2, ), 'length', None), (13, 'right', 2, 0, 0.5, (3, ), 'length', None)] | ||
148 | }, | ||
149 | '车辆类型': { | ||
150 | 'length': None, | ||
151 | 'str_type': 'str', | ||
152 | 'location': [(14, 'right', 0.2, 0.2, 0, (15, 1.5), 'xmin', 'split'), (15, 'left', 0.2, 0.2, 0, (14, 2.5), 'xmax', None)] | ||
153 | }, | ||
154 | '厂牌型号': { | ||
155 | 'length': None, | ||
156 | 'str_type': 'str', | ||
157 | 'location': [(15, 'right', 0.2, 0.2, 0, (16, 3.5), 'xmin', 'split'), (16, 'left', 0.2, 0.2, 0, (15, 2.5), 'xmax', None)] | ||
158 | }, | ||
159 | '产地': { | ||
160 | 'length': None, | ||
161 | 'str_type': 'str', # cn | ||
162 | 'location': [(16, 'right', 0.2, 0.2, 0, (2.5, ), 'xmin', 'split')] | ||
163 | }, | ||
164 | '合格证号': { | ||
165 | 'length': None, # 15 | ||
166 | 'str_type': 'str', # alnum | ||
167 | 'location': [(17, 'right', 0.2, 0.2, 0, (18, 1.5), 'xmin', None), (18, 'left', 0.2, 0.2, 0, (17, 1.5), 'xmax', None)] | ||
168 | }, | ||
169 | '进口证明书号': { | ||
170 | 'length': None, | ||
171 | 'str_type': 'str', # alnum | ||
172 | 'location': [(18, 'right', 0.3, 0.3, 0, (19, 1.5), 'xmin', 'split'), (19, 'left', 0.2, 0.2, 0, (18, 3), 'xmax', None)] | ||
173 | }, | ||
174 | '商检单号': { | ||
175 | 'length': None, | ||
176 | 'str_type': 'str', | ||
177 | 'location': [(19, 'right', 0.2, 0.2, 0, (1.5, ), 'xmin', 'split')] | ||
178 | }, | ||
179 | '发动机号码': { | ||
180 | 'length': None, | ||
181 | 'str_type': 'str', # alnum | ||
182 | 'location': [(20, 'right', 0.2, 0.2, 0, (21, 2), 'xmin', None), (21, 'left', 0.2, 0.2, 0, (20, 1.4), 'xmax', None)] | ||
183 | }, | ||
184 | '车辆识别代号/车架号码': { | ||
185 | 'length': 17, | ||
186 | 'str_type': 'str', # alnum | ||
187 | 'location': [(21, 'right', 0.3, 0.3, 0, (1.2, ), 'xmin', 'split')] | ||
188 | }, | ||
189 | '价税合计大写': { | ||
190 | 'length': None, | ||
191 | 'str_type': 'str', # cn | ||
192 | 'location': [(22, 'right', 0.2, 0.2, 0, (23, 3), 'xmin', None), (23, 'left', 0.2, 0.2, 0, (22, 15), 'xmax', None)], | ||
193 | 'fix_methods': [('prune_no_cn', {}), ('replace_whole', {'replace_map': REPLACE_DICT_1})] | ||
194 | }, | ||
195 | '价税合计小写': { | ||
196 | 'length': None, | ||
197 | 'str_type': 'float', | ||
198 | 'location': [(23, 'right', 0.4, 0.4, 0, (4, ), 'xmin', 'split')], | ||
199 | 'fix_methods': [('prune_amount', {})] | ||
200 | }, | ||
201 | '销货单位名称': { | ||
202 | 'length': None, | ||
203 | 'str_type': 'str', # cn | ||
204 | 'location': [(24, 'right', 0.2, 0.2, 0, (25, 3), 'xmin', None), (25, 'left', 0.3, 0.3, 0, (24, 15), 'xmax', None)] | ||
205 | }, | ||
206 | '电话': { | ||
207 | 'length': None, | ||
208 | 'str_type': 'str', # int + - | ||
209 | 'location': [(25, 'right', 0.3, 0.3, 0, (5, ), 'xmin', 'split')] | ||
210 | }, | ||
211 | '纳税人识别号': { | ||
212 | 'length': None, | ||
213 | 'str_type': 'str', # cn | ||
214 | 'location': [(0, 'right', 0.3, 0.3, 0, (26, 2.5), 'xmin', None), (26, 'left', 0.3, 0.3, 0, (0, 15), 'xmax', None)] | ||
215 | }, | ||
216 | '账号': { | ||
217 | 'length': None, | ||
218 | 'str_type': 'str', | ||
219 | 'location': [(26, 'right', 0.3, 0.3, 0, (6, ), 'xmin', 'split')] | ||
220 | }, | ||
221 | '地址': { | ||
222 | 'length': None, | ||
223 | 'str_type': 'str', # cn | ||
224 | 'location': [(27, 'left', 0.3, 0.3, 0, (3, 4), 'merge', None), (3, 'right', 0.3, 0.3, 0, (27, 20), 'xmin', None)] | ||
225 | }, | ||
226 | '开户银行': { | ||
227 | 'length': None, | ||
228 | 'str_type': 'str', # cn | ||
229 | 'location': [(27, 'right', 0.3, 0.3, 0, (3, ), 'xmin', 'split')] | ||
230 | }, | ||
231 | '增值税税率或征收率': { | ||
232 | 'length': 3, | ||
233 | 'str_type': 'str', # 13% | ||
234 | 'location': [(28, 'right', 0, 1, 0, (1, 30, 1), 'xmin', None), (29, 'right', 1, 0, 0, (1, 30, 1), 'xmin', None), | ||
235 | (1, 'left', 0, 1, 0, (28, 29, 2), 'xmax', None), (30, 'left', 1, 0, 0, (28, 29, 2), 'xmax', None)], | ||
236 | 'fix_methods': [('replace_last_char', {'char_set': {'8', '9', '号'}, 'target_char': '%'})] | ||
237 | |||
238 | }, | ||
239 | '增值税税额': { | ||
240 | 'length': None, | ||
241 | 'str_type': 'float', | ||
242 | 'location': [(1, 'right', 0, 1, 0, (31, 32, 2.5), 'xmin', None), (30, 'right', 1, 0, 0, (31, 32, 2.5), 'xmin', None), | ||
243 | (31, 'left', 0, 1, 0, (1, 30, 2), 'xmax', None), (32, 'left', 1, 0, 0, (1, 30, 2), 'xmax', None)], | ||
244 | 'fix_methods': [('prune_amount', {})] | ||
245 | }, | ||
246 | '主管税务机关及代码': { | ||
247 | 'length': None, | ||
248 | 'str_type': 'str', | ||
249 | 'location': [(31, 'right', 0, 1.5, 0, (2, ), 'merge', None), (32, 'right', 1, 0.5, 0, (2, ), 'merge', None)] | ||
250 | }, | ||
251 | '不含税价-小写': { | ||
252 | 'length': None, | ||
253 | 'str_type': 'float', # cn | ||
254 | 'location': [(34, 'left', 0.3, 0.3, 0, (33, 1.5), 'xmax', None), (33, 'right', 0.2, 0.2, 0, (34, 1.5), 'xmin', None)], | ||
255 | 'fix_methods': [('prune_amount', {})] | ||
256 | }, | ||
257 | '完税凭证号码': { | ||
258 | 'length': None, | ||
259 | 'str_type': 'str', | ||
260 | 'location': [(34, 'right', 0.2, 0.2, 0, (36, 1.5), 'xmin', None), (36, 'left', 0.2, 0.2, 0, (34, 6), 'xmax', None)] | ||
261 | }, | ||
262 | '吨位': { | ||
263 | 'length': None, | ||
264 | 'str_type': 'str', | ||
265 | 'location': [(36, 'right', 0.2, 0.2, 0, (37, 1), 'xmin', 'split'), (37, 'left', 0.2, 0.2, 0, (36, 0.5), 'xmax', None)] | ||
266 | }, | ||
267 | '限乘人数': { | ||
268 | 'length': None, | ||
269 | 'str_type': 'int', | ||
270 | 'location': [(37, 'right', 0.2, 0.2, 0, (0.5, ), 'xmin', 'split')] | ||
271 | }, | ||
272 | '开票人': { | ||
273 | 'length': None, | ||
274 | 'str_type': 'str', | ||
275 | 'location': [(35, 'right', 0, 0.5, 0, (1.5, ), 'xmin', 'split')] | ||
276 | }, | ||
277 | '备注': { | ||
278 | 'length': None, | ||
279 | 'str_type': 'str', | ||
280 | 'location': [(38, 'right', 0.2, 0.2, 0, (2, ), 'xmin', 'split')], | ||
281 | 'fix_methods': [('prune_first_char', {'char_set': {';', ':', ':'}})] | ||
282 | }, | ||
283 | |||
284 | } | ||
285 | |||
286 | INVOICE_CONST = { | ||
287 | ARG_KEY_KEY_LIST: INVOICE_KEY_LIST, | ||
288 | ARG_KEY_VALUE_DICT: INVOICE_VALUE_DICT | ||
289 | } |
general_extractor/retriever.py
0 → 100644
This diff is collapsed.
Click to expand it.
general_extractor/step1.py
0 → 100644
1 | import json | ||
2 | import os | ||
3 | import base64 | ||
4 | import requests | ||
5 | import cv2 | ||
6 | import time | ||
7 | import numpy as np | ||
8 | from PIL import Image, ImageDraw, ImageFont | ||
9 | |||
10 | |||
11 | base_dir = os.path.dirname(os.path.abspath(__file__)) | ||
12 | img_dir = '/home/zwq/data/gcfp/valid/image' | ||
13 | draw_dir = os.path.join(base_dir, 'draw', 'valid') | ||
14 | sign_dir = os.path.join(base_dir, 'sign_res', 'valid') | ||
15 | go_dir = os.path.join(base_dir, 'go_res', 'valid') | ||
16 | |||
17 | font_path = os.path.join(base_dir, 'simhei.ttf') | ||
18 | font = ImageFont.truetype(font_path, 10, encoding="utf-8") | ||
19 | |||
20 | |||
21 | for image_name in os.listdir(img_dir): | ||
22 | |||
23 | print('start: {0}'.format(image_name)) | ||
24 | base_image_name, _ = os.path.splitext(image_name) | ||
25 | |||
26 | image_path = os.path.join(img_dir, image_name) | ||
27 | output_path = os.path.join(draw_dir, image_name) | ||
28 | go_res_path = os.path.join(go_dir, '{0}.json'.format(base_image_name)) | ||
29 | sign_res_path = os.path.join(sign_dir, '{0}.json'.format(base_image_name)) | ||
30 | |||
31 | go_response = requests.post(url=r'http://139.196.149.46:9001/gen_ocr', files={'file': open(image_path, 'rb')}) | ||
32 | go_res = go_response.json()['ocr_results'] | ||
33 | # print(go_res) | ||
34 | |||
35 | with open(go_res_path, 'w') as fp: | ||
36 | json.dump(go_res, fp, ensure_ascii=False) | ||
37 | |||
38 | img = cv2.imread(image_path) | ||
39 | for coordinates, text in go_res.values(): | ||
40 | # print(coordinates) | ||
41 | # print(text) | ||
42 | cv2.rectangle(img, (coordinates[0], coordinates[1]), (coordinates[4], coordinates[5]), (0, 255, 0), 2) | ||
43 | pil_img = Image.fromarray(img) | ||
44 | draw = ImageDraw.Draw(pil_img) | ||
45 | draw.text((coordinates[0], coordinates[1]), text, (255, 0, 0), font=font) | ||
46 | img = np.array(pil_img) | ||
47 | |||
48 | cv2.imwrite(output_path, img) | ||
49 | |||
50 | sign_response = requests.post(url=r'http://139.196.149.46:9001/signature_detect', files={'file': open(image_path, 'rb')}) | ||
51 | signature_res = sign_response.json() | ||
52 | |||
53 | with open(sign_res_path, 'w') as fp: | ||
54 | json.dump(signature_res, fp, ensure_ascii=False) | ||
55 | |||
56 | # print(signature_res) | ||
57 | |||
58 | # start_time = time.time() | ||
59 | |||
60 | # res = retriever_individuals.get_target_fields(go_res, signature_res) | ||
61 | # print(res) | ||
62 | |||
63 | # end_time = time.time() | ||
64 | # print('time: {0}'.format(end_time - start_time)) | ||
65 | |||
66 | # break | ||
67 | |||
68 |
-
Please register or sign in to post a comment