d29ec34f by 周伟奇

rm general_extractor

1 parent 32c61c54
1 REPLACE_DICT_1 = {
2 "元": "圆",
3 # "零角": "零",
4 "柴": "柒",
5 "染": "柒",
6 "查": "壹",
7 "武": "贰",
8 "家": "贰",
9 "就": "贰",
10 "登": "叁",
11 # "@整": "叁",
12 "鑫": "叁",
13 "垂": "叁",
14 "捆": "捌",
15 "搁": "捌",
16 "级": "捌",
17 "测": "捌",
18 "拥": "捌",
19 "损": "捌",
20 "盒": "叁",
21 "摄": "捌",
22 "报": "捌",
23 "会": "叁",
24 "索": "壹",
25 "任": "仟",
26 "杆": "仟",
27 "仔": "仟",
28 "什": "仟",
29 "付": "仟",
30 "伴": "仟",
31 "宿": "佰",
32 "信": "佰",
33 "情": "佰",
34 "值": "佰",
35 "荣": "柒",
36 "渠": "柒",
37 "类": "柒",
38 "案": "柒",
39 "集": "柒",
40 "方": "万",
41 "抬": "拾",
42 "给": "拾",
43 "樟": "肆",
44 "单": "肆",
45 "邮": "肆",
46 "政": "玖",
47 "拐": "捌",
48 # "柴": "柒",
49 # "任": "仟",
50 # "拥": "捌",
51 # "会": "叁",
52 }
53
54
55 ARG_KEY_KEY_LIST = 'keys_list'
56 ARG_KEY_VALUE_DICT = 'values_dict'
57
58 INVOICE_KEY_LIST = [
59 ('纳税人识别号', False), # 相近的key 0
60 ('增值税', False), # 相近的key 1
61
62 ('地', False), # 单字的key 2
63 ('址', False), # 单字的key 3
64
65 ('开票日期', '开票曰期', '开票日', True), # 4
66 ('发票代码', '发票代鸡', True), # 5
67 ('发票号码', '发票号瑞', '发要号瑞', True), # 6
68 ('机打代码', False), # 7
69 ('机打号码', '机打号玛', False), # 8
70 ('机器编号', False), # 9
71 ('购买方名称', '购买方名称及', False), # 10
72 ('纳税人识别号/', False), # 11
73 ('统一社会信用代码/', False), # 12
74 ('身份证号码', '身份证号码/', False), # 13
75 ('车辆类型', True), # 14
76 ('厂牌型号', '广牌型号', '厂胖型号', '广牌型考', True), # 15
77 ('产地', '严地', True), # 16
78 ('合格证号', False), # 17
79 ('进口证明书号', True), # 18
80 ('商检单号', True), # 19
81 ('发动机号码', False), # 20
82 ('车辆识别代号/车架号码', True), # 21
83 ('价税合计', '价现合计', '价“税合计', False), # 22
84 ('小写', True), # 23 TODO 多个值时的取值
85 ('销货单位名称', False), # 24
86 ('电话', True), # 25
87 ('账号', '账考', '帐号', '帐考', '张号', '陈号', '昨号', True), # 26
88 ('开户银行', True), # 27
89 ('增值税税率', True), # 28 value false
90 ('或征收税', False), # 29
91 ('税额', False), # 30
92 ('主管税务', True), # 31 value False
93 ('机关及代码', True), # 32
94 ('不含税价', True), # 33 value False
95 ('完税凭证号码', False), # 34
96 ('开票人', True), # 35
97 ('吨位', True), # 36
98 ('限乘人数', '跟乘人数', True), # 37 TODO '人数'这种情况的坐标切分
99 ('备注', True) # 38
100 ]
101
102 # split key-value一体
103 # append key-value_suffix 需要坐标切分
104 # insert key-value_prefix 需要坐标切分
105 INVOICE_VALUE_DICT = {
106 '开票日期': {
107 'length': 10,
108 'str_type': 'date',
109 # idx, location, top, bottom, left, (idx, scope), choice, if_startswith
110 'location': [(4, 'right', 0.3, 0.5, 0, (2, ), 'xmin', 'split')],
111 'fix_methods': [('prune_first_char', {'char_set': {':', ':', ';', }})]
112 },
113 '发票代码': {
114 'length': 12,
115 'str_type': 'int',
116 'location': [(5, 'right', 0.2, 0.2, 0, (2, ), 'xmin', 'split')]
117 },
118 '发票号码': {
119 'length': 8,
120 'str_type': 'int',
121 'location': [(6, 'right', 0.2, 0.5, 0, (2, ), 'length', 'split')],
122 'fix_methods': [('prune_first_char', {'char_set': {'-',}})]
123 },
124 '机打代码': {
125 'length': 12,
126 'str_type': 'int',
127 'location': [(7, 'right', 0.5, 1, 0, (2, ), 'ymin', None)]
128 },
129 '机器编号': {
130 'length': 12,
131 'str_type': 'int',
132 'location': [(9, 'right', 0.5, 1, 0, (2, ), 'ymax', None)]
133 },
134 '机打号码': {
135 'length': 8,
136 'str_type': 'int',
137 'location': [(8, 'right', 0.5, 0.5, 0, (2, ), 'length', None)]
138 },
139 '购买方名称': {
140 'length': None,
141 'str_type': 'str', # cn
142 'location': [(10, 'right', 0.5, 0.5, 0, (11, 12, 13, 2), 'xmin', None)]
143 },
144 '纳税人识别号/统一社会信用代码/身份证号码': {
145 'length': 18,
146 'str_type': 'str', # alnum
147 'location': [(11, 'right', 0, 2, 0, (2.5, ), 'length', None), (12, 'right', 1, 1, 0, (2, ), 'length', None), (13, 'right', 2, 0, 0.5, (3, ), 'length', None)]
148 },
149 '车辆类型': {
150 'length': None,
151 'str_type': 'str',
152 'location': [(14, 'right', 0.2, 0.2, 0, (15, 1.5), 'xmin', 'split'), (15, 'left', 0.2, 0.2, 0, (14, 2.5), 'xmax', None)]
153 },
154 '厂牌型号': {
155 'length': None,
156 'str_type': 'str',
157 'location': [(15, 'right', 0.2, 0.2, 0, (16, 3.5), 'xmin', 'split'), (16, 'left', 0.2, 0.2, 0, (15, 2.5), 'xmax', None)]
158 },
159 '产地': {
160 'length': None,
161 'str_type': 'str', # cn
162 'location': [(16, 'right', 0.2, 0.2, 0, (2.5, ), 'xmin', 'split')]
163 },
164 '合格证号': {
165 'length': None, # 15
166 'str_type': 'str', # alnum
167 'location': [(17, 'right', 0.2, 0.2, 0, (18, 1.5), 'xmin', None), (18, 'left', 0.2, 0.2, 0, (17, 1.5), 'xmax', None)]
168 },
169 '进口证明书号': {
170 'length': None,
171 'str_type': 'str', # alnum
172 'location': [(18, 'right', 0.3, 0.3, 0, (19, 1.5), 'xmin', 'split'), (19, 'left', 0.2, 0.2, 0, (18, 3), 'xmax', None)]
173 },
174 '商检单号': {
175 'length': None,
176 'str_type': 'str',
177 'location': [(19, 'right', 0.2, 0.2, 0, (1.5, ), 'xmin', 'split')]
178 },
179 '发动机号码': {
180 'length': None,
181 'str_type': 'str', # alnum
182 'location': [(20, 'right', 0.2, 0.2, 0, (21, 2), 'xmin', None), (21, 'left', 0.2, 0.2, 0, (20, 1.4), 'xmax', None)]
183 },
184 '车辆识别代号/车架号码': {
185 'length': 17,
186 'str_type': 'str', # alnum
187 'location': [(21, 'right', 0.3, 0.3, 0, (1.2, ), 'xmin', 'split')]
188 },
189 '价税合计大写': {
190 'length': None,
191 'str_type': 'str', # cn
192 'location': [(22, 'right', 0.2, 0.2, 0, (23, 3), 'xmin', None), (23, 'left', 0.2, 0.2, 0, (22, 15), 'xmax', None)],
193 'fix_methods': [('prune_no_cn', {}), ('replace_whole', {'replace_map': REPLACE_DICT_1})]
194 },
195 '价税合计小写': {
196 'length': None,
197 'str_type': 'float',
198 'location': [(23, 'right', 0.4, 0.4, 0, (4, ), 'xmin', 'split')],
199 'fix_methods': [('prune_amount', {})]
200 },
201 '销货单位名称': {
202 'length': None,
203 'str_type': 'str', # cn
204 'location': [(24, 'right', 0.2, 0.2, 0, (25, 3), 'xmin', None), (25, 'left', 0.3, 0.3, 0, (24, 15), 'xmax', None)]
205 },
206 '电话': {
207 'length': None,
208 'str_type': 'str', # int + -
209 'location': [(25, 'right', 0.3, 0.3, 0, (5, ), 'xmin', 'split')]
210 },
211 '纳税人识别号': {
212 'length': None,
213 'str_type': 'str', # cn
214 'location': [(0, 'right', 0.3, 0.3, 0, (26, 2.5), 'xmin', None), (26, 'left', 0.3, 0.3, 0, (0, 15), 'xmax', None)]
215 },
216 '账号': {
217 'length': None,
218 'str_type': 'str',
219 'location': [(26, 'right', 0.3, 0.3, 0, (6, ), 'xmin', 'split')]
220 },
221 '地址': {
222 'length': None,
223 'str_type': 'str', # cn
224 'location': [(27, 'left', 0.3, 0.3, 0, (3, 4), 'merge', None), (3, 'right', 0.3, 0.3, 0, (27, 20), 'xmin', None)]
225 },
226 '开户银行': {
227 'length': None,
228 'str_type': 'str', # cn
229 'location': [(27, 'right', 0.3, 0.3, 0, (3, ), 'xmin', 'split')]
230 },
231 '增值税税率或征收率': {
232 'length': 3,
233 'str_type': 'str', # 13%
234 'location': [(28, 'right', 0, 1, 0, (1, 30, 1), 'xmin', None), (29, 'right', 1, 0, 0, (1, 30, 1), 'xmin', None),
235 (1, 'left', 0, 1, 0, (28, 29, 2), 'xmax', None), (30, 'left', 1, 0, 0, (28, 29, 2), 'xmax', None)],
236 'fix_methods': [('replace_last_char', {'char_set': {'8', '9', '号'}, 'target_char': '%'})]
237
238 },
239 '增值税税额': {
240 'length': None,
241 'str_type': 'float',
242 'location': [(1, 'right', 0, 1, 0, (31, 32, 2.5), 'xmin', None), (30, 'right', 1, 0, 0, (31, 32, 2.5), 'xmin', None),
243 (31, 'left', 0, 1, 0, (1, 30, 2), 'xmax', None), (32, 'left', 1, 0, 0, (1, 30, 2), 'xmax', None)],
244 'fix_methods': [('prune_amount', {})]
245 },
246 '主管税务机关及代码': {
247 'length': None,
248 'str_type': 'str',
249 'location': [(31, 'right', 0, 1.5, 0, (2, ), 'merge', None), (32, 'right', 1, 0.5, 0, (2, ), 'merge', None)]
250 },
251 '不含税价-小写': {
252 'length': None,
253 'str_type': 'float', # cn
254 'location': [(34, 'left', 0.3, 0.3, 0, (33, 1.5), 'xmax', None), (33, 'right', 0.2, 0.2, 0, (34, 1.5), 'xmin', None)],
255 'fix_methods': [('prune_amount', {})]
256 },
257 '完税凭证号码': {
258 'length': None,
259 'str_type': 'str',
260 'location': [(34, 'right', 0.2, 0.2, 0, (36, 1.5), 'xmin', None), (36, 'left', 0.2, 0.2, 0, (34, 6), 'xmax', None)]
261 },
262 '吨位': {
263 'length': None,
264 'str_type': 'str',
265 'location': [(36, 'right', 0.2, 0.2, 0, (37, 1), 'xmin', 'split'), (37, 'left', 0.2, 0.2, 0, (36, 0.5), 'xmax', None)]
266 },
267 '限乘人数': {
268 'length': None,
269 'str_type': 'int',
270 'location': [(37, 'right', 0.2, 0.2, 0, (0.5, ), 'xmin', 'split')]
271 },
272 '开票人': {
273 'length': None,
274 'str_type': 'str',
275 'location': [(35, 'right', 0, 0.5, 0, (1.5, ), 'xmin', 'split')]
276 },
277 '备注': {
278 'length': None,
279 'str_type': 'str',
280 'location': [(38, 'right', 0.2, 0.2, 0, (2, ), 'xmin', 'split')],
281 'fix_methods': [('prune_first_char', {'char_set': {';', ':', ':'}})]
282 },
283
284 }
285
286 INVOICE_CONST = {
287 ARG_KEY_KEY_LIST: INVOICE_KEY_LIST,
288 ARG_KEY_VALUE_DICT: INVOICE_VALUE_DICT
289 }
1 import json
2 import os
3 import base64
4 import requests
5 import cv2
6 import time
7 import numpy as np
8 from PIL import Image, ImageDraw, ImageFont
9
10
11 base_dir = os.path.dirname(os.path.abspath(__file__))
12 img_dir = '/home/zwq/data/gcfp/valid/image'
13 draw_dir = os.path.join(base_dir, 'draw', 'valid')
14 sign_dir = os.path.join(base_dir, 'sign_res', 'valid')
15 go_dir = os.path.join(base_dir, 'go_res', 'valid')
16
17 font_path = os.path.join(base_dir, 'simhei.ttf')
18 font = ImageFont.truetype(font_path, 10, encoding="utf-8")
19
20
21 for image_name in os.listdir(img_dir):
22
23 print('start: {0}'.format(image_name))
24 base_image_name, _ = os.path.splitext(image_name)
25
26 image_path = os.path.join(img_dir, image_name)
27 output_path = os.path.join(draw_dir, image_name)
28 go_res_path = os.path.join(go_dir, '{0}.json'.format(base_image_name))
29 sign_res_path = os.path.join(sign_dir, '{0}.json'.format(base_image_name))
30
31 go_response = requests.post(url=r'http://139.196.149.46:9001/gen_ocr', files={'file': open(image_path, 'rb')})
32 go_res = go_response.json()['ocr_results']
33 # print(go_res)
34
35 with open(go_res_path, 'w') as fp:
36 json.dump(go_res, fp, ensure_ascii=False)
37
38 img = cv2.imread(image_path)
39 for coordinates, text in go_res.values():
40 # print(coordinates)
41 # print(text)
42 cv2.rectangle(img, (coordinates[0], coordinates[1]), (coordinates[4], coordinates[5]), (0, 255, 0), 2)
43 pil_img = Image.fromarray(img)
44 draw = ImageDraw.Draw(pil_img)
45 draw.text((coordinates[0], coordinates[1]), text, (255, 0, 0), font=font)
46 img = np.array(pil_img)
47
48 cv2.imwrite(output_path, img)
49
50 sign_response = requests.post(url=r'http://139.196.149.46:9001/signature_detect', files={'file': open(image_path, 'rb')})
51 signature_res = sign_response.json()
52
53 with open(sign_res_path, 'w') as fp:
54 json.dump(signature_res, fp, ensure_ascii=False)
55
56 # print(signature_res)
57
58 # start_time = time.time()
59
60 # res = retriever_individuals.get_target_fields(go_res, signature_res)
61 # print(res)
62
63 # end_time = time.time()
64 # print('time: {0}'.format(end_time - start_time))
65
66 # break
67
68
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!