add general extractor
Showing
4 changed files
with
364 additions
and
0 deletions
general_extractor/const.py
0 → 100644
| 1 | REPLACE_DICT_1 = { | ||
| 2 | "元": "圆", | ||
| 3 | # "零角": "零", | ||
| 4 | "柴": "柒", | ||
| 5 | "染": "柒", | ||
| 6 | "查": "壹", | ||
| 7 | "武": "贰", | ||
| 8 | "家": "贰", | ||
| 9 | "就": "贰", | ||
| 10 | "登": "叁", | ||
| 11 | # "@整": "叁", | ||
| 12 | "鑫": "叁", | ||
| 13 | "垂": "叁", | ||
| 14 | "捆": "捌", | ||
| 15 | "搁": "捌", | ||
| 16 | "级": "捌", | ||
| 17 | "测": "捌", | ||
| 18 | "拥": "捌", | ||
| 19 | "损": "捌", | ||
| 20 | "盒": "叁", | ||
| 21 | "摄": "捌", | ||
| 22 | "报": "捌", | ||
| 23 | "会": "叁", | ||
| 24 | "索": "壹", | ||
| 25 | "任": "仟", | ||
| 26 | "杆": "仟", | ||
| 27 | "仔": "仟", | ||
| 28 | "什": "仟", | ||
| 29 | "付": "仟", | ||
| 30 | "伴": "仟", | ||
| 31 | "宿": "佰", | ||
| 32 | "信": "佰", | ||
| 33 | "情": "佰", | ||
| 34 | "值": "佰", | ||
| 35 | "荣": "柒", | ||
| 36 | "渠": "柒", | ||
| 37 | "类": "柒", | ||
| 38 | "案": "柒", | ||
| 39 | "集": "柒", | ||
| 40 | "方": "万", | ||
| 41 | "抬": "拾", | ||
| 42 | "给": "拾", | ||
| 43 | "樟": "肆", | ||
| 44 | "单": "肆", | ||
| 45 | "邮": "肆", | ||
| 46 | "政": "玖", | ||
| 47 | "拐": "捌", | ||
| 48 | # "柴": "柒", | ||
| 49 | # "任": "仟", | ||
| 50 | # "拥": "捌", | ||
| 51 | # "会": "叁", | ||
| 52 | } | ||
| 53 | |||
| 54 | |||
| 55 | ARG_KEY_KEY_LIST = 'keys_list' | ||
| 56 | ARG_KEY_VALUE_DICT = 'values_dict' | ||
| 57 | |||
| 58 | INVOICE_KEY_LIST = [ | ||
| 59 | ('纳税人识别号', False), # 相近的key 0 | ||
| 60 | ('增值税', False), # 相近的key 1 | ||
| 61 | |||
| 62 | ('地', False), # 单字的key 2 | ||
| 63 | ('址', False), # 单字的key 3 | ||
| 64 | |||
| 65 | ('开票日期', '开票曰期', '开票日', True), # 4 | ||
| 66 | ('发票代码', '发票代鸡', True), # 5 | ||
| 67 | ('发票号码', '发票号瑞', '发要号瑞', True), # 6 | ||
| 68 | ('机打代码', False), # 7 | ||
| 69 | ('机打号码', '机打号玛', False), # 8 | ||
| 70 | ('机器编号', False), # 9 | ||
| 71 | ('购买方名称', '购买方名称及', False), # 10 | ||
| 72 | ('纳税人识别号/', False), # 11 | ||
| 73 | ('统一社会信用代码/', False), # 12 | ||
| 74 | ('身份证号码', '身份证号码/', False), # 13 | ||
| 75 | ('车辆类型', True), # 14 | ||
| 76 | ('厂牌型号', '广牌型号', '厂胖型号', '广牌型考', True), # 15 | ||
| 77 | ('产地', '严地', True), # 16 | ||
| 78 | ('合格证号', False), # 17 | ||
| 79 | ('进口证明书号', True), # 18 | ||
| 80 | ('商检单号', True), # 19 | ||
| 81 | ('发动机号码', False), # 20 | ||
| 82 | ('车辆识别代号/车架号码', True), # 21 | ||
| 83 | ('价税合计', '价现合计', '价“税合计', False), # 22 | ||
| 84 | ('小写', True), # 23 TODO 多个值时的取值 | ||
| 85 | ('销货单位名称', False), # 24 | ||
| 86 | ('电话', True), # 25 | ||
| 87 | ('账号', '账考', '帐号', '帐考', '张号', '陈号', '昨号', True), # 26 | ||
| 88 | ('开户银行', True), # 27 | ||
| 89 | ('增值税税率', True), # 28 value false | ||
| 90 | ('或征收税', False), # 29 | ||
| 91 | ('税额', False), # 30 | ||
| 92 | ('主管税务', True), # 31 value False | ||
| 93 | ('机关及代码', True), # 32 | ||
| 94 | ('不含税价', True), # 33 value False | ||
| 95 | ('完税凭证号码', False), # 34 | ||
| 96 | ('开票人', True), # 35 | ||
| 97 | ('吨位', True), # 36 | ||
| 98 | ('限乘人数', '跟乘人数', True), # 37 TODO '人数'这种情况的坐标切分 | ||
| 99 | ('备注', True) # 38 | ||
| 100 | ] | ||
| 101 | |||
| 102 | # split key-value一体 | ||
| 103 | # append key-value_suffix 需要坐标切分 | ||
| 104 | # insert key-value_prefix 需要坐标切分 | ||
| 105 | INVOICE_VALUE_DICT = { | ||
| 106 | '开票日期': { | ||
| 107 | 'length': 10, | ||
| 108 | 'str_type': 'date', | ||
| 109 | # idx, location, top, bottom, left, (idx, scope), choice, if_startswith | ||
| 110 | 'location': [(4, 'right', 0.3, 0.5, 0, (2, ), 'xmin', 'split')], | ||
| 111 | 'fix_methods': [('prune_first_char', {'char_set': {':', ':', ';', }})] | ||
| 112 | }, | ||
| 113 | '发票代码': { | ||
| 114 | 'length': 12, | ||
| 115 | 'str_type': 'int', | ||
| 116 | 'location': [(5, 'right', 0.2, 0.2, 0, (2, ), 'xmin', 'split')] | ||
| 117 | }, | ||
| 118 | '发票号码': { | ||
| 119 | 'length': 8, | ||
| 120 | 'str_type': 'int', | ||
| 121 | 'location': [(6, 'right', 0.2, 0.5, 0, (2, ), 'length', 'split')], | ||
| 122 | 'fix_methods': [('prune_first_char', {'char_set': {'-',}})] | ||
| 123 | }, | ||
| 124 | '机打代码': { | ||
| 125 | 'length': 12, | ||
| 126 | 'str_type': 'int', | ||
| 127 | 'location': [(7, 'right', 0.5, 1, 0, (2, ), 'ymin', None)] | ||
| 128 | }, | ||
| 129 | '机器编号': { | ||
| 130 | 'length': 12, | ||
| 131 | 'str_type': 'int', | ||
| 132 | 'location': [(9, 'right', 0.5, 1, 0, (2, ), 'ymax', None)] | ||
| 133 | }, | ||
| 134 | '机打号码': { | ||
| 135 | 'length': 8, | ||
| 136 | 'str_type': 'int', | ||
| 137 | 'location': [(8, 'right', 0.5, 0.5, 0, (2, ), 'length', None)] | ||
| 138 | }, | ||
| 139 | '购买方名称': { | ||
| 140 | 'length': None, | ||
| 141 | 'str_type': 'str', # cn | ||
| 142 | 'location': [(10, 'right', 0.5, 0.5, 0, (11, 12, 13, 2), 'xmin', None)] | ||
| 143 | }, | ||
| 144 | '纳税人识别号/统一社会信用代码/身份证号码': { | ||
| 145 | 'length': 18, | ||
| 146 | 'str_type': 'str', # alnum | ||
| 147 | 'location': [(11, 'right', 0, 2, 0, (2.5, ), 'length', None), (12, 'right', 1, 1, 0, (2, ), 'length', None), (13, 'right', 2, 0, 0.5, (3, ), 'length', None)] | ||
| 148 | }, | ||
| 149 | '车辆类型': { | ||
| 150 | 'length': None, | ||
| 151 | 'str_type': 'str', | ||
| 152 | 'location': [(14, 'right', 0.2, 0.2, 0, (15, 1.5), 'xmin', 'split'), (15, 'left', 0.2, 0.2, 0, (14, 2.5), 'xmax', None)] | ||
| 153 | }, | ||
| 154 | '厂牌型号': { | ||
| 155 | 'length': None, | ||
| 156 | 'str_type': 'str', | ||
| 157 | 'location': [(15, 'right', 0.2, 0.2, 0, (16, 3.5), 'xmin', 'split'), (16, 'left', 0.2, 0.2, 0, (15, 2.5), 'xmax', None)] | ||
| 158 | }, | ||
| 159 | '产地': { | ||
| 160 | 'length': None, | ||
| 161 | 'str_type': 'str', # cn | ||
| 162 | 'location': [(16, 'right', 0.2, 0.2, 0, (2.5, ), 'xmin', 'split')] | ||
| 163 | }, | ||
| 164 | '合格证号': { | ||
| 165 | 'length': None, # 15 | ||
| 166 | 'str_type': 'str', # alnum | ||
| 167 | 'location': [(17, 'right', 0.2, 0.2, 0, (18, 1.5), 'xmin', None), (18, 'left', 0.2, 0.2, 0, (17, 1.5), 'xmax', None)] | ||
| 168 | }, | ||
| 169 | '进口证明书号': { | ||
| 170 | 'length': None, | ||
| 171 | 'str_type': 'str', # alnum | ||
| 172 | 'location': [(18, 'right', 0.3, 0.3, 0, (19, 1.5), 'xmin', 'split'), (19, 'left', 0.2, 0.2, 0, (18, 3), 'xmax', None)] | ||
| 173 | }, | ||
| 174 | '商检单号': { | ||
| 175 | 'length': None, | ||
| 176 | 'str_type': 'str', | ||
| 177 | 'location': [(19, 'right', 0.2, 0.2, 0, (1.5, ), 'xmin', 'split')] | ||
| 178 | }, | ||
| 179 | '发动机号码': { | ||
| 180 | 'length': None, | ||
| 181 | 'str_type': 'str', # alnum | ||
| 182 | 'location': [(20, 'right', 0.2, 0.2, 0, (21, 2), 'xmin', None), (21, 'left', 0.2, 0.2, 0, (20, 1.4), 'xmax', None)] | ||
| 183 | }, | ||
| 184 | '车辆识别代号/车架号码': { | ||
| 185 | 'length': 17, | ||
| 186 | 'str_type': 'str', # alnum | ||
| 187 | 'location': [(21, 'right', 0.3, 0.3, 0, (1.2, ), 'xmin', 'split')] | ||
| 188 | }, | ||
| 189 | '价税合计大写': { | ||
| 190 | 'length': None, | ||
| 191 | 'str_type': 'str', # cn | ||
| 192 | 'location': [(22, 'right', 0.2, 0.2, 0, (23, 3), 'xmin', None), (23, 'left', 0.2, 0.2, 0, (22, 15), 'xmax', None)], | ||
| 193 | 'fix_methods': [('prune_no_cn', {}), ('replace_whole', {'replace_map': REPLACE_DICT_1})] | ||
| 194 | }, | ||
| 195 | '价税合计小写': { | ||
| 196 | 'length': None, | ||
| 197 | 'str_type': 'float', | ||
| 198 | 'location': [(23, 'right', 0.4, 0.4, 0, (4, ), 'xmin', 'split')], | ||
| 199 | 'fix_methods': [('prune_amount', {})] | ||
| 200 | }, | ||
| 201 | '销货单位名称': { | ||
| 202 | 'length': None, | ||
| 203 | 'str_type': 'str', # cn | ||
| 204 | 'location': [(24, 'right', 0.2, 0.2, 0, (25, 3), 'xmin', None), (25, 'left', 0.3, 0.3, 0, (24, 15), 'xmax', None)] | ||
| 205 | }, | ||
| 206 | '电话': { | ||
| 207 | 'length': None, | ||
| 208 | 'str_type': 'str', # int + - | ||
| 209 | 'location': [(25, 'right', 0.3, 0.3, 0, (5, ), 'xmin', 'split')] | ||
| 210 | }, | ||
| 211 | '纳税人识别号': { | ||
| 212 | 'length': None, | ||
| 213 | 'str_type': 'str', # cn | ||
| 214 | 'location': [(0, 'right', 0.3, 0.3, 0, (26, 2.5), 'xmin', None), (26, 'left', 0.3, 0.3, 0, (0, 15), 'xmax', None)] | ||
| 215 | }, | ||
| 216 | '账号': { | ||
| 217 | 'length': None, | ||
| 218 | 'str_type': 'str', | ||
| 219 | 'location': [(26, 'right', 0.3, 0.3, 0, (6, ), 'xmin', 'split')] | ||
| 220 | }, | ||
| 221 | '地址': { | ||
| 222 | 'length': None, | ||
| 223 | 'str_type': 'str', # cn | ||
| 224 | 'location': [(27, 'left', 0.3, 0.3, 0, (3, 4), 'merge', None), (3, 'right', 0.3, 0.3, 0, (27, 20), 'xmin', None)] | ||
| 225 | }, | ||
| 226 | '开户银行': { | ||
| 227 | 'length': None, | ||
| 228 | 'str_type': 'str', # cn | ||
| 229 | 'location': [(27, 'right', 0.3, 0.3, 0, (3, ), 'xmin', 'split')] | ||
| 230 | }, | ||
| 231 | '增值税税率或征收率': { | ||
| 232 | 'length': 3, | ||
| 233 | 'str_type': 'str', # 13% | ||
| 234 | 'location': [(28, 'right', 0, 1, 0, (1, 30, 1), 'xmin', None), (29, 'right', 1, 0, 0, (1, 30, 1), 'xmin', None), | ||
| 235 | (1, 'left', 0, 1, 0, (28, 29, 2), 'xmax', None), (30, 'left', 1, 0, 0, (28, 29, 2), 'xmax', None)], | ||
| 236 | 'fix_methods': [('replace_last_char', {'char_set': {'8', '9', '号'}, 'target_char': '%'})] | ||
| 237 | |||
| 238 | }, | ||
| 239 | '增值税税额': { | ||
| 240 | 'length': None, | ||
| 241 | 'str_type': 'float', | ||
| 242 | 'location': [(1, 'right', 0, 1, 0, (31, 32, 2.5), 'xmin', None), (30, 'right', 1, 0, 0, (31, 32, 2.5), 'xmin', None), | ||
| 243 | (31, 'left', 0, 1, 0, (1, 30, 2), 'xmax', None), (32, 'left', 1, 0, 0, (1, 30, 2), 'xmax', None)], | ||
| 244 | 'fix_methods': [('prune_amount', {})] | ||
| 245 | }, | ||
| 246 | '主管税务机关及代码': { | ||
| 247 | 'length': None, | ||
| 248 | 'str_type': 'str', | ||
| 249 | 'location': [(31, 'right', 0, 1.5, 0, (2, ), 'merge', None), (32, 'right', 1, 0.5, 0, (2, ), 'merge', None)] | ||
| 250 | }, | ||
| 251 | '不含税价-小写': { | ||
| 252 | 'length': None, | ||
| 253 | 'str_type': 'float', # cn | ||
| 254 | 'location': [(34, 'left', 0.3, 0.3, 0, (33, 1.5), 'xmax', None), (33, 'right', 0.2, 0.2, 0, (34, 1.5), 'xmin', None)], | ||
| 255 | 'fix_methods': [('prune_amount', {})] | ||
| 256 | }, | ||
| 257 | '完税凭证号码': { | ||
| 258 | 'length': None, | ||
| 259 | 'str_type': 'str', | ||
| 260 | 'location': [(34, 'right', 0.2, 0.2, 0, (36, 1.5), 'xmin', None), (36, 'left', 0.2, 0.2, 0, (34, 6), 'xmax', None)] | ||
| 261 | }, | ||
| 262 | '吨位': { | ||
| 263 | 'length': None, | ||
| 264 | 'str_type': 'str', | ||
| 265 | 'location': [(36, 'right', 0.2, 0.2, 0, (37, 1), 'xmin', 'split'), (37, 'left', 0.2, 0.2, 0, (36, 0.5), 'xmax', None)] | ||
| 266 | }, | ||
| 267 | '限乘人数': { | ||
| 268 | 'length': None, | ||
| 269 | 'str_type': 'int', | ||
| 270 | 'location': [(37, 'right', 0.2, 0.2, 0, (0.5, ), 'xmin', 'split')] | ||
| 271 | }, | ||
| 272 | '开票人': { | ||
| 273 | 'length': None, | ||
| 274 | 'str_type': 'str', | ||
| 275 | 'location': [(35, 'right', 0, 0.5, 0, (1.5, ), 'xmin', 'split')] | ||
| 276 | }, | ||
| 277 | '备注': { | ||
| 278 | 'length': None, | ||
| 279 | 'str_type': 'str', | ||
| 280 | 'location': [(38, 'right', 0.2, 0.2, 0, (2, ), 'xmin', 'split')], | ||
| 281 | 'fix_methods': [('prune_first_char', {'char_set': {';', ':', ':'}})] | ||
| 282 | }, | ||
| 283 | |||
| 284 | } | ||
| 285 | |||
| 286 | INVOICE_CONST = { | ||
| 287 | ARG_KEY_KEY_LIST: INVOICE_KEY_LIST, | ||
| 288 | ARG_KEY_VALUE_DICT: INVOICE_VALUE_DICT | ||
| 289 | } |
general_extractor/retriever.py
0 → 100644
This diff is collapsed.
Click to expand it.
general_extractor/step1.py
0 → 100644
| 1 | import json | ||
| 2 | import os | ||
| 3 | import base64 | ||
| 4 | import requests | ||
| 5 | import cv2 | ||
| 6 | import time | ||
| 7 | import numpy as np | ||
| 8 | from PIL import Image, ImageDraw, ImageFont | ||
| 9 | |||
| 10 | |||
| 11 | base_dir = os.path.dirname(os.path.abspath(__file__)) | ||
| 12 | img_dir = '/home/zwq/data/gcfp/valid/image' | ||
| 13 | draw_dir = os.path.join(base_dir, 'draw', 'valid') | ||
| 14 | sign_dir = os.path.join(base_dir, 'sign_res', 'valid') | ||
| 15 | go_dir = os.path.join(base_dir, 'go_res', 'valid') | ||
| 16 | |||
| 17 | font_path = os.path.join(base_dir, 'simhei.ttf') | ||
| 18 | font = ImageFont.truetype(font_path, 10, encoding="utf-8") | ||
| 19 | |||
| 20 | |||
| 21 | for image_name in os.listdir(img_dir): | ||
| 22 | |||
| 23 | print('start: {0}'.format(image_name)) | ||
| 24 | base_image_name, _ = os.path.splitext(image_name) | ||
| 25 | |||
| 26 | image_path = os.path.join(img_dir, image_name) | ||
| 27 | output_path = os.path.join(draw_dir, image_name) | ||
| 28 | go_res_path = os.path.join(go_dir, '{0}.json'.format(base_image_name)) | ||
| 29 | sign_res_path = os.path.join(sign_dir, '{0}.json'.format(base_image_name)) | ||
| 30 | |||
| 31 | go_response = requests.post(url=r'http://139.196.149.46:9001/gen_ocr', files={'file': open(image_path, 'rb')}) | ||
| 32 | go_res = go_response.json()['ocr_results'] | ||
| 33 | # print(go_res) | ||
| 34 | |||
| 35 | with open(go_res_path, 'w') as fp: | ||
| 36 | json.dump(go_res, fp, ensure_ascii=False) | ||
| 37 | |||
| 38 | img = cv2.imread(image_path) | ||
| 39 | for coordinates, text in go_res.values(): | ||
| 40 | # print(coordinates) | ||
| 41 | # print(text) | ||
| 42 | cv2.rectangle(img, (coordinates[0], coordinates[1]), (coordinates[4], coordinates[5]), (0, 255, 0), 2) | ||
| 43 | pil_img = Image.fromarray(img) | ||
| 44 | draw = ImageDraw.Draw(pil_img) | ||
| 45 | draw.text((coordinates[0], coordinates[1]), text, (255, 0, 0), font=font) | ||
| 46 | img = np.array(pil_img) | ||
| 47 | |||
| 48 | cv2.imwrite(output_path, img) | ||
| 49 | |||
| 50 | sign_response = requests.post(url=r'http://139.196.149.46:9001/signature_detect', files={'file': open(image_path, 'rb')}) | ||
| 51 | signature_res = sign_response.json() | ||
| 52 | |||
| 53 | with open(sign_res_path, 'w') as fp: | ||
| 54 | json.dump(signature_res, fp, ensure_ascii=False) | ||
| 55 | |||
| 56 | # print(signature_res) | ||
| 57 | |||
| 58 | # start_time = time.time() | ||
| 59 | |||
| 60 | # res = retriever_individuals.get_target_fields(go_res, signature_res) | ||
| 61 | # print(res) | ||
| 62 | |||
| 63 | # end_time = time.time() | ||
| 64 | # print('time: {0}'.format(end_time - start_time)) | ||
| 65 | |||
| 66 | # break | ||
| 67 | |||
| 68 |
-
Please register or sign in to post a comment