add general extractor
Showing
4 changed files
with
702 additions
and
1 deletions
general_extractor/const.py
0 → 100644
| 1 | REPLACE_DICT_1 = { | ||
| 2 | "元": "圆", | ||
| 3 | # "零角": "零", | ||
| 4 | "柴": "柒", | ||
| 5 | "染": "柒", | ||
| 6 | "查": "壹", | ||
| 7 | "武": "贰", | ||
| 8 | "家": "贰", | ||
| 9 | "就": "贰", | ||
| 10 | "登": "叁", | ||
| 11 | # "@整": "叁", | ||
| 12 | "鑫": "叁", | ||
| 13 | "垂": "叁", | ||
| 14 | "捆": "捌", | ||
| 15 | "搁": "捌", | ||
| 16 | "级": "捌", | ||
| 17 | "测": "捌", | ||
| 18 | "拥": "捌", | ||
| 19 | "损": "捌", | ||
| 20 | "盒": "叁", | ||
| 21 | "摄": "捌", | ||
| 22 | "报": "捌", | ||
| 23 | "会": "叁", | ||
| 24 | "索": "壹", | ||
| 25 | "任": "仟", | ||
| 26 | "杆": "仟", | ||
| 27 | "仔": "仟", | ||
| 28 | "什": "仟", | ||
| 29 | "付": "仟", | ||
| 30 | "伴": "仟", | ||
| 31 | "宿": "佰", | ||
| 32 | "信": "佰", | ||
| 33 | "情": "佰", | ||
| 34 | "值": "佰", | ||
| 35 | "荣": "柒", | ||
| 36 | "渠": "柒", | ||
| 37 | "类": "柒", | ||
| 38 | "案": "柒", | ||
| 39 | "集": "柒", | ||
| 40 | "方": "万", | ||
| 41 | "抬": "拾", | ||
| 42 | "给": "拾", | ||
| 43 | "樟": "肆", | ||
| 44 | "单": "肆", | ||
| 45 | "邮": "肆", | ||
| 46 | "政": "玖", | ||
| 47 | "拐": "捌", | ||
| 48 | # "柴": "柒", | ||
| 49 | # "任": "仟", | ||
| 50 | # "拥": "捌", | ||
| 51 | # "会": "叁", | ||
| 52 | } | ||
| 53 | |||
| 54 | |||
| 55 | ARG_KEY_KEY_LIST = 'keys_list' | ||
| 56 | ARG_KEY_VALUE_DICT = 'values_dict' | ||
| 57 | |||
| 58 | INVOICE_KEY_LIST = [ | ||
| 59 | ('纳税人识别号', False), # 相近的key 0 | ||
| 60 | ('增值税', False), # 相近的key 1 | ||
| 61 | |||
| 62 | ('地', False), # 单字的key 2 | ||
| 63 | ('址', False), # 单字的key 3 | ||
| 64 | |||
| 65 | ('开票日期', '开票曰期', '开票日', True), # 4 | ||
| 66 | ('发票代码', '发票代鸡', True), # 5 | ||
| 67 | ('发票号码', '发票号瑞', '发要号瑞', True), # 6 | ||
| 68 | ('机打代码', False), # 7 | ||
| 69 | ('机打号码', '机打号玛', False), # 8 | ||
| 70 | ('机器编号', False), # 9 | ||
| 71 | ('购买方名称', '购买方名称及', False), # 10 | ||
| 72 | ('纳税人识别号/', False), # 11 | ||
| 73 | ('统一社会信用代码/', False), # 12 | ||
| 74 | ('身份证号码', '身份证号码/', False), # 13 | ||
| 75 | ('车辆类型', True), # 14 | ||
| 76 | ('厂牌型号', '广牌型号', '厂胖型号', '广牌型考', True), # 15 | ||
| 77 | ('产地', '严地', True), # 16 | ||
| 78 | ('合格证号', False), # 17 | ||
| 79 | ('进口证明书号', True), # 18 | ||
| 80 | ('商检单号', True), # 19 | ||
| 81 | ('发动机号码', False), # 20 | ||
| 82 | ('车辆识别代号/车架号码', True), # 21 | ||
| 83 | ('价税合计', '价现合计', '价“税合计', False), # 22 | ||
| 84 | ('小写', True), # 23 TODO 多个值时的取值 | ||
| 85 | ('销货单位名称', False), # 24 | ||
| 86 | ('电话', True), # 25 | ||
| 87 | ('账号', '账考', '帐号', '帐考', '张号', '陈号', '昨号', True), # 26 | ||
| 88 | ('开户银行', True), # 27 | ||
| 89 | ('增值税税率', True), # 28 value false | ||
| 90 | ('或征收税', False), # 29 | ||
| 91 | ('税额', False), # 30 | ||
| 92 | ('主管税务', True), # 31 value False | ||
| 93 | ('机关及代码', True), # 32 | ||
| 94 | ('不含税价', True), # 33 value False | ||
| 95 | ('完税凭证号码', False), # 34 | ||
| 96 | ('开票人', True), # 35 | ||
| 97 | ('吨位', True), # 36 | ||
| 98 | ('限乘人数', '跟乘人数', True), # 37 TODO '人数'这种情况的坐标切分 | ||
| 99 | ('备注', True) # 38 | ||
| 100 | ] | ||
| 101 | |||
| 102 | # split key-value一体 | ||
| 103 | # append key-value_suffix 需要坐标切分 | ||
| 104 | # insert key-value_prefix 需要坐标切分 | ||
| 105 | INVOICE_VALUE_DICT = { | ||
| 106 | '开票日期': { | ||
| 107 | 'length': 10, | ||
| 108 | 'str_type': 'date', | ||
| 109 | # idx, location, top, bottom, left, (idx, scope), choice, if_startswith | ||
| 110 | 'location': [(4, 'right', 0.3, 0.5, 0, (2, ), 'xmin', 'split')], | ||
| 111 | 'fix_methods': [('prune_first_char', {'char_set': {':', ':', ';', }})] | ||
| 112 | }, | ||
| 113 | '发票代码': { | ||
| 114 | 'length': 12, | ||
| 115 | 'str_type': 'int', | ||
| 116 | 'location': [(5, 'right', 0.2, 0.2, 0, (2, ), 'xmin', 'split')] | ||
| 117 | }, | ||
| 118 | '发票号码': { | ||
| 119 | 'length': 8, | ||
| 120 | 'str_type': 'int', | ||
| 121 | 'location': [(6, 'right', 0.2, 0.5, 0, (2, ), 'length', 'split')], | ||
| 122 | 'fix_methods': [('prune_first_char', {'char_set': {'-',}})] | ||
| 123 | }, | ||
| 124 | '机打代码': { | ||
| 125 | 'length': 12, | ||
| 126 | 'str_type': 'int', | ||
| 127 | 'location': [(7, 'right', 0.5, 1, 0, (2, ), 'ymin', None)] | ||
| 128 | }, | ||
| 129 | '机器编号': { | ||
| 130 | 'length': 12, | ||
| 131 | 'str_type': 'int', | ||
| 132 | 'location': [(9, 'right', 0.5, 1, 0, (2, ), 'ymax', None)] | ||
| 133 | }, | ||
| 134 | '机打号码': { | ||
| 135 | 'length': 8, | ||
| 136 | 'str_type': 'int', | ||
| 137 | 'location': [(8, 'right', 0.5, 0.5, 0, (2, ), 'length', None)] | ||
| 138 | }, | ||
| 139 | '购买方名称': { | ||
| 140 | 'length': None, | ||
| 141 | 'str_type': 'str', # cn | ||
| 142 | 'location': [(10, 'right', 0.5, 0.5, 0, (11, 12, 13, 2), 'xmin', None)] | ||
| 143 | }, | ||
| 144 | '纳税人识别号/统一社会信用代码/身份证号码': { | ||
| 145 | 'length': 18, | ||
| 146 | 'str_type': 'str', # alnum | ||
| 147 | 'location': [(11, 'right', 0, 2, 0, (2.5, ), 'length', None), (12, 'right', 1, 1, 0, (2, ), 'length', None), (13, 'right', 2, 0, 0.5, (3, ), 'length', None)] | ||
| 148 | }, | ||
| 149 | '车辆类型': { | ||
| 150 | 'length': None, | ||
| 151 | 'str_type': 'str', | ||
| 152 | 'location': [(14, 'right', 0.2, 0.2, 0, (15, 1.5), 'xmin', 'split'), (15, 'left', 0.2, 0.2, 0, (14, 2.5), 'xmax', None)] | ||
| 153 | }, | ||
| 154 | '厂牌型号': { | ||
| 155 | 'length': None, | ||
| 156 | 'str_type': 'str', | ||
| 157 | 'location': [(15, 'right', 0.2, 0.2, 0, (16, 3.5), 'xmin', 'split'), (16, 'left', 0.2, 0.2, 0, (15, 2.5), 'xmax', None)] | ||
| 158 | }, | ||
| 159 | '产地': { | ||
| 160 | 'length': None, | ||
| 161 | 'str_type': 'str', # cn | ||
| 162 | 'location': [(16, 'right', 0.2, 0.2, 0, (2.5, ), 'xmin', 'split')] | ||
| 163 | }, | ||
| 164 | '合格证号': { | ||
| 165 | 'length': None, # 15 | ||
| 166 | 'str_type': 'str', # alnum | ||
| 167 | 'location': [(17, 'right', 0.2, 0.2, 0, (18, 1.5), 'xmin', None), (18, 'left', 0.2, 0.2, 0, (17, 1.5), 'xmax', None)] | ||
| 168 | }, | ||
| 169 | '进口证明书号': { | ||
| 170 | 'length': None, | ||
| 171 | 'str_type': 'str', # alnum | ||
| 172 | 'location': [(18, 'right', 0.3, 0.3, 0, (19, 1.5), 'xmin', 'split'), (19, 'left', 0.2, 0.2, 0, (18, 3), 'xmax', None)] | ||
| 173 | }, | ||
| 174 | '商检单号': { | ||
| 175 | 'length': None, | ||
| 176 | 'str_type': 'str', | ||
| 177 | 'location': [(19, 'right', 0.2, 0.2, 0, (1.5, ), 'xmin', 'split')] | ||
| 178 | }, | ||
| 179 | '发动机号码': { | ||
| 180 | 'length': None, | ||
| 181 | 'str_type': 'str', # alnum | ||
| 182 | 'location': [(20, 'right', 0.2, 0.2, 0, (21, 2), 'xmin', None), (21, 'left', 0.2, 0.2, 0, (20, 1.4), 'xmax', None)] | ||
| 183 | }, | ||
| 184 | '车辆识别代号/车架号码': { | ||
| 185 | 'length': 17, | ||
| 186 | 'str_type': 'str', # alnum | ||
| 187 | 'location': [(21, 'right', 0.3, 0.3, 0, (1.2, ), 'xmin', 'split')] | ||
| 188 | }, | ||
| 189 | '价税合计大写': { | ||
| 190 | 'length': None, | ||
| 191 | 'str_type': 'str', # cn | ||
| 192 | 'location': [(22, 'right', 0.2, 0.2, 0, (23, 3), 'xmin', None), (23, 'left', 0.2, 0.2, 0, (22, 15), 'xmax', None)], | ||
| 193 | 'fix_methods': [('prune_no_cn', {}), ('replace_whole', {'replace_map': REPLACE_DICT_1})] | ||
| 194 | }, | ||
| 195 | '价税合计小写': { | ||
| 196 | 'length': None, | ||
| 197 | 'str_type': 'float', | ||
| 198 | 'location': [(23, 'right', 0.4, 0.4, 0, (4, ), 'xmin', 'split')], | ||
| 199 | 'fix_methods': [('prune_amount', {})] | ||
| 200 | }, | ||
| 201 | '销货单位名称': { | ||
| 202 | 'length': None, | ||
| 203 | 'str_type': 'str', # cn | ||
| 204 | 'location': [(24, 'right', 0.2, 0.2, 0, (25, 3), 'xmin', None), (25, 'left', 0.3, 0.3, 0, (24, 15), 'xmax', None)] | ||
| 205 | }, | ||
| 206 | '电话': { | ||
| 207 | 'length': None, | ||
| 208 | 'str_type': 'str', # int + - | ||
| 209 | 'location': [(25, 'right', 0.3, 0.3, 0, (5, ), 'xmin', 'split')] | ||
| 210 | }, | ||
| 211 | '纳税人识别号': { | ||
| 212 | 'length': None, | ||
| 213 | 'str_type': 'str', # cn | ||
| 214 | 'location': [(0, 'right', 0.3, 0.3, 0, (26, 2.5), 'xmin', None), (26, 'left', 0.3, 0.3, 0, (0, 15), 'xmax', None)] | ||
| 215 | }, | ||
| 216 | '账号': { | ||
| 217 | 'length': None, | ||
| 218 | 'str_type': 'str', | ||
| 219 | 'location': [(26, 'right', 0.3, 0.3, 0, (6, ), 'xmin', 'split')] | ||
| 220 | }, | ||
| 221 | '地址': { | ||
| 222 | 'length': None, | ||
| 223 | 'str_type': 'str', # cn | ||
| 224 | 'location': [(27, 'left', 0.3, 0.3, 0, (3, 4), 'merge', None), (3, 'right', 0.3, 0.3, 0, (27, 20), 'xmin', None)] | ||
| 225 | }, | ||
| 226 | '开户银行': { | ||
| 227 | 'length': None, | ||
| 228 | 'str_type': 'str', # cn | ||
| 229 | 'location': [(27, 'right', 0.3, 0.3, 0, (3, ), 'xmin', 'split')] | ||
| 230 | }, | ||
| 231 | '增值税税率或征收率': { | ||
| 232 | 'length': 3, | ||
| 233 | 'str_type': 'str', # 13% | ||
| 234 | 'location': [(28, 'right', 0, 1, 0, (1, 30, 1), 'xmin', None), (29, 'right', 1, 0, 0, (1, 30, 1), 'xmin', None), | ||
| 235 | (1, 'left', 0, 1, 0, (28, 29, 2), 'xmax', None), (30, 'left', 1, 0, 0, (28, 29, 2), 'xmax', None)], | ||
| 236 | 'fix_methods': [('replace_last_char', {'char_set': {'8', '9', '号'}, 'target_char': '%'})] | ||
| 237 | |||
| 238 | }, | ||
| 239 | '增值税税额': { | ||
| 240 | 'length': None, | ||
| 241 | 'str_type': 'float', | ||
| 242 | 'location': [(1, 'right', 0, 1, 0, (31, 32, 2.5), 'xmin', None), (30, 'right', 1, 0, 0, (31, 32, 2.5), 'xmin', None), | ||
| 243 | (31, 'left', 0, 1, 0, (1, 30, 2), 'xmax', None), (32, 'left', 1, 0, 0, (1, 30, 2), 'xmax', None)], | ||
| 244 | 'fix_methods': [('prune_amount', {})] | ||
| 245 | }, | ||
| 246 | '主管税务机关及代码': { | ||
| 247 | 'length': None, | ||
| 248 | 'str_type': 'str', | ||
| 249 | 'location': [(31, 'right', 0, 1.5, 0, (2, ), 'merge', None), (32, 'right', 1, 0.5, 0, (2, ), 'merge', None)] | ||
| 250 | }, | ||
| 251 | '不含税价-小写': { | ||
| 252 | 'length': None, | ||
| 253 | 'str_type': 'float', # cn | ||
| 254 | 'location': [(34, 'left', 0.3, 0.3, 0, (33, 1.5), 'xmax', None), (33, 'right', 0.2, 0.2, 0, (34, 1.5), 'xmin', None)], | ||
| 255 | 'fix_methods': [('prune_amount', {})] | ||
| 256 | }, | ||
| 257 | '完税凭证号码': { | ||
| 258 | 'length': None, | ||
| 259 | 'str_type': 'str', | ||
| 260 | 'location': [(34, 'right', 0.2, 0.2, 0, (36, 1.5), 'xmin', None), (36, 'left', 0.2, 0.2, 0, (34, 6), 'xmax', None)] | ||
| 261 | }, | ||
| 262 | '吨位': { | ||
| 263 | 'length': None, | ||
| 264 | 'str_type': 'str', | ||
| 265 | 'location': [(36, 'right', 0.2, 0.2, 0, (37, 1), 'xmin', 'split'), (37, 'left', 0.2, 0.2, 0, (36, 0.5), 'xmax', None)] | ||
| 266 | }, | ||
| 267 | '限乘人数': { | ||
| 268 | 'length': None, | ||
| 269 | 'str_type': 'int', | ||
| 270 | 'location': [(37, 'right', 0.2, 0.2, 0, (0.5, ), 'xmin', 'split')] | ||
| 271 | }, | ||
| 272 | '开票人': { | ||
| 273 | 'length': None, | ||
| 274 | 'str_type': 'str', | ||
| 275 | 'location': [(35, 'right', 0, 0.5, 0, (1.5, ), 'xmin', 'split')] | ||
| 276 | }, | ||
| 277 | '备注': { | ||
| 278 | 'length': None, | ||
| 279 | 'str_type': 'str', | ||
| 280 | 'location': [(38, 'right', 0.2, 0.2, 0, (2, ), 'xmin', 'split')], | ||
| 281 | 'fix_methods': [('prune_first_char', {'char_set': {';', ':', ':'}})] | ||
| 282 | }, | ||
| 283 | |||
| 284 | } | ||
| 285 | |||
| 286 | INVOICE_CONST = { | ||
| 287 | ARG_KEY_KEY_LIST: INVOICE_KEY_LIST, | ||
| 288 | ARG_KEY_VALUE_DICT: INVOICE_VALUE_DICT | ||
| 289 | } |
general_extractor/retriever.py
0 → 100644
| 1 | import re | ||
| 2 | import math | ||
| 3 | |||
| 4 | |||
| 5 | class Retriever: | ||
| 6 | |||
| 7 | def __init__(self, keys_list=[], values_dict={}): | ||
| 8 | self.keys_list = keys_list | ||
| 9 | self.values_dict = values_dict | ||
| 10 | self.find_keys_list = [] | ||
| 11 | |||
| 12 | @staticmethod | ||
| 13 | def get_theta(x0, y0, x1, y1): | ||
| 14 | theta = math.atan((y0-y1)/(x1-x0)) | ||
| 15 | return math.cos(theta), math.sin(theta) | ||
| 16 | |||
| 17 | @staticmethod | ||
| 18 | def rebuild_xy(x, y, cos, sin): | ||
| 19 | rebuild_x = x * cos - y * sin | ||
| 20 | rebuild_y = y * cos + x * sin | ||
| 21 | return rebuild_x, rebuild_y | ||
| 22 | |||
| 23 | def rebuild_coord(self, coord_tuple, cos, sin): | ||
| 24 | rebuild_list = [] | ||
| 25 | for idx in range(0, len(coord_tuple), 2): | ||
| 26 | rebuild_list.extend(self.rebuild_xy(coord_tuple[idx], coord_tuple[idx+1], cos, sin)) | ||
| 27 | return rebuild_list | ||
| 28 | |||
| 29 | @staticmethod | ||
| 30 | def prune_no_cn(src_str): | ||
| 31 | fix_str = re.sub(r'[^\u4e00-\u9fa5]+', '', src_str) | ||
| 32 | return fix_str | ||
| 33 | |||
| 34 | @staticmethod | ||
| 35 | def prune_first_char(src_str, char_set): | ||
| 36 | if src_str[0] in char_set: | ||
| 37 | return src_str[1:] | ||
| 38 | return src_str | ||
| 39 | |||
| 40 | @staticmethod | ||
| 41 | def prune_amount(src_str): | ||
| 42 | fix_str = ''.join(filter(lambda i: i in [',', '.'] or str.isdigit(i), src_str)) | ||
| 43 | return fix_str | ||
| 44 | |||
| 45 | @staticmethod | ||
| 46 | def replace_whole(src_str, replace_map): | ||
| 47 | fix_str = src_str.translate(str.maketrans(replace_map)) | ||
| 48 | return fix_str | ||
| 49 | |||
| 50 | @staticmethod | ||
| 51 | def replace_last_char(src_str, char_set, target_char): | ||
| 52 | if src_str[-1] in char_set: | ||
| 53 | return src_str[:-1] + target_char | ||
| 54 | return src_str | ||
| 55 | |||
| 56 | # @staticmethod | ||
| 57 | # def prune_RMB(src_str): | ||
| 58 | # return src_str | ||
| 59 | |||
| 60 | @staticmethod | ||
| 61 | def choice_xmin(value_list, value_length): | ||
| 62 | value_list.sort(key=lambda x: x[1]) | ||
| 63 | return value_list[0] | ||
| 64 | |||
| 65 | @staticmethod | ||
| 66 | def choice_xmax(value_list, value_length): | ||
| 67 | value_list.sort(key=lambda x: x[1], reverse=True) | ||
| 68 | return value_list[0] | ||
| 69 | |||
| 70 | @staticmethod | ||
| 71 | def choice_ymin(value_list, value_length): | ||
| 72 | value_list.sort(key=lambda x: x[2]) | ||
| 73 | return value_list[0] | ||
| 74 | |||
| 75 | @staticmethod | ||
| 76 | def choice_ymax(value_list, value_length): | ||
| 77 | value_list.sort(key=lambda x: x[2], reverse=True) | ||
| 78 | return value_list[0] | ||
| 79 | |||
| 80 | @staticmethod | ||
| 81 | def choice_merge(value_list, value_length): | ||
| 82 | value_list.sort(key=lambda x: x[2]) | ||
| 83 | merged_value_list = [] | ||
| 84 | merged_idx_list = [] | ||
| 85 | merged_x_list = [] | ||
| 86 | merged_y_list = [] | ||
| 87 | for text, x0, y0, x1, y1, idx_tuple in value_list: | ||
| 88 | merged_value_list.append(text) | ||
| 89 | merged_idx_list.extend(idx_tuple) | ||
| 90 | merged_x_list.append(x0) | ||
| 91 | merged_x_list.append(x1) | ||
| 92 | merged_y_list.append(y0) | ||
| 93 | merged_y_list.append(y1) | ||
| 94 | return (''.join(merged_value_list), | ||
| 95 | min(merged_x_list), | ||
| 96 | min(merged_y_list), | ||
| 97 | max(merged_x_list), | ||
| 98 | max(merged_y_list), | ||
| 99 | tuple(merged_idx_list)) | ||
| 100 | |||
| 101 | @staticmethod | ||
| 102 | def choice_length(value_list, value_length): | ||
| 103 | value_list.sort(key=lambda x: len(x[0]) - value_length) | ||
| 104 | return | ||
| 105 | |||
| 106 | def value_direction_left(self, go_res, key_idx, top_or_left, bottom_or_right, offset, scope_tuple, choice_method, | ||
| 107 | if_startswith, length): | ||
| 108 | # 字段值查找方向:左侧 | ||
| 109 | |||
| 110 | if self.find_keys_list[key_idx] is None: | ||
| 111 | return | ||
| 112 | |||
| 113 | _, _, find_key_str, suffix_key, key_x0, key_y0, key_x1, key_y1 = self.find_keys_list[key_idx] | ||
| 114 | |||
| 115 | for scope_key_idx in scope_tuple[:-1]: | ||
| 116 | if self.find_keys_list[scope_key_idx] is None: | ||
| 117 | continue | ||
| 118 | key_scope = self.find_keys_list[scope_key_idx][6] # left x1 | ||
| 119 | break | ||
| 120 | else: | ||
| 121 | key_scope = None | ||
| 122 | |||
| 123 | if isinstance(if_startswith, str): | ||
| 124 | if isinstance(suffix_key, str): | ||
| 125 | # TODO suffix_key校验与修正 | ||
| 126 | # TODO 目前只考虑了split的情况 | ||
| 127 | return suffix_key, key_x0, key_y0, key_x1, key_y1, () | ||
| 128 | |||
| 129 | height = key_y1 - key_y0 | ||
| 130 | y_min = key_y0 - (top_or_left * height) | ||
| 131 | y_max = key_y1 + (bottom_or_right * height) | ||
| 132 | |||
| 133 | width = key_x1 - key_x0 | ||
| 134 | x_max = key_x0 - (offset * width) | ||
| 135 | x_min = x_max - (width * scope_tuple[-1]) if key_scope is None else key_scope | ||
| 136 | |||
| 137 | all_find_value_list = [] | ||
| 138 | for go_key_idx, ((x0, y0, _, _, x1, y1, _, _), text) in go_res.items(): | ||
| 139 | cent_x = x0 + ((x1 - x0) / 2) | ||
| 140 | cent_y = y0 + ((y1 - y0) / 2) | ||
| 141 | # if go_key_idx == '98' and key_idx == 34: | ||
| 142 | # print(key_scope) | ||
| 143 | # print('-------------') | ||
| 144 | # print(cent_x) | ||
| 145 | # print(cent_y) | ||
| 146 | # print('-----------') | ||
| 147 | # print(key_x0) | ||
| 148 | # print(key_x1) | ||
| 149 | # print(key_y0) | ||
| 150 | # print(key_y1) | ||
| 151 | # print('-----------') | ||
| 152 | # print(x_min) | ||
| 153 | # print(x_max) | ||
| 154 | # print(y_min) | ||
| 155 | # print(y_max) | ||
| 156 | # print('===============') | ||
| 157 | if x_min < cent_x < x_max and y_min < cent_y < y_max: | ||
| 158 | all_find_value_list.append((text, x0, y0, x1, y1, (go_key_idx, ))) | ||
| 159 | |||
| 160 | if len(all_find_value_list) == 0: | ||
| 161 | return | ||
| 162 | elif len(all_find_value_list) == 1: | ||
| 163 | return all_find_value_list[0] | ||
| 164 | else: | ||
| 165 | choice_value = getattr(self, 'choice_{0}'.format(choice_method))(all_find_value_list, length) | ||
| 166 | return choice_value | ||
| 167 | |||
| 168 | # if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str): | ||
| 169 | # new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {}))) | ||
| 170 | # return new_value, coordinates | ||
| 171 | |||
| 172 | def value_direction_right(self, go_res, key_idx, top_or_left, bottom_or_right, offset, scope_tuple, choice_method, | ||
| 173 | if_startswith, length): | ||
| 174 | # 字段值查找方向:右侧 | ||
| 175 | |||
| 176 | if self.find_keys_list[key_idx] is None: | ||
| 177 | return | ||
| 178 | |||
| 179 | _, _, find_key_str, suffix_key, key_x0_src, key_y0_src, key_x1_src, key_y1_src, key_x2_src, key_y2_src, key_x3_src, key_y3_src = self.find_keys_list[key_idx] | ||
| 180 | |||
| 181 | for scope_key_idx in scope_tuple[:-1]: | ||
| 182 | if self.find_keys_list[scope_key_idx] is None: | ||
| 183 | continue | ||
| 184 | key_scope_tuple = (self.find_keys_list[scope_key_idx][4], self.find_keys_list[scope_key_idx][5]) # right x0, y0 | ||
| 185 | break | ||
| 186 | else: | ||
| 187 | key_scope_tuple = None | ||
| 188 | |||
| 189 | if isinstance(if_startswith, str): | ||
| 190 | if isinstance(suffix_key, str): | ||
| 191 | # TODO suffix_key校验与修正 | ||
| 192 | # TODO 目前只考虑了split的情况 | ||
| 193 | if isinstance(length, int): | ||
| 194 | if -3 < length - len(suffix_key) < 3: | ||
| 195 | return suffix_key, (key_x0_src, key_y0_src, key_x1_src, key_y1_src, key_x2_src, key_y2_src, key_x3_src, key_y3_src), () | ||
| 196 | else: | ||
| 197 | return suffix_key, (key_x0_src, key_y0_src, key_x1_src, key_y1_src, key_x2_src, key_y2_src, key_x3_src, key_y3_src), () | ||
| 198 | |||
| 199 | # 坐标系转换 | ||
| 200 | cos, sin = self.get_theta(x0, y0, x1, y1) | ||
| 201 | key_x0, key_y0, key_x1, key_y1, key_x2, key_y2, key_x3, key_y3 = self.rebuild_coord((key_x0_src, key_y0_src, key_x1_src, key_y1_src, key_x2_src, key_y2_src, key_x3_src, key_y3_src), cos, sin) | ||
| 202 | |||
| 203 | height = key_y2 - key_y0 | ||
| 204 | y_min = key_y0 - (top_or_left * height) | ||
| 205 | y_max = key_y2 + (bottom_or_right * height) | ||
| 206 | |||
| 207 | width = key_x2 - key_x0 | ||
| 208 | x_min = key_x2 + (offset * width) | ||
| 209 | x_max = x_min + (width * scope_tuple[-1]) if key_scope_tuple is None else self.rebuild_xy(*key_scope_tuple, cos, sin)[0] | ||
| 210 | |||
| 211 | all_find_value_list = [] | ||
| 212 | for go_key_idx, ((x0, y0, x1, y1, x2, y3, x3, y3), text) in go_res.items(): | ||
| 213 | cent_x, cent_y = self.rebuild_xy(x0 + ((x2 - x0) / 2), y0 + ((y2 - y0) / 2), cos, sin) | ||
| 214 | # if go_key_idx == '98' and key_idx == 34: | ||
| 215 | # print(cent_x) | ||
| 216 | # print(cent_y) | ||
| 217 | # print('-----------') | ||
| 218 | # print(key_x0) | ||
| 219 | # print(key_x1) | ||
| 220 | # print(key_y0) | ||
| 221 | # print(key_y1) | ||
| 222 | # print('-----------') | ||
| 223 | # print(x_min) | ||
| 224 | # print(x_max) | ||
| 225 | # print(y_min) | ||
| 226 | # print(y_max) | ||
| 227 | if x_min < cent_x < x_max and y_min < cent_y < y_max: | ||
| 228 | all_find_value_list.append((text, x0, y0, x1, y1, x2, y2, x3, y3, (go_key_idx, ))) | ||
| 229 | |||
| 230 | if len(all_find_value_list) == 0: | ||
| 231 | return | ||
| 232 | elif len(all_find_value_list) == 1: | ||
| 233 | return all_find_value_list[0] | ||
| 234 | else: | ||
| 235 | # TODO choice时的坐标转换? | ||
| 236 | choice_value = getattr(self, 'choice_{0}'.format(choice_method))(all_find_value_list, length) | ||
| 237 | return choice_value | ||
| 238 | |||
| 239 | # if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str): | ||
| 240 | # new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {}))) | ||
| 241 | # return new_value, coordinates | ||
| 242 | |||
| 243 | @staticmethod | ||
| 244 | def splitext(base_str, key_str, x0, y0, x1, y1, x2, y2, x3, y3): | ||
| 245 | suffix_value = base_str[len(key_str):] # TODO 坐标切分 | ||
| 246 | return key_str, suffix_value, x1, y1, x2, y2 | ||
| 247 | # return prefix_key, suffix_value, new_x1 | ||
| 248 | |||
| 249 | def search_keys(self, go_res): | ||
| 250 | find_keys_list = [None for _ in range(len(self.keys_list))] | ||
| 251 | rm_go_key_set = set() | ||
| 252 | done_key_idx_set = set() | ||
| 253 | |||
| 254 | for key_idx, key_tuple in enumerate(self.keys_list): | ||
| 255 | for str_idx, ((x0, y0, x1, y1, x2, y2, x3, y3), text) in go_res.items(): | ||
| 256 | if len(text.strip()) == 0: # 去除空格 | ||
| 257 | rm_go_key_set.add(str_idx) | ||
| 258 | continue | ||
| 259 | for key_str in key_tuple[:-1]: | ||
| 260 | if text == key_str: # 全值匹配 | ||
| 261 | find_keys_list[key_idx] = (key_tuple[0], key_str, text, None, x0, y0, x1, y1, x2, y2, x3, y3) | ||
| 262 | done_key_idx_set.add(key_idx) | ||
| 263 | rm_go_key_set.add(str_idx) | ||
| 264 | break | ||
| 265 | else: | ||
| 266 | continue | ||
| 267 | break | ||
| 268 | |||
| 269 | for go_key in rm_go_key_set: | ||
| 270 | go_res.pop(go_key) | ||
| 271 | rm_go_key_set.clear() | ||
| 272 | |||
| 273 | for key_idx, key_tuple in enumerate(self.keys_list): | ||
| 274 | if key_idx in done_key_idx_set or not key_tuple[-1]: | ||
| 275 | continue | ||
| 276 | |||
| 277 | for str_idx, ((x0, y0, x1, y1, x2, y2, x3, y3), text) in go_res.items(): | ||
| 278 | if text.startswith(key_tuple[0]): # 以key开头 | ||
| 279 | prefix_key, suffix_value, new_x1, new_y1, new_x2, new_y2 = self.splitext( | ||
| 280 | text, key_tuple[0], x0, y0, x1, y1, x2, y2, x3, y3) | ||
| 281 | find_keys_list[key_idx] = (key_tuple[0], key_tuple[0], text, suffix_value, | ||
| 282 | x0, y0, new_x1, new_y1, new_x2, new_y2, x3, y3) | ||
| 283 | done_key_idx_set.add(key_idx) | ||
| 284 | rm_go_key_set.add(str_idx) | ||
| 285 | break | ||
| 286 | |||
| 287 | for go_key in rm_go_key_set: | ||
| 288 | go_res.pop(go_key) | ||
| 289 | rm_go_key_set.clear() | ||
| 290 | |||
| 291 | self.find_keys_list = find_keys_list | ||
| 292 | |||
| 293 | # for i in find_keys_list: | ||
| 294 | # print(i) | ||
| 295 | |||
| 296 | def search_values(self, go_res): | ||
| 297 | # idx, location, top, bottom, left, (idx, scope), choice, if_startswith | ||
| 298 | find_value_dict = dict() | ||
| 299 | rm_go_key_set = set() | ||
| 300 | for cn_key, search_dict in self.values_dict.items(): | ||
| 301 | for key_idx, direction_str, top_or_left, bottom_or_right, offset, scope_tuple, choice_method, if_startswith in search_dict['location']: | ||
| 302 | value_tuple = getattr(self, 'value_direction_{0}'.format(direction_str))( | ||
| 303 | go_res, | ||
| 304 | key_idx, | ||
| 305 | top_or_left, | ||
| 306 | bottom_or_right, | ||
| 307 | offset, | ||
| 308 | scope_tuple, | ||
| 309 | choice_method, | ||
| 310 | if_startswith, | ||
| 311 | search_dict['length'], | ||
| 312 | ) | ||
| 313 | if isinstance(value_tuple, tuple): | ||
| 314 | break | ||
| 315 | |||
| 316 | if isinstance(value_tuple, tuple): | ||
| 317 | fixed_str = value_tuple[0] | ||
| 318 | for fix_method, kwargs in search_dict.get('fix_methods', []): | ||
| 319 | fixed_str = getattr(self, fix_method)(fixed_str, **kwargs) | ||
| 320 | find_value_dict[cn_key] = fixed_str | ||
| 321 | else: | ||
| 322 | find_value_dict[cn_key] = '' | ||
| 323 | |||
| 324 | # TODO 坐标重构 | ||
| 325 | |||
| 326 | if isinstance(value_tuple, tuple): | ||
| 327 | for go_key in value_tuple[-1]: | ||
| 328 | go_res.pop(go_key) | ||
| 329 | |||
| 330 | return find_value_dict | ||
| 331 | |||
| 332 | def extract_fields(self, go_res): | ||
| 333 | # 搜索关键词 | ||
| 334 | self.search_keys(go_res) | ||
| 335 | res = self.search_values(go_res) | ||
| 336 | return res | ||
| 337 |
general_extractor/step1.py
0 → 100644
| 1 | import json | ||
| 2 | import os | ||
| 3 | import base64 | ||
| 4 | import requests | ||
| 5 | import cv2 | ||
| 6 | import time | ||
| 7 | import numpy as np | ||
| 8 | from PIL import Image, ImageDraw, ImageFont | ||
| 9 | |||
| 10 | |||
| 11 | base_dir = os.path.dirname(os.path.abspath(__file__)) | ||
| 12 | img_dir = '/home/zwq/data/gcfp/valid/image' | ||
| 13 | draw_dir = os.path.join(base_dir, 'draw', 'valid') | ||
| 14 | sign_dir = os.path.join(base_dir, 'sign_res', 'valid') | ||
| 15 | go_dir = os.path.join(base_dir, 'go_res', 'valid') | ||
| 16 | |||
| 17 | font_path = os.path.join(base_dir, 'simhei.ttf') | ||
| 18 | font = ImageFont.truetype(font_path, 10, encoding="utf-8") | ||
| 19 | |||
| 20 | |||
| 21 | for image_name in os.listdir(img_dir): | ||
| 22 | |||
| 23 | print('start: {0}'.format(image_name)) | ||
| 24 | base_image_name, _ = os.path.splitext(image_name) | ||
| 25 | |||
| 26 | image_path = os.path.join(img_dir, image_name) | ||
| 27 | output_path = os.path.join(draw_dir, image_name) | ||
| 28 | go_res_path = os.path.join(go_dir, '{0}.json'.format(base_image_name)) | ||
| 29 | sign_res_path = os.path.join(sign_dir, '{0}.json'.format(base_image_name)) | ||
| 30 | |||
| 31 | go_response = requests.post(url=r'http://139.196.149.46:9001/gen_ocr', files={'file': open(image_path, 'rb')}) | ||
| 32 | go_res = go_response.json()['ocr_results'] | ||
| 33 | # print(go_res) | ||
| 34 | |||
| 35 | with open(go_res_path, 'w') as fp: | ||
| 36 | json.dump(go_res, fp, ensure_ascii=False) | ||
| 37 | |||
| 38 | img = cv2.imread(image_path) | ||
| 39 | for coordinates, text in go_res.values(): | ||
| 40 | # print(coordinates) | ||
| 41 | # print(text) | ||
| 42 | cv2.rectangle(img, (coordinates[0], coordinates[1]), (coordinates[4], coordinates[5]), (0, 255, 0), 2) | ||
| 43 | pil_img = Image.fromarray(img) | ||
| 44 | draw = ImageDraw.Draw(pil_img) | ||
| 45 | draw.text((coordinates[0], coordinates[1]), text, (255, 0, 0), font=font) | ||
| 46 | img = np.array(pil_img) | ||
| 47 | |||
| 48 | cv2.imwrite(output_path, img) | ||
| 49 | |||
| 50 | sign_response = requests.post(url=r'http://139.196.149.46:9001/signature_detect', files={'file': open(image_path, 'rb')}) | ||
| 51 | signature_res = sign_response.json() | ||
| 52 | |||
| 53 | with open(sign_res_path, 'w') as fp: | ||
| 54 | json.dump(signature_res, fp, ensure_ascii=False) | ||
| 55 | |||
| 56 | # print(signature_res) | ||
| 57 | |||
| 58 | # start_time = time.time() | ||
| 59 | |||
| 60 | # res = retriever_individuals.get_target_fields(go_res, signature_res) | ||
| 61 | # print(res) | ||
| 62 | |||
| 63 | # end_time = time.time() | ||
| 64 | # print('time: {0}'.format(end_time - start_time)) | ||
| 65 | |||
| 66 | # break | ||
| 67 | |||
| 68 |
-
Please register or sign in to post a comment