43758c7e by 周伟奇

add general extractor

1 parent d8cec4a0
......@@ -16,4 +16,11 @@ test*
*.jpg
*.out
*.log
\ No newline at end of file
*.log
sample/
go_res/
test.py
simhei.ttf
sign_res/
res_valid.json
......
REPLACE_DICT_1 = {
"元": "圆",
# "零角": "零",
"柴": "柒",
"染": "柒",
"查": "壹",
"武": "贰",
"家": "贰",
"就": "贰",
"登": "叁",
# "@整": "叁",
"鑫": "叁",
"垂": "叁",
"捆": "捌",
"搁": "捌",
"级": "捌",
"测": "捌",
"拥": "捌",
"损": "捌",
"盒": "叁",
"摄": "捌",
"报": "捌",
"会": "叁",
"索": "壹",
"任": "仟",
"杆": "仟",
"仔": "仟",
"什": "仟",
"付": "仟",
"伴": "仟",
"宿": "佰",
"信": "佰",
"情": "佰",
"值": "佰",
"荣": "柒",
"渠": "柒",
"类": "柒",
"案": "柒",
"集": "柒",
"方": "万",
"抬": "拾",
"给": "拾",
"樟": "肆",
"单": "肆",
"邮": "肆",
"政": "玖",
"拐": "捌",
# "柴": "柒",
# "任": "仟",
# "拥": "捌",
# "会": "叁",
}
ARG_KEY_KEY_LIST = 'keys_list'
ARG_KEY_VALUE_DICT = 'values_dict'
INVOICE_KEY_LIST = [
('纳税人识别号', False), # 相近的key 0
('增值税', False), # 相近的key 1
('地', False), # 单字的key 2
('址', False), # 单字的key 3
('开票日期', '开票曰期', '开票日', True), # 4
('发票代码', '发票代鸡', True), # 5
('发票号码', '发票号瑞', '发要号瑞', True), # 6
('机打代码', False), # 7
('机打号码', '机打号玛', False), # 8
('机器编号', False), # 9
('购买方名称', '购买方名称及', False), # 10
('纳税人识别号/', False), # 11
('统一社会信用代码/', False), # 12
('身份证号码', '身份证号码/', False), # 13
('车辆类型', True), # 14
('厂牌型号', '广牌型号', '厂胖型号', '广牌型考', True), # 15
('产地', '严地', True), # 16
('合格证号', False), # 17
('进口证明书号', True), # 18
('商检单号', True), # 19
('发动机号码', False), # 20
('车辆识别代号/车架号码', True), # 21
('价税合计', '价现合计', '价“税合计', False), # 22
('小写', True), # 23 TODO 多个值时的取值
('销货单位名称', False), # 24
('电话', True), # 25
('账号', '账考', '帐号', '帐考', '张号', '陈号', '昨号', True), # 26
('开户银行', True), # 27
('增值税税率', True), # 28 value false
('或征收税', False), # 29
('税额', False), # 30
('主管税务', True), # 31 value False
('机关及代码', True), # 32
('不含税价', True), # 33 value False
('完税凭证号码', False), # 34
('开票人', True), # 35
('吨位', True), # 36
('限乘人数', '跟乘人数', True), # 37 TODO '人数'这种情况的坐标切分
('备注', True) # 38
]
# split key-value一体
# append key-value_suffix 需要坐标切分
# insert key-value_prefix 需要坐标切分
INVOICE_VALUE_DICT = {
'开票日期': {
'length': 10,
'str_type': 'date',
# idx, location, top, bottom, left, (idx, scope), choice, if_startswith
'location': [(4, 'right', 0.3, 0.5, 0, (2, ), 'xmin', 'split')],
'fix_methods': [('prune_first_char', {'char_set': {':', ':', ';', }})]
},
'发票代码': {
'length': 12,
'str_type': 'int',
'location': [(5, 'right', 0.2, 0.2, 0, (2, ), 'xmin', 'split')]
},
'发票号码': {
'length': 8,
'str_type': 'int',
'location': [(6, 'right', 0.2, 0.5, 0, (2, ), 'length', 'split')],
'fix_methods': [('prune_first_char', {'char_set': {'-',}})]
},
'机打代码': {
'length': 12,
'str_type': 'int',
'location': [(7, 'right', 0.5, 1, 0, (2, ), 'ymin', None)]
},
'机器编号': {
'length': 12,
'str_type': 'int',
'location': [(9, 'right', 0.5, 1, 0, (2, ), 'ymax', None)]
},
'机打号码': {
'length': 8,
'str_type': 'int',
'location': [(8, 'right', 0.5, 0.5, 0, (2, ), 'length', None)]
},
'购买方名称': {
'length': None,
'str_type': 'str', # cn
'location': [(10, 'right', 0.5, 0.5, 0, (11, 12, 13, 2), 'xmin', None)]
},
'纳税人识别号/统一社会信用代码/身份证号码': {
'length': 18,
'str_type': 'str', # alnum
'location': [(11, 'right', 0, 2, 0, (2.5, ), 'length', None), (12, 'right', 1, 1, 0, (2, ), 'length', None), (13, 'right', 2, 0, 0.5, (3, ), 'length', None)]
},
'车辆类型': {
'length': None,
'str_type': 'str',
'location': [(14, 'right', 0.2, 0.2, 0, (15, 1.5), 'xmin', 'split'), (15, 'left', 0.2, 0.2, 0, (14, 2.5), 'xmax', None)]
},
'厂牌型号': {
'length': None,
'str_type': 'str',
'location': [(15, 'right', 0.2, 0.2, 0, (16, 3.5), 'xmin', 'split'), (16, 'left', 0.2, 0.2, 0, (15, 2.5), 'xmax', None)]
},
'产地': {
'length': None,
'str_type': 'str', # cn
'location': [(16, 'right', 0.2, 0.2, 0, (2.5, ), 'xmin', 'split')]
},
'合格证号': {
'length': None, # 15
'str_type': 'str', # alnum
'location': [(17, 'right', 0.2, 0.2, 0, (18, 1.5), 'xmin', None), (18, 'left', 0.2, 0.2, 0, (17, 1.5), 'xmax', None)]
},
'进口证明书号': {
'length': None,
'str_type': 'str', # alnum
'location': [(18, 'right', 0.3, 0.3, 0, (19, 1.5), 'xmin', 'split'), (19, 'left', 0.2, 0.2, 0, (18, 3), 'xmax', None)]
},
'商检单号': {
'length': None,
'str_type': 'str',
'location': [(19, 'right', 0.2, 0.2, 0, (1.5, ), 'xmin', 'split')]
},
'发动机号码': {
'length': None,
'str_type': 'str', # alnum
'location': [(20, 'right', 0.2, 0.2, 0, (21, 2), 'xmin', None), (21, 'left', 0.2, 0.2, 0, (20, 1.4), 'xmax', None)]
},
'车辆识别代号/车架号码': {
'length': 17,
'str_type': 'str', # alnum
'location': [(21, 'right', 0.3, 0.3, 0, (1.2, ), 'xmin', 'split')]
},
'价税合计大写': {
'length': None,
'str_type': 'str', # cn
'location': [(22, 'right', 0.2, 0.2, 0, (23, 3), 'xmin', None), (23, 'left', 0.2, 0.2, 0, (22, 15), 'xmax', None)],
'fix_methods': [('prune_no_cn', {}), ('replace_whole', {'replace_map': REPLACE_DICT_1})]
},
'价税合计小写': {
'length': None,
'str_type': 'float',
'location': [(23, 'right', 0.4, 0.4, 0, (4, ), 'xmin', 'split')],
'fix_methods': [('prune_amount', {})]
},
'销货单位名称': {
'length': None,
'str_type': 'str', # cn
'location': [(24, 'right', 0.2, 0.2, 0, (25, 3), 'xmin', None), (25, 'left', 0.3, 0.3, 0, (24, 15), 'xmax', None)]
},
'电话': {
'length': None,
'str_type': 'str', # int + -
'location': [(25, 'right', 0.3, 0.3, 0, (5, ), 'xmin', 'split')]
},
'纳税人识别号': {
'length': None,
'str_type': 'str', # cn
'location': [(0, 'right', 0.3, 0.3, 0, (26, 2.5), 'xmin', None), (26, 'left', 0.3, 0.3, 0, (0, 15), 'xmax', None)]
},
'账号': {
'length': None,
'str_type': 'str',
'location': [(26, 'right', 0.3, 0.3, 0, (6, ), 'xmin', 'split')]
},
'地址': {
'length': None,
'str_type': 'str', # cn
'location': [(27, 'left', 0.3, 0.3, 0, (3, 4), 'merge', None), (3, 'right', 0.3, 0.3, 0, (27, 20), 'xmin', None)]
},
'开户银行': {
'length': None,
'str_type': 'str', # cn
'location': [(27, 'right', 0.3, 0.3, 0, (3, ), 'xmin', 'split')]
},
'增值税税率或征收率': {
'length': 3,
'str_type': 'str', # 13%
'location': [(28, 'right', 0, 1, 0, (1, 30, 1), 'xmin', None), (29, 'right', 1, 0, 0, (1, 30, 1), 'xmin', None),
(1, 'left', 0, 1, 0, (28, 29, 2), 'xmax', None), (30, 'left', 1, 0, 0, (28, 29, 2), 'xmax', None)],
'fix_methods': [('replace_last_char', {'char_set': {'8', '9', '号'}, 'target_char': '%'})]
},
'增值税税额': {
'length': None,
'str_type': 'float',
'location': [(1, 'right', 0, 1, 0, (31, 32, 2.5), 'xmin', None), (30, 'right', 1, 0, 0, (31, 32, 2.5), 'xmin', None),
(31, 'left', 0, 1, 0, (1, 30, 2), 'xmax', None), (32, 'left', 1, 0, 0, (1, 30, 2), 'xmax', None)],
'fix_methods': [('prune_amount', {})]
},
'主管税务机关及代码': {
'length': None,
'str_type': 'str',
'location': [(31, 'right', 0, 1.5, 0, (2, ), 'merge', None), (32, 'right', 1, 0.5, 0, (2, ), 'merge', None)]
},
'不含税价-小写': {
'length': None,
'str_type': 'float', # cn
'location': [(34, 'left', 0.3, 0.3, 0, (33, 1.5), 'xmax', None), (33, 'right', 0.2, 0.2, 0, (34, 1.5), 'xmin', None)],
'fix_methods': [('prune_amount', {})]
},
'完税凭证号码': {
'length': None,
'str_type': 'str',
'location': [(34, 'right', 0.2, 0.2, 0, (36, 1.5), 'xmin', None), (36, 'left', 0.2, 0.2, 0, (34, 6), 'xmax', None)]
},
'吨位': {
'length': None,
'str_type': 'str',
'location': [(36, 'right', 0.2, 0.2, 0, (37, 1), 'xmin', 'split'), (37, 'left', 0.2, 0.2, 0, (36, 0.5), 'xmax', None)]
},
'限乘人数': {
'length': None,
'str_type': 'int',
'location': [(37, 'right', 0.2, 0.2, 0, (0.5, ), 'xmin', 'split')]
},
'开票人': {
'length': None,
'str_type': 'str',
'location': [(35, 'right', 0, 0.5, 0, (1.5, ), 'xmin', 'split')]
},
'备注': {
'length': None,
'str_type': 'str',
'location': [(38, 'right', 0.2, 0.2, 0, (2, ), 'xmin', 'split')],
'fix_methods': [('prune_first_char', {'char_set': {';', ':', ':'}})]
},
}
INVOICE_CONST = {
ARG_KEY_KEY_LIST: INVOICE_KEY_LIST,
ARG_KEY_VALUE_DICT: INVOICE_VALUE_DICT
}
import json
import os
import base64
import requests
import cv2
import time
import numpy as np
from PIL import Image, ImageDraw, ImageFont
base_dir = os.path.dirname(os.path.abspath(__file__))
img_dir = '/home/zwq/data/gcfp/valid/image'
draw_dir = os.path.join(base_dir, 'draw', 'valid')
sign_dir = os.path.join(base_dir, 'sign_res', 'valid')
go_dir = os.path.join(base_dir, 'go_res', 'valid')
font_path = os.path.join(base_dir, 'simhei.ttf')
font = ImageFont.truetype(font_path, 10, encoding="utf-8")
for image_name in os.listdir(img_dir):
print('start: {0}'.format(image_name))
base_image_name, _ = os.path.splitext(image_name)
image_path = os.path.join(img_dir, image_name)
output_path = os.path.join(draw_dir, image_name)
go_res_path = os.path.join(go_dir, '{0}.json'.format(base_image_name))
sign_res_path = os.path.join(sign_dir, '{0}.json'.format(base_image_name))
go_response = requests.post(url=r'http://139.196.149.46:9001/gen_ocr', files={'file': open(image_path, 'rb')})
go_res = go_response.json()['ocr_results']
# print(go_res)
with open(go_res_path, 'w') as fp:
json.dump(go_res, fp, ensure_ascii=False)
img = cv2.imread(image_path)
for coordinates, text in go_res.values():
# print(coordinates)
# print(text)
cv2.rectangle(img, (coordinates[0], coordinates[1]), (coordinates[4], coordinates[5]), (0, 255, 0), 2)
pil_img = Image.fromarray(img)
draw = ImageDraw.Draw(pil_img)
draw.text((coordinates[0], coordinates[1]), text, (255, 0, 0), font=font)
img = np.array(pil_img)
cv2.imwrite(output_path, img)
sign_response = requests.post(url=r'http://139.196.149.46:9001/signature_detect', files={'file': open(image_path, 'rb')})
signature_res = sign_response.json()
with open(sign_res_path, 'w') as fp:
json.dump(signature_res, fp, ensure_ascii=False)
# print(signature_res)
# start_time = time.time()
# res = retriever_individuals.get_target_fields(go_res, signature_res)
# print(res)
# end_time = time.time()
# print('time: {0}'.format(end_time - start_time))
# break
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!