add general extractor

周伟奇
Showing 4 changed files with 365 additions and 1 deletions
.gitignore
general_extractor/const.py
general_extractor/retriever.py
general_extractor/step1.py
--- a/.gitignore
View file @43758c7
+++ b/.gitignore
View file @43758c7
@@ -16,4 +16,11 @@ test*
 *.jpg
 *.out

-*.log
\ No newline at end of file
+*.log
+
+sample/
+go_res/
+test.py
+simhei.ttf
+sign_res/
+res_valid.json
--- a/general_extractor/const.py 0 → 100644
View file @43758c7
+++ b/general_extractor/const.py 0 → 100644
View file @43758c7
+REPLACE_DICT_1 = {
+    "元": "圆",
+    # "零角": "零",
+    "柴": "柒",
+    "染": "柒",
+    "查": "壹",
+    "武": "贰",
+    "家": "贰",
+    "就": "贰",
+    "登": "叁",
+    # "@整": "叁",
+    "鑫": "叁",
+    "垂": "叁",
+    "捆": "捌",
+    "搁": "捌",
+    "级": "捌",
+    "测": "捌",
+    "拥": "捌",
+    "损": "捌",
+    "盒": "叁",
+    "摄": "捌",
+    "报": "捌",
+    "会": "叁",
+    "索": "壹",
+    "任": "仟",
+    "杆": "仟",
+    "仔": "仟",
+    "什": "仟",
+    "付": "仟",
+    "伴": "仟",
+    "宿": "佰",
+    "信": "佰",
+    "情": "佰",
+    "值": "佰",
+    "荣": "柒",
+    "渠": "柒",
+    "类": "柒",
+    "案": "柒",
+    "集": "柒",
+    "方": "万",
+    "抬": "拾",
+    "给": "拾",
+    "樟": "肆",
+    "单": "肆",
+    "邮": "肆",
+    "政": "玖",
+    "拐": "捌",
+    # "柴": "柒",
+    # "任": "仟",
+    # "拥": "捌",
+    # "会": "叁",
+}
+
+
+ARG_KEY_KEY_LIST = 'keys_list'
+ARG_KEY_VALUE_DICT = 'values_dict'
+
+INVOICE_KEY_LIST = [
+    ('纳税人识别号', False),  # 相近的key 0
+    ('增值税', False),   # 相近的key 1
+
+    ('地', False),   # 单字的key 2
+    ('址', False),   # 单字的key 3
+
+    ('开票日期', '开票曰期', '开票日', True),  # 4
+    ('发票代码', '发票代鸡', True),  # 5
+    ('发票号码', '发票号瑞', '发要号瑞', True),  # 6 
+    ('机打代码', False),  # 7
+    ('机打号码', '机打号玛', False),  # 8 
+    ('机器编号', False),   # 9
+    ('购买方名称', '购买方名称及', False),  # 10
+    ('纳税人识别号/', False),   # 11
+    ('统一社会信用代码/', False),  # 12
+    ('身份证号码', '身份证号码/', False),  # 13 
+    ('车辆类型', True),   # 14
+    ('厂牌型号', '广牌型号', '厂胖型号', '广牌型考', True),  # 15
+    ('产地', '严地', True),   # 16
+    ('合格证号', False),   # 17
+    ('进口证明书号', True),   # 18
+    ('商检单号', True),   # 19
+    ('发动机号码', False),   # 20
+    ('车辆识别代号/车架号码', True),  # 21 
+    ('价税合计', '价现合计', '价“税合计', False),  # 22
+    ('小写', True),   # 23 TODO 多个值时的取值
+    ('销货单位名称', False),  # 24 
+    ('电话', True),   # 25
+    ('账号', '账考', '帐号', '帐考', '张号', '陈号', '昨号', True),  # 26 
+    ('开户银行', True),   # 27
+    ('增值税税率', True),   # 28 value false
+    ('或征收税', False),   # 29
+    ('税额', False),   # 30
+    ('主管税务', True),   # 31 value False
+    ('机关及代码', True),   # 32
+    ('不含税价', True),   # 33 value False
+    ('完税凭证号码', False),   # 34
+    ('开票人', True),   # 35
+    ('吨位', True),   # 36
+    ('限乘人数', '跟乘人数', True),  # 37 TODO '人数'这种情况的坐标切分 
+    ('备注', True)  # 38
+]
+
+# split key-value一体
+# append key-value_suffix  需要坐标切分
+# insert key-value_prefix  需要坐标切分 
+INVOICE_VALUE_DICT = {
+    '开票日期': {
+        'length': 10,
+        'str_type': 'date',
+        #             idx, location, top, bottom, left, (idx, scope), choice, if_startswith
+        'location': [(4, 'right', 0.3, 0.5, 0, (2, ), 'xmin', 'split')],
+        'fix_methods': [('prune_first_char', {'char_set': {':', '：', ';', }})]
+    },
+    '发票代码': {
+        'length': 12,
+        'str_type': 'int',
+        'location': [(5, 'right', 0.2, 0.2, 0, (2, ), 'xmin', 'split')]
+    },
+    '发票号码': {
+        'length': 8,
+        'str_type': 'int',
+        'location': [(6, 'right', 0.2, 0.5, 0, (2, ), 'length', 'split')],
+        'fix_methods': [('prune_first_char', {'char_set': {'-',}})]
+    },
+    '机打代码': {
+        'length': 12,
+        'str_type': 'int',
+        'location': [(7, 'right', 0.5, 1, 0, (2, ), 'ymin', None)]
+    },
+    '机器编号': {
+        'length': 12,
+        'str_type': 'int',
+        'location': [(9, 'right', 0.5, 1, 0, (2, ), 'ymax', None)]
+    },
+    '机打号码': {
+        'length': 8,
+        'str_type': 'int',
+        'location': [(8, 'right', 0.5, 0.5, 0, (2, ), 'length', None)]
+    },
+    '购买方名称': {
+        'length': None,
+        'str_type': 'str',  # cn
+        'location': [(10, 'right', 0.5, 0.5, 0, (11, 12, 13, 2), 'xmin', None)]
+    },
+    '纳税人识别号/统一社会信用代码/身份证号码': {
+        'length': 18,
+        'str_type': 'str',  # alnum
+        'location': [(11, 'right', 0, 2, 0, (2.5, ), 'length', None), (12, 'right', 1, 1, 0, (2, ), 'length', None), (13, 'right', 2, 0, 0.5, (3, ), 'length', None)]
+    },
+    '车辆类型': {
+        'length': None,
+        'str_type': 'str',
+        'location': [(14, 'right', 0.2, 0.2, 0, (15, 1.5), 'xmin', 'split'), (15, 'left', 0.2, 0.2, 0, (14, 2.5), 'xmax', None)]
+    },
+    '厂牌型号': {
+        'length': None,
+        'str_type': 'str',
+        'location': [(15, 'right', 0.2, 0.2, 0, (16, 3.5), 'xmin', 'split'), (16, 'left', 0.2, 0.2, 0, (15, 2.5), 'xmax', None)]
+    },
+    '产地': {
+        'length': None,
+        'str_type': 'str',  # cn
+        'location': [(16, 'right', 0.2, 0.2, 0, (2.5, ), 'xmin', 'split')]
+    },
+    '合格证号': {
+        'length': None,  # 15
+        'str_type': 'str',  # alnum
+        'location': [(17, 'right', 0.2, 0.2, 0, (18, 1.5), 'xmin', None), (18, 'left', 0.2, 0.2, 0, (17, 1.5), 'xmax', None)]
+    },
+    '进口证明书号': {
+        'length': None,
+        'str_type': 'str',  # alnum
+        'location': [(18, 'right', 0.3, 0.3, 0, (19, 1.5), 'xmin', 'split'), (19, 'left', 0.2, 0.2, 0, (18, 3), 'xmax', None)]
+    },
+    '商检单号': {
+        'length': None,
+        'str_type': 'str',  
+        'location': [(19, 'right', 0.2, 0.2, 0, (1.5, ), 'xmin', 'split')]
+    },
+    '发动机号码': {
+        'length': None,
+        'str_type': 'str',  # alnum
+        'location': [(20, 'right', 0.2, 0.2, 0, (21, 2), 'xmin', None), (21, 'left', 0.2, 0.2, 0, (20, 1.4), 'xmax', None)]
+    },
+    '车辆识别代号/车架号码': {
+        'length': 17,
+        'str_type': 'str',  # alnum  
+        'location': [(21, 'right', 0.3, 0.3, 0, (1.2, ), 'xmin', 'split')]
+    },
+    '价税合计大写': {
+        'length': None,
+        'str_type': 'str',  # cn
+        'location': [(22, 'right', 0.2, 0.2, 0, (23, 3), 'xmin', None), (23, 'left', 0.2, 0.2, 0, (22, 15), 'xmax', None)],
+        'fix_methods': [('prune_no_cn', {}), ('replace_whole', {'replace_map': REPLACE_DICT_1})]
+    },
+    '价税合计小写': {
+        'length': None,
+        'str_type': 'float', 
+        'location': [(23, 'right', 0.4, 0.4, 0, (4, ), 'xmin', 'split')],
+        'fix_methods': [('prune_amount', {})]
+    },
+    '销货单位名称': {
+        'length': None,
+        'str_type': 'str',  # cn
+        'location': [(24, 'right', 0.2, 0.2, 0, (25, 3), 'xmin', None), (25, 'left', 0.3, 0.3, 0, (24, 15), 'xmax', None)]
+    },
+    '电话': {
+        'length': None,
+        'str_type': 'str',  # int + -
+        'location': [(25, 'right', 0.3, 0.3, 0, (5, ), 'xmin', 'split')]
+    },
+    '纳税人识别号': {
+        'length': None,
+        'str_type': 'str',  # cn
+        'location': [(0, 'right', 0.3, 0.3, 0, (26, 2.5), 'xmin', None), (26, 'left', 0.3, 0.3, 0, (0, 15), 'xmax', None)]
+    },
+    '账号': {
+        'length': None,
+        'str_type': 'str',
+        'location': [(26, 'right', 0.3, 0.3, 0, (6, ), 'xmin', 'split')]
+    },
+    '地址': {
+        'length': None,
+        'str_type': 'str',  # cn
+        'location': [(27, 'left', 0.3, 0.3, 0, (3, 4), 'merge', None), (3, 'right', 0.3, 0.3, 0, (27, 20), 'xmin', None)]
+    },
+    '开户银行': {
+        'length': None,
+        'str_type': 'str', # cn
+        'location': [(27, 'right', 0.3, 0.3, 0, (3, ), 'xmin', 'split')]
+    },
+    '增值税税率或征收率': {
+        'length': 3,
+        'str_type': 'str',  # 13%
+        'location': [(28, 'right', 0, 1, 0, (1, 30, 1), 'xmin', None), (29, 'right', 1, 0, 0, (1, 30, 1), 'xmin', None), 
+                     (1, 'left', 0, 1, 0, (28, 29, 2), 'xmax', None), (30, 'left', 1, 0, 0, (28, 29, 2), 'xmax', None)],
+        'fix_methods': [('replace_last_char', {'char_set': {'8', '9', '号'}, 'target_char': '%'})]
+
+    },
+    '增值税税额': {
+        'length': None,
+        'str_type': 'float',
+        'location': [(1, 'right', 0, 1, 0, (31, 32, 2.5), 'xmin', None), (30, 'right', 1, 0, 0, (31, 32, 2.5), 'xmin', None), 
+                     (31, 'left', 0, 1, 0, (1, 30, 2), 'xmax', None), (32, 'left', 1, 0, 0, (1, 30, 2), 'xmax', None)],
+        'fix_methods': [('prune_amount', {})]
+    },
+    '主管税务机关及代码': {
+        'length': None,
+        'str_type': 'str', 
+        'location': [(31, 'right', 0, 1.5, 0, (2, ), 'merge', None), (32, 'right', 1, 0.5, 0, (2, ), 'merge', None)]
+    },
+    '不含税价-小写': {
+        'length': None,
+        'str_type': 'float', # cn
+        'location': [(34, 'left', 0.3, 0.3, 0, (33, 1.5), 'xmax', None), (33, 'right', 0.2, 0.2, 0, (34, 1.5), 'xmin', None)],
+        'fix_methods': [('prune_amount', {})]
+    },
+    '完税凭证号码': {
+        'length': None,
+        'str_type': 'str', 
+        'location': [(34, 'right', 0.2, 0.2, 0, (36, 1.5), 'xmin', None), (36, 'left', 0.2, 0.2, 0, (34, 6), 'xmax', None)]
+    },
+    '吨位': {
+        'length': None,
+        'str_type': 'str', 
+        'location': [(36, 'right', 0.2, 0.2, 0, (37, 1), 'xmin', 'split'), (37, 'left', 0.2, 0.2, 0, (36, 0.5), 'xmax', None)]
+    },
+    '限乘人数': {
+        'length': None,
+        'str_type': 'int', 
+        'location': [(37, 'right', 0.2, 0.2, 0, (0.5, ), 'xmin', 'split')]
+    },
+    '开票人': {
+        'length': None,
+        'str_type': 'str', 
+        'location': [(35, 'right', 0, 0.5, 0, (1.5, ), 'xmin', 'split')]
+    },
+    '备注': {
+        'length': None,
+        'str_type': 'str', 
+        'location': [(38, 'right', 0.2, 0.2, 0, (2, ), 'xmin', 'split')],
+        'fix_methods': [('prune_first_char', {'char_set': {';', ':', '：'}})]
+    },
+
+}
+
+INVOICE_CONST = {
+    ARG_KEY_KEY_LIST: INVOICE_KEY_LIST,    
+    ARG_KEY_VALUE_DICT: INVOICE_VALUE_DICT    
+}
--- a/general_extractor/retriever.py 0 → 100644
View file @43758c7
+++ b/general_extractor/retriever.py 0 → 100644
View file @43758c7
--- a/general_extractor/step1.py 0 → 100644
View file @43758c7
+++ b/general_extractor/step1.py 0 → 100644
View file @43758c7
+import json
+import os
+import base64
+import requests
+import cv2
+import time
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+
+
+base_dir = os.path.dirname(os.path.abspath(__file__))
+img_dir = '/home/zwq/data/gcfp/valid/image'
+draw_dir = os.path.join(base_dir, 'draw', 'valid')
+sign_dir = os.path.join(base_dir, 'sign_res', 'valid')
+go_dir = os.path.join(base_dir, 'go_res', 'valid')
+
+font_path = os.path.join(base_dir, 'simhei.ttf')
+font = ImageFont.truetype(font_path, 10, encoding="utf-8")
+
+
+for image_name in os.listdir(img_dir):
+
+    print('start: {0}'.format(image_name))
+    base_image_name, _ = os.path.splitext(image_name)
+
+    image_path = os.path.join(img_dir, image_name)
+    output_path = os.path.join(draw_dir, image_name)
+    go_res_path = os.path.join(go_dir, '{0}.json'.format(base_image_name))
+    sign_res_path = os.path.join(sign_dir, '{0}.json'.format(base_image_name))
+
+    go_response = requests.post(url=r'http://139.196.149.46:9001/gen_ocr', files={'file': open(image_path, 'rb')})
+    go_res = go_response.json()['ocr_results']
+    # print(go_res)
+
+    with open(go_res_path, 'w') as fp:
+        json.dump(go_res, fp, ensure_ascii=False)
+
+    img = cv2.imread(image_path)
+    for coordinates, text in go_res.values():
+        # print(coordinates)
+        # print(text)
+        cv2.rectangle(img, (coordinates[0], coordinates[1]), (coordinates[4], coordinates[5]), (0, 255, 0), 2)
+        pil_img = Image.fromarray(img)
+        draw = ImageDraw.Draw(pil_img)
+        draw.text((coordinates[0], coordinates[1]), text, (255, 0, 0), font=font)
+        img = np.array(pil_img)
+
+    cv2.imwrite(output_path, img)
+
+    sign_response = requests.post(url=r'http://139.196.149.46:9001/signature_detect', files={'file': open(image_path, 'rb')})
+    signature_res = sign_response.json()
+
+    with open(sign_res_path, 'w') as fp:
+        json.dump(signature_res, fp, ensure_ascii=False)
+    
+    # print(signature_res)
+
+    # start_time = time.time()
+
+    # res = retriever_individuals.get_target_fields(go_res, signature_res)
+    # print(res)
+
+    # end_time = time.time()
+    # print('time: {0}'.format(end_time - start_time))
+    
+    # break
+
+