d7847808 by 周伟奇

Merge branch 'feature/fsm-contract' into fix/report_ca

2 parents 30509ad4 784ff18a
......@@ -1057,7 +1057,25 @@ HIL_CONTRACT_2_CLASSIFY = 44
HIL_CONTRACT_3_CN_NAME = '车辆处置协议'
HIL_CONTRACT_3_CLASSIFY = 45
CONTRACT_SET = {CONTRACT_QRS_CLASSIFY, CONTRACT_CLASSIFY, HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_2_CLASSIFY, HIL_CONTRACT_3_CLASSIFY}
FSM_CONTRACT_WEP_CN_NAME = '延长保修合同'
FSM_CONTRACT_WEP_CLASSIFY = 51
FSM_CONTRACT_MSI_CN_NAME = '长悦保养合同'
FSM_CONTRACT_MSI_CLASSIFY = 52
FSM_CONTRACT_SC_CN_NAME = '汽车销售合同'
FSM_CONTRACT_SC_CLASSIFY = 53
CONTRACT_SET = {
CONTRACT_QRS_CLASSIFY,
CONTRACT_CLASSIFY,
HIL_CONTRACT_1_CLASSIFY,
HIL_CONTRACT_2_CLASSIFY,
HIL_CONTRACT_3_CLASSIFY,
FSM_CONTRACT_WEP_CLASSIFY,
FSM_CONTRACT_MSI_CLASSIFY,
FSM_CONTRACT_SC_CLASSIFY,
}
CONTRACT_MAP = {
HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_CN_NAME,
......@@ -1065,6 +1083,10 @@ CONTRACT_MAP = {
HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_CN_NAME,
CONTRACT_CLASSIFY: CONTRACT_CN_NAME,
CONTRACT_QRS_CLASSIFY: CONTRACT_QRS_CN_NAME,
FSM_CONTRACT_WEP_CLASSIFY: FSM_CONTRACT_WEP_CN_NAME,
FSM_CONTRACT_MSI_CLASSIFY: FSM_CONTRACT_MSI_CN_NAME,
FSM_CONTRACT_SC_CLASSIFY: FSM_CONTRACT_SC_CN_NAME,
}
# 保单
......@@ -1214,6 +1236,11 @@ BS_FIELD = 'bss_ocr'
HIL_CONTRACT_1_FIELD = 'hil_contract_1_ocr'
HIL_CONTRACT_2_FIELD = 'hil_contract_2_ocr'
HIL_CONTRACT_3_FIELD = 'hil_contract_3_ocr'
FSM_CONTRACT_WEP_FIELD = 'fsm_wep_ocr',
FSM_CONTRACT_MSI_FIELD = 'fsm_msi_ocr',
FSM_CONTRACT_SC_FIELD = 'fsm_sc_ocr',
BS_CLASSIFY = 10089
RESULT_MAPPING = {
......@@ -1238,6 +1265,9 @@ RESULT_MAPPING = {
HIL_CONTRACT_1_CLASSIFY: HIL_CONTRACT_1_FIELD,
HIL_CONTRACT_2_CLASSIFY: HIL_CONTRACT_2_FIELD,
HIL_CONTRACT_3_CLASSIFY: HIL_CONTRACT_3_FIELD,
FSM_CONTRACT_WEP_CLASSIFY: FSM_CONTRACT_WEP_FIELD,
FSM_CONTRACT_MSI_CLASSIFY: FSM_CONTRACT_MSI_FIELD,
FSM_CONTRACT_SC_CLASSIFY: FSM_CONTRACT_SC_FIELD,
}
CA_ADD_COMPARE_FIELDS = (IC_OCR_FIELD, BL_OCR_FIELD, BS_FIELD)
......@@ -2313,29 +2343,35 @@ APPLICANT_TYPE_MAP = {
APPLICANT_TYPE_ORDER = ['Borrower', 'Co-Borrower', 'Guarantor', 'Mortgager']
FILE_NAME_PREFIX_MAP = {
AFC_PREFIX: [
((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'),
((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
],
HIL_PREFIX: [
((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'),
((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'),
((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
]
}
# FILE_NAME_PREFIX_MAP = {
# AFC_PREFIX: [
# ((CONTRACT_CLASSIFY, 0), '{0}_电子签署-汽车抵押贷款合同'),
# ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
# ],
# HIL_PREFIX: [
# ((HIL_CONTRACT_1_CLASSIFY, HIL_CONTRACT_3_CLASSIFY), '{0}_电子签署-售后回租合同'),
# ((HIL_CONTRACT_2_CLASSIFY, 0), '{0}_电子签署-汽车租赁抵押合同'),
# ((HMH_CLASSIFY, 0), '{0}_电子签署-抵押登记豁免函'),
# ]
# }
ECONTRACT_KEYWORDS_MAP = {
AFC_PREFIX: [
('抵押贷款合同', CONTRACT_CLASSIFY),
('送达地址确认书', CONTRACT_QRS_CLASSIFY),
# ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0),
('抵押登记豁免函', HMH_CLASSIFY),
('延长保修', FSM_CONTRACT_WEP_CLASSIFY),
('长悦保养', FSM_CONTRACT_MSI_CLASSIFY),
('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY),
],
HIL_PREFIX: [
('售后回租合同', HIL_CONTRACT_1_CLASSIFY),
('租赁抵押合同', HIL_CONTRACT_2_CLASSIFY),
('车辆处置协议', HIL_CONTRACT_3_CLASSIFY),
# ('电子签署-抵押登记豁免函', HMH_CLASSIFY, 0),
('抵押登记豁免函', HMH_CLASSIFY),
('延长保修', FSM_CONTRACT_WEP_CLASSIFY),
('长悦保养', FSM_CONTRACT_MSI_CLASSIFY),
('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY),
]
}
......@@ -2345,6 +2381,12 @@ HIL_CONTRACT_TYPE_MAP = {
str(HIL_CONTRACT_3_CLASSIFY): 1,
}
FSM_CONTRACT_TYPE_MAP = {
str(FSM_CONTRACT_WEP_CLASSIFY): 0,
str(FSM_CONTRACT_MSI_CLASSIFY): 1,
str(FSM_CONTRACT_SC_CLASSIFY): 2,
}
RESULT_MAP = {
0: None,
1: True,
......
......@@ -20,6 +20,8 @@ from common.tools.file_tools import get_pwd_list_from_str, extract_zip_or_rar, g
from common.tools.pdf_to_img import PDFHandler
from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict
from common.electronic_hil_contract.hil_contract_ocr import predict as hil_predict
from common.fsm_econtract.fsm_contract_ocr import predict as fsm_predict
from common.fsm_econtract.hmh_ocr import predict as hmh_predict
from apps.doc import consts
# from apps.doc.ocr.edms import EDMS, rh
from apps.doc.ocr.ecm import ECM, rh
......@@ -996,7 +998,7 @@ class Command(BaseCommand, LoggerMixin):
res.setdefault(consts.ALL_POSITION_KEY, dict())[key] = page_info_dict.get(str(pno), {}).get(
consts.ALL_POSITION_KEY, {}).get(key1, [])
license_summary[classify] = [res]
else:
elif classify in consts.SE_HIL_CON_MAP: # TODO FSM新合同写入数据库用于比对
res = {}
for key, (pno1, pno2, end_idx, key1, key2) in consts.SE_HIL_CON_MAP[classify].items():
if pno1 is None:
......@@ -1442,7 +1444,7 @@ class Command(BaseCommand, LoggerMixin):
self.log_base, traceback.format_exc()))
error_list.append(1)
return
else: # e-contract
else: # e-contract or or e-fsm-contract or e-hmh
try:
# pdf下载 处理 图片存储 识别
for times in range(consts.RETRY_TIMES):
......@@ -1472,6 +1474,7 @@ class Command(BaseCommand, LoggerMixin):
self.online_log.error('{0} [process error (db save)] [error={1}]'.format(
self.log_base, traceback.format_exc()))
# AFC合同
if classify_1_str == str(consts.CONTRACT_CLASSIFY):
ocr_result = afc_predict(pdf_handler.pdf_info)
page_res = {}
......@@ -1483,6 +1486,7 @@ class Command(BaseCommand, LoggerMixin):
'page_num': page_num,
'page_info': page_info
}
# 送达地址确认书
elif classify_1_str == str(consts.CONTRACT_QRS_CLASSIFY):
ocr_result = afc_predict(pdf_handler.pdf_info, is_qrs=True)
page_num = 'page_1'
......@@ -1493,7 +1497,8 @@ class Command(BaseCommand, LoggerMixin):
'page_info': ocr_result.pop(page_num, {})
}
}
else:
# HIL合同
elif classify_1_str in consts.HIL_CONTRACT_TYPE_MAP:
file_type_1 = consts.HIL_CONTRACT_TYPE_MAP.get(classify_1_str)
ocr_result_1 = hil_predict(pdf_handler.pdf_info, file_type_1)
rebuild_res_1 = {}
......@@ -1508,9 +1513,35 @@ class Command(BaseCommand, LoggerMixin):
'page_num': page_num,
'page_info': page_info
}
# FSM合同 WEP MSI SC
elif classify_1_str in consts.FSM_CONTRACT_TYPE_MAP:
file_type = consts.FSM_CONTRACT_TYPE_MAP.get(classify_1_str)
ocr_result = fsm_predict(pdf_handler.pdf_info, file_type)
for page_num, page_info in ocr_result.items():
if isinstance(page_num, str) and page_num.startswith('page_'):
page_res[page_num] = {
'classify': int(classify_1_str),
'page_num': page_num,
'page_info': page_info
}
# hmh
else:
pass
contract_res = {}
for img_path_tmp, page_key in pdf_handler.img_path_pno_list:
if classify_1_str == str(consts.HMH_CLASSIFY):
img_contract_res = {
'code': 1,
'data': [
{
'classify': consts.HMH_CLASSIFY,
'data': hmh_predict(pdf_handler.pdf_info)
}
]
}
else:
if page_key in page_res:
img_contract_res = {
'code': 1,
......
WEP_FIELD = {
"0": {
'keys': {
'客户姓名': [('客户姓名', (r'^客户姓名.?$', r'^企业名称.?$'), 'top1', {})],
'证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})],
'证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})],
'合同价格(小写)': [('人民币', (r'^人民币¥.?$', ), 'top1', {})],
'客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', ), 'top1', {})],
'签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})],
},
'value': {
'客户姓名': ('text', 'right', {'offset_tuple': (-1.1, 1, 0.3, 0)}, ''),
'证件类型': ('text', 'right', {'offset_tuple': (-1, 1, 0, 0)}, ''),
'证件号码': ('text', 'right', {'offset_tuple': (-1, 2, 0.3, 0)}, ''),
'合同价格(小写)': ('text', 'right', {'offset_tuple': (-1, 1, 0.3, 0)}, ''),
'客户签名': ('img', 'under', {'offset_tuple': (0, 0, 0, 4), 'rigorous': True}, '无'),
'签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'),
},
}
}
MSI_FIELD = {
"0": {
'keys': {
'客户姓名': [('客户姓名', (r'^客户姓名.?$', r'^企业名称.?$'), 'top1', {})],
'证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})],
'证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})],
'合同价格(小写)': [('人民币', (r'^人民币¥.?$', ), 'top1', {})],
},
'value': {
'客户姓名': ('text', 'right', {'offset_tuple': (-1.2, 1, 0.3, 0)}, ''),
'证件类型': ('text', 'right', {'offset_tuple': (-1, 1, 0, 0)}, ''),
'证件号码': ('text', 'right', {'offset_tuple': (-1, 2, 0.3, 0)}, ''),
'合同价格(小写)': ('text', 'right', {'offset_tuple': (-1, 1, 0.3, 0)}, ''),
},
},
"1": {
'keys': {
'客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', ), 'top1', {})],
'签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})],
},
'value': {
'客户签名': ('img', 'under', {'offset_tuple': (0, 0, 0, 4), 'rigorous': True}, '无'),
'签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'),
},
}
}
SC_FIELD = {
"0": {
'keys': {
'姓名': [('姓名', (r'^姓名.?$', r'^企业名称.?$'), 'top1', {})],
'证件类型': [('证件类型', (r'^证件类型.?$', ), 'top1', {})],
'证件号码': [('证件号码', (r'^证件号码.?$', r'^统一社会信用代码.?$'), 'top1', {})],
'总价': [('总价', (r'^总价.?$', ), 'top1', {})],
},
'value': {
'姓名': ('text', 'right', {'offset_tuple': (-2, 8, 0.5, 0)}, ''),
'证件类型': ('text', 'right', {'offset_tuple': (-2, 6, 0.5, 0)}, ''),
'证件号码': ('text', 'right', {'offset_tuple': (-2, 6, 0.5, 0)}, ''),
'总价': ('text', 'right', {'offset_tuple': (-2, 12, 0.5, 0)}, ''),
},
},
"-1": {
'keys': {
'客户签名': [('客户签名/盖章', (r'^客户签名/盖章.*$', r'^客户签名/盖章.*$'), 'top1', {})],
'签单日期': [('签单日期', (r'^签单日期.*签单日期.?$', ), 'top1', {})],
},
'value': {
'客户签名': ('img', 'under', {'offset_tuple': (1.5, 1, 0, 4), 'rigorous': True}, '无'),
'签单日期': ('img', 'right', {'offset_tuple': (0, 0, 1.1, 0), 'rigorous': True}, '无'),
},
}
}
from .retriever import Retriever
from .const import WEP_FIELD, MSI_FIELD, SC_FIELD
from .tools import pdf_info_rebuild
retriever_list = [Retriever(WEP_FIELD), Retriever(MSI_FIELD), Retriever(SC_FIELD)]
def predict(pdf_info, file_type=0):
retriever = retriever_list[file_type]
pdf_text_list, pdf_img_list = pdf_info_rebuild(pdf_info)
return retriever.get_target_fields(pdf_text_list, pdf_img_list)
from .retriever import HMHRetriever
from .tools import pdf_info_rebuild
hmh_retriever = HMHRetriever()
def predict(pdf_info):
pdf_text_list, _ = pdf_info_rebuild(pdf_info, fix_bbox=False)
return hmh_retriever.get_target_fields(pdf_text_list)
import re
class HMHRetriever:
def __init__(self):
self.words_str = 'words'
self.position_str = 'location'
self.default_position = [0, 0, 0, 0]
self.search_fields_list = [
('借款/承租人姓名', ''),
('证件号码', ''),
('渠道', ''),
('合同编号', ''),
('借款人签字/盖章', '无'),
]
def get_target_fields(self, pdf_text_list):
result = dict()
is_find_name_id_company, is_find_application_no, is_find_name_date = False, False, False
for bbox, text in pdf_text_list.pop(str(0), []):
# print(text)
if not is_find_name_id_company:
# name_id_company_list = re.findall(r'借款人\(姓名(.*)证件号码(.*)与(.*公司)', text)
name_id_company_list = re.findall(r'承租人\(姓名(.*)证件号码(.*)与(.*公司)', text)
for name_id_company_tuple in name_id_company_list:
if len(name_id_company_tuple) == 3:
result[self.search_fields_list[0][0]] = {
self.words_str: name_id_company_tuple[0].replace('\u3000', '').strip(),
self.position_str: bbox
}
result[self.search_fields_list[1][0]] = {
self.words_str: name_id_company_tuple[1].replace('\u3000', '').strip(),
self.position_str: bbox
}
result[self.search_fields_list[2][0]] = {
self.words_str: name_id_company_tuple[2],
self.position_str: bbox
}
is_find_name_id_company = True
break
if not is_find_name_id_company:
name_id_company_list = re.findall(r'借款人\(姓名(.*)证件号码(.*)与(.*公司)', text)
# name_id_company_list = re.findall(r'承租人\(姓名(.*)证件号码(.*)与(.*公司)', text)
for name_id_company_tuple in name_id_company_list:
if len(name_id_company_tuple) == 3:
result[self.search_fields_list[0][0]] = {
self.words_str: name_id_company_tuple[0].replace('\u3000', '').strip(),
self.position_str: bbox
}
result[self.search_fields_list[1][0]] = {
self.words_str: name_id_company_tuple[1].replace('\u3000', '').strip(),
self.position_str: bbox
}
result[self.search_fields_list[2][0]] = {
self.words_str: name_id_company_tuple[2],
self.position_str: bbox
}
is_find_name_id_company = True
break
if not is_find_application_no:
application_no_list = re.findall(r'合同编号.*(CH-B\d*-\d*).*', text)
if len(application_no_list) == 1:
result[self.search_fields_list[3][0]] = {
self.words_str: application_no_list[0],
self.position_str: bbox
}
is_find_application_no = True
if not is_find_name_date:
name_date_list = re.findall(r'(.*).*签署日期.*(\d{4}-\d{2}-\d{2})', text)
for name_date_tuple in name_date_list:
if len(name_date_tuple) == 2:
result[self.search_fields_list[4][0]] = {
self.words_str: '{0} {1}'.format(name_date_tuple[0].replace('\u3000', '').strip(), name_date_tuple[1]),
self.position_str: bbox
}
is_find_name_date = True
break
for find_key, default_value in self.search_fields_list:
if find_key not in result:
result[find_key] = {
self.words_str: default_value,
self.position_str: self.default_position,
}
# simple_result = []
# for key, value_dict in result.items():
# simple_result.append((key, value_dict[self.words_str]))
# return simple_result
return {"words_result": result}
class Retriever:
def __init__(self, target_fields):
self.keys_str = 'keys'
self.value_str = 'value'
self.text_str = 'text'
self.words_str = 'words'
self.position_str = 'position'
self.default_position = [-1, -1, -1, -1]
self.target_fields = target_fields
self.replace_map = {
'int': {
'(': '0'
}
}
@staticmethod
def key_top1(coordinates_list, key_coordinates):
# 关键词查找方向:最上面
coordinates_list.sort(key=lambda x: x[1])
return coordinates_list[0]
def key_right(self, coordinates_list, key_coordinates, offset_tuple, rigorous=False):
# 关键词查找方向:右侧
if len(coordinates_list) == 1:
return coordinates_list[0]
# 没有上一层关键词的坐标时,返回最上面的坐标
if key_coordinates is None:
return self.key_top1(coordinates_list, key_coordinates)
x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple)
x_min_find, find_key_coordinates = None, None
for x0, y0, x1, y1 in coordinates_list:
if rigorous:
is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max
else:
cent_x = x0 + ((x1 - x0) / 2)
cent_y = y0 + ((y1 - y0) / 2)
is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max
if is_eligible:
if x_min_find is None or x0 < x_min_find:
x_min_find = x0
find_key_coordinates = (x0, y0, x1, y1)
if find_key_coordinates is None:
return self.key_top1(coordinates_list, key_coordinates)
else:
return find_key_coordinates
def value_right(self, search_list, key_coordinates, offset_tuple, value_type=None, rigorous=False):
# 字段值查找方向:右侧
x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple)
x_min_find, value, coordinates = None, None, None
for (x0, y0, x1, y1), text in search_list:
if rigorous:
is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max
else:
cent_x = x0 + ((x1 - x0) / 2)
cent_y = y0 + ((y1 - y0) / 2)
is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max
if is_eligible:
if x_min_find is None or x0 < x_min_find:
if len(text.strip()) > 0:
x_min_find = x0
value = text
coordinates = (x0, y0, x1, y1)
if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str):
new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {})))
return new_value, coordinates
return value, coordinates
def value_under(self, search_list, key_coordinates, offset_tuple, value_type=None, append=False, rigorous=False):
# 字段值查找方向:下方
x_min, y_min, x_max, y_max = self.get_target_bbox(key_coordinates, offset_tuple)
find_list = []
for (x0, y0, x1, y1), text in search_list:
if rigorous:
is_eligible = x_min < x0 and x1 < x_max and y_min < y0 and y1 < y_max
else:
cent_x = x0 + ((x1 - x0) / 2)
cent_y = y0 + ((y1 - y0) / 2)
is_eligible = x_min < cent_x < x_max and y_min < cent_y < y_max
if is_eligible:
if len(text.strip()) > 0:
find_list.append((x0, y0, x1, y1, text))
if len(find_list) == 0:
return None, None
else:
find_list.sort(key=lambda x: (x[1], x[0]))
coordinates = find_list[0][:-1]
if append:
value = ''.join([text for _, _, _, _, text in find_list])
else:
value = find_list[0][-1]
if isinstance(value_type, str) and value_type in self.replace_map and isinstance(value, str):
new_value = value.translate(str.maketrans(self.replace_map.get(value_type, {})))
return new_value, coordinates
return value, coordinates
@staticmethod
def get_target_bbox(key_coordinates, offset_tuple):
offset_xmin, offset_xmax, offset_ymin, offset_ymax = offset_tuple
width = key_coordinates[2] - key_coordinates[0]
height = key_coordinates[-1] - key_coordinates[1]
x_min = key_coordinates[0] - (width * offset_xmin) # -1
x_max = key_coordinates[2] + (width * offset_xmax)
y_min = key_coordinates[1] - (height * offset_ymin) # -1
y_max = key_coordinates[-1] + (height * offset_ymax)
return x_min, y_min, x_max, y_max
def get_target_fields(self, pdf_text_list, pdf_img_list):
pdf_result = dict()
for pno_str, fields_dict in self.target_fields.items():
if pno_str == '-1':
pno_int_list = [int(pno_str) for pno_str in pdf_text_list.keys()]
pno_str = str(max(pno_int_list))
# 搜索关键词
key_text_info = dict()
for key_text_list in fields_dict[self.keys_str].values():
for key_text, key_re_tuple, _, _ in key_text_list:
for (x0, y0, x1, y1), text in pdf_text_list.get(pno_str, []):
for key_re in key_re_tuple:
if re.match(key_re, text):
key_text_info.setdefault(key_text, list()).append((x0, y0, x1, y1))
# 搜索关键词
key_coordinates_info = dict()
for field, key_text_list in fields_dict[self.keys_str].items():
last_key_coordinates = None
for key_text, _, direction, kwargs in key_text_list:
if key_text not in key_text_info:
last_key_coordinates = None
continue
last_key_coordinates = getattr(self, 'key_{0}'.format(direction))(
key_text_info[key_text],
last_key_coordinates,
**kwargs)
key_coordinates_info[field] = last_key_coordinates
# 搜索字段值
page_result = dict()
for field, (source, direction, kwargs, default_value) in fields_dict[self.value_str].items():
if not isinstance(key_coordinates_info.get(field), tuple):
page_result[field] = {
self.words_str: default_value,
self.position_str: self.default_position,
}
continue
value, coordinates = getattr(self, 'value_{0}'.format(direction))(
pdf_text_list.get(pno_str, []) if source == self.text_str else pdf_img_list.get(pno_str, []),
key_coordinates_info[field],
**kwargs
)
if not isinstance(value, str):
page_result[field] = {
self.words_str: default_value,
self.position_str: self.default_position,
}
else:
page_result[field] = {
self.words_str: value,
self.position_str: list(coordinates),
}
pdf_result['page_{0}'.format(int(pno_str) + 1)] = page_result
return pdf_result
def pdf_info_rebuild(pdf_info, fix_bbox=True):
pdf_text_info = dict()
pdf_img_info = dict()
for pno_str, page_info in pdf_info.items():
text_set = set()
for block in page_info['blocks']:
if block['type'] == 0:
# text有重复的现象
text_set.clear()
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text'].strip()
if len(text) != 0 and text not in text_set:
text_set.add(text)
# bbox的高,不准
if fix_bbox and bbox[-1] - bbox[1] < span['size']:
bbox[-1] = bbox[-1] + span['size']
pdf_text_info.setdefault(pno_str, list()).append([bbox, text])
elif block['type'] == 1:
pdf_img_info.setdefault(pno_str, list()).append((block['bbox'], '有'))
return pdf_text_info, pdf_img_info
\ No newline at end of file
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!