88211099 by 乔峰昇

add youchu OCR

1 parent 64510cdb
......@@ -29,7 +29,6 @@ def bill_ocr(image):
# 提取民生银行信息
def extract_minsheng_info(ocr_results):
name_prefix = '客户姓名:'
account_prefix = '客户账号:'
results = []
......@@ -74,6 +73,7 @@ def extract_minsheng_info(ocr_results):
results.append([value[1], value[0]])
return results
# 提取工商银行信息
def extract_gongshang_info(ocr_results):
name_prefix = '户名:'
......@@ -120,6 +120,7 @@ def extract_gongshang_info(ocr_results):
results.append([value[1], value[0]])
return results
# 提取中国银行信息
def extract_zhongguo_info(ocr_results):
name_prefix = '客户姓名:'
......@@ -166,6 +167,7 @@ def extract_zhongguo_info(ocr_results):
results.append([value[1], value[0]])
return results
# 提取建设银行信息
def extract_jianshe_info(ocr_results):
name_prefixes = ['客户名称:', '户名:']
......@@ -218,6 +220,7 @@ def extract_jianshe_info(ocr_results):
break
return results
# 提取农业银行信息(比较复杂,目前训练的版式都支持)
def extract_nongye_info(ocr_results):
name_prefixes = ['客户名:', '户名:']
......@@ -321,6 +324,7 @@ def extract_nongye_info(ocr_results):
break
return results
# 提取银行流水信息总接口
def extract_bank_info(ocr_results):
results = []
......@@ -337,33 +341,89 @@ def extract_bank_info(ocr_results):
elif value[1].__contains__('中国银行'):
results = extract_zhongguo_info(ocr_results)
break
elif value[1].__contains__('中国邮政储蓄'):
results = extract_youchu_info(ocr_results)
if len(results) == 0:
results = extract_gongshang_info(ocr_results)
return results
if __name__ == '__main__':
def extract_youchu_info(ocr_results):
name_prefixes = ['户名:']
account_prefixes = ['账号:', '卡号:']
results = []
for value in ocr_results.values():
for name_prefix in name_prefixes:
if name_prefix in value[1]:
if name_prefix == value[1]:
tmp_value, max_dis = [], 999999
top_right_x = value[0][2]
top_right_y = value[0][3]
for tmp in ocr_results.values():
if tmp[1] != name_prefix:
if abs(tmp[0][1] - top_right_y) < abs(value[0][3] - value[0][5]) / 2 and abs(
tmp[0][0] - top_right_x) < max_dis:
tmp_value = tmp
max_dis = abs(tmp[0][0] - top_right_x)
else:
continue
new_position = [value[0][0], value[0][1], tmp_value[0][2], tmp_value[0][3], tmp_value[0][4],
tmp_value[0][5],
value[0][6], value[0][7]]
results.append([value[1] + tmp_value[1], new_position])
break
else:
results.append([value[1], value[0]])
break
for account_prefix in account_prefixes:
if account_prefix in value[1]:
if account_prefix == value[1]:
tmp_value, max_dis = [], 999999
top_right_x = value[0][2]
top_right_y = value[0][3]
for tmp in ocr_results.values():
if tmp[1] != account_prefix:
if abs(tmp[0][1] - top_right_y) < abs(value[0][3] - value[0][5]) / 2 and abs(
tmp[0][0] - top_right_x) < max_dis:
tmp_value = tmp
max_dis = abs(tmp[0][0] - top_right_x)
else:
continue
new_position = [value[0][0], value[0][1], tmp_value[0][2], tmp_value[0][3], tmp_value[0][4],
tmp_value[0][5],
value[0][6], value[0][7]]
results.append([value[1] + tmp_value[1], new_position])
break
else:
results.append([value[1], value[0]])
break
return results
path = '/data/situ_invoice_bill_data/new_data/qfs_bank_bill_data/minsheng/authentic/images/val'
save_path='/data/situ_invoice_bill_data/new_data/results'
bank='minsheng'
if not os.path.exists(os.path.join(save_path,bank)):
os.makedirs(os.path.join(save_path,bank))
save_path=os.path.join(save_path,bank)
for j in tqdm.tqdm(os.listdir(path)):
# if True:
img=cv2.imread(os.path.join(path,j))
# img = cv2.imread('/data/situ_invoice_bill_data/new_data/results/nongye/6/_1597382769.6449914page_23_img_0.jpg')
st = time.time()
ocr_result = bill_ocr(img)
et1 = time.time()
result = extract_bank_info(ocr_result)
et2 = time.time()
for i in range(len(result)):
cv2.rectangle(img, (result[i][1][0], result[i][1][1]), (result[i][1][4], result[i][1][5]), (0, 0, 255), 2)
# cv2.imshow('img',img)
# cv2.waitKey(0)
cv2.imwrite(os.path.join(save_path,j),img)
print('spend:{} ocr:{} extract:{}'.format(et2 - st, et1 - st, et2 - et1))
#
\ No newline at end of file
if __name__ == '__main__':
img = cv2.imread('/home/situ/下载/邮储对账单/飞书20221020-155202.jpg')
ocr_results = bill_ocr(img)
results = extract_youchu_info(ocr_results)
print(results)
# path = '/data/situ_invoice_bill_data/new_data/qfs_bank_bill_data/minsheng/authentic/images/val'
# save_path='/data/situ_invoice_bill_data/new_data/results'
# bank='minsheng'
# if not os.path.exists(os.path.join(save_path,bank)):
# os.makedirs(os.path.join(save_path,bank))
# save_path=os.path.join(save_path,bank)
# for j in tqdm.tqdm(os.listdir(path)):
# # if True:
# img=cv2.imread(os.path.join(path,j))
# # img = cv2.imread('/data/situ_invoice_bill_data/new_data/results/nongye/6/_1597382769.6449914page_23_img_0.jpg')
# st = time.time()
# ocr_result = bill_ocr(img)
# et1 = time.time()
# result = extract_bank_info(ocr_result)
# et2 = time.time()
# for i in range(len(result)):
# cv2.rectangle(img, (result[i][1][0], result[i][1][1]), (result[i][1][4], result[i][1][5]), (0, 0, 255), 2)
# # cv2.imshow('img',img)
# # cv2.waitKey(0)
# cv2.imwrite(os.path.join(save_path,j),img)
# print('spend:{} ocr:{} extract:{}'.format(et2 - st, et1 - st, et2 - et1))
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!