88211099 by 乔峰昇

add youchu OCR

1 parent 64510cdb
...@@ -29,7 +29,6 @@ def bill_ocr(image): ...@@ -29,7 +29,6 @@ def bill_ocr(image):
29 29
30 # 提取民生银行信息 30 # 提取民生银行信息
31 def extract_minsheng_info(ocr_results): 31 def extract_minsheng_info(ocr_results):
32
33 name_prefix = '客户姓名:' 32 name_prefix = '客户姓名:'
34 account_prefix = '客户账号:' 33 account_prefix = '客户账号:'
35 results = [] 34 results = []
...@@ -74,6 +73,7 @@ def extract_minsheng_info(ocr_results): ...@@ -74,6 +73,7 @@ def extract_minsheng_info(ocr_results):
74 results.append([value[1], value[0]]) 73 results.append([value[1], value[0]])
75 return results 74 return results
76 75
76
77 # 提取工商银行信息 77 # 提取工商银行信息
78 def extract_gongshang_info(ocr_results): 78 def extract_gongshang_info(ocr_results):
79 name_prefix = '户名:' 79 name_prefix = '户名:'
...@@ -120,6 +120,7 @@ def extract_gongshang_info(ocr_results): ...@@ -120,6 +120,7 @@ def extract_gongshang_info(ocr_results):
120 results.append([value[1], value[0]]) 120 results.append([value[1], value[0]])
121 return results 121 return results
122 122
123
123 # 提取中国银行信息 124 # 提取中国银行信息
124 def extract_zhongguo_info(ocr_results): 125 def extract_zhongguo_info(ocr_results):
125 name_prefix = '客户姓名:' 126 name_prefix = '客户姓名:'
...@@ -166,6 +167,7 @@ def extract_zhongguo_info(ocr_results): ...@@ -166,6 +167,7 @@ def extract_zhongguo_info(ocr_results):
166 results.append([value[1], value[0]]) 167 results.append([value[1], value[0]])
167 return results 168 return results
168 169
170
169 # 提取建设银行信息 171 # 提取建设银行信息
170 def extract_jianshe_info(ocr_results): 172 def extract_jianshe_info(ocr_results):
171 name_prefixes = ['客户名称:', '户名:'] 173 name_prefixes = ['客户名称:', '户名:']
...@@ -218,6 +220,7 @@ def extract_jianshe_info(ocr_results): ...@@ -218,6 +220,7 @@ def extract_jianshe_info(ocr_results):
218 break 220 break
219 return results 221 return results
220 222
223
221 # 提取农业银行信息(比较复杂,目前训练的版式都支持) 224 # 提取农业银行信息(比较复杂,目前训练的版式都支持)
222 def extract_nongye_info(ocr_results): 225 def extract_nongye_info(ocr_results):
223 name_prefixes = ['客户名:', '户名:'] 226 name_prefixes = ['客户名:', '户名:']
...@@ -321,6 +324,7 @@ def extract_nongye_info(ocr_results): ...@@ -321,6 +324,7 @@ def extract_nongye_info(ocr_results):
321 break 324 break
322 return results 325 return results
323 326
327
324 # 提取银行流水信息总接口 328 # 提取银行流水信息总接口
325 def extract_bank_info(ocr_results): 329 def extract_bank_info(ocr_results):
326 results = [] 330 results = []
...@@ -337,33 +341,89 @@ def extract_bank_info(ocr_results): ...@@ -337,33 +341,89 @@ def extract_bank_info(ocr_results):
337 elif value[1].__contains__('中国银行'): 341 elif value[1].__contains__('中国银行'):
338 results = extract_zhongguo_info(ocr_results) 342 results = extract_zhongguo_info(ocr_results)
339 break 343 break
344 elif value[1].__contains__('中国邮政储蓄'):
345 results = extract_youchu_info(ocr_results)
340 if len(results) == 0: 346 if len(results) == 0:
341 results = extract_gongshang_info(ocr_results) 347 results = extract_gongshang_info(ocr_results)
342 348
343 return results 349 return results
344 350
345 351
346 if __name__ == '__main__': 352 def extract_youchu_info(ocr_results):
353 name_prefixes = ['户名:']
354 account_prefixes = ['账号:', '卡号:']
355 results = []
356 for value in ocr_results.values():
357 for name_prefix in name_prefixes:
358 if name_prefix in value[1]:
359 if name_prefix == value[1]:
360 tmp_value, max_dis = [], 999999
361 top_right_x = value[0][2]
362 top_right_y = value[0][3]
363 for tmp in ocr_results.values():
364 if tmp[1] != name_prefix:
365 if abs(tmp[0][1] - top_right_y) < abs(value[0][3] - value[0][5]) / 2 and abs(
366 tmp[0][0] - top_right_x) < max_dis:
367 tmp_value = tmp
368 max_dis = abs(tmp[0][0] - top_right_x)
369 else:
370 continue
371 new_position = [value[0][0], value[0][1], tmp_value[0][2], tmp_value[0][3], tmp_value[0][4],
372 tmp_value[0][5],
373 value[0][6], value[0][7]]
374 results.append([value[1] + tmp_value[1], new_position])
375 break
376 else:
377 results.append([value[1], value[0]])
378 break
379 for account_prefix in account_prefixes:
380 if account_prefix in value[1]:
381 if account_prefix == value[1]:
382 tmp_value, max_dis = [], 999999
383 top_right_x = value[0][2]
384 top_right_y = value[0][3]
385 for tmp in ocr_results.values():
386 if tmp[1] != account_prefix:
387 if abs(tmp[0][1] - top_right_y) < abs(value[0][3] - value[0][5]) / 2 and abs(
388 tmp[0][0] - top_right_x) < max_dis:
389 tmp_value = tmp
390 max_dis = abs(tmp[0][0] - top_right_x)
391 else:
392 continue
393 new_position = [value[0][0], value[0][1], tmp_value[0][2], tmp_value[0][3], tmp_value[0][4],
394 tmp_value[0][5],
395 value[0][6], value[0][7]]
396 results.append([value[1] + tmp_value[1], new_position])
397 break
398 else:
399 results.append([value[1], value[0]])
400 break
401 return results
402
347 403
348 path = '/data/situ_invoice_bill_data/new_data/qfs_bank_bill_data/minsheng/authentic/images/val'
349 save_path='/data/situ_invoice_bill_data/new_data/results'
350 bank='minsheng'
351 if not os.path.exists(os.path.join(save_path,bank)):
352 os.makedirs(os.path.join(save_path,bank))
353 save_path=os.path.join(save_path,bank)
354 for j in tqdm.tqdm(os.listdir(path)):
355 # if True:
356 img=cv2.imread(os.path.join(path,j))
357 # img = cv2.imread('/data/situ_invoice_bill_data/new_data/results/nongye/6/_1597382769.6449914page_23_img_0.jpg')
358 st = time.time()
359 ocr_result = bill_ocr(img)
360 et1 = time.time()
361 result = extract_bank_info(ocr_result)
362 et2 = time.time()
363 for i in range(len(result)):
364 cv2.rectangle(img, (result[i][1][0], result[i][1][1]), (result[i][1][4], result[i][1][5]), (0, 0, 255), 2)
365 # cv2.imshow('img',img)
366 # cv2.waitKey(0)
367 cv2.imwrite(os.path.join(save_path,j),img)
368 print('spend:{} ocr:{} extract:{}'.format(et2 - st, et1 - st, et2 - et1))
369 #
...\ No newline at end of file ...\ No newline at end of file
404 if __name__ == '__main__':
405 img = cv2.imread('/home/situ/下载/邮储对账单/飞书20221020-155202.jpg')
406 ocr_results = bill_ocr(img)
407 results = extract_youchu_info(ocr_results)
408 print(results)
409 # path = '/data/situ_invoice_bill_data/new_data/qfs_bank_bill_data/minsheng/authentic/images/val'
410 # save_path='/data/situ_invoice_bill_data/new_data/results'
411 # bank='minsheng'
412 # if not os.path.exists(os.path.join(save_path,bank)):
413 # os.makedirs(os.path.join(save_path,bank))
414 # save_path=os.path.join(save_path,bank)
415 # for j in tqdm.tqdm(os.listdir(path)):
416 # # if True:
417 # img=cv2.imread(os.path.join(path,j))
418 # # img = cv2.imread('/data/situ_invoice_bill_data/new_data/results/nongye/6/_1597382769.6449914page_23_img_0.jpg')
419 # st = time.time()
420 # ocr_result = bill_ocr(img)
421 # et1 = time.time()
422 # result = extract_bank_info(ocr_result)
423 # et2 = time.time()
424 # for i in range(len(result)):
425 # cv2.rectangle(img, (result[i][1][0], result[i][1][1]), (result[i][1][4], result[i][1][5]), (0, 0, 255), 2)
426 # # cv2.imshow('img',img)
427 # # cv2.waitKey(0)
428 # cv2.imwrite(os.path.join(save_path,j),img)
429 # print('spend:{} ocr:{} extract:{}'.format(et2 - st, et1 - st, et2 - et1))
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!