add youchu OCR
Showing
1 changed file
with
84 additions
and
24 deletions
... | @@ -29,7 +29,6 @@ def bill_ocr(image): | ... | @@ -29,7 +29,6 @@ def bill_ocr(image): |
29 | 29 | ||
30 | # 提取民生银行信息 | 30 | # 提取民生银行信息 |
31 | def extract_minsheng_info(ocr_results): | 31 | def extract_minsheng_info(ocr_results): |
32 | |||
33 | name_prefix = '客户姓名:' | 32 | name_prefix = '客户姓名:' |
34 | account_prefix = '客户账号:' | 33 | account_prefix = '客户账号:' |
35 | results = [] | 34 | results = [] |
... | @@ -74,6 +73,7 @@ def extract_minsheng_info(ocr_results): | ... | @@ -74,6 +73,7 @@ def extract_minsheng_info(ocr_results): |
74 | results.append([value[1], value[0]]) | 73 | results.append([value[1], value[0]]) |
75 | return results | 74 | return results |
76 | 75 | ||
76 | |||
77 | # 提取工商银行信息 | 77 | # 提取工商银行信息 |
78 | def extract_gongshang_info(ocr_results): | 78 | def extract_gongshang_info(ocr_results): |
79 | name_prefix = '户名:' | 79 | name_prefix = '户名:' |
... | @@ -120,6 +120,7 @@ def extract_gongshang_info(ocr_results): | ... | @@ -120,6 +120,7 @@ def extract_gongshang_info(ocr_results): |
120 | results.append([value[1], value[0]]) | 120 | results.append([value[1], value[0]]) |
121 | return results | 121 | return results |
122 | 122 | ||
123 | |||
123 | # 提取中国银行信息 | 124 | # 提取中国银行信息 |
124 | def extract_zhongguo_info(ocr_results): | 125 | def extract_zhongguo_info(ocr_results): |
125 | name_prefix = '客户姓名:' | 126 | name_prefix = '客户姓名:' |
... | @@ -166,6 +167,7 @@ def extract_zhongguo_info(ocr_results): | ... | @@ -166,6 +167,7 @@ def extract_zhongguo_info(ocr_results): |
166 | results.append([value[1], value[0]]) | 167 | results.append([value[1], value[0]]) |
167 | return results | 168 | return results |
168 | 169 | ||
170 | |||
169 | # 提取建设银行信息 | 171 | # 提取建设银行信息 |
170 | def extract_jianshe_info(ocr_results): | 172 | def extract_jianshe_info(ocr_results): |
171 | name_prefixes = ['客户名称:', '户名:'] | 173 | name_prefixes = ['客户名称:', '户名:'] |
... | @@ -218,6 +220,7 @@ def extract_jianshe_info(ocr_results): | ... | @@ -218,6 +220,7 @@ def extract_jianshe_info(ocr_results): |
218 | break | 220 | break |
219 | return results | 221 | return results |
220 | 222 | ||
223 | |||
221 | # 提取农业银行信息(比较复杂,目前训练的版式都支持) | 224 | # 提取农业银行信息(比较复杂,目前训练的版式都支持) |
222 | def extract_nongye_info(ocr_results): | 225 | def extract_nongye_info(ocr_results): |
223 | name_prefixes = ['客户名:', '户名:'] | 226 | name_prefixes = ['客户名:', '户名:'] |
... | @@ -321,6 +324,7 @@ def extract_nongye_info(ocr_results): | ... | @@ -321,6 +324,7 @@ def extract_nongye_info(ocr_results): |
321 | break | 324 | break |
322 | return results | 325 | return results |
323 | 326 | ||
327 | |||
324 | # 提取银行流水信息总接口 | 328 | # 提取银行流水信息总接口 |
325 | def extract_bank_info(ocr_results): | 329 | def extract_bank_info(ocr_results): |
326 | results = [] | 330 | results = [] |
... | @@ -337,33 +341,89 @@ def extract_bank_info(ocr_results): | ... | @@ -337,33 +341,89 @@ def extract_bank_info(ocr_results): |
337 | elif value[1].__contains__('中国银行'): | 341 | elif value[1].__contains__('中国银行'): |
338 | results = extract_zhongguo_info(ocr_results) | 342 | results = extract_zhongguo_info(ocr_results) |
339 | break | 343 | break |
344 | elif value[1].__contains__('中国邮政储蓄'): | ||
345 | results = extract_youchu_info(ocr_results) | ||
340 | if len(results) == 0: | 346 | if len(results) == 0: |
341 | results = extract_gongshang_info(ocr_results) | 347 | results = extract_gongshang_info(ocr_results) |
342 | 348 | ||
343 | return results | 349 | return results |
344 | 350 | ||
345 | 351 | ||
346 | if __name__ == '__main__': | 352 | def extract_youchu_info(ocr_results): |
353 | name_prefixes = ['户名:'] | ||
354 | account_prefixes = ['账号:', '卡号:'] | ||
355 | results = [] | ||
356 | for value in ocr_results.values(): | ||
357 | for name_prefix in name_prefixes: | ||
358 | if name_prefix in value[1]: | ||
359 | if name_prefix == value[1]: | ||
360 | tmp_value, max_dis = [], 999999 | ||
361 | top_right_x = value[0][2] | ||
362 | top_right_y = value[0][3] | ||
363 | for tmp in ocr_results.values(): | ||
364 | if tmp[1] != name_prefix: | ||
365 | if abs(tmp[0][1] - top_right_y) < abs(value[0][3] - value[0][5]) / 2 and abs( | ||
366 | tmp[0][0] - top_right_x) < max_dis: | ||
367 | tmp_value = tmp | ||
368 | max_dis = abs(tmp[0][0] - top_right_x) | ||
369 | else: | ||
370 | continue | ||
371 | new_position = [value[0][0], value[0][1], tmp_value[0][2], tmp_value[0][3], tmp_value[0][4], | ||
372 | tmp_value[0][5], | ||
373 | value[0][6], value[0][7]] | ||
374 | results.append([value[1] + tmp_value[1], new_position]) | ||
375 | break | ||
376 | else: | ||
377 | results.append([value[1], value[0]]) | ||
378 | break | ||
379 | for account_prefix in account_prefixes: | ||
380 | if account_prefix in value[1]: | ||
381 | if account_prefix == value[1]: | ||
382 | tmp_value, max_dis = [], 999999 | ||
383 | top_right_x = value[0][2] | ||
384 | top_right_y = value[0][3] | ||
385 | for tmp in ocr_results.values(): | ||
386 | if tmp[1] != account_prefix: | ||
387 | if abs(tmp[0][1] - top_right_y) < abs(value[0][3] - value[0][5]) / 2 and abs( | ||
388 | tmp[0][0] - top_right_x) < max_dis: | ||
389 | tmp_value = tmp | ||
390 | max_dis = abs(tmp[0][0] - top_right_x) | ||
391 | else: | ||
392 | continue | ||
393 | new_position = [value[0][0], value[0][1], tmp_value[0][2], tmp_value[0][3], tmp_value[0][4], | ||
394 | tmp_value[0][5], | ||
395 | value[0][6], value[0][7]] | ||
396 | results.append([value[1] + tmp_value[1], new_position]) | ||
397 | break | ||
398 | else: | ||
399 | results.append([value[1], value[0]]) | ||
400 | break | ||
401 | return results | ||
402 | |||
347 | 403 | ||
348 | path = '/data/situ_invoice_bill_data/new_data/qfs_bank_bill_data/minsheng/authentic/images/val' | ||
349 | save_path='/data/situ_invoice_bill_data/new_data/results' | ||
350 | bank='minsheng' | ||
351 | if not os.path.exists(os.path.join(save_path,bank)): | ||
352 | os.makedirs(os.path.join(save_path,bank)) | ||
353 | save_path=os.path.join(save_path,bank) | ||
354 | for j in tqdm.tqdm(os.listdir(path)): | ||
355 | # if True: | ||
356 | img=cv2.imread(os.path.join(path,j)) | ||
357 | # img = cv2.imread('/data/situ_invoice_bill_data/new_data/results/nongye/6/_1597382769.6449914page_23_img_0.jpg') | ||
358 | st = time.time() | ||
359 | ocr_result = bill_ocr(img) | ||
360 | et1 = time.time() | ||
361 | result = extract_bank_info(ocr_result) | ||
362 | et2 = time.time() | ||
363 | for i in range(len(result)): | ||
364 | cv2.rectangle(img, (result[i][1][0], result[i][1][1]), (result[i][1][4], result[i][1][5]), (0, 0, 255), 2) | ||
365 | # cv2.imshow('img',img) | ||
366 | # cv2.waitKey(0) | ||
367 | cv2.imwrite(os.path.join(save_path,j),img) | ||
368 | print('spend:{} ocr:{} extract:{}'.format(et2 - st, et1 - st, et2 - et1)) | ||
369 | # | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
404 | if __name__ == '__main__': | ||
405 | img = cv2.imread('/home/situ/下载/邮储对账单/飞书20221020-155202.jpg') | ||
406 | ocr_results = bill_ocr(img) | ||
407 | results = extract_youchu_info(ocr_results) | ||
408 | print(results) | ||
409 | # path = '/data/situ_invoice_bill_data/new_data/qfs_bank_bill_data/minsheng/authentic/images/val' | ||
410 | # save_path='/data/situ_invoice_bill_data/new_data/results' | ||
411 | # bank='minsheng' | ||
412 | # if not os.path.exists(os.path.join(save_path,bank)): | ||
413 | # os.makedirs(os.path.join(save_path,bank)) | ||
414 | # save_path=os.path.join(save_path,bank) | ||
415 | # for j in tqdm.tqdm(os.listdir(path)): | ||
416 | # # if True: | ||
417 | # img=cv2.imread(os.path.join(path,j)) | ||
418 | # # img = cv2.imread('/data/situ_invoice_bill_data/new_data/results/nongye/6/_1597382769.6449914page_23_img_0.jpg') | ||
419 | # st = time.time() | ||
420 | # ocr_result = bill_ocr(img) | ||
421 | # et1 = time.time() | ||
422 | # result = extract_bank_info(ocr_result) | ||
423 | # et2 = time.time() | ||
424 | # for i in range(len(result)): | ||
425 | # cv2.rectangle(img, (result[i][1][0], result[i][1][1]), (result[i][1][4], result[i][1][5]), (0, 0, 255), 2) | ||
426 | # # cv2.imshow('img',img) | ||
427 | # # cv2.waitKey(0) | ||
428 | # cv2.imwrite(os.path.join(save_path,j),img) | ||
429 | # print('spend:{} ocr:{} extract:{}'.format(et2 - st, et1 - st, et2 - et1)) | ... | ... |
-
Please register or sign in to post a comment