add youchu OCR
Showing
1 changed file
with
84 additions
and
24 deletions
| ... | @@ -29,7 +29,6 @@ def bill_ocr(image): | ... | @@ -29,7 +29,6 @@ def bill_ocr(image): |
| 29 | 29 | ||
| 30 | # 提取民生银行信息 | 30 | # 提取民生银行信息 |
| 31 | def extract_minsheng_info(ocr_results): | 31 | def extract_minsheng_info(ocr_results): |
| 32 | |||
| 33 | name_prefix = '客户姓名:' | 32 | name_prefix = '客户姓名:' |
| 34 | account_prefix = '客户账号:' | 33 | account_prefix = '客户账号:' |
| 35 | results = [] | 34 | results = [] |
| ... | @@ -74,6 +73,7 @@ def extract_minsheng_info(ocr_results): | ... | @@ -74,6 +73,7 @@ def extract_minsheng_info(ocr_results): |
| 74 | results.append([value[1], value[0]]) | 73 | results.append([value[1], value[0]]) |
| 75 | return results | 74 | return results |
| 76 | 75 | ||
| 76 | |||
| 77 | # 提取工商银行信息 | 77 | # 提取工商银行信息 |
| 78 | def extract_gongshang_info(ocr_results): | 78 | def extract_gongshang_info(ocr_results): |
| 79 | name_prefix = '户名:' | 79 | name_prefix = '户名:' |
| ... | @@ -120,6 +120,7 @@ def extract_gongshang_info(ocr_results): | ... | @@ -120,6 +120,7 @@ def extract_gongshang_info(ocr_results): |
| 120 | results.append([value[1], value[0]]) | 120 | results.append([value[1], value[0]]) |
| 121 | return results | 121 | return results |
| 122 | 122 | ||
| 123 | |||
| 123 | # 提取中国银行信息 | 124 | # 提取中国银行信息 |
| 124 | def extract_zhongguo_info(ocr_results): | 125 | def extract_zhongguo_info(ocr_results): |
| 125 | name_prefix = '客户姓名:' | 126 | name_prefix = '客户姓名:' |
| ... | @@ -166,6 +167,7 @@ def extract_zhongguo_info(ocr_results): | ... | @@ -166,6 +167,7 @@ def extract_zhongguo_info(ocr_results): |
| 166 | results.append([value[1], value[0]]) | 167 | results.append([value[1], value[0]]) |
| 167 | return results | 168 | return results |
| 168 | 169 | ||
| 170 | |||
| 169 | # 提取建设银行信息 | 171 | # 提取建设银行信息 |
| 170 | def extract_jianshe_info(ocr_results): | 172 | def extract_jianshe_info(ocr_results): |
| 171 | name_prefixes = ['客户名称:', '户名:'] | 173 | name_prefixes = ['客户名称:', '户名:'] |
| ... | @@ -218,6 +220,7 @@ def extract_jianshe_info(ocr_results): | ... | @@ -218,6 +220,7 @@ def extract_jianshe_info(ocr_results): |
| 218 | break | 220 | break |
| 219 | return results | 221 | return results |
| 220 | 222 | ||
| 223 | |||
| 221 | # 提取农业银行信息(比较复杂,目前训练的版式都支持) | 224 | # 提取农业银行信息(比较复杂,目前训练的版式都支持) |
| 222 | def extract_nongye_info(ocr_results): | 225 | def extract_nongye_info(ocr_results): |
| 223 | name_prefixes = ['客户名:', '户名:'] | 226 | name_prefixes = ['客户名:', '户名:'] |
| ... | @@ -321,6 +324,7 @@ def extract_nongye_info(ocr_results): | ... | @@ -321,6 +324,7 @@ def extract_nongye_info(ocr_results): |
| 321 | break | 324 | break |
| 322 | return results | 325 | return results |
| 323 | 326 | ||
| 327 | |||
| 324 | # 提取银行流水信息总接口 | 328 | # 提取银行流水信息总接口 |
| 325 | def extract_bank_info(ocr_results): | 329 | def extract_bank_info(ocr_results): |
| 326 | results = [] | 330 | results = [] |
| ... | @@ -337,33 +341,89 @@ def extract_bank_info(ocr_results): | ... | @@ -337,33 +341,89 @@ def extract_bank_info(ocr_results): |
| 337 | elif value[1].__contains__('中国银行'): | 341 | elif value[1].__contains__('中国银行'): |
| 338 | results = extract_zhongguo_info(ocr_results) | 342 | results = extract_zhongguo_info(ocr_results) |
| 339 | break | 343 | break |
| 344 | elif value[1].__contains__('中国邮政储蓄'): | ||
| 345 | results = extract_youchu_info(ocr_results) | ||
| 340 | if len(results) == 0: | 346 | if len(results) == 0: |
| 341 | results = extract_gongshang_info(ocr_results) | 347 | results = extract_gongshang_info(ocr_results) |
| 342 | 348 | ||
| 343 | return results | 349 | return results |
| 344 | 350 | ||
| 345 | 351 | ||
| 346 | if __name__ == '__main__': | 352 | def extract_youchu_info(ocr_results): |
| 353 | name_prefixes = ['户名:'] | ||
| 354 | account_prefixes = ['账号:', '卡号:'] | ||
| 355 | results = [] | ||
| 356 | for value in ocr_results.values(): | ||
| 357 | for name_prefix in name_prefixes: | ||
| 358 | if name_prefix in value[1]: | ||
| 359 | if name_prefix == value[1]: | ||
| 360 | tmp_value, max_dis = [], 999999 | ||
| 361 | top_right_x = value[0][2] | ||
| 362 | top_right_y = value[0][3] | ||
| 363 | for tmp in ocr_results.values(): | ||
| 364 | if tmp[1] != name_prefix: | ||
| 365 | if abs(tmp[0][1] - top_right_y) < abs(value[0][3] - value[0][5]) / 2 and abs( | ||
| 366 | tmp[0][0] - top_right_x) < max_dis: | ||
| 367 | tmp_value = tmp | ||
| 368 | max_dis = abs(tmp[0][0] - top_right_x) | ||
| 369 | else: | ||
| 370 | continue | ||
| 371 | new_position = [value[0][0], value[0][1], tmp_value[0][2], tmp_value[0][3], tmp_value[0][4], | ||
| 372 | tmp_value[0][5], | ||
| 373 | value[0][6], value[0][7]] | ||
| 374 | results.append([value[1] + tmp_value[1], new_position]) | ||
| 375 | break | ||
| 376 | else: | ||
| 377 | results.append([value[1], value[0]]) | ||
| 378 | break | ||
| 379 | for account_prefix in account_prefixes: | ||
| 380 | if account_prefix in value[1]: | ||
| 381 | if account_prefix == value[1]: | ||
| 382 | tmp_value, max_dis = [], 999999 | ||
| 383 | top_right_x = value[0][2] | ||
| 384 | top_right_y = value[0][3] | ||
| 385 | for tmp in ocr_results.values(): | ||
| 386 | if tmp[1] != account_prefix: | ||
| 387 | if abs(tmp[0][1] - top_right_y) < abs(value[0][3] - value[0][5]) / 2 and abs( | ||
| 388 | tmp[0][0] - top_right_x) < max_dis: | ||
| 389 | tmp_value = tmp | ||
| 390 | max_dis = abs(tmp[0][0] - top_right_x) | ||
| 391 | else: | ||
| 392 | continue | ||
| 393 | new_position = [value[0][0], value[0][1], tmp_value[0][2], tmp_value[0][3], tmp_value[0][4], | ||
| 394 | tmp_value[0][5], | ||
| 395 | value[0][6], value[0][7]] | ||
| 396 | results.append([value[1] + tmp_value[1], new_position]) | ||
| 397 | break | ||
| 398 | else: | ||
| 399 | results.append([value[1], value[0]]) | ||
| 400 | break | ||
| 401 | return results | ||
| 402 | |||
| 347 | 403 | ||
| 348 | path = '/data/situ_invoice_bill_data/new_data/qfs_bank_bill_data/minsheng/authentic/images/val' | ||
| 349 | save_path='/data/situ_invoice_bill_data/new_data/results' | ||
| 350 | bank='minsheng' | ||
| 351 | if not os.path.exists(os.path.join(save_path,bank)): | ||
| 352 | os.makedirs(os.path.join(save_path,bank)) | ||
| 353 | save_path=os.path.join(save_path,bank) | ||
| 354 | for j in tqdm.tqdm(os.listdir(path)): | ||
| 355 | # if True: | ||
| 356 | img=cv2.imread(os.path.join(path,j)) | ||
| 357 | # img = cv2.imread('/data/situ_invoice_bill_data/new_data/results/nongye/6/_1597382769.6449914page_23_img_0.jpg') | ||
| 358 | st = time.time() | ||
| 359 | ocr_result = bill_ocr(img) | ||
| 360 | et1 = time.time() | ||
| 361 | result = extract_bank_info(ocr_result) | ||
| 362 | et2 = time.time() | ||
| 363 | for i in range(len(result)): | ||
| 364 | cv2.rectangle(img, (result[i][1][0], result[i][1][1]), (result[i][1][4], result[i][1][5]), (0, 0, 255), 2) | ||
| 365 | # cv2.imshow('img',img) | ||
| 366 | # cv2.waitKey(0) | ||
| 367 | cv2.imwrite(os.path.join(save_path,j),img) | ||
| 368 | print('spend:{} ocr:{} extract:{}'.format(et2 - st, et1 - st, et2 - et1)) | ||
| 369 | # | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 404 | if __name__ == '__main__': | ||
| 405 | img = cv2.imread('/home/situ/下载/邮储对账单/飞书20221020-155202.jpg') | ||
| 406 | ocr_results = bill_ocr(img) | ||
| 407 | results = extract_youchu_info(ocr_results) | ||
| 408 | print(results) | ||
| 409 | # path = '/data/situ_invoice_bill_data/new_data/qfs_bank_bill_data/minsheng/authentic/images/val' | ||
| 410 | # save_path='/data/situ_invoice_bill_data/new_data/results' | ||
| 411 | # bank='minsheng' | ||
| 412 | # if not os.path.exists(os.path.join(save_path,bank)): | ||
| 413 | # os.makedirs(os.path.join(save_path,bank)) | ||
| 414 | # save_path=os.path.join(save_path,bank) | ||
| 415 | # for j in tqdm.tqdm(os.listdir(path)): | ||
| 416 | # # if True: | ||
| 417 | # img=cv2.imread(os.path.join(path,j)) | ||
| 418 | # # img = cv2.imread('/data/situ_invoice_bill_data/new_data/results/nongye/6/_1597382769.6449914page_23_img_0.jpg') | ||
| 419 | # st = time.time() | ||
| 420 | # ocr_result = bill_ocr(img) | ||
| 421 | # et1 = time.time() | ||
| 422 | # result = extract_bank_info(ocr_result) | ||
| 423 | # et2 = time.time() | ||
| 424 | # for i in range(len(result)): | ||
| 425 | # cv2.rectangle(img, (result[i][1][0], result[i][1][1]), (result[i][1][4], result[i][1][5]), (0, 0, 255), 2) | ||
| 426 | # # cv2.imshow('img',img) | ||
| 427 | # # cv2.waitKey(0) | ||
| 428 | # cv2.imwrite(os.path.join(save_path,j),img) | ||
| 429 | # print('spend:{} ocr:{} extract:{}'.format(et2 - st, et1 - st, et2 - et1)) | ... | ... |
-
Please register or sign in to post a comment