d9b0ae8c by 周伟奇

add folder ocr process

1 parent 4076848e
......@@ -34,5 +34,4 @@ ocr/*
# 脚本
src/*.sh
test*
folder_ocr_process.py
\ No newline at end of file
test*
\ No newline at end of file
......
......@@ -83,6 +83,7 @@ RES_SHEET_HEADER = ('页码', '图片序号', '检测图片序号', '结果')
RES_SUCCESS = '识别成功'
RES_SUCCESS_OTHER = '识别成功(其他类)'
RES_SUCCESS_EMPTY = '识别成功(空数据)'
RES_FAILED = '识别失败'
RES_FAILED_1 = '识别失败(阶段1)'
RES_FAILED_2 = '识别失败(阶段2)'
RES_FAILED_3 = '识别失败(阶段1数据格式错误)'
......
import os
import re
import time
import json
import shutil
import base64
import signal
import asyncio
import aiohttp
import difflib
import requests
import traceback
from collections import Counter
from datetime import datetime, date
from django.utils import timezone
from django.core.management import BaseCommand
from multiprocessing import Process, Queue, Manager, Lock
from settings import conf
from common.mixins import LoggerMixin
from common.tools.file_tools import write_zip_file
from common.tools.pdf_to_img import PDFHandler
from apps.doc import consts
from apps.doc.ocr.edms import EDMS, rh
from apps.doc.named_enum import KeywordsType
from apps.doc.exceptions import EDMSException, OCR1Exception, OCR2Exception
from apps.doc.ocr.wb import BSWorkbook, Workbook
from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords
class Command(BaseCommand, LoggerMixin):
def __init__(self):
super().__init__()
self.log_base = '[folder ocr process]'
# 处理文件开关
self.switch = True
# 睡眠时间
self.sleep_time = float(conf.SLEEP_SECOND_FOLDER)
# input foler
self.input_dirs = conf.get_namespace('INPUT_DIR_')
# ocr相关
self.ocr_url = conf.OCR_URL_FOLDER
# 优雅退出信号:15
signal.signal(signal.SIGTERM, self.signal_handler)
def signal_handler(self, sig, frame):
self.switch = False # 停止处理文件
def license1_process(self, ocr_data, license_summary, classify, res_list, pno, ino, part_idx):
# 类别:'0'身份证, '1'居住证
license_data = ocr_data.get('data', [])
if not license_data:
res_list.append((pno, ino, part_idx, consts.RES_SUCCESS_EMPTY))
return
res_list.append((pno, ino, part_idx, consts.RES_SUCCESS))
license_summary.setdefault(classify, []).extend(license_data)
@staticmethod
def parse_img_path(img_path):
# 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext)
img_name, _ = os.path.splitext(os.path.basename(img_path))
if re.match(r'page_\d+_img_\d+', img_name):
part_list = img_name.split('_')
return img_name, int(part_list[1])+1, int(part_list[3])+1
else:
return img_name, 1, 1
@staticmethod
def get_path(name, img_output_dir, wb_output_dir):
time_stamp = int(time.time())
new_name = '{0}_{1}'.format(time_stamp, name)
img_save_path = os.path.join(img_output_dir, new_name)
excel_name = '{0}.xlsx'.format(os.path.splitext(new_name)[0])
excel_path = os.path.join(wb_output_dir, excel_name)
return img_save_path, excel_path
def res_process(self, all_res, classify, excel_path):
try:
license_summary = {}
res_list = []
if not all_res:
return
else:
for img_path, ocr_res in all_res.items():
img_name, pno, ino = self.parse_img_path(img_path)
part_idx = 1
if isinstance(ocr_res, dict):
if ocr_res.get('code') == 1:
data_list = ocr_res.get('data', [])
if isinstance(data_list, list):
for part_idx, ocr_data in enumerate(data_list):
part_idx = part_idx + 1
self.license1_process(ocr_data, license_summary, classify, res_list, pno, ino, part_idx)
else:
res_list.append((pno, ino, part_idx, consts.RES_FAILED_3))
else:
res_list.append((pno, ino, part_idx, consts.RES_FAILED))
else:
res_list.append((pno, ino, part_idx, consts.RES_FAILED))
wb = BSWorkbook(set(), set(), set())
wb.simple_license_rebuild(license_summary, consts.DOC_SCHEME_LIST[0])
wb.save(excel_path)
except Exception as e:
self.cronjob_log.error('{0} [wb build error] [path={1}] [error={2}]'.format(
self.log_base, excel_path, traceback.format_exc()))
def ocr_process(self, img_path, classify):
if os.path.exists(img_path):
# TODO 图片验证
with open(img_path, 'rb') as f:
base64_data = base64.b64encode(f.read())
# 获取解码后的base64值
file_data = base64_data.decode()
json_data = {
"file": file_data,
"classify": classify
}
for times in range(consts.RETRY_TIMES):
try:
start_time = time.time()
ocr_response = requests.post(self.ocr_url, json=json_data)
if ocr_response.status_code != 200:
raise OCR1Exception('{0} ocr status code: {0}'.format(self.log_base, ocr_response.status_code))
except Exception as e:
self.cronjob_log.warn('{0} [ocr failed] [times={1}] [img_path={2}] [error={3}]'.format(
self.log_base, times, img_path, traceback.format_exc()))
else:
ocr_res = ocr_response.json()
end_time = time.time()
speed_time = int(end_time - start_time)
self.cronjob_log.info('{0} [ocr success] [img={1}] [res={2}] [speed_time={3}]'.format(
self.log_base, img_path, ocr_res, speed_time))
return ocr_res
else:
self.cronjob_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path))
def pdf_process(self, name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir):
if os.path.exists(path):
try:
img_save_path, excel_path= self.get_path(name, img_output_dir, wb_output_dir)
self.cronjob_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path))
pdf_handler = PDFHandler(path, img_save_path)
pdf_handler.extract_image()
self.cronjob_log.info('{0} [pdf to img end] [path={1}]'.format(self.log_base, path))
except Exception as e:
self.cronjob_log.error('{0} [pdf to img error] [path={1}] [error={2}]'.format(
self.log_base, path, traceback.format_exc()))
else:
all_res = {}
for img_path in pdf_handler.img_path_list:
ocr_res = self.ocr_process(img_path, classify)
all_res[img_path] = ocr_res
self.res_process(all_res, classify, excel_path)
shutil.move(path, pdf_output_dir)
def img_process(self, name, path, classify, wb_output_dir, img_output_dir):
ocr_res = self.ocr_process(path, classify)
all_res = {path: ocr_res}
try:
img_save_path, excel_path = self.get_path(name, img_output_dir, wb_output_dir)
except Exception as e:
self.cronjob_log.error('{0} [get path error] [path={1}] [error={2}]'.format(
self.log_base, path, traceback.format_exc()))
else:
self.res_process(all_res, classify, excel_path)
shutil.move(path, img_save_path)
def folder_process(self, input_dir, classify):
output_dir = os.path.join(os.path.dirname(input_dir), 'Output')
img_output_dir = os.path.join(output_dir, 'image')
wb_output_dir = os.path.join(output_dir, 'excel')
pdf_output_dir = os.path.join(output_dir, 'pdf')
os.makedirs(output_dir, exist_ok=True)
os.makedirs(img_output_dir, exist_ok=True)
os.makedirs(wb_output_dir, exist_ok=True)
os.makedirs(pdf_output_dir, exist_ok=True)
while self.switch:
# 1. 从input dir获取pdf or image
list_dir = os.listdir(input_dir)
if not list_dir:
self.cronjob_log.error('{0} [input dir empty] [input_dir={1}]'.format(self.log_base, input_dir))
time.sleep(self.sleep_time)
for name in list_dir:
path = os.path.join(input_dir, name)
if os.path.isfile(path):
self.cronjob_log.info('{0} [file start] [path={1}]'.format(self.log_base, path))
if name.endswith('.pdf'):
self.pdf_process(name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir)
else:
self.img_process(name, path, classify, wb_output_dir, img_output_dir)
self.cronjob_log.info('{0} [file end] [path={1}]'.format(self.log_base, path))
def handle(self, *args, **kwargs):
process_list = []
for classify_idx, input_dir in self.input_dirs.items():
classify = int(classify_idx.split('_')[0])
process = Process(target=self.folder_process, args=(input_dir, classify))
process_list.append(process)
for p in process_list:
p.start()
for p in process_list:
p.join()
self.cronjob_log.info('{0} [stop safely]'.format(self.log_base))
......@@ -583,6 +583,29 @@ class BSWorkbook(Workbook):
count += 1
count_list.append((field_str, count))
def simple_license_rebuild(self, license_summary, document_scheme):
for classify, (_, name, field_order, side_diff, scheme_diff, _) in consts.LICENSE_ORDER:
license_list = license_summary.get(classify)
if not license_list:
continue
ws = self.create_sheet(name)
if scheme_diff and document_scheme == consts.DOC_SCHEME_LIST[1]:
classify = consts.MVC_CLASSIFY_SE
for license_dict in license_list:
if classify == consts.IC_CLASSIFY and license_dict.get('类别') == '1':
license_summary.setdefault(consts.RP_CLASSIFY, []).append(license_dict)
continue
if side_diff:
key, field_order_yes, field_order_no = consts.FIELD_ORDER_MAP.get(classify)
field_order = field_order_yes if key in license_dict else field_order_no
for search_field, write_field in field_order:
field_value = license_dict.get(search_field, '')
if isinstance(field_value, list):
ws.append((write_field, *field_value))
else:
ws.append((write_field, field_value))
ws.append((None, ))
def res_sheet(self, res_list):
if res_list:
res_list.sort(key=lambda x: (x[0], x[1], x[2]))
......
......@@ -4,6 +4,8 @@ SLEEP_SECOND_DOC_GET = 2
SLEEP_SECOND_IMG_PUT = 2
SLEEP_SECOND_IMG_GET = 0.5
SLEEP_SECOND_TASK_GET = 2
SLEEP_SECOND_FOLDER = 2
IMG_QUEUE_SIZE = 500
EDMS_DOWNLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/DownloadHandler.ashx
......
......@@ -4,6 +4,8 @@ SLEEP_SECOND_DOC_GET = 10
SLEEP_SECOND_IMG_PUT = 2
SLEEP_SECOND_IMG_GET = 0.5
SLEEP_SECOND_TASK_GET = 2
SLEEP_SECOND_FOLDER = 2
IMG_QUEUE_SIZE = 500
EDMS_DOWNLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/DownloadHandler.ashx
......
......@@ -4,6 +4,8 @@ SLEEP_SECOND_DOC_GET = 2
SLEEP_SECOND_IMG_PUT = 2
SLEEP_SECOND_IMG_GET = 0.5
SLEEP_SECOND_TASK_GET = 2
SLEEP_SECOND_FOLDER = 2
IMG_QUEUE_SIZE = 500
EDMS_DOWNLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/DownloadHandler.ashx
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!