add folder ocr process

周伟奇
Showing 7 changed files with 243 additions and 2 deletions
.gitignore
src/apps/doc/consts.py
src/apps/doc/management/commands/folder_ocr_process.py
src/apps/doc/ocr/wb.py
src/settings/conf/prd.ini
src/settings/conf/sit.ini
src/settings/conf/uat.ini
--- a/.gitignore
View file @d9b0ae8
+++ b/.gitignore
View file @d9b0ae8
@@ -34,5 +34,4 @@ ocr/*
 # 脚本
 src/*.sh

-test*
-folder_ocr_process.py
\ No newline at end of file
+test*
\ No newline at end of file
--- a/src/apps/doc/consts.py
View file @d9b0ae8
+++ b/src/apps/doc/consts.py
View file @d9b0ae8
@@ -83,6 +83,7 @@ RES_SHEET_HEADER = ('页码', '图片序号', '检测图片序号', '结果')
 RES_SUCCESS = '识别成功'
 RES_SUCCESS_OTHER = '识别成功（其他类）'
 RES_SUCCESS_EMPTY = '识别成功（空数据）'
+RES_FAILED = '识别失败'
 RES_FAILED_1 = '识别失败(阶段1)'
 RES_FAILED_2 = '识别失败(阶段2)'
 RES_FAILED_3 = '识别失败(阶段1数据格式错误)'
--- a/src/apps/doc/management/commands/folder_ocr_process.py 0 → 100644
View file @d9b0ae8
+++ b/src/apps/doc/management/commands/folder_ocr_process.py 0 → 100644
View file @d9b0ae8
+import os
+import re
+import time
+import json
+import shutil
+import base64
+import signal
+import asyncio
+import aiohttp
+import difflib
+import requests
+import traceback
+from collections import Counter
+from datetime import datetime, date
+from django.utils import timezone
+from django.core.management import BaseCommand
+from multiprocessing import Process, Queue, Manager, Lock
+
+from settings import conf
+from common.mixins import LoggerMixin
+from common.tools.file_tools import write_zip_file
+from common.tools.pdf_to_img import PDFHandler
+from apps.doc import consts
+from apps.doc.ocr.edms import EDMS, rh
+from apps.doc.named_enum import KeywordsType
+from apps.doc.exceptions import EDMSException, OCR1Exception, OCR2Exception
+from apps.doc.ocr.wb import BSWorkbook, Workbook
+from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords
+
+
+class Command(BaseCommand, LoggerMixin):
+
+    def __init__(self):
+        super().__init__()
+        self.log_base = '[folder ocr process]'
+        # 处理文件开关
+        self.switch = True
+        # 睡眠时间
+        self.sleep_time = float(conf.SLEEP_SECOND_FOLDER)
+        # input foler
+        self.input_dirs = conf.get_namespace('INPUT_DIR_')
+        # ocr相关
+        self.ocr_url = conf.OCR_URL_FOLDER
+        # 优雅退出信号：15
+        signal.signal(signal.SIGTERM, self.signal_handler)
+
+    def signal_handler(self, sig, frame):
+        self.switch = False  # 停止处理文件
+
+    def license1_process(self, ocr_data, license_summary, classify, res_list, pno, ino, part_idx):
+        # 类别：'0'身份证， '1'居住证
+        license_data = ocr_data.get('data', [])
+        if not license_data:
+            res_list.append((pno, ino, part_idx, consts.RES_SUCCESS_EMPTY))
+            return
+        res_list.append((pno, ino, part_idx, consts.RES_SUCCESS))
+        license_summary.setdefault(classify, []).extend(license_data)
+
+    @staticmethod
+    def parse_img_path(img_path):
+        # 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext)
+        img_name, _ = os.path.splitext(os.path.basename(img_path))
+        if re.match(r'page_\d+_img_\d+', img_name):
+            part_list = img_name.split('_')
+            return img_name, int(part_list[1])+1, int(part_list[3])+1
+        else:
+            return img_name, 1, 1
+
+    @staticmethod
+    def get_path(name, img_output_dir, wb_output_dir):
+        time_stamp = int(time.time())
+        new_name = '{0}_{1}'.format(time_stamp, name)
+        img_save_path = os.path.join(img_output_dir, new_name)
+        excel_name = '{0}.xlsx'.format(os.path.splitext(new_name)[0])
+        excel_path = os.path.join(wb_output_dir, excel_name)
+        return img_save_path, excel_path
+
+    def res_process(self, all_res, classify, excel_path):
+        try:
+            license_summary = {}
+            res_list = []
+
+            if not all_res:
+                return
+            else:
+                for img_path, ocr_res in all_res.items():
+                    img_name, pno, ino = self.parse_img_path(img_path)
+                    part_idx = 1
+
+                    if isinstance(ocr_res, dict):
+                        if ocr_res.get('code') == 1:
+                            data_list = ocr_res.get('data', [])
+                            if isinstance(data_list, list):
+                                for part_idx, ocr_data in enumerate(data_list):
+                                    part_idx = part_idx + 1
+                                    self.license1_process(ocr_data, license_summary, classify, res_list, pno, ino, part_idx)
+                            else:
+                                res_list.append((pno, ino, part_idx, consts.RES_FAILED_3))
+                        else:
+                            res_list.append((pno, ino, part_idx, consts.RES_FAILED))
+                    else:
+                        res_list.append((pno, ino, part_idx, consts.RES_FAILED))
+
+                wb = BSWorkbook(set(), set(), set())
+                wb.simple_license_rebuild(license_summary, consts.DOC_SCHEME_LIST[0])
+                wb.save(excel_path)
+        except Exception as e:
+            self.cronjob_log.error('{0} [wb build error] [path={1}] [error={2}]'.format(
+                self.log_base, excel_path, traceback.format_exc()))
+
+    def ocr_process(self, img_path, classify):
+        if os.path.exists(img_path):
+            # TODO 图片验证
+            with open(img_path, 'rb') as f:
+                base64_data = base64.b64encode(f.read())
+                # 获取解码后的base64值
+                file_data = base64_data.decode()
+            json_data = {
+                "file": file_data,
+                "classify": classify
+            }
+
+            for times in range(consts.RETRY_TIMES):
+                try:
+                    start_time = time.time()
+                    ocr_response = requests.post(self.ocr_url, json=json_data)
+                    if ocr_response.status_code != 200:
+                        raise OCR1Exception('{0} ocr status code: {0}'.format(self.log_base, ocr_response.status_code))
+                except Exception as e:
+                    self.cronjob_log.warn('{0} [ocr failed] [times={1}] [img_path={2}] [error={3}]'.format(
+                        self.log_base, times, img_path, traceback.format_exc()))
+                else:
+                    ocr_res = ocr_response.json()
+                    end_time = time.time()
+                    speed_time = int(end_time - start_time)
+                    self.cronjob_log.info('{0} [ocr success] [img={1}] [res={2}] [speed_time={3}]'.format(
+                        self.log_base, img_path, ocr_res, speed_time))
+                    return ocr_res
+            else:
+                self.cronjob_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path))
+
+    def pdf_process(self, name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir):
+        if os.path.exists(path):
+            try:
+                img_save_path, excel_path= self.get_path(name, img_output_dir, wb_output_dir)
+                self.cronjob_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path))
+                pdf_handler = PDFHandler(path, img_save_path)
+                pdf_handler.extract_image()
+                self.cronjob_log.info('{0} [pdf to img end] [path={1}]'.format(self.log_base, path))
+            except Exception as e:
+                self.cronjob_log.error('{0} [pdf to img error] [path={1}] [error={2}]'.format(
+                    self.log_base, path, traceback.format_exc()))
+            else:
+                all_res = {}
+                for img_path in pdf_handler.img_path_list:
+                    ocr_res = self.ocr_process(img_path, classify)
+                    all_res[img_path] = ocr_res
+                self.res_process(all_res, classify, excel_path)
+
+            shutil.move(path, pdf_output_dir)
+
+    def img_process(self, name, path, classify, wb_output_dir, img_output_dir):
+        ocr_res = self.ocr_process(path, classify)
+        all_res = {path: ocr_res}
+
+        try:
+            img_save_path, excel_path = self.get_path(name, img_output_dir, wb_output_dir)
+        except Exception as e:
+            self.cronjob_log.error('{0} [get path error] [path={1}] [error={2}]'.format(
+                self.log_base, path, traceback.format_exc()))
+        else:
+            self.res_process(all_res, classify, excel_path)
+            shutil.move(path, img_save_path)
+
+    def folder_process(self, input_dir, classify):
+        output_dir = os.path.join(os.path.dirname(input_dir), 'Output')
+        img_output_dir = os.path.join(output_dir, 'image')
+        wb_output_dir = os.path.join(output_dir, 'excel')
+        pdf_output_dir = os.path.join(output_dir, 'pdf')
+        os.makedirs(output_dir, exist_ok=True)
+        os.makedirs(img_output_dir, exist_ok=True)
+        os.makedirs(wb_output_dir, exist_ok=True)
+        os.makedirs(pdf_output_dir, exist_ok=True)
+        while self.switch:
+            # 1. 从input dir获取pdf or image
+            list_dir = os.listdir(input_dir)
+            if not list_dir:
+                self.cronjob_log.error('{0} [input dir empty] [input_dir={1}]'.format(self.log_base, input_dir))
+                time.sleep(self.sleep_time)
+            for name in list_dir:
+                path = os.path.join(input_dir, name)
+                if os.path.isfile(path):
+                    self.cronjob_log.info('{0} [file start] [path={1}]'.format(self.log_base, path))
+                    if name.endswith('.pdf'):
+                        self.pdf_process(name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir)
+                    else:
+                        self.img_process(name, path, classify, wb_output_dir, img_output_dir)
+                    self.cronjob_log.info('{0} [file end] [path={1}]'.format(self.log_base, path))
+
+    def handle(self, *args, **kwargs):
+        process_list = []
+        for classify_idx, input_dir in self.input_dirs.items():
+            classify = int(classify_idx.split('_')[0])
+            process = Process(target=self.folder_process, args=(input_dir, classify))
+            process_list.append(process)
+
+        for p in process_list:
+            p.start()
+        for p in process_list:
+            p.join()
+
+        self.cronjob_log.info('{0} [stop safely]'.format(self.log_base))
--- a/src/apps/doc/ocr/wb.py
View file @d9b0ae8
+++ b/src/apps/doc/ocr/wb.py
View file @d9b0ae8
@@ -583,6 +583,29 @@ class BSWorkbook(Workbook):
                count += 1
            count_list.append((field_str, count))

+    def simple_license_rebuild(self, license_summary, document_scheme):
+        for classify, (_, name, field_order, side_diff, scheme_diff, _) in consts.LICENSE_ORDER:
+            license_list = license_summary.get(classify)
+            if not license_list:
+                continue
+            ws = self.create_sheet(name)
+            if scheme_diff and document_scheme == consts.DOC_SCHEME_LIST[1]:
+                classify = consts.MVC_CLASSIFY_SE
+            for license_dict in license_list:
+                if classify == consts.IC_CLASSIFY and license_dict.get('类别') == '1':
+                    license_summary.setdefault(consts.RP_CLASSIFY, []).append(license_dict)
+                    continue
+                if side_diff:
+                    key, field_order_yes, field_order_no = consts.FIELD_ORDER_MAP.get(classify)
+                    field_order = field_order_yes if key in license_dict else field_order_no
+                for search_field, write_field in field_order:
+                    field_value = license_dict.get(search_field, '')
+                    if isinstance(field_value, list):
+                        ws.append((write_field, *field_value))
+                    else:
+                        ws.append((write_field, field_value))
+                ws.append((None, ))
+
    def res_sheet(self, res_list):
        if res_list:
            res_list.sort(key=lambda x: (x[0], x[1], x[2]))
--- a/src/settings/conf/prd.ini
View file @d9b0ae8
+++ b/src/settings/conf/prd.ini
View file @d9b0ae8
@@ -4,6 +4,8 @@ SLEEP_SECOND_DOC_GET = 2
 SLEEP_SECOND_IMG_PUT = 2
 SLEEP_SECOND_IMG_GET = 0.5
 SLEEP_SECOND_TASK_GET = 2
+SLEEP_SECOND_FOLDER = 2
+
 IMG_QUEUE_SIZE = 500

 EDMS_DOWNLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/DownloadHandler.ashx
--- a/src/settings/conf/sit.ini
View file @d9b0ae8
+++ b/src/settings/conf/sit.ini
View file @d9b0ae8
@@ -4,6 +4,8 @@ SLEEP_SECOND_DOC_GET = 10
 SLEEP_SECOND_IMG_PUT = 2
 SLEEP_SECOND_IMG_GET = 0.5
 SLEEP_SECOND_TASK_GET = 2
+SLEEP_SECOND_FOLDER = 2
+
 IMG_QUEUE_SIZE = 500

 EDMS_DOWNLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/DownloadHandler.ashx
--- a/src/settings/conf/uat.ini
View file @d9b0ae8
+++ b/src/settings/conf/uat.ini
View file @d9b0ae8
@@ -4,6 +4,8 @@ SLEEP_SECOND_DOC_GET = 2
 SLEEP_SECOND_IMG_PUT = 2
 SLEEP_SECOND_IMG_GET = 0.5
 SLEEP_SECOND_TASK_GET = 2
+SLEEP_SECOND_FOLDER = 2
+
 IMG_QUEUE_SIZE = 500

 EDMS_DOWNLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/DownloadHandler.ashx