add new ocr flow

周伟奇
Showing 4 changed files with 651 additions and 4 deletions
.gitignore
src/apps/doc/consts.py
src/apps/doc/exceptions.py
src/apps/doc/management/commands/ocr_process.py
--- a/.gitignore
View file @85a7f36
+++ b/.gitignore
View file @85a7f36
@@ -34,5 +34,3 @@ data/*
 src/*.sh

 test*
\ No newline at end of file
-ocr_test.py
-ocr_process.py
\ No newline at end of file
--- a/src/apps/doc/consts.py
View file @85a7f36
+++ b/src/apps/doc/consts.py
View file @85a7f36
@@ -36,7 +36,7 @@ APPLICATION_ID_META_FIELD_id = 1
 DEALER_CODE_META_FIELD_id = 13
 BUSINESS_TYPE_META_FIELD_id = 93

-RETRY_TIMES = 3
+RETRY_TIMES = 2

 # ---------银行流水模板相关--------------------------------------------------------------------------------------------

@@ -77,7 +77,8 @@ RES_SHEET_HEADER = ('页码', '序号', '结果')
 RES_SUCCESS = '识别成功'
 RES_SUCCESS_OTHER = '识别成功（其他类）'
 RES_SUCCESS_EMPTY = '识别成功（空数据）'
-RES_FAILED = '识别失败'
+RES_FAILED_1 = '识别失败(阶段1)'
+RES_FAILED_2 = '识别失败(阶段2)'

 CARD_RATIO = 0.9
 UNKNOWN_CARD = '未知卡号'
--- a/src/apps/doc/exceptions.py
View file @85a7f36
+++ b/src/apps/doc/exceptions.py
View file @85a7f36
 class EDMSException(Exception):
    pass
+
+
+class OCR1Exception(Exception):
+    pass
+
+
+class OCR2Exception(Exception):
+    pass
--- a/src/apps/doc/management/commands/ocr_process.py 0 → 100644
View file @85a7f36
+++ b/src/apps/doc/management/commands/ocr_process.py 0 → 100644
View file @85a7f36
+import os
+import time
+import json
+import base64
+import signal
+import asyncio
+import aiohttp
+import difflib
+import requests
+from collections import Counter
+from datetime import datetime, date
+from django.core.management import BaseCommand
+from multiprocessing import Process, Queue, Manager, Lock
+
+from settings import conf
+from common.mixins import LoggerMixin
+from common.tools.file_tools import write_zip_file
+from common.tools.pdf_to_img import PDFHandler
+from apps.doc import consts
+from apps.doc.ocr.edms import EDMS, rh
+from apps.doc.named_enum import KeywordsType
+from apps.doc.exceptions import EDMSException, OCR1Exception, OCR2Exception
+from apps.doc.ocr.wb import BSWorkbook, Workbook
+from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords
+
+
+class Command(BaseCommand, LoggerMixin):
+
+    def __init__(self):
+        super().__init__()
+        self.log_base = '[doc ocr process]'
+        # 处理文件开关
+        self.switch = True
+        # 睡眠时间
+        self.sleep_time = conf.SLEEP_SECOND
+        # 数据目录
+        self.data_dir = conf.DATA_DIR
+        # ocr相关
+        self.ocr_1_urls = conf.get_namespace('OCR_URL_1_')
+        self.ocr_url_2 = conf.OCR_URL_2
+        self.ocr_url_3 = conf.BC_URL
+        # EDMS web_service_api
+        self.edms = EDMS()
+        # 优雅退出信号：15
+        signal.signal(signal.SIGTERM, self.signal_handler)
+
+    def signal_handler(self, sig, frame):
+        self.switch = False  # 停止处理文件
+
+    @staticmethod
+    def get_doc_object(task_str):
+        business_type, doc_id_str = task_str.split(consts.SPLIT_STR)
+        doc_id = int(doc_id_str)
+        doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
+        # doc_info = doc_class.objects.filter(id=doc_id, status=DocStatus.INIT.value).values(
+        #     'id', 'metadata_version_id', 'application_id', 'document_name', 'document_scheme').first()
+        doc = doc_class.objects.filter(id=doc_id).first()
+        return doc, business_type
+
+    def get_doc_info(self):
+        task_str, is_priority = rh.dequeue()
+        if task_str is None:
+            self.cronjob_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base))
+            return None, None, None
+
+        doc, business_type = self.get_doc_object(task_str)
+
+        if doc is None:
+            self.cronjob_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format(
+                self.log_base, task_str, is_priority))
+            return None, None, None
+        elif doc.status != DocStatus.INIT.value:
+            self.cronjob_log.warn('{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] '
+                                  '[doc_status={3}]'.format(self.log_base, task_str, is_priority, doc.status))
+            return None, None, None
+        doc.status = DocStatus.PROCESSING.value  # TODO update_time --> start_time
+        doc.save()
+        self.cronjob_log.info('{0} [get_doc_info] [success] [task_str={1}] [is_priority={2}]'.format(
+            self.log_base, task_str, is_priority))
+        return doc, business_type, task_str
+
+    def pdf_download(self, doc, pdf_path):
+        if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
+            for times in range(consts.RETRY_TIMES):
+                try:
+                    self.edms.download(pdf_path, doc.metadata_version_id)
+                except Exception as e:
+                    self.cronjob_log.warn('{0} [edms download failed] [times={1}] [pdf_path={2}] '
+                                          '[error={3}]'.format(self.log_base, times, pdf_path, e))
+                    edms_exc = str(e)
+                else:
+                    break
+            else:
+                raise EDMSException(edms_exc)
+        self.cronjob_log.info('{0} [edms download success] [pdf_path={1}]'.format(self.log_base, pdf_path))
+
+    def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino):
+        sheets = ocr_data.get('data', [])
+        if not sheets:
+            res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY))
+            return
+        confidence = ocr_data.get('confidence', 1)
+        img_name = 'page_{0}_img_{1}'.format(pno, ino)
+        cells_exists = False
+        for i, sheet in enumerate(sheets):
+            cells = sheet.get('cells')
+            if not cells:
+                continue
+            cells_exists = True
+            sheet_name = '{0}_{1}'.format(img_name, i)
+            ws = wb.create_sheet(sheet_name)
+            for cell in cells:
+                c1 = cell.get('start_column')
+                r1 = cell.get('start_row')
+                words = cell.get('words')
+                ws.cell(row=r1 + 1, column=c1 + 1, value=words)
+
+            # ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间']
+            summary = sheet.get('summary')
+            card = summary[1]
+            if card is None:
+                classify_dict = unknown_summary.setdefault(classify, {})
+                role = consts.UNKNOWN_ROLE if summary[0] is None else summary[0]
+                role_dict = classify_dict.setdefault(role, {})
+                role_dict['classify'] = classify
+                role_dict['role'] = role
+                role_dict.setdefault('sheet', []).append(sheet_name)
+                role_dict.setdefault('confidence', []).append(confidence)
+                code_list = role_dict.setdefault('code', [])
+                pt_list = role_dict.setdefault('print_time', [])
+                sd_list = role_dict.setdefault('start_date', [])
+                ed_list = role_dict.setdefault('end_date', [])
+                if summary[3] is not None:
+                    code_list.append((summary[2], summary[3]))
+                if summary[4] is not None:
+                    pt_list.append(summary[4])
+                if summary[5] is not None:
+                    sd_list.append(summary[5])
+                if summary[6] is not None:
+                    ed_list.append(summary[6])
+            else:
+                card_dict = bs_summary.setdefault(card, {})
+                card_dict['count'] = card_dict.get('count', 0) + 1
+                card_dict.setdefault('classify', []).append(classify)
+                card_dict.setdefault('confidence', []).append(confidence)
+                card_dict.setdefault('sheet', []).append(sheet_name)
+                role_list = card_dict.setdefault('role', [])
+                role_set = card_dict.setdefault('role_set', set())
+                code_list = card_dict.setdefault('code', [])
+                pt_list = card_dict.setdefault('print_time', [])
+                sd_list = card_dict.setdefault('start_date', [])
+                ed_list = card_dict.setdefault('end_date', [])
+                if summary[0] is not None:
+                    role_list.append(summary[0])
+                    role_set.add(summary[0])
+                if summary[3] is not None:
+                    code_list.append((summary[2], summary[3]))
+                if summary[4] is not None:
+                    pt_list.append(summary[4])
+                if summary[5] is not None:
+                    sd_list.append(summary[5])
+                if summary[6] is not None:
+                    ed_list.append(summary[6])
+
+        if cells_exists:
+            res_list.append((pno, ino, consts.RES_SUCCESS))
+        else:
+            res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY))
+
+    def license1_process(self, ocr_data, license_summary, classify, res_list, pno, ino):
+        # 类别：'0'身份证， '1'居住证
+        license_data = ocr_data.get('data', [])
+        if not license_data:
+            res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY))
+            return
+        res_list.append((pno, ino, consts.RES_SUCCESS))
+        license_summary.setdefault(classify, []).extend(license_data)
+
+    def license2_process(self, ocr_res_2, license_summary, pid, classify, res_list, pno, ino):
+        if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET:
+            res_list.append((pno, ino, consts.RES_SUCCESS))
+            if pid == consts.BC_PID:
+                # 银行卡
+                # res_dict = {}
+                # for en_key, chn_key in consts.BC_FIELD:
+                #     res_dict[chn_key] = ocr_res_2.get(en_key, '')
+                license_summary.setdefault(classify, []).append(ocr_res_2)
+            else:
+                # 营业执照等
+                for result_dict in ocr_res_2.get('ResultList', []):
+                    res_dict = {}
+                    for field_dict in result_dict.get('FieldList', []):
+                        res_dict[field_dict.get('chn_key', '')] = field_dict.get('value', '')
+                    license_summary.setdefault(classify, []).append(res_dict)
+        else:
+            res_list.append((pno, ino, consts.RES_FAILED_2))
+
+    @staticmethod
+    def parse_img_path(img_path):
+        img_name, _ = os.path.splitext(os.path.basename(img_path))
+        part_list = img_name.split('_')
+        # page_7_img_11_0
+        return int(part_list[1])+1, int(part_list[3])+1
+
+    @staticmethod
+    def get_most(value_list):
+        if value_list:
+            most_common = Counter(value_list).most_common(1)
+            return most_common[0][0] if most_common else None
+
+    @staticmethod
+    def date_format(date_str, format_str):
+        try:
+            date_res = datetime.strptime(date_str, format_str).date()
+        except Exception as e:
+            return
+        else:
+            return date_res
+
+    def get_validate_date(self, date_list):
+        for date_str in date_list:
+            for format_str in consts.DATE_FORMAT:
+                date_res = self.date_format(date_str, format_str)
+                if isinstance(date_res, date):
+                    return date_res
+
+    def merge_card(self, bs_summary):
+        merged_bs_summary = {}
+        sorted_card = sorted(bs_summary.keys(), key=lambda x: bs_summary[x]['count'], reverse=True)
+        for main_card in sorted_card:
+            if bs_summary.get(main_card) is None:
+                continue
+            merged_bs_summary[main_card] = bs_summary.pop(main_card)
+            del merged_bs_summary[main_card]['count']
+            merge_cards = []
+            for card in bs_summary.keys():
+                if difflib.SequenceMatcher(None, main_card, card).quick_ratio() > consts.CARD_RATIO:
+                    merged_bs_summary[main_card]['classify'].extend(bs_summary[card]['classify'])
+                    merged_bs_summary[main_card]['confidence'].extend(bs_summary[card]['confidence'])
+                    merged_bs_summary[main_card]['sheet'].extend(bs_summary[card]['sheet'])
+                    merged_bs_summary[main_card]['role'].extend(bs_summary[card]['role'])
+                    merged_bs_summary[main_card]['role_set'].update(bs_summary[card]['role_set'])
+                    merged_bs_summary[main_card]['code'].extend(bs_summary[card]['code'])
+                    merged_bs_summary[main_card]['print_time'].extend(bs_summary[card]['print_time'])
+                    merged_bs_summary[main_card]['start_date'].extend(bs_summary[card]['start_date'])
+                    merged_bs_summary[main_card]['end_date'].extend(bs_summary[card]['end_date'])
+                    merge_cards.append(card)
+            for card in merge_cards:
+                del bs_summary[card]
+            merged_bs_summary[main_card]['classify'] = self.get_most(merged_bs_summary[main_card]['classify'])
+            merged_bs_summary[main_card]['role'] = self.get_most(merged_bs_summary[main_card]['role'])
+        del bs_summary
+        return merged_bs_summary
+
+    def prune_bs_summary(self, bs_summary):
+        for summary in bs_summary.values():
+            del summary['count']
+            summary['classify'] = self.get_most(summary['classify'])
+            summary['role'] = self.get_most(summary['role'])
+        return bs_summary
+
+    def rebuild_bs_summary(self, bs_summary, unknown_summary):
+        # bs_summary = {
+        #     '卡号': {
+        #         'count': 100,
+        #         'classify': [],
+        #         'confidence': [],
+        #         'role': [],
+        #         'code': [('page', 'code')],
+        #         'print_time': [],
+        #         'start_date': [],
+        #         'end_date': [],
+        #         'sheet': ['sheet_name']
+        #     }
+        # }
+        #
+        # unknown_summary = {
+        #     0: {
+        #         '户名': {
+        #             'classify': 0,
+        #             'confidence': [],
+        #             'role': '户名',
+        #             'code': [('page', 'code')],
+        #             'print_time': [],
+        #             'start_date': [],
+        #             'end_date': [],
+        #             'sheet': ['sheet_name']
+        #         }
+        #     }
+        # }
+        # 无卡号
+        if len(bs_summary) == 0:
+            del bs_summary
+            merged_bs_summary = {}
+            card_num = 1
+            for role_dict in unknown_summary.values():
+                if len(role_dict) == 2 and consts.UNKNOWN_ROLE in role_dict:
+                    summary_dict = role_dict.pop(consts.UNKNOWN_ROLE, {})
+                    for summary in role_dict.values():
+                        summary_dict['confidence'].extend(summary['confidence'])
+                        summary_dict['role'] = summary['role']
+                        summary_dict['code'].extend(summary['code'])
+                        summary_dict['print_time'].extend(summary['print_time'])
+                        summary_dict['start_date'].extend(summary['start_date'])
+                        summary_dict['end_date'].extend(summary['end_date'])
+                        summary_dict['sheet'].extend(summary['sheet'])
+                    card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num)
+                    merged_bs_summary[card] = summary_dict
+                else:
+                    for summary in role_dict.values():
+                        card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num)
+                        card_num += 1
+                        merged_bs_summary[card] = summary
+        else:
+            # 1卡号
+            one_card = False
+            if len(bs_summary) == 1:
+                merged_bs_summary = self.prune_bs_summary(bs_summary)
+                one_card = True
+            # 多卡号
+            else:
+                merged_bs_summary = self.merge_card(bs_summary)
+
+            for card_summary in merged_bs_summary.values():
+                merge_role = []
+                classify_summary = unknown_summary.get(card_summary['classify'], {})
+                for role, summary in classify_summary.items():
+                    if one_card or role in card_summary['role_set']:
+                        merge_role.append(role)
+                        card_summary['confidence'].extend(summary['confidence'])
+                        card_summary['sheet'].extend(summary['sheet'])
+                        card_summary['code'].extend(summary['code'])
+                        card_summary['print_time'].extend(summary['print_time'])
+                        card_summary['start_date'].extend(summary['start_date'])
+                        card_summary['end_date'].extend(summary['end_date'])
+
+                for role in merge_role:
+                    del classify_summary[role]
+
+            card_num = 1
+            for role_dict in unknown_summary.values():
+                for summary in role_dict.values():
+                    card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num)
+                    card_num += 1
+                    merged_bs_summary[card] = summary
+
+        del unknown_summary
+        for summary in merged_bs_summary.values():
+            if summary.get('role_set') is not None:
+                del summary['role_set']
+            summary['print_time'] = self.get_validate_date(summary['print_time'])
+            summary['start_date'] = self.get_validate_date(summary['start_date'])
+            summary['end_date'] = self.get_validate_date(summary['end_date'])
+            summary['confidence'] = max(summary['confidence'])
+        return merged_bs_summary
+
+    def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock):
+        while self.switch:
+            # 1. 从队列获取文件信息
+            doc, business_type, task_str = self.get_doc_info()
+            # 队列为空时的处理
+            if doc is None:
+                time.sleep(self.sleep_time)
+                continue
+
+            try:
+                # 2. 从EDMS获取PDF文件
+                doc_data_path = os.path.join(self.data_dir, business_type, str(doc.id))
+                os.makedirs(doc_data_path, exist_ok=True)
+                pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
+                img_save_path = os.path.join(doc_data_path, 'img')
+                self.pdf_download(doc, pdf_path)
+
+                # 3.PDF文件提取图片
+                self.cronjob_log.info('{0} [pdf to img start] [task={1}]'.format(self.log_base, task_str))
+                pdf_handler = PDFHandler(pdf_path, img_save_path)
+                pdf_handler.extract_image()
+                self.cronjob_log.info('{0} [pdf to img end] [task={1}]'.format(self.log_base, task_str))
+
+                with lock:
+                    todo_count_dict[task_str] = len(pdf_handler.img_path_list)
+                for img_path in pdf_handler.img_path_list:
+                    img_queue.put(img_path)  # TODO 队列控制
+            except EDMSException as e:
+                doc.status = DocStatus.PROCESS_FAILED.value
+                doc.save()
+                self.cronjob_log.error('{0} [process failed (edms download)] [task={1}] [err={2}]'.format(
+                    self.log_base, task_str, e))
+            except Exception as e:
+                doc.status = DocStatus.PROCESS_FAILED.value
+                doc.save()
+                self.cronjob_log.error('{0} [process failed (pdf to img)] [task={1}] [err={2}]'.format(
+                    self.log_base, task_str, e))
+
+    def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url):
+        while True:
+            try:
+                img_path = img_queue.get(block=False)
+            except Exception as e:
+                self.cronjob_log.info('{0} [img_2_ocr_1] [queue empty]'.format(self.log_base))
+                time.sleep(0.5)
+                continue
+            else:
+                self.cronjob_log.info('{0} [img_2_ocr_1] [get img] [img_path={1}]'.format(self.log_base, img_path))
+                with open(img_path, 'rb') as f:
+                    base64_data = base64.b64encode(f.read())
+                    # 获取解码后的base64值
+                    file_data = base64_data.decode()
+                json_data_1 = {
+                    "file": file_data
+                }
+
+                for times in range(consts.RETRY_TIMES):
+                    try:
+                        start_time = time.time()
+                        ocr_1_response = requests.post(url, json=json_data_1)
+                        if ocr_1_response.status_code != 200:
+                            raise OCR1Exception('ocr_1 status code: {0}'.format(ocr_1_response.status_code))
+                    except Exception as e:
+                        self.cronjob_log.warn('{0} [ocr_1 failed] [times={1}] [img_path={2}] [error={3}]'.format(
+                            self.log_base, times, img_path, e))
+                    else:
+                        ocr_1_res = ocr_1_response.json()
+                        end_time = time.time()
+                        speed_time = int(end_time - start_time)
+                        self.cronjob_log.info('{0} [ocr_1 success] [img={1}] [res={2}] [speed_time={3}]'.format(
+                            self.log_base, img_path, ocr_1_res, speed_time))
+                        break
+                else:
+                    ocr_1_res = {}
+                    self.cronjob_log.warn('{0} [ocr_1 failed] [img_path={1}]'.format(self.log_base, img_path))
+                    # continue
+
+                del json_data_1
+                # /data/bmw-ocr-data/AFC/6/img/page_0_img_0.jpeg
+                # AFC_2
+                path_split = img_path.split('/')
+                task_str = consts.SPLIT_STR.join((path_split[-4], path_split[-3]))
+
+                with lock:
+                    doc_res_dict = res_dict.setdefault(task_str, {})
+                    doc_res_dict[os.path.basename(img_path)] = ocr_1_res
+                    res_dict[task_str] = doc_res_dict
+                    todo_count = todo_count_dict.get(task_str)
+                    if todo_count == 1:
+                        finish_queue.put(task_str)
+                        del todo_count_dict[task_str]
+                    else:
+                        todo_count_dict[task_str] = todo_count - 1
+
+    def res_2_wb(self, res_dict, finish_queue, lock):
+        while True:
+            try:
+                task_str = finish_queue.get(block=False)
+            except Exception as e:
+                self.cronjob_log.info('{0} [res_2_wb] [queue empty]'.format(self.log_base))
+                time.sleep(0.5)
+                continue
+            else:
+                self.cronjob_log.info('{0} [res_2_wb] [get task] [task={1}]'.format(self.log_base, task_str))
+                ocr_1_res = res_dict.get(task_str, {})
+                self.cronjob_log.info('{0} [res_2_wb] [get task res] [task={1}] [res={2}]'.format(
+                    self.log_base, task_str, ocr_1_res))
+
+                try:
+                    # 4.OCR结果并且构建excel文件
+                    bs_summary = {}
+                    license_summary = {}
+                    unknown_summary = {}
+                    res_list = []
+                    interest_keyword = Keywords.objects.filter(
+                        type=KeywordsType.INTEREST.value, on_off=True).values_list('keyword', flat=True)
+                    salary_keyword = Keywords.objects.filter(
+                        type=KeywordsType.SALARY.value, on_off=True).values_list('keyword', flat=True)
+                    loan_keyword = Keywords.objects.filter(
+                        type__in=[KeywordsType.LOAN.value, KeywordsType.ALI_WECHART.value], on_off=True).values_list(
+                        'keyword', flat=True)
+                    wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword)
+                    for img_path, res in ocr_1_res.items():
+                        pno, ino = self.parse_img_path(img_path)
+                        if res.get('code') == 1:
+                            ocr_data = res.get('data', {})
+                            classify = ocr_data.get('classify')
+                            if classify is None:
+                                res_list.append((pno, ino, consts.RES_FAILED_1))
+                                self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format(
+                                    self.log_base, img_path, res))
+                                continue
+                            elif classify in consts.OTHER_CLASSIFY_SET:  # 其他类
+                                res_list.append((pno, ino, consts.RES_SUCCESS_OTHER))
+                                continue
+                            elif classify in consts.LICENSE_CLASSIFY_SET_1:  # 证件1
+                                self.license1_process(ocr_data, license_summary, classify, res_list, pno, ino)
+                            elif classify in consts.LICENSE_CLASSIFY_SET_2:  # 证件2
+                                pid, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
+                                with open(img_path, 'rb') as f:
+                                    base64_data = base64.b64encode(f.read())
+                                    # 获取解码后的base64值
+                                    file_data = base64_data.decode()
+                                json_data_2 = {
+                                    "pid": str(pid),
+                                    "filedata": file_data
+                                }
+
+                                for times in range(consts.RETRY_TIMES):
+                                    try:
+                                        start_time = time.time()
+                                        ocr_2_response = requests.post(self.ocr_url_2, data=json_data_2)
+                                        if ocr_2_response.status_code != 200:
+                                            raise OCR2Exception('ocr_2 status code: {0}'.format(ocr_2_response.status_code))
+                                    except Exception as e:
+                                        self.cronjob_log.warn(
+                                            '{0} [ocr_2 failed] [times={1}] [img_path={2}] [error={3}]'.format(
+                                                self.log_base, times, img_path, e))
+                                    else:
+                                        ocr_2_res = json.loads(ocr_2_response.text)
+                                        end_time = time.time()
+                                        speed_time = int(end_time - start_time)
+                                        self.cronjob_log.info(
+                                            '{0} [ocr_2 success] [img={1}] [res={2}] [speed_time={3}]'.format(
+                                                self.log_base, img_path, ocr_2_res, speed_time))
+
+                                        if classify == consts.BC_CLASSIFY:
+                                            name = '有'
+                                            json_data_3 = {
+                                                "file": file_data,
+                                                'card_res': ocr_2_res
+                                            }
+                                            card_name_response = requests.post(self.ocr_url_3, json_data_3)
+                                            if card_name_response.status_code == 200:
+                                                card_name_res = card_name_response.json()
+                                                if isinstance(card_name_res, dict) and \
+                                                        card_name_res.get('data', {}).get('is_exists_name') == 0:
+                                                    name = '无'
+                                            ocr_2_res['Name'] = name
+                                        self.license2_process(ocr_2_res, license_summary, pid, classify, res_list, pno, ino)
+                                        break
+                                else:
+                                    res_list.append((pno, ino, consts.RES_FAILED_2))
+                                    self.cronjob_log.warn(
+                                        '{0} [ocr_2 failed] [img_path={1}]'.format(self.log_base, img_path))
+                            else:  # 流水处理
+                                self.bs_process(wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino)
+                        else:
+                            res_list.append((pno, ino, consts.RES_FAILED_1))
+                            self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format(
+                                self.log_base, img_path, res))
+
+                    with lock:
+                        del res_dict[task_str]
+                        self.cronjob_log.info('{0} [res_dict record] [res_dict={1}]'.format(
+                            self.log_base, res_dict))
+
+                    self.cronjob_log.info('{0} [task={1}] [bs_summary={2}] [unknown_summary={3}] '
+                                          '[license_summary={4}]'.format(self.log_base, task_str, bs_summary,
+                                                                         unknown_summary, license_summary))
+
+                    merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary)
+
+                    self.cronjob_log.info('{0} [task={1}] [merged_bs_summary={2}] [unknown_summary={3}] '
+                                          '[res_list={4}]'.format(self.log_base, task_str, merged_bs_summary,
+                                                                  unknown_summary, res_list))
+                    del unknown_summary
+
+                    # 4.2 重构Excel文件
+                    doc, business_type = self.get_doc_object(task_str)
+                    doc_data_path = os.path.join(self.data_dir, business_type, str(doc.id))
+                    excel_path = os.path.join(doc_data_path, '{0}.xlsx'.format(doc.id))
+                    img_save_path = os.path.join(doc_data_path, 'img')
+                    # wb.save(src_excel_path)
+                    wb.rebuild(merged_bs_summary, license_summary, res_list, doc.document_scheme)
+                    wb.save(excel_path)
+                except Exception as e:
+                    with lock:
+                        if task_str in res_dict:
+                            del res_dict[task_str]
+                    doc, _ = self.get_doc_object(task_str)
+                    doc.status = DocStatus.PROCESS_FAILED.value
+                    doc.save()  # TODO end_time
+                    self.cronjob_log.error('{0} [process failed (res to wb)] [task={1}] [err={2}]'.format(
+                        self.log_base, task_str, e))
+                else:
+                    try:
+                        # 5.上传至EDMS
+                        for times in range(consts.RETRY_TIMES):
+                            try:
+                                self.edms.upload(excel_path, doc, business_type)
+                            except Exception as e:
+                                self.cronjob_log.warn(
+                                    '{0} [edms upload failed] [times={1}] [task={2}] [error={3}]'.format(
+                                        self.log_base, times, task_str, e))
+                                edms_exc = str(e)
+                            else:
+                                break
+                        else:
+                            raise EDMSException(edms_exc)
+                    except Exception as e:
+                        doc.status = DocStatus.UPLOAD_FAILED.value  # TODO end_time
+                        doc.save()
+                        self.cronjob_log.error('{0} [process failed (edms upload)] [task={1}] [err={2}]'.format(
+                            self.log_base, task_str, e))
+                        write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
+
+                    else:
+                        doc.status = DocStatus.COMPLETE.value
+                        doc.save()  # TODO end_time
+                        self.cronjob_log.info('{0} [process complete] [task={1}]'.format(self.log_base, task_str))
+                        write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
+
+    # TODO 细化文件状态，不同异常状态，归还队列，重试时采取不同的处理
+    # TODO 异常邮件通知
+    # 识别失败：普通异常，如PDF异常、构建过程异常
+    # EDMS异常：下载异常-->回队列-->邮件；上传异常-->重新上传队列-->邮件
+    # 算法异常：第一道异常-->识别失败-->邮件；第二道异常-->识别失败-->邮件
+    # TODO OCR接口调用重试
+    def handle(self, *args, **kwargs):
+        lock = Lock()
+        with Manager() as manager:
+            todo_count_dict = manager.dict()
+            res_dict = manager.dict()
+            img_queue = Queue()
+            finish_queue = Queue()
+
+            process_list = []
+            pdf_process = Process(target=self.pdf_2_img_2_queue, args=(img_queue, todo_count_dict, lock))
+            process_list.append(pdf_process)
+
+            for url in self.ocr_1_urls.values():
+                ocr_1_process = Process(target=self.img_2_ocr_1, args=(
+                    img_queue, todo_count_dict, res_dict, finish_queue, lock, url))
+                process_list.append(ocr_1_process)
+
+            wb_process = Process(target=self.res_2_wb, args=(res_dict, finish_queue, lock))
+            process_list.append(wb_process)
+
+            for p in process_list:
+                p.start()
+                p.join()
+
+            self.cronjob_log.info('{0} [stop safely]'.format(self.log_base))