Merge branch 'feature/main' into feature/mssql

周伟奇
Showing 10 changed files with 207 additions and 672 deletions
src/apps/doc/consts.py
src/apps/doc/management/commands/bs_statistics.py
src/apps/doc/management/commands/doc_ocr_process.py
src/apps/doc/management/commands/idcard_daily.py
src/apps/doc/management/commands/idcard_monthly.py
src/apps/doc/management/commands/license_statistics.py
src/apps/doc/management/commands/ocr_process.py
src/common/mixins.py
src/common/tools/pdf_to_img.py
src/settings/conf/logging.conf
--- a/src/apps/doc/consts.py
View file @15dccc9
+++ b/src/apps/doc/consts.py
View file @15dccc9
@@ -152,7 +152,7 @@ RESULT_IDX = FIXED_HEADERS.index('核对结果')
 # '借贷': ('贷', '借'),  # 竖版-无表格-广发银行
 # '借贷状态': ('贷', '借'),  # 竖版-特殊-交通银行
 # '收/支': ('收入', '支出'),  # 横版-表格-北京银行
-BORROW_HEADERS_SET = {'借贷', '借贷状态', '收/支', '收支标志'}
+BORROW_HEADERS_SET = {'借贷', '借\n贷', '借贷状态', '收/支', '收支标志'}
 BORROW_INCOME_SET = {'贷', '收入', '收', '收(Cr)'}
 BORROW_OUTLAY_SET = {'借', '支出', '支', '付(Dr)'}
 INCOME_HEADERS_SET = {'收入金额', '收入', '存入', '存入金额（贷）', '存入金额(贷)'}
@@ -165,6 +165,7 @@ HEADERS_MAPPING = {}
 HEADERS_MAPPING.update(
    {
        '借贷': BORROW_KEY,
+        '借\n贷': BORROW_KEY,
        '借贷状态': BORROW_KEY,
        '收支标志': BORROW_KEY,
        '收/支': BORROW_KEY,
--- a/src/apps/doc/management/commands/bs_statistics.py
View file @15dccc9
+++ b/src/apps/doc/management/commands/bs_statistics.py
View file @15dccc9
@@ -40,7 +40,8 @@ class Command(BaseCommand, LoggerMixin):
            print('excel dir not exists')
            return
        excel_path = os.path.join(excel_dir, 'bs_{0}.xlsx'.format(date_str))
-        log_path = os.path.join(conf.LOG_DIR, 'bs.log.{0}'.format(date_str))
+        # log_path = os.path.join(conf.LOG_DIR, 'bs.log.{0}'.format(date_str))
+        log_path = os.path.join(conf.LOG_DIR, 'bs_statistics.log.{0}'.format(date_str))
        if not os.path.exists(log_path):
            print('log_path not exists')
            return
@@ -48,7 +49,8 @@ class Command(BaseCommand, LoggerMixin):
        summary_dict = {}
        with open(log_path, 'r', encoding='utf-8') as fp:
            for line in fp:
-                search_obj = re.search(r'task=(.*) merged_bs_summary=(.*)', line)
+                # search_obj = re.search(r'task=(.*) merged_bs_summary=(.*)', line)
+                search_obj = re.search(r'\[task=(.*)] \[bs_summary=(.*)]', line)
                task_str = search_obj.group(1)
                business_type, doc_id_str = task_str.split(consts.SPLIT_STR)
                doc_id = int(doc_id_str)
--- a/src/apps/doc/management/commands/doc_ocr_process.py deleted 100644 → 0
View file @236b64e
+++ b/src/apps/doc/management/commands/doc_ocr_process.py deleted 100644 → 0
View file @236b64e
-import os
-import time
-import json
-import signal
-import asyncio
-import aiohttp
-import difflib
-import base64
-import requests
-from datetime import datetime, date
-from collections import Counter
-from apps.doc.ocr.wb import BSWorkbook, Workbook
-from django.core.management import BaseCommand
-from settings import conf
-from common.mixins import LoggerMixin
-from common.tools.file_tools import write_zip_file
-from common.tools.pdf_to_img import PDFHandler
-from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords
-from apps.doc.named_enum import KeywordsType
-from apps.doc import consts
-from apps.doc.ocr.edms import EDMS, rh
-from apps.doc.exceptions import EDMSException
-class Command(BaseCommand, LoggerMixin):
-    def __init__(self):
-        super().__init__()
-        self.log_base = '[doc ocr process]'
-        # 处理文件开关
-        self.switch = True
-        # 数据目录
-        self.data_dir = conf.DATA_DIR
-        # ocr相关
-        self.ocr_url_1 = conf.OCR_URL_1
-        self.ocr_url_2 = conf.OCR_URL_2
-        self.ocr_url_3 = conf.BC_URL
-        # EDMS web_service_api
-        self.edms = EDMS()
-        # 优雅退出信号：15
-        signal.signal(signal.SIGTERM, self.signal_handler)
-    def signal_handler(self, sig, frame):
-        self.switch = False  # 停止处理文件
-    def get_doc_info(self):
-        task_str, is_priority = rh.dequeue()
-        if task_str is None:
-            self.cronjob_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base))
-            return None, None
-        business_type, doc_id_str = task_str.split(consts.SPLIT_STR)
-        doc_id = int(doc_id_str)
-        doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
-        # doc_info = doc_class.objects.filter(id=doc_id, status=DocStatus.INIT.value).values(
-        #     'id', 'metadata_version_id', 'application_id', 'document_name', 'document_scheme').first()
-        doc = doc_class.objects.filter(id=doc_id).first()
-        if doc is None:
-            self.cronjob_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format(
-                self.log_base, task_str, is_priority))
-            return None, None
-        elif doc.status != DocStatus.INIT.value:
-            self.cronjob_log.warn('{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] '
-                                  '[doc_status={3}]'.format(self.log_base, task_str, is_priority, doc.status))
-            return None, None
-        doc.status = DocStatus.PROCESSING.value
-        doc.save()
-        self.cronjob_log.info('{0} [get_doc_info] [success] [task_str={1}] [is_priority={2}]'.format(
-            self.log_base, task_str, is_priority))
-        return doc, business_type
-    def pdf_download(self, doc, business_type):
-        doc_data_path = os.path.join(self.data_dir, business_type, str(doc.id))
-        os.makedirs(doc_data_path, exist_ok=True)
-        pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
-        if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
-            for times in range(consts.RETRY_TIMES):
-                try:
-                    self.edms.download(pdf_path, doc.metadata_version_id)
-                except Exception as e:
-                    self.cronjob_log.warn('{0} [edms download failed] [times={1}] [business_type={2}] [doc_id={3}] '
-                                          '[error={4}]'.format(self.log_base, times, business_type, doc.id, e))
-                    edms_exc = str(e)
-                else:
-                    break
-            else:
-                raise EDMSException(edms_exc)
-        excel_path = os.path.join(doc_data_path, '{0}.xlsx'.format(doc.id))
-        src_excel_path = os.path.join(doc_data_path, 'src.xlsx')
-        self.cronjob_log.info('{0} [edms download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format(
-            self.log_base, business_type, doc.id, pdf_path))
-        return doc_data_path, excel_path, src_excel_path, pdf_path
-    def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino):
-        sheets = ocr_data.get('data', [])
-        if not sheets:
-            res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY))
-            return
-        confidence = ocr_data.get('confidence', 1)
-        img_name = 'page_{0}_img_{1}'.format(pno, ino)
-        cells_exists = False
-        for i, sheet in enumerate(sheets):
-            cells = sheet.get('cells')
-            if not cells:
-                continue
-            cells_exists = True
-            sheet_name = '{0}_{1}'.format(img_name, i)
-            ws = wb.create_sheet(sheet_name)
-            for cell in cells:
-                c1 = cell.get('start_column')
-                r1 = cell.get('start_row')
-                words = cell.get('words')
-                ws.cell(row=r1 + 1, column=c1 + 1, value=words)
-            # ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间']
-            summary = sheet.get('summary')
-            card = summary[1]
-            if card is None:
-                classify_dict = unknown_summary.setdefault(classify, {})
-                role = consts.UNKNOWN_ROLE if summary[0] is None else summary[0]
-                role_dict = classify_dict.setdefault(role, {})
-                role_dict['classify'] = classify
-                role_dict['role'] = role
-                role_dict.setdefault('sheet', []).append(sheet_name)
-                role_dict.setdefault('confidence', []).append(confidence)
-                code_list = role_dict.setdefault('code', [])
-                pt_list = role_dict.setdefault('print_time', [])
-                sd_list = role_dict.setdefault('start_date', [])
-                ed_list = role_dict.setdefault('end_date', [])
-                if summary[3] is not None:
-                    code_list.append((summary[2], summary[3]))
-                if summary[4] is not None:
-                    pt_list.append(summary[4])
-                if summary[5] is not None:
-                    sd_list.append(summary[5])
-                if summary[6] is not None:
-                    ed_list.append(summary[6])
-            else:
-                card_dict = bs_summary.setdefault(card, {})
-                card_dict['count'] = card_dict.get('count', 0) + 1
-                card_dict.setdefault('classify', []).append(classify)
-                card_dict.setdefault('confidence', []).append(confidence)
-                card_dict.setdefault('sheet', []).append(sheet_name)
-                role_list = card_dict.setdefault('role', [])
-                role_set = card_dict.setdefault('role_set', set())
-                code_list = card_dict.setdefault('code', [])
-                pt_list = card_dict.setdefault('print_time', [])
-                sd_list = card_dict.setdefault('start_date', [])
-                ed_list = card_dict.setdefault('end_date', [])
-                if summary[0] is not None:
-                    role_list.append(summary[0])
-                    role_set.add(summary[0])
-                if summary[3] is not None:
-                    code_list.append((summary[2], summary[3]))
-                if summary[4] is not None:
-                    pt_list.append(summary[4])
-                if summary[5] is not None:
-                    sd_list.append(summary[5])
-                if summary[6] is not None:
-                    ed_list.append(summary[6])
-        if cells_exists:
-            res_list.append((pno, ino, consts.RES_SUCCESS))
-        else:
-            res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY))
-    def license1_process(self, ocr_data, license_summary, classify, res_list, pno, ino):
-        # 类别：'0'身份证， '1'居住证
-        license_data = ocr_data.get('data', [])
-        if not license_data:
-            res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY))
-            return
-        res_list.append((pno, ino, consts.RES_SUCCESS))
-        license_summary.setdefault(classify, []).extend(license_data)
-    def license2_process(self, ocr_res_2, license_summary, pid, classify, res_list, pno, ino):
-        if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET:
-            res_list.append((pno, ino, consts.RES_SUCCESS))
-            if pid == consts.BC_PID:
-                # 银行卡
-                # res_dict = {}
-                # for en_key, chn_key in consts.BC_FIELD:
-                #     res_dict[chn_key] = ocr_res_2.get(en_key, '')
-                license_summary.setdefault(classify, []).append(ocr_res_2)
-            else:
-                # 营业执照等
-                for result_dict in ocr_res_2.get('ResultList', []):
-                    res_dict = {}
-                    for field_dict in result_dict.get('FieldList', []):
-                        res_dict[field_dict.get('chn_key', '')] = field_dict.get('value', '')
-                    license_summary.setdefault(classify, []).append(res_dict)
-        else:
-            res_list.append((pno, ino, consts.RES_FAILED))
-    @staticmethod
-    async def fetch_ocr_1_result(url, json_data):
-        async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
-            async with session.post(url, json=json_data) as response:
-                if response.status == 200:
-                    return await response.json()
-    @staticmethod
-    async def fetch_ocr_2_result(url, json_data):
-        async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
-            async with session.post(url, data=json_data) as response:
-                if response.status == 200:
-                    return await response.text()
-    @staticmethod
-    async def fetch_bc_name_result(url, json_data):
-        async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
-            async with session.post(url, json=json_data) as response:
-                if response.status == 200:
-                    return await response.json()
-    async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, res_list):
-        pno, ino = self.parse_img_path(img_path)
-        with open(img_path, 'rb') as f:
-            base64_data = base64.b64encode(f.read())
-            # 获取解码后的base64值
-            file_data = base64_data.decode()
-        json_data_1 = {
-            "file": file_data
-        }
-        ocr_res_1 = await self.fetch_ocr_1_result(self.ocr_url_1, json_data_1)
-        if ocr_res_1 is None:
-            res_list.append((pno, ino, consts.RES_FAILED))
-            self.cronjob_log.info('{0} [ocr_1 failed] [img={1}]'.format(self.log_base, img_path))
-            # raise Exception('ocr 1 error, img_path={0}'.format(img_path))
-        else:
-            self.cronjob_log.info('{0} [ocr_1 success] [img={1}] [res={2}]'.format(
-                self.log_base, img_path, ocr_res_1))
-            if ocr_res_1.get('code') == 1:
-                ocr_data = ocr_res_1.get('data', {})
-                classify = ocr_data.get('classify')
-                if classify is None:
-                    res_list.append((pno, ino, consts.RES_FAILED))
-                    self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format(
-                        self.log_base, img_path, ocr_res_1))
-                    return
-                elif classify in consts.OTHER_CLASSIFY_SET:  # 其他类
-                    res_list.append((pno, ino, consts.RES_SUCCESS_OTHER))
-                    return
-                elif classify in consts.LICENSE_CLASSIFY_SET_1:  # 证件1
-                    self.license1_process(ocr_data, license_summary, classify, res_list, pno, ino)
-                elif classify in consts.LICENSE_CLASSIFY_SET_2:  # 证件2
-                    pid, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
-                    json_data_2 = {
-                        "pid": str(pid),
-                        # "key": conf.OCR_KEY,
-                        # "secret": conf.OCR_SECRET,
-                        "filedata": file_data
-                    }
-                    ocr_res_2 = await self.fetch_ocr_2_result(self.ocr_url_2, json_data_2)
-                    if ocr_res_2 is None:
-                        res_list.append((pno, ino, consts.RES_FAILED))
-                        self.cronjob_log.info('{0} [ocr_2 failed] [img={1}]'.format(self.log_base, img_path))
-                        # raise Exception('ocr 2 error, img_path={0}'.format(img_path))
-                    else:
-                        # 识别结果
-                        ocr_res_2 = json.loads(ocr_res_2)
-                        self.cronjob_log.info('{0} [ocr_2 success] [img={1}] [res={2}]'.format(
-                            self.log_base, img_path, ocr_res_2))
-                        if classify == consts.BC_CLASSIFY:
-                            name = '有'
-                            json_data_1['card_res'] = ocr_res_2
-                            card_name_res = await self.fetch_bc_name_result(self.ocr_url_3, json_data_1)
-                            if isinstance(card_name_res, dict) and \
-                                    card_name_res.get('data', {}).get('is_exists_name') == 0:
-                                name = '无'
-                            ocr_res_2['Name'] = name
-                        self.license2_process(ocr_res_2, license_summary, pid, classify, res_list, pno, ino)
-                else:  # 流水处理
-                    self.bs_process(wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino)
-            else:
-                res_list.append((pno, ino, consts.RES_FAILED))
-                self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format(
-                    self.log_base, img_path, ocr_res_1))
-    # def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
-    #     # # 流水
-    #     # res = {
-    #     #     'code': 1,
-    #     #     'msg': 'success',
-    #     #     'data': {
-    #     #         'classify': 0,
-    #     #         'confidence': 0.999,
-    #     #         'data': [
-    #     #             {
-    #     #                 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
-    #     #                 'cells': []
-    #     #             },
-    #     #             {
-    #     #                 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
-    #     #                 'cells': []
-    #     #             }
-    #     #         ]
-    #     #     }
-    #     # }
-    #     #
-    #     # # 证件-1
-    #     # res = {
-    #     #     'code': 1,
-    #     #     'msg': 'success',
-    #     #     'data': {
-    #     #         'classify': 0,
-    #     #         'confidence': 0.999,
-    #     #         'data': [
-    #     #             {
-    #     #                 'cn_key': 'value',
-    #     #                 'cn_key': 'value',
-    #     #             },
-    #     #             {
-    #     #                 'cn_key': 'value',
-    #     #                 'cn_key': 'value',
-    #     #             },
-    #     #         ]
-    #     #     }
-    #     # }
-    #     #
-    #     # # 证件-2 or 其他类
-    #     # res = {
-    #     #     'code': 1,
-    #     #     'msg': 'success',
-    #     #     'data': {
-    #     #         'classify': 0,
-    #     #         'confidence': 0.999,
-    #     #     }
-    #     # }
-    #     with open(img_path, 'rb') as f:
-    #         base64_data = base64.b64encode(f.read())
-    #         # 获取解码后的base64值
-    #         file_data = base64_data.decode()
-    #     json_data_1 = {
-    #         "file": file_data
-    #     }
-    #     response_1 = requests.post(self.ocr_url_1, json=json_data_1)
-    #     if response_1.status_code == 200:
-    #         ocr_res_1 = response_1.json()
-    #         self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
-    #             self.log_base, img_path, ocr_res_1))
-    #
-    #         if ocr_res_1.get('code') == 1:
-    #             ocr_data = ocr_res_1.get('data', {})
-    #             classify = ocr_data.get('classify')
-    #             if classify is None:
-    #                 skip_img.append(self.parse_img_path(img_path))
-    #                 return
-    #             elif classify in consts.OTHER_CLASSIFY_SET:  # 其他类
-    #                 skip_img.append(self.parse_img_path(img_path))
-    #                 return
-    #             elif classify in consts.LICENSE_CLASSIFY_SET_1:  # 证件1
-    #                 self.license1_process(ocr_data, license_summary, classify, skip_img, img_path)
-    #             elif classify in consts.LICENSE_CLASSIFY_SET_2:  # 证件2
-    #                 pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
-    #                 json_data_2 = {
-    #                     "pid": str(pid),
-    #                     "key": conf.OCR_KEY,
-    #                     "secret": conf.OCR_SECRET,
-    #                     "file": file_data
-    #                 }
-    #                 response_2 = requests.post(self.ocr_url_2, data=json_data_2)
-    #                 if response_2.status_code == 200:
-    #                     # 识别结果
-    #                     ocr_res_2 = response_2.json()
-    #                     self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
-    #                         self.log_base, img_path, ocr_res_2))
-    #                     self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path)
-    #                 else:
-    #                     raise Exception('ocr 2 error, img_path={0}'.format(img_path))
-    #             else:  # 流水处理
-    #                 self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img)
-    #         else:
-    #             skip_img.append(self.parse_img_path(img_path))
-    #     else:
-    #         raise Exception('ocr 1 error, img_path={0}'.format(img_path))
-    @staticmethod
-    def parse_img_path(img_path):
-        img_name, _ = os.path.splitext(os.path.basename(img_path))
-        part_list = img_name.split('_')
-        # page_7_img_11_0
-        return int(part_list[1])+1, int(part_list[3])+1
-    @staticmethod
-    def get_most(value_list):
-        if value_list:
-            most_common = Counter(value_list).most_common(1)
-            return most_common[0][0] if most_common else None
-    @staticmethod
-    def date_format(date_str, format_str):
-        try:
-            date_res = datetime.strptime(date_str, format_str).date()
-        except Exception as e:
-            return
-        else:
-            return date_res
-    def get_validate_date(self, date_list):
-        for date_str in date_list:
-            for format_str in consts.DATE_FORMAT:
-                date_res = self.date_format(date_str, format_str)
-                if isinstance(date_res, date):
-                    return date_res
-    def merge_card(self, bs_summary):
-        merged_bs_summary = {}
-        sorted_card = sorted(bs_summary.keys(), key=lambda x: bs_summary[x]['count'], reverse=True)
-        for main_card in sorted_card:
-            if bs_summary.get(main_card) is None:
-                continue
-            merged_bs_summary[main_card] = bs_summary.pop(main_card)
-            del merged_bs_summary[main_card]['count']
-            merge_cards = []
-            for card in bs_summary.keys():
-                if difflib.SequenceMatcher(None, main_card, card).quick_ratio() > consts.CARD_RATIO:
-                    merged_bs_summary[main_card]['classify'].extend(bs_summary[card]['classify'])
-                    merged_bs_summary[main_card]['confidence'].extend(bs_summary[card]['confidence'])
-                    merged_bs_summary[main_card]['sheet'].extend(bs_summary[card]['sheet'])
-                    merged_bs_summary[main_card]['role'].extend(bs_summary[card]['role'])
-                    merged_bs_summary[main_card]['role_set'].update(bs_summary[card]['role_set'])
-                    merged_bs_summary[main_card]['code'].extend(bs_summary[card]['code'])
-                    merged_bs_summary[main_card]['print_time'].extend(bs_summary[card]['print_time'])
-                    merged_bs_summary[main_card]['start_date'].extend(bs_summary[card]['start_date'])
-                    merged_bs_summary[main_card]['end_date'].extend(bs_summary[card]['end_date'])
-                    merge_cards.append(card)
-            for card in merge_cards:
-                del bs_summary[card]
-            merged_bs_summary[main_card]['classify'] = self.get_most(merged_bs_summary[main_card]['classify'])
-            merged_bs_summary[main_card]['role'] = self.get_most(merged_bs_summary[main_card]['role'])
-        del bs_summary
-        return merged_bs_summary
-    def prune_bs_summary(self, bs_summary):
-        for summary in bs_summary.values():
-            del summary['count']
-            summary['classify'] = self.get_most(summary['classify'])
-            summary['role'] = self.get_most(summary['role'])
-        return bs_summary
-    def rebuild_bs_summary(self, bs_summary, unknown_summary):
-        # bs_summary = {
-        #     '卡号': {
-        #         'count': 100,
-        #         'classify': [],
-        #         'confidence': [],
-        #         'role': [],
-        #         'code': [('page', 'code')],
-        #         'print_time': [],
-        #         'start_date': [],
-        #         'end_date': [],
-        #         'sheet': ['sheet_name']
-        #     }
-        # }
-        #
-        # unknown_summary = {
-        #     0: {
-        #         '户名': {
-        #             'classify': 0,
-        #             'confidence': [],
-        #             'role': '户名',
-        #             'code': [('page', 'code')],
-        #             'print_time': [],
-        #             'start_date': [],
-        #             'end_date': [],
-        #             'sheet': ['sheet_name']
-        #         }
-        #     }
-        # }
-        # 无卡号
-        if len(bs_summary) == 0:
-            del bs_summary
-            merged_bs_summary = {}
-            card_num = 1
-            for role_dict in unknown_summary.values():
-                if len(role_dict) == 2 and consts.UNKNOWN_ROLE in role_dict:
-                    summary_dict = role_dict.pop(consts.UNKNOWN_ROLE, {})
-                    for summary in role_dict.values():
-                        summary_dict['confidence'].extend(summary['confidence'])
-                        summary_dict['role'] = summary['role']
-                        summary_dict['code'].extend(summary['code'])
-                        summary_dict['print_time'].extend(summary['print_time'])
-                        summary_dict['start_date'].extend(summary['start_date'])
-                        summary_dict['end_date'].extend(summary['end_date'])
-                        summary_dict['sheet'].extend(summary['sheet'])
-                    card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num)
-                    merged_bs_summary[card] = summary_dict
-                else:
-                    for summary in role_dict.values():
-                        card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num)
-                        card_num += 1
-                        merged_bs_summary[card] = summary
-        else:
-            # 1卡号
-            one_card = False
-            if len(bs_summary) == 1:
-                merged_bs_summary = self.prune_bs_summary(bs_summary)
-                one_card = True
-            # 多卡号
-            else:
-                merged_bs_summary = self.merge_card(bs_summary)
-            for card_summary in merged_bs_summary.values():
-                merge_role = []
-                classify_summary = unknown_summary.get(card_summary['classify'], {})
-                for role, summary in classify_summary.items():
-                    if one_card or role in card_summary['role_set']:
-                        merge_role.append(role)
-                        card_summary['confidence'].extend(summary['confidence'])
-                        card_summary['sheet'].extend(summary['sheet'])
-                        card_summary['code'].extend(summary['code'])
-                        card_summary['print_time'].extend(summary['print_time'])
-                        card_summary['start_date'].extend(summary['start_date'])
-                        card_summary['end_date'].extend(summary['end_date'])
-                for role in merge_role:
-                    del classify_summary[role]
-            card_num = 1
-            for role_dict in unknown_summary.values():
-                for summary in role_dict.values():
-                    card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num)
-                    card_num += 1
-                    merged_bs_summary[card] = summary
-        del unknown_summary
-        for summary in merged_bs_summary.values():
-            if summary.get('role_set') is not None:
-                del summary['role_set']
-            summary['print_time'] = self.get_validate_date(summary['print_time'])
-            summary['start_date'] = self.get_validate_date(summary['start_date'])
-            summary['end_date'] = self.get_validate_date(summary['end_date'])
-            summary['confidence'] = max(summary['confidence'])
-        return merged_bs_summary
-    # TODO 细化文件状态，不同异常状态，归还队列，重试时采取不同的处理
-    # TODO 异常邮件通知
-    # 识别失败：普通异常，如PDF异常、构建过程异常
-    # EDMS异常：下载异常-->回队列-->邮件；上传异常-->重新上传队列-->邮件
-    # 算法异常：第一道异常-->识别失败-->邮件；第二道异常-->识别失败-->邮件
-    # TODO OCR接口调用重试
-    def handle(self, *args, **kwargs):
-        sleep_second = int(conf.SLEEP_SECOND)
-        max_sleep_second = int(conf.MAX_SLEEP_SECOND)
-        while self.switch:
-            # 1. 从队列获取文件信息
-            doc, business_type = self.get_doc_info()
-            # 队列为空时的处理
-            if doc is None:
-                time.sleep(sleep_second)
-                sleep_second = min(max_sleep_second, sleep_second + 5)
-                continue
-            sleep_second = int(conf.SLEEP_SECOND)
-            try:
-                start_time = time.time()
-                # 2. 从EDMS获取PDF文件
-                doc_data_path, excel_path, src_excel_path, pdf_path = self.pdf_download(doc, business_type)
-                # 3.PDF文件提取图片
-                img_save_path = os.path.join(doc_data_path, 'img')
-                self.cronjob_log.info('{0} [pdf to img start] [business_type={1}] [doc_id={2}]'.format(
-                    self.log_base, business_type, doc.id))
-                pdf_handler = PDFHandler(pdf_path, img_save_path)
-                pdf_handler.extract_image()
-                self.cronjob_log.info('{0} [pdf to img end] [business_type={1}] [doc_id={2}]'.format(
-                    self.log_base, business_type, doc.id))
-                # 4.获取OCR结果并且构建excel文件
-                bs_summary = {}
-                license_summary = {}
-                unknown_summary = {}
-                res_list = []
-                interest_keyword = Keywords.objects.filter(
-                    type=KeywordsType.INTEREST.value, on_off=True).values_list('keyword', flat=True)
-                salary_keyword = Keywords.objects.filter(
-                    type=KeywordsType.SALARY.value, on_off=True).values_list('keyword', flat=True)
-                loan_keyword = Keywords.objects.filter(
-                    type__in=[KeywordsType.LOAN.value, KeywordsType.ALI_WECHART.value], on_off=True).values_list(
-                    'keyword', flat=True)
-                wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword)
-                # wb = Workbook()
-                # 4.1 获取OCR结果
-                loop = asyncio.get_event_loop()
-                tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, res_list)
-                         for img_path in pdf_handler.img_path_list]
-                loop.run_until_complete(asyncio.wait(tasks))
-                # loop.close()
-                # for img_path in pdf_handler.img_path_list:
-                #     self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, res_list)
-                self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] '
-                                      '[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary,
-                                                                     unknown_summary, license_summary))
-                merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary)
-                self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [merged_bs_summary={3}] '
-                                      '[unknown_summary={4}] [res_list={5}]'.format(self.log_base, business_type,
-                                                                                    doc.id, merged_bs_summary,
-                                                                                    unknown_summary, res_list))
-                del unknown_summary
-                # 4.2 重构Excel文件
-                wb.save(src_excel_path)
-                wb.rebuild(merged_bs_summary, license_summary, res_list, doc.document_scheme)
-                wb.save(excel_path)
-            except EDMSException as e:
-                doc.status = DocStatus.PROCESS_FAILED.value
-                doc.save()
-                self.cronjob_log.error('{0} [process failed (edms download)] [business_type={1}] [doc_id={2}] '
-                                       '[err={3}]'.format(self.log_base, business_type, doc.id, e))
-            except Exception as e:
-                doc.status = DocStatus.PROCESS_FAILED.value
-                doc.save()
-                self.cronjob_log.error('{0} [process failed (program)] [business_type={1}] [doc_id={2}] '
-                                       '[err={3}]'.format(self.log_base, business_type, doc.id, e))
-            else:
-                try:
-                    # 5.上传至EDMS
-                    for times in range(consts.RETRY_TIMES):
-                        try:
-                            self.edms.upload(excel_path, doc, business_type)
-                        except Exception as e:
-                            self.cronjob_log.warn(
-                                '{0} [edms upload failed] [times={1}] [business_type={2}] [doc_id={3}] '
-                                '[error={4}]'.format(self.log_base, times, business_type, doc.id, e))
-                            edms_exc = str(e)
-                        else:
-                            break
-                    else:
-                        raise EDMSException(edms_exc)
-                except Exception as e:
-                    doc.status = DocStatus.UPLOAD_FAILED.value
-                    doc.save()
-                    end_time = time.time()
-                    speed_time = int(end_time - start_time)
-                    self.cronjob_log.error('{0} [process failed (edms upload)] [business_type={1}] [doc_id={2}] '
-                                           '[speed_time={3}] [err={4}]'.format(self.log_base, business_type, doc.id,
-                                                                               speed_time, e))
-                    write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
-                else:
-                    doc.status = DocStatus.COMPLETE.value
-                    doc.save()
-                    end_time = time.time()
-                    speed_time = int(end_time - start_time)
-                    self.cronjob_log.info('{0} [process complete] [business_type={1}] [doc_id={2}] '
-                                          '[speed_time={3}]'.format(self.log_base, business_type, doc.id, speed_time))
-                    write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
-        self.cronjob_log.info('{0} [stop safely]'.format(self.log_base))
--- a/src/apps/doc/management/commands/idcard_daily.py 0 → 100644
View file @15dccc9
+++ b/src/apps/doc/management/commands/idcard_daily.py 0 → 100644
View file @15dccc9
+import re
+import os
+import ast
+import datetime
+from openpyxl import Workbook
+from django.core.management import BaseCommand
+from settings import conf
+from common.mixins import LoggerMixin
+from apps.doc.models import HILDoc, AFCDoc
+from apps.doc import consts
+class Command(BaseCommand, LoggerMixin):
+    def __init__(self):
+        super().__init__()
+        self.sheet_name = '身份证'
+        self.header = ('申请号', '身份证号', '民族', '时间戳')
+    def add_arguments(self, parser):
+        parser.add_argument(
+            '--date',
+            default=datetime.date.today() - datetime.timedelta(days=1),
+            dest='date',
+            help='将要计算的日期，格式: 2018-01-01'
+        )
+    def handle(self, *args, **kwargs):
+        date = kwargs.get('date')
+        if isinstance(date, str):
+            if not re.match(r'\d{4}-\d{2}-\d{2}', date):
+                print('date format error')
+                return
+            date_str = date
+        else:
+            date_str = date.strftime('%Y-%m-%d')
+        afc_excel_dir = os.path.join(conf.DATA_DIR, 'AFC', 'IdCard')
+        hil_excel_dir = os.path.join(conf.DATA_DIR, 'HIL', 'IdCard')
+        if not os.path.exists(afc_excel_dir) or not os.path.exists(hil_excel_dir):
+            print('excel_dir not exist')
+            return
+        log_path = os.path.join(conf.LOG_DIR, 'idcard.log.{0}'.format(date_str))
+        if not os.path.exists(log_path):
+            print('log_path not exists')
+            return
+        wb_afc = Workbook()
+        ws_afc = wb_afc.create_sheet(self.sheet_name)
+        ws_afc.append(self.header)
+        wb_afc.remove(wb_afc.get_sheet_by_name('Sheet'))
+        wb_hil = Workbook()
+        ws_hil = wb_hil.create_sheet(self.sheet_name)
+        ws_hil.append(self.header)
+        wb_hil.remove(wb_hil.get_sheet_by_name('Sheet'))
+        with open(log_path, 'r', encoding='utf-8') as fp:
+            for line in fp:
+                search_obj = re.match(r'\[(.*)] \[task=(.*)] \[idcard=(.*)]', line)
+                idcard_str = search_obj.group(3)
+                idcard_list = ast.literal_eval(idcard_str)
+                content_list = []
+                for idcard_dict in idcard_list:
+                    nation = idcard_dict.get('民族')
+                    if nation is None:
+                        continue
+                    if idcard_dict.get('类别') == '1':
+                        continue
+                    content_list.append((idcard_dict.get('公民身份号码'), nation))
+                if len(content_list) == 0:
+                    continue
+                time_str = search_obj.group(1)
+                task_str = search_obj.group(2)
+                business_type, doc_id_str = task_str.split(consts.SPLIT_STR)
+                doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
+                application_id = doc_class.objects.filter(id=int(doc_id_str)).values_list('application_id', flat=True)
+                if business_type == consts.HIL_PREFIX:
+                    for id_num, nation in content_list:
+                        ws_hil.append((application_id[0], id_num, nation, time_str))
+                else:
+                    for id_num, nation in content_list:
+                        ws_afc.append((application_id[0], id_num, nation, time_str))
+        afc_excel_path = os.path.join(afc_excel_dir, 'idcard_{0}.xlsx'.format(date_str))
+        hil_excel_path = os.path.join(hil_excel_dir, 'idcard_{0}.xlsx'.format(date_str))
+        wb_afc.save(afc_excel_path)
+        wb_hil.save(hil_excel_path)
--- a/src/apps/doc/management/commands/idcard_monthly.py 0 → 100644
View file @15dccc9
+++ b/src/apps/doc/management/commands/idcard_monthly.py 0 → 100644
View file @15dccc9
+import os
+import datetime
+from calendar import monthrange
+from openpyxl import Workbook, load_workbook
+from django.core.management import BaseCommand
+from settings import conf
+from common.mixins import LoggerMixin
+class Command(BaseCommand, LoggerMixin):
+    def __init__(self):
+        super().__init__()
+        self.dirs = ('AFC', 'HIL')
+    def handle(self, *args, **kwargs):
+        now_time = datetime.datetime.now()
+        end_day_in_mouth = now_time.replace(day=1)
+        pre_mouth = end_day_in_mouth - datetime.timedelta(days=1)
+        for target_dir in self.dirs:
+            excel_dir = os.path.join(conf.DATA_DIR, target_dir, 'IdCard')
+            if not os.path.exists(excel_dir):
+                print('excel dir not exists: {0}'.format(excel_dir))
+                return
+            monthly_wb = Workbook()
+            for d in range(1, monthrange(pre_mouth.year, pre_mouth.month)[1] + 1):
+                date_str = '{:04d}-{:02d}-{:02d}'.format(pre_mouth.year, pre_mouth.month, d)
+                daily_excel_path = os.path.join(excel_dir, 'idcard_{0}.xlsx'.format(date_str))
+                if not os.path.exists(daily_excel_path):
+                    print('daily excel path not exists: {0}'.format(daily_excel_path))
+                    continue
+                monthly_ws = monthly_wb.create_sheet(date_str)
+                daily_wb = load_workbook(daily_excel_path)
+                daily_ws = daily_wb.get_sheet_by_name('身份证')
+                for row in daily_ws.iter_rows(min_row=1, values_only=True):
+                    monthly_ws.append(row)
+            monthly_excel_path = os.path.join(excel_dir, 'idcard_{0}.xlsx'.format(pre_mouth.strftime('%Y-%m')))
+            monthly_wb.remove(monthly_wb.get_sheet_by_name('Sheet'))
+            monthly_wb.save(monthly_excel_path)
--- a/src/apps/doc/management/commands/license_statistics.py
View file @15dccc9
+++ b/src/apps/doc/management/commands/license_statistics.py
View file @15dccc9
@@ -14,7 +14,6 @@ class Command(BaseCommand, LoggerMixin):
    def __init__(self):
        super().__init__()
-        self.log_base = '[license statistics]'
        self.header_map = {
            consts.MVI_CLASSIFY: [('申请ID', '发票代码', '发票号码', '开票日期', '不含税价', '发票联', '购买方名称',
                                   '购买方证件号码', '纳税人识别号', '车架号', '价税合计小写', '销货单位名称', '增值税税额',
@@ -75,7 +74,8 @@ class Command(BaseCommand, LoggerMixin):
            print('excel dir not exists')
            return
        excel_path = os.path.join(excel_dir, 'license_{0}.xlsx'.format(date_str))
-        log_path = os.path.join(conf.LOG_DIR, 'license.log.{0}'.format(date_str))
+        # log_path = os.path.join(conf.LOG_DIR, 'license.log.{0}'.format(date_str))
+        log_path = os.path.join(conf.LOG_DIR, 'license_statistics.log.{0}'.format(date_str))
        if not os.path.exists(log_path):
            print('log_path not exists')
            return
@@ -92,7 +92,8 @@ class Command(BaseCommand, LoggerMixin):
        with open(log_path, 'r', encoding='utf-8') as fp:
            for line in fp:
-                search_obj = re.search(r'task=(.*) license_summary=(.*)', line)
+                # search_obj = re.search(r'task=(.*) license_summary=(.*)', line)
+                search_obj = re.search(r'\[task=(.*)] \[license_summary=(.*)]', line)
                task_str = search_obj.group(1)
                license_summary = ast.literal_eval(search_obj.group(2))
                business_type, doc_id_str = task_str.split(consts.SPLIT_STR)
--- a/src/apps/doc/management/commands/ocr_process.py
View file @15dccc9
+++ b/src/apps/doc/management/commands/ocr_process.py
View file @15dccc9
@@ -689,8 +689,15 @@ class Command(BaseCommand, LoggerMixin):
                                          '[license_summary={4}]'.format(self.log_base, task_str, bs_summary,
                                                                         unknown_summary, license_summary))
+                    self.license_log.info('[task={0}] [license_summary={1}]'.format(task_str, license_summary))
+                    idcard_list = license_summary.get(consts.IC_CLASSIFY)
+                    if idcard_list:
+                        self.idcard_log.info('[task={0}] [idcard={1}]'.format(task_str, idcard_list))
                    merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary)
+                    self.bs_log.info('[task={0}] [bs_summary={1}]'.format(task_str, merged_bs_summary))
                    self.cronjob_log.info('{0} [task={1}] [merged_bs_summary={2}] [unknown_summary={3}] '
                                          '[res_list={4}]'.format(self.log_base, task_str, merged_bs_summary,
                                                                  unknown_summary, res_list))
--- a/src/common/mixins.py
View file @15dccc9
+++ b/src/common/mixins.py
View file @15dccc9
@@ -40,6 +40,9 @@ class LoggerMixin:
    exception_log = logging.getLogger('exception')
    cronjob_log = logging.getLogger('cronjob')
    folder_log = logging.getLogger('folder')
+    bs_log = logging.getLogger('bs')
+    license_log = logging.getLogger('license')
+    idcard_log = logging.getLogger('idcard')
 class GenericView(LoggerMixin, GenericExceptionMixin, GenericAPIView):
--- a/src/common/tools/pdf_to_img.py
View file @15dccc9
+++ b/src/common/tools/pdf_to_img.py
View file @15dccc9
@@ -84,9 +84,15 @@ class PDFHandler:
    def extract_single_image(self, pdf, xref, smask, colorspace, pno, img_index=0):
        pix = self.recover_pix(pdf, xref, smask, colorspace)
        ext, img_data = self.get_img_data(pix)
-        img_save_path = self.get_img_save_path(pno, img_index=img_index, ext=ext)
+        if ext == 'jpx':
-        with open(img_save_path, "wb") as f:
+            img_save_path = self.get_img_save_path(pno, img_index=img_index, ext='jpeg')
-            f.write(img_data)
+            jpx_pix = fitz.Pixmap(img_data)
+            jpx_pix.writeImage(img_save_path)
+            jpx_pix = None
+        else:
+            img_save_path = self.get_img_save_path(pno, img_index=img_index, ext=ext)
+            with open(img_save_path, "wb") as f:
+                f.write(img_data)
        self.xref_set.add(xref)
        self.img_path_list.append(img_save_path)
--- a/src/settings/conf/logging.conf
View file @15dccc9
+++ b/src/settings/conf/logging.conf
View file @15dccc9
 [loggers]
-keys=root, running, exception, cronjob, folder, django.db.backends
+keys=root, running, exception, cronjob, folder, bs, license, idcard, django.db.backends
 [handlers]
-keys=consoleHandler, django_rotateFileHandler, exceptionFileHandler, cronjobFileHandler, folderFileHandler, djangodbFileHandler
+keys=consoleHandler, django_rotateFileHandler, exceptionFileHandler, cronjobFileHandler, folderFileHandler, bsFileHandler, licenseFileHandler, idcardFileHandler, djangodbFileHandler
 [formatters]
-keys=SituFormatter, dataLogFormatter
+keys=SituFormatter, dataLogFormatter, SimpleFormatter
 [formatter_SituFormatter]
 format=[%(asctime)s] [%(process)d] [%(thread)d] [%(threadName)s] [%(filename)s:%(lineno)d] %(levelname)s %(message)s
@@ -15,6 +15,10 @@ datefmt=
 class=situlogger.JsonFormatter
 format=%(asctime)s %(levelname)s %(funcName)s
+[formatter_SimpleFormatter]
+format=[%(asctime)s] %(message)s
+datefmt=
 [handler_consoleHandler]
 class=StreamHandler
 level=ERROR
@@ -45,6 +49,24 @@ level=DEBUG
 formatter=SituFormatter
 args=('../logs/folder_ocr.log',)
+[handler_bsFileHandler]
+class=situlogger.SituRotatingFileHandler
+level=DEBUG
+formatter=SimpleFormatter
+args=('../logs/bs_statistics.log',)
+[handler_licenseFileHandler]
+class=situlogger.SituRotatingFileHandler
+level=DEBUG
+formatter=SimpleFormatter
+args=('../logs/license_statistics.log',)
+[handler_idcardFileHandler]
+class=situlogger.SituRotatingFileHandler
+level=DEBUG
+formatter=SimpleFormatter
+args=('../logs/idcard.log',)
 [handler_djangodbFileHandler]
 class=situlogger.SituRotatingFileHandler
 level=DEBUG
@@ -79,6 +101,24 @@ handlers=folderFileHandler
 qualname=folder
 propagate=0
+[logger_bs]
+level=INFO
+handlers=bsFileHandler
+qualname=bs
+propagate=0
+[logger_license]
+level=INFO
+handlers=licenseFileHandler
+qualname=license
+propagate=0
+[logger_idcard]
+level=INFO
+handlers=idcardFileHandler
+qualname=idcard
+propagate=0
 [logger_django.db.backends]
 level=DEBUG
 handlers=djangodbFileHandler