ocr excel upload eDMS

周伟奇
Showing 8 changed files with 299 additions and 142 deletions
requirements/base.txt
src/apps/doc/consts.py
src/apps/doc/edms.py
src/apps/doc/management/commands/doc_process.py
src/apps/doc/management/commands/pdf_to_img.py
src/apps/doc/mixins.py
src/apps/doc/named_enum.py
src/apps/doc/views.py
--- a/requirements/base.txt
View file @9799467
+++ b/requirements/base.txt
View file @9799467
@@ -12,13 +12,16 @@ Django==2.1
 django-oauth-toolkit==1.3.2
 djangorestframework==3.9.0
 djangorestframework-jwt==1.11.0
+et-xmlfile==1.0.1
 idna==2.9
 idna-ssl==1.1.0
 isodate==0.6.0
+jdcal==1.4.1
 lxml==4.5.1
 marshmallow==3.6.1
 multidict==4.7.6
 oauthlib==3.1.0
+openpyxl==3.0.4
 pdfminer3k==1.3.4
 Pillow==7.1.2
 ply==3.11
--- a/src/apps/doc/consts.py
View file @9799467
+++ b/src/apps/doc/consts.py
View file @9799467
 PAGE_DEFAULT = 1
 PAGE_SIZE_DEFAULT = 10

-DOC_SCHEME_LIST = ['Acceptance', 'Settlement', 'Contract Management']
-DATA_SOURCE_LIST = ['POS', 'EAPP', 'Econtract']
-BUSINESS_TYPE_LIST = ['HIL', 'AFC']
-HIL_SET = {'HIL', 'HIl', 'HiL', 'Hil', 'hIL', 'hIl', 'hiL', 'hil', 'CO00002'}
+FIXED_APPLICATION_ID = '手工单'
+
+DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACT MANAGEMENT']
+DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT']
+
 HIL_PREFIX = 'HIL'
 AFC_PREFIX = 'AFC'
+SPLIT_STR = '_'
+BUSINESS_TYPE_LIST = [HIL_PREFIX, AFC_PREFIX]
+HIL_SET = {'HIL', 'HIl', 'HiL', 'Hil', 'hIL', 'hIl', 'hiL', 'hil', 'CO00002'}

 SESSION_PREFIX = 'FHLSID'
 CUSTOM_CLIENT = 'CustomClient'
@@ -15,12 +19,22 @@ FIXED_FILE_SIZE = 0
 DOWNLOAD_ACTION_TYPE = 'Downloaded'

 DOC_SCHEMA_ID_FILL = {
-    'Acceptance': (1, 'DFE-AutoFilingScript'),
-    'Settlement': (20, 'DFE-AutoFilingScript'),
-    'Contract Management': (86, 'Schema-Based')
+    'ACCEPTANCE': (1, 'DFE-AutoFilingScript'),
+    'SETTLEMENT': (20, 'DFE-AutoFilingScript'),
+    'CONTRACT MANAGEMENT': (86, 'Schema-Based')
+}
+BUSINESS_TYPE_DICT = {
+    HIL_PREFIX: 'CO00002',
+    AFC_PREFIX: 'CO00001'
 }
 DOC_SCHEMA_TYPE = 'ElectronicRecord'
 APPLICATION_ID_META_FIELD_id = 1
 DEALER_CODE_META_FIELD_id = 13
 BUSINESS_TYPE_META_FIELD_id = 93
 DEALER_CODE = 'ocr_situ_group'
+
+AMOUNT_COL_TITLE_SET = {"交易金额", "金额", "收入/支出金额", "发生额"}
+OVERAGE_COL_TITLE_SET = {"账户余额", "余额"}
+PROOF_COL_TITLE = '核对结果'
+PROOF_RES = ('对', '错')
+META_SHEET_TITLE = '关键信息提取和展示'
--- a/src/apps/doc/edms.py
View file @9799467
+++ b/src/apps/doc/edms.py
View file @9799467
+import os
 import requests
 from zeep import Client, xsd
 from settings import conf
@@ -65,9 +66,9 @@ class EDMS:
        params = {'token': token}
        self.download_handler(params, headers, save_path)

-    def create_upload_token(self, headers, file_size):
+    def create_upload_token(self, headers):
        with self.rc_client.settings(extra_http_headers=headers):
-            token = self.rc_client.service.CreateUploadToken(fileSize=file_size)
+            token = self.rc_client.service.CreateUploadToken(fileSize=consts.FIXED_FILE_SIZE)
        return token

    def upload_handler(self, file_path, params, headers):
@@ -80,11 +81,19 @@ class EDMS:
        else:
            raise Exception

-    def get_doc_info(self, token, doc_info):
-        doc_schema_id, auto_filing = consts.DOC_SCHEMA_ID_FILL.get(doc_info.get('document_scheme'))
-        application_id = doc_info.get('application_id')
-        doc_file_name = doc_info.get('doc_file_name')
-        business_type = doc_info.get('business_type')
+    @staticmethod
+    def get_doc_file_name(doc_name):
+        if doc_name.endswith('pdf'):
+            name, _ = os.path.splitext(doc_name)
+            return name
+        return doc_name
+
+    def get_doc_info(self, token, doc, business_type, file_path):
+        business_type = consts.BUSINESS_TYPE_DICT.get(business_type)
+        doc_schema_id, auto_filing = consts.DOC_SCHEMA_ID_FILL.get(doc.document_scheme)
+        application_id = doc.application_id
+        doc_file_name = self.get_doc_file_name(doc.document_name)
+        origin_file_name = os.path.basename(file_path)
        fields_with_value = [
            {'FieldId': consts.APPLICATION_ID_META_FIELD_id,
             'FieldValue': xsd.AnyObject(xsd.String(), application_id)},
@@ -99,20 +108,20 @@ class EDMS:
            'DocumentName': doc_file_name,
            'FieldsWithValues': fields_with_values,
            'UploadToken': token,
-            'OriginalFileName': doc_file_name,
+            'OriginalFileName': origin_file_name,
            'SendEmailToMembers': False,
            'AutoFilingScriptToUse': auto_filing,
            'DocumentSchemaType': consts.DOC_SCHEMA_TYPE,
        }
        return info

-    def add_doc_info(self, headers, token, doc_info):
-        info = self.get_doc_info(token, doc_info)
+    def add_doc_info(self, headers, token, doc, business_type, file_path):
+        info = self.get_doc_info(token, doc, business_type, file_path)
        with self.dm_client.settings(extra_http_headers=headers):
            metadata_version_id = self.dm_client.service.AddDocumentInfo(info=info)
        return metadata_version_id

-    def upload(self, file_path, file_size, doc_info):
+    def upload(self, file_path, doc, business_type):
        # file_path = '/Users/clay/Postman/files/OCRuploadTest4.txt'
        # file_size = 16
        # doc_info = {
@@ -122,12 +131,12 @@ class EDMS:
        #     'business_type': 'CO00001',
        # }
        headers = self.get_headers()
-        token = self.create_upload_token(headers, file_size)
+        token = self.create_upload_token(headers)
        headers.update({'Content-Type': 'application/octet-stream'})
        params = {'token': token}
        self.upload_handler(file_path, params, headers)
        headers.pop('Content-Type')
-        metadata_version_id = self.add_doc_info(headers, token, doc_info)
+        metadata_version_id = self.add_doc_info(headers, token, doc, business_type, file_path)
        return metadata_version_id


--- a/src/apps/doc/management/commands/doc_process.py
View file @9799467
+++ b/src/apps/doc/management/commands/doc_process.py
View file @9799467
 import os
 import time
 import fitz
-import xlwt
 import signal
 import base64
 import asyncio
 import aiohttp
+import locale
 from PIL import Image
 from io import BytesIO
-from zeep import Client
+from openpyxl import Workbook
+from openpyxl.styles import numbers
+from openpyxl.utils import get_column_letter

 from django.core.management import BaseCommand
 from common.mixins import LoggerMixin
@@ -23,7 +25,7 @@ class Command(BaseCommand, LoggerMixin):

    def __init__(self):
        super().__init__()
-        self.log_base = '[doc process]'
+        self.log_base = '[doc ocr process]'
        # 处理文件开关
        self.switch = True
        # 数据目录
@@ -50,46 +52,54 @@ class Command(BaseCommand, LoggerMixin):
        task_str, is_priority = rh.dequeue()
        if task_str is None:
            self.cronjob_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base))
-            return None, None, None, None
+            return None, None

-        business_type, doc_id_str = task_str.split('_')
+        business_type, doc_id_str = task_str.split(consts.SPLIT_STR)
        doc_id = int(doc_id_str)
        doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
-        doc_info = doc_class.objects.filter(id=doc_id, status=DocStatus.INIT.value).values(
-            'id', 'metadata_version_id', 'application_id', 'document_name', 'document_scheme').first()
-        if doc_info is None:
-            self.cronjob_log.warn('{0} [get_doc_info] [doc completed] [task_str={1}] [is_priority={2}]'.format(
+        # doc_info = doc_class.objects.filter(id=doc_id, status=DocStatus.INIT.value).values(
+        #     'id', 'metadata_version_id', 'application_id', 'document_name', 'document_scheme').first()
+        doc = doc_class.objects.filter(id=doc_id).first()
+        if doc is None:
+            self.cronjob_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format(
                self.log_base, task_str, is_priority))
-            return None, None, None, None
-        doc_class.objects.filter(id=doc_id).update(status=DocStatus.PROCESSING.value)
-        self.cronjob_log.info('{0} [get_doc_info] [task_str={1}] [is_priority={2}] [doc_info={3}]'.format(
-            self.log_base, task_str, is_priority, doc_info))
-        return doc_info, doc_class, doc_id, business_type
-
-    def pdf_download(self, doc_id, doc_info, business_type):
-        if doc_info is None:
+            return None, None
+        elif doc.status != DocStatus.INIT.value:
+            self.cronjob_log.warn('{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] '
+                                  '[doc_status={3}]'.format(self.log_base, task_str, is_priority, doc.status))
+            return None, None
+        doc.status = DocStatus.PROCESSING.value
+        doc.save()
+        self.cronjob_log.info('{0} [get_doc_info] [success] [task_str={1}] [is_priority={2}]'.format(
+            self.log_base, task_str, is_priority))
+        return doc, business_type
+
+    def pdf_download(self, doc, business_type):
+        if doc is None:
            return None, None, None
        # TODO EDMS下载pdf
-
-        doc_data_path = os.path.join(self.data_dir, business_type, str(doc_id))
-        pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc_id))
-        excel_path = os.path.join(doc_data_path, '{0}.xls'.format(doc_id))
-        self.cronjob_log.info('{0} [pdf download success] [business_type={1}] [doc_info={2}] [pdf_path={3}]'.format(
-            self.log_base, business_type, doc_info, pdf_path))
+        doc_data_path = os.path.join(self.data_dir, business_type, str(doc.id))
+        pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
+        if doc.application_id != consts.FIXED_APPLICATION_ID:
+            self.edms.download(pdf_path, doc.metadata_version_id)
+
+        excel_path = os.path.join(doc_data_path, '{0}.xls'.format(doc.id))
+        self.cronjob_log.info('{0} [pdf download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format(
+            self.log_base, business_type, doc.id, pdf_path))
        return doc_data_path, excel_path, pdf_path

    @staticmethod
    def append_sheet(wb, sheets_list, img_name):
        for i, sheet in enumerate(sheets_list):
-            ws = wb.add_sheet('{0}_{1}'.format(img_name, i))
+            ws = wb.create_sheet('{0}_{1}'.format(img_name, i))
            cells = sheet.get('cells')
            for cell in cells:
                c1 = cell.get('start_column')
-                c2 = cell.get('end_column')
+                # c2 = cell.get('end_column')
                r1 = cell.get('start_row')
-                r2 = cell.get('end_row')
+                # r2 = cell.get('end_row')
                label = cell.get('words')
-                ws.write_merge(r1, r2, c1, c2, label=label)
+                ws.cell(row=r1+1, column=c1+1, value=label)

    @staticmethod
    def get_ocr_json(img_path):
@@ -112,6 +122,46 @@ class Command(BaseCommand, LoggerMixin):
        img_name = os.path.basename(img_path)
        self.append_sheet(wb, sheets_list, img_name)

+    def proof(self, ws):
+        # 找到金额、余额列
+        amount_col = overage_col = None
+        for i in ws[1]:
+            if i.value in consts.AMOUNT_COL_TITLE_SET:
+                amount_col = i.column
+                amount_col_letter = get_column_letter(amount_col)
+            elif i.value in consts.OVERAGE_COL_TITLE_SET:
+                overage_col = i.column
+                overage_col_letter = get_column_letter(overage_col)
+        if amount_col is None or overage_col is None:
+            return
+        # 文本转数值
+        for col_tuple in ws.iter_cols(min_row=2, min_col=amount_col, max_col=overage_col):
+            for c in col_tuple:
+                try:
+                    c.value = locale.atof(c.value)
+                    c.number_format = numbers.FORMAT_NUMBER_00
+                except Exception:
+                    continue
+        # 增加核对结果列
+        proof_col_letter = get_column_letter(ws.max_column + 1)
+        for c in ws[proof_col_letter]:
+            if c.row == 1:
+                c.value = consts.PROOF_COL_TITLE
+            elif c.row == 2:
+                continue
+            else:
+                c.value = '=IF({3}{0}=SUM({2}{0},{3}{1}), "{4}", "{5}")'.format(
+                    c.row, c.row - 1, amount_col_letter, overage_col_letter, *consts.PROOF_RES)
+
+    def wb_process(self, wb, excel_path):
+        locale.setlocale(locale.LC_NUMERIC, 'en_US.UTF-8')
+        for ws in wb.worksheets:
+            if ws.title == 'Sheet':
+                ws.title = consts.META_SHEET_TITLE
+            else:
+                self.proof(ws)
+        wb.save(excel_path)  # TODO no sheet (res always [])
+
    @staticmethod
    def getimage(pix):
        if pix.colorspace.n != 4:
@@ -124,7 +174,7 @@ class Command(BaseCommand, LoggerMixin):
        s = item[1]  # xref of its /SMask
        is_rgb = True if item[5] == 'DeviceRGB' else False

-        # GRAY/RGB  # TODO 颜色空间不同处理
+        # RGB
        if is_rgb:
            if s == 0:
                return doc.extractImage(x)
@@ -158,7 +208,7 @@ class Command(BaseCommand, LoggerMixin):

        pix1 = pix2 = None  # free temp pixmaps

-        pix = fitz.Pixmap(fitz.csRGB, pix)  # CMYK to RGB
+        pix = fitz.Pixmap(fitz.csRGB, pix)  # GRAY/CMYK to RGB
        return self.getimage(pix)

    @staticmethod
@@ -200,10 +250,11 @@ class Command(BaseCommand, LoggerMixin):

        while self.switch:
            # 1. 从队列获取文件信息
-            doc_info, doc_class, doc_id, business_type = self.get_doc_info()
+            doc, business_type = self.get_doc_info()

+            try:
                # 2. 从EDMS获取PDF文件
-            doc_data_path, excel_path, pdf_path = self.pdf_download(doc_id, doc_info, business_type)
+                doc_data_path, excel_path, pdf_path = self.pdf_download(doc, business_type)

                # 队列为空时的处理
                if pdf_path is None:
@@ -212,7 +263,7 @@ class Command(BaseCommand, LoggerMixin):
                    continue

                sleep_second = int(conf.SLEEP_SECOND)
-            try:
+
                # 3.PDF文件提取图片
                img_save_path = os.path.join(doc_data_path, 'img')
                os.makedirs(img_save_path, exist_ok=True)
@@ -233,8 +284,8 @@ class Command(BaseCommand, LoggerMixin):
                            save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number))
                            pm.writePNG(save_path)
                            img_path_list.append(save_path)
-                            self.cronjob_log.info('{0} [page to img success] [doc_id={1}] [pdf_path={2}] '
-                                                  '[page={3}]'.format(self.log_base, doc_id, pdf_path, page.number))
+                            self.cronjob_log.info('{0} [page to img success] [pdf_path={1}] [page={2}]'.format(
+                                self.log_base, pdf_path, page.number))
                        else:  # 提取图片
                            for img_index, img_il in enumerate(img_il_list):
                                if len(img_il) == 1:  # 当只有一张图片时， 简化处理
@@ -246,8 +297,8 @@ class Command(BaseCommand, LoggerMixin):
                                        f.write(img_data)
                                    img_path_list.append(save_path)
                                    self.cronjob_log.info(
-                                        '{0} [extract img success] [doc_id={1}] [pdf_path={2}] [page={3}] '
-                                        '[img_index={4}]'.format(self.log_base, doc_id, pdf_path, pno, img_index))
+                                        '{0} [extract img success] [pdf_path={1}] [page={2}] [img_index={3}]'.format(
+                                            self.log_base, pdf_path, pno, img_index))
                                else:  # 多张图片，竖向拼接
                                    height_sum = 0
                                    im_list = []
@@ -276,28 +327,41 @@ class Command(BaseCommand, LoggerMixin):
                                    res.save(save_path)
                                    img_path_list.append(save_path)
                                    self.cronjob_log.info(
-                                        '{0} [extract img success] [doc_id={1}] [pdf_path={2}] [page={3}] '
-                                        '[img_index={4}]'.format(self.log_base, doc_id, pdf_path, pno, img_index))
-                    self.cronjob_log.info('{0} [pdf to img success] [doc_id={1}]'.format(self.log_base, doc_id))
+                                        '{0} [extract img success] [pdf_path={1}] [page={2}] [img_index={3}]'.format(
+                                            self.log_base, pdf_path, pno, img_index))
+                    self.cronjob_log.info('{0} [pdf to img success] [business_type={1}] [doc_id={2}]'.format(
+                        self.log_base, business_type, doc.id))

-                write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc_id)))
+                write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))

                # 4.图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件
-                wb = xlwt.Workbook()
+                wb = Workbook()
                loop = asyncio.get_event_loop()
                tasks = [self.img_ocr_excel(wb, img_path) for img_path in img_path_list]
                loop.run_until_complete(asyncio.wait(tasks))
                # loop.close()
-                wb.save(excel_path)  # TODO no sheet (res always [])
-                # 整合excel文件

+                # 整合excel文件
+                # self.wb_process(wb, excel_path)
+                wb.save(excel_path)
+            except Exception as e:
+                doc.status = DocStatus.PROCESS_FAILED.value
+                doc.save()
+                self.cronjob_log.error('{0} [process failed] [business_type={1}] [doc_id={2}] [err={3}]'.format(
+                    self.log_base, business_type, doc.id, e))
+            else:
+                try:
                    # 5.上传至EDMS
-
+                    self.edms.upload(excel_path, doc, business_type)
                except Exception as e:
-                doc_class.objects.filter(id=doc_id).update(status=DocStatus.PROCESS_FAILED.value)
-                self.cronjob_log.error('{0} [process failed] [doc_id={1}] [err={2}]'.format(self.log_base, doc_id, e))
+                    doc.status = DocStatus.UPLOAD_FAILED.value
+                    doc.save()
+                    self.cronjob_log.error('{0} [upload failed] [business_type={1}] [doc_id={2}] [err={3}]'.format(
+                        self.log_base, business_type, doc.id, e))
                else:
-                doc_class.objects.filter(id=doc_id).update(status=DocStatus.COMPLETE.value)
-                self.cronjob_log.info('{0} [doc process complete] [doc_id={1}]'.format(self.log_base, doc_id))
+                    doc.status = DocStatus.COMPLETE.value
+                    doc.save()
+                    self.cronjob_log.info('{0} [doc process complete] [business_type={1}] [doc_id={2}]'.format(
+                        self.log_base, business_type, doc.id))

        self.cronjob_log.info('{0} [stop safely]'.format(self.log_base))
--- a/src/apps/doc/management/commands/pdf_to_img.py
View file @9799467
+++ b/src/apps/doc/management/commands/pdf_to_img.py
View file @9799467
@@ -86,73 +86,143 @@ class Command(BaseCommand, LoggerMixin):

    @staticmethod
    def split_il(il):
-        img_il_list = []
+        small_img_il_list = []
+        big_img_il_list = []
        start = 0
+        index = 0
        length = len(il)
        for i in range(length):
+            if il[i][2] >= 700 and il[i][3] >= 647:
+                if start < i:
+                    small_img_il_list.append((il[start: i], index))
+                    index += 1
+                else:
+                    start += 1
+                big_img_il_list.append((il[i], index))
+                index += 1
+                continue
            if i == start:
                if i == length - 1:
-                    img_il_list.append(il[start: length])
+                    small_img_il_list.append((il[start: length], index))
                continue
            elif i == length - 1:
-                img_il_list.append(il[start: length])
+                if il[i][2] == il[i - 1][2]:
+                    small_img_il_list.append((il[start: length], index))
+                else:
+                    small_img_il_list.append((il[start: i], index))
+                    small_img_il_list.append((il[i: length], index+1))
                continue
            if il[i][2] != il[i - 1][2]:
-                img_il_list.append(il[start: i])
+                small_img_il_list.append((il[start: i], index))
+                index += 1
                start = i
-            elif il[i][3] != il[i - 1][3]:
-                img_il_list.append(il[start: i + 1])
+            elif il[i][3] != il[i - 1][3] and il[i][2] < 1200:
+                small_img_il_list.append((il[start: i + 1], index))
+                index += 1
                start = i + 1
-        return img_il_list
+        return small_img_il_list, big_img_il_list

    def handle(self, *args, **kwargs):
-        pdf_dir = '/Users/clay/Desktop/普通打印-部分无线/竖版-无表格-农业银行'
-        img_dir = '/Users/clay/Desktop/普通打印-部分无线_img/竖版-无表格-农业银行'
-        os.makedirs(img_dir, exist_ok=True)
+        pdf_dir = '/Users/clay/Desktop/问题PDF'
+        img_dir = '/Users/clay/Desktop/问题PDF'
        for d in os.listdir(pdf_dir):
-        #     if d in ['.DS_Store', 'CH-B008486764.pdf', 'CH-B008003736.pdf', 'CH-B008487476.pdf', 'CH-B006763780.pdf',
-        #              'CH-B009000564.pdf', 'CH-B009020488.pdf']:
-            if d in ['.DS_Store', '1竖版-无表格-农业银行样例.PNG']:
+            # if d in ['.DS_Store', 'CH-B008003736.pdf', 'CH-B006317088.pdf', 'CH-B008487476.pdf', 'CH-B006337608.pdf',
+            #          'CH-B006391612.pdf', 'CH-B006536124.pdf', 'CH-B006526652.pdf', 'CH-B009003592.pdf']:
+            #     continue
+            # if d != 'CH-B006393152.PDF':
+            # if d != 'CH-B006526652.pdf':
+            if d != 'CH-B008487944.pdf':
                continue
            pdf_path = os.path.join(pdf_dir, d)
-            # pdf_path = '/Users/clay/Desktop/普通打印part2/工商银行(标准版)/CH-B006754676.pdf'
            if os.path.isfile(pdf_path):
-                img_save_path = os.path.join(img_dir, d)
-                if os.path.exists(img_save_path):
-                    continue
+                img_save_path = os.path.join(img_dir, d[:-4])
+                # if os.path.exists(img_save_path):
+                #     continue
                os.makedirs(img_save_path, exist_ok=True)
                with fitz.Document(pdf_path) as pdf:
                    self.cronjob_log.info('{0} [pdf_path={1}] [metadata={2}]'.format(
                        self.log_base, pdf_path, pdf.metadata))
-                    # xref_list = []
+                    xref_set = set()
                    for pno in range(pdf.pageCount):
+                        print('---------------------------------------')
                        il = pdf.getPageImageList(pno)
+                        # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
+                        print(il)
+
+                        # for img_index, img in enumerate(il):
+                        #     pix = self.recoverpix(pdf, img)
+                        #     ext, img_data = self.get_img_data(pix)
+                        #     save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format(
+                        #         pno, img_index, ext))
+                        #     with open(save_path, "wb") as f:
+                        #         f.write(img_data)
+
+                        if len(il) == 0:
+                            page = pdf.loadPage(pno)
+                            pm = page.getPixmap(matrix=self.trans, alpha=False)
+                            save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number))
+                            pm.writePNG(save_path)
+                        elif len(il) == 1:
+                            width = il[0][2]
+                            height = il[0][3]
+                            colorspace = il[0][5]
+                            adobe_filter = il[0][-1]
+                            if colorspace == '' or adobe_filter in ['', '']:
+                                continue
+                            # 小图
+                            if width < 500 and height < 500:
+                                page = pdf.loadPage(pno)
+                                pm = page.getPixmap(matrix=self.trans, alpha=False)
+                                save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number))
+                                pm.writePNG(save_path)
+                            # 大图
+                            elif il[0][0] not in xref_set:
+                                pix = self.recoverpix(pdf, il[0])
+                                ext, img_data = self.get_img_data(pix)
+                                save_path = os.path.join(img_save_path, 'page_{0}_img_0.{1}'.format(pno, ext))
+                                with open(save_path, "wb") as f:
+                                    f.write(img_data)
+                                xref_set.add(il[0][0])
+                        else:
                            il.sort(key=lambda x: x[0])
-                        img_il_list = self.split_il(il)
-                        del il
+                            small_img_il_list, big_img_il_list = self.split_il(il)
+                            print(small_img_il_list)
+                            print(big_img_il_list)
+                            print('+++++++++++++++++++++++++++++++++++')

-                        print(img_il_list)
-                        if len(img_il_list) > 3:  # 单页无规律小图过多时，使用页面转图片
+                            if len(small_img_il_list) > 2:  # 单页无规律小图过多时，使用页面转图片
                                page = pdf.loadPage(pno)
                                pm = page.getPixmap(matrix=self.trans, alpha=False)
                                save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number))
                                pm.writePNG(save_path)
-                            # img_path_list.append(save_path)
-                            # self.cronjob_log.info('{0} [page to img success] [doc_id={1}] [pdf_path={2}] '
-                            #                       '[page={3}]'.format(self.log_base, doc_id, pdf_path, page.number))
                            else:  # 提取图片
-                            for img_index, img_il in enumerate(img_il_list):
-                                if len(img_il) == 1:  # 当只有一张图片时， 简化处理
+                                for img_il, img_index in big_img_il_list:
+                                    if img_il[0] in xref_set:
+                                        continue
+                                    pix = self.recoverpix(pdf, img_il)
+                                    ext, img_data = self.get_img_data(pix)
+                                    save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format(
+                                        pno, img_index, ext))
+                                    with open(save_path, "wb") as f:
+                                        f.write(img_data)
+                                    xref_set.add(img_il[0])
+
+                                for img_il, img_index in small_img_il_list:
+                                    # 小图
+                                    if len(img_il) == 1 and img_il[0][2] < 500 and img_il[0][3] < 500:
+                                        page = pdf.loadPage(pno)
+                                        pm = page.getPixmap(matrix=self.trans, alpha=False)
+                                        save_path = os.path.join(img_save_path,
+                                                                 'page_{0}_img_0.png'.format(page.number))
+                                        pm.writePNG(save_path)
+                                    elif len(img_il) == 1 and img_il[0][0] not in xref_set:  # 当只有一张图片时， 简化处理
                                        pix = self.recoverpix(pdf, img_il[0])
                                        ext, img_data = self.get_img_data(pix)
                                        save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format(
                                            pno, img_index, ext))
                                        with open(save_path, "wb") as f:
                                            f.write(img_data)
-                                    # img_path_list.append(save_path)
-                                    # self.cronjob_log.info(
-                                    #     '{0} [extract img success] [doc_id={1}] [pdf_path={2}] [page={3}] '
-                                    #     '[img_index={4}]'.format(self.log_base, doc_id, pdf_path, pno, img_index))
+                                        xref_set.add(img_il[0][0])
                                    else:  # 多张图片，竖向拼接
                                        height_sum = 0
                                        im_list = []
@@ -179,6 +249,3 @@ class Command(BaseCommand, LoggerMixin):
                                            res.paste(m, box=(0, h_now))
                                            h_now += h
                                        res.save(save_path)
-            # else:
-            #     img_dir_path = os.path.join(img_dir, d)
-            #     os.makedirs(img_dir_path, exist_ok=True)
--- a/src/apps/doc/mixins.py
View file @9799467
+++ b/src/apps/doc/mixins.py
View file @9799467
@@ -26,7 +26,21 @@ class DocHandler:

    @staticmethod
    def get_doc_class(business_type):
-        is_hil = business_type in consts.HIL_SET
-        doc_class, prefix = (HILDoc, consts.HIL_PREFIX) if is_hil else (AFCDoc, consts.AFC_PREFIX)
-        return doc_class, prefix
+        return (HILDoc, consts.HIL_PREFIX) if business_type in consts.HIL_SET else (AFCDoc, consts.AFC_PREFIX)
+
+    def fix_scheme(self, scheme):
+        if scheme in consts.DOC_SCHEME_LIST:
+            return scheme
+        elif scheme.upper() in consts.DOC_SCHEME_LIST:
+            return scheme.upper()
+        else:
+            return consts.DOC_SCHEME_LIST[0]
+
+    def fix_data_source(self, data_source):
+        if data_source in consts.DATA_SOURCE_LIST:
+            return data_source
+        elif data_source.upper() in consts.DATA_SOURCE_LIST:
+            return data_source.upper()
+        else:
+            return consts.DATA_SOURCE_LIST[0]

--- a/src/apps/doc/named_enum.py
View file @9799467
+++ b/src/apps/doc/named_enum.py
View file @9799467
@@ -7,20 +7,3 @@ class DocStatus(NamedEnum):
    PROCESS_FAILED = (2, '识别失败')
    UPLOAD_FAILED = (3, '同步失败')
    COMPLETE = (4, '已完成')
-
-
-class DocScheme(NamedEnum):
-    ACCEPTANCE = (0, "Acceptance")
-    SETTLEMENT = (1, 'Settlement')
-    CONTRACT_MANAGEMENT = (2, 'Contract Management')
-
-
-class BusinessType(NamedEnum):
-    AFC = (0, "CO00001")
-    HIL = (1, 'CO00002')
-
-
-class DataSource(NamedEnum):
-    POS = (0, "POS")
-    EAPP = (1, 'EAPP')
-    ECONTRACT = (2, 'Econtract')
--- a/src/apps/doc/views.py
View file @9799467
+++ b/src/apps/doc/views.py
View file @9799467
@@ -60,7 +60,7 @@ doc_list_args = {
    'status': fields.Int(required=False,
                         validate=validate.OneOf(DocStatus.get_value_lst())),
    'application_id': fields.Str(required=False, validate=validate.Length(max=64)),
-    'data_source': fields.Str(required=False, validate=validate.Length(max=64)),
+    'data_source': fields.Str(required=False, validate=validate.OneOf(consts.DATA_SOURCE_LIST)),
    'business_type': fields.Str(required=True, validate=validate.OneOf(consts.BUSINESS_TYPE_LIST)),
    'upload_time_start': fields.Date(required=False),
    'upload_time_end': fields.Date(required=False),
@@ -100,6 +100,8 @@ class UploadDocView(GenericView, DocHandler):
        document = args.get('document')
        business_type = document.get('businessType')
        application_id = application_data.get('applicationId')
+        document_scheme = document.get('documentScheme')
+        data_source = document.get('dataSource')
        try:
            # 1. 上传信息记录
            record = UploadDocRecords.objects.create(
@@ -110,9 +112,9 @@ class UploadDocView(GenericView, DocHandler):
                guarantor_1=applicant_data.get('guarantor1Name'),
                guarantor_2=applicant_data.get('guarantor2Name'),
                document_name=document.get('documentName'),
-                document_scheme=document.get('documentScheme'),
+                document_scheme=document_scheme,
                business_type=business_type,
-                data_source=document.get('dataSource'),
+                data_source=data_source,
                upload_finish_time=document.get('uploadFinishTime'),
            )
        except IntegrityError as e:
@@ -130,17 +132,17 @@ class UploadDocView(GenericView, DocHandler):
                guarantor_1=applicant_data.get('guarantor1Name'),
                guarantor_2=applicant_data.get('guarantor2Name'),
                document_name=document.get('documentName'),
-                document_scheme=document.get('documentScheme'),
-                data_source=document.get('dataSource'),
+                document_scheme=self.fix_scheme(document_scheme),
+                data_source=self.fix_data_source(data_source),
                upload_finish_time=document.get('uploadFinishTime'),
            )
            # 3. 选择队列进入
            is_priority = PriorityApplication.objects.filter(application_id=application_id, on_off=True).exists()
-            value = ['{0}_{1}'.format(prefix, doc.id)]
-            redis_res = rh.enqueue(value, is_priority)
-            self.running_log.info('[doc upload success] [args={0}] [record_id={1}] [prefix={2}] [doc_id={3}] '
+            tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)]
+            enqueue_res = rh.enqueue(tasks, is_priority)
+            self.running_log.info('[doc upload success] [args={0}] [record_id={1}] [business_type={2}] [doc_id={3}] '
                                  '[is_priority={4}] [enqueue_res={5}]'.format(args, record.id, prefix, doc.id,
-                                                                               is_priority, redis_res))
+                                                                               is_priority, enqueue_res))
            return response.ok()

    post.openapi_doc = '''
@@ -174,6 +176,7 @@ class PriorityDocView(GenericView, DocHandler):
        application_id = application_info.get('APPLICATION_ID')
        submit_datetime = application_info.get('SUBMIT_DATETIME')
        entity = application_info.get('ENTITY')
+        if submit_datetime.utcoffset() is not None:
            submit_datetime = timezone.make_naive(submit_datetime, timezone.get_current_timezone())
        GCAPRecords.objects.create(
            entity=entity,
@@ -190,14 +193,14 @@ class PriorityDocView(GenericView, DocHandler):
            doc_class, prefix = self.get_doc_class(entity)
            doc_ids = doc_class.objects.filter(application_id=application_id,
                                               status=DocStatus.INIT.value).values_list('id', flat=True)
-            task_str_list = ['{0}_{1}'.format(prefix, doc_id) for doc_id in doc_ids]
-            if not task_str_list:
+            tasks_list = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc_id) for doc_id in doc_ids]
+            if not tasks_list:
                self.running_log.info(
-                    '[priority doc success] [args={0}] [task_str_list={1}]'.format(args, task_str_list))
+                    '[priority doc success] [args={0}]'.format(args))
            else:
-                enqueue_res = rh.enqueue(task_str_list, is_priority=True)
-                self.running_log.info('[priority doc success] [args={0}] [task_str_list={1}] [enqueue_res={2}]'.format(
-                    args, task_str_list, enqueue_res))
+                enqueue_res = rh.enqueue(tasks_list, is_priority=True)
+                self.running_log.info('[priority doc success] [args={0}] [tasks_list={1}] [enqueue_res={2}]'.format(
+                    args, tasks_list, enqueue_res))
        return response.ok()

    post.openapi_doc = '''
@@ -268,7 +271,7 @@ class DocView(GenericView, DocHandler):
    @use_args(upload_pdf_args, location='files')
    def post(self, request, args):
        # 1. 上传信息记录
-        const_str = '手工单'
+        const_str = consts.FIXED_APPLICATION_ID
        metadata_version_id = str(int(time.time()))
        upload_finish_time = timezone.now()
        document_scheme = random.choice(consts.DOC_SCHEME_LIST)
@@ -305,8 +308,8 @@ class DocView(GenericView, DocHandler):
        )
        # 3. 选择队列进入
        is_priority = False
-        value = ['{0}_{1}'.format(prefix, doc.id)]
-        redis_res = rh.enqueue(value, is_priority)
+        tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)]
+        enqueue_res = rh.enqueue(tasks, is_priority)

        pdf_file = args.get('pdf_file')
        save_dir_path = os.path.join(conf.DATA_DIR, business_type, str(doc.id))
@@ -314,7 +317,7 @@ class DocView(GenericView, DocHandler):
        os.makedirs(save_dir_path, exist_ok=True)
        file_write(pdf_file, save_file_path)

-        self.running_log.info('[mock doc upload success] [args={0}] [record_id={1}] [prefix={2}] [doc_id={3}] '
+        self.running_log.info('[mock doc upload success] [args={0}] [record_id={1}] [business_type={2}] [doc_id={3}] '
                              '[is_priority={4}] [enqueue_res={5}]'.format(args, record.id, prefix, doc.id,
-                                                                           is_priority, redis_res))
+                                                                           is_priority, enqueue_res))
        return response.ok()