ocr excel upload eDMS
Showing
8 changed files
with
299 additions
and
142 deletions
... | @@ -12,13 +12,16 @@ Django==2.1 | ... | @@ -12,13 +12,16 @@ Django==2.1 |
12 | django-oauth-toolkit==1.3.2 | 12 | django-oauth-toolkit==1.3.2 |
13 | djangorestframework==3.9.0 | 13 | djangorestframework==3.9.0 |
14 | djangorestframework-jwt==1.11.0 | 14 | djangorestframework-jwt==1.11.0 |
15 | et-xmlfile==1.0.1 | ||
15 | idna==2.9 | 16 | idna==2.9 |
16 | idna-ssl==1.1.0 | 17 | idna-ssl==1.1.0 |
17 | isodate==0.6.0 | 18 | isodate==0.6.0 |
19 | jdcal==1.4.1 | ||
18 | lxml==4.5.1 | 20 | lxml==4.5.1 |
19 | marshmallow==3.6.1 | 21 | marshmallow==3.6.1 |
20 | multidict==4.7.6 | 22 | multidict==4.7.6 |
21 | oauthlib==3.1.0 | 23 | oauthlib==3.1.0 |
24 | openpyxl==3.0.4 | ||
22 | pdfminer3k==1.3.4 | 25 | pdfminer3k==1.3.4 |
23 | Pillow==7.1.2 | 26 | Pillow==7.1.2 |
24 | ply==3.11 | 27 | ply==3.11 | ... | ... |
1 | PAGE_DEFAULT = 1 | 1 | PAGE_DEFAULT = 1 |
2 | PAGE_SIZE_DEFAULT = 10 | 2 | PAGE_SIZE_DEFAULT = 10 |
3 | 3 | ||
4 | DOC_SCHEME_LIST = ['Acceptance', 'Settlement', 'Contract Management'] | 4 | FIXED_APPLICATION_ID = '手工单' |
5 | DATA_SOURCE_LIST = ['POS', 'EAPP', 'Econtract'] | 5 | |
6 | BUSINESS_TYPE_LIST = ['HIL', 'AFC'] | 6 | DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACT MANAGEMENT'] |
7 | HIL_SET = {'HIL', 'HIl', 'HiL', 'Hil', 'hIL', 'hIl', 'hiL', 'hil', 'CO00002'} | 7 | DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT'] |
8 | |||
8 | HIL_PREFIX = 'HIL' | 9 | HIL_PREFIX = 'HIL' |
9 | AFC_PREFIX = 'AFC' | 10 | AFC_PREFIX = 'AFC' |
11 | SPLIT_STR = '_' | ||
12 | BUSINESS_TYPE_LIST = [HIL_PREFIX, AFC_PREFIX] | ||
13 | HIL_SET = {'HIL', 'HIl', 'HiL', 'Hil', 'hIL', 'hIl', 'hiL', 'hil', 'CO00002'} | ||
10 | 14 | ||
11 | SESSION_PREFIX = 'FHLSID' | 15 | SESSION_PREFIX = 'FHLSID' |
12 | CUSTOM_CLIENT = 'CustomClient' | 16 | CUSTOM_CLIENT = 'CustomClient' |
... | @@ -15,12 +19,22 @@ FIXED_FILE_SIZE = 0 | ... | @@ -15,12 +19,22 @@ FIXED_FILE_SIZE = 0 |
15 | DOWNLOAD_ACTION_TYPE = 'Downloaded' | 19 | DOWNLOAD_ACTION_TYPE = 'Downloaded' |
16 | 20 | ||
17 | DOC_SCHEMA_ID_FILL = { | 21 | DOC_SCHEMA_ID_FILL = { |
18 | 'Acceptance': (1, 'DFE-AutoFilingScript'), | 22 | 'ACCEPTANCE': (1, 'DFE-AutoFilingScript'), |
19 | 'Settlement': (20, 'DFE-AutoFilingScript'), | 23 | 'SETTLEMENT': (20, 'DFE-AutoFilingScript'), |
20 | 'Contract Management': (86, 'Schema-Based') | 24 | 'CONTRACT MANAGEMENT': (86, 'Schema-Based') |
25 | } | ||
26 | BUSINESS_TYPE_DICT = { | ||
27 | HIL_PREFIX: 'CO00002', | ||
28 | AFC_PREFIX: 'CO00001' | ||
21 | } | 29 | } |
22 | DOC_SCHEMA_TYPE = 'ElectronicRecord' | 30 | DOC_SCHEMA_TYPE = 'ElectronicRecord' |
23 | APPLICATION_ID_META_FIELD_id = 1 | 31 | APPLICATION_ID_META_FIELD_id = 1 |
24 | DEALER_CODE_META_FIELD_id = 13 | 32 | DEALER_CODE_META_FIELD_id = 13 |
25 | BUSINESS_TYPE_META_FIELD_id = 93 | 33 | BUSINESS_TYPE_META_FIELD_id = 93 |
26 | DEALER_CODE = 'ocr_situ_group' | 34 | DEALER_CODE = 'ocr_situ_group' |
35 | |||
36 | AMOUNT_COL_TITLE_SET = {"交易金额", "金额", "收入/支出金额", "发生额"} | ||
37 | OVERAGE_COL_TITLE_SET = {"账户余额", "余额"} | ||
38 | PROOF_COL_TITLE = '核对结果' | ||
39 | PROOF_RES = ('对', '错') | ||
40 | META_SHEET_TITLE = '关键信息提取和展示' | ... | ... |
1 | import os | ||
1 | import requests | 2 | import requests |
2 | from zeep import Client, xsd | 3 | from zeep import Client, xsd |
3 | from settings import conf | 4 | from settings import conf |
... | @@ -65,9 +66,9 @@ class EDMS: | ... | @@ -65,9 +66,9 @@ class EDMS: |
65 | params = {'token': token} | 66 | params = {'token': token} |
66 | self.download_handler(params, headers, save_path) | 67 | self.download_handler(params, headers, save_path) |
67 | 68 | ||
68 | def create_upload_token(self, headers, file_size): | 69 | def create_upload_token(self, headers): |
69 | with self.rc_client.settings(extra_http_headers=headers): | 70 | with self.rc_client.settings(extra_http_headers=headers): |
70 | token = self.rc_client.service.CreateUploadToken(fileSize=file_size) | 71 | token = self.rc_client.service.CreateUploadToken(fileSize=consts.FIXED_FILE_SIZE) |
71 | return token | 72 | return token |
72 | 73 | ||
73 | def upload_handler(self, file_path, params, headers): | 74 | def upload_handler(self, file_path, params, headers): |
... | @@ -80,11 +81,19 @@ class EDMS: | ... | @@ -80,11 +81,19 @@ class EDMS: |
80 | else: | 81 | else: |
81 | raise Exception | 82 | raise Exception |
82 | 83 | ||
83 | def get_doc_info(self, token, doc_info): | 84 | @staticmethod |
84 | doc_schema_id, auto_filing = consts.DOC_SCHEMA_ID_FILL.get(doc_info.get('document_scheme')) | 85 | def get_doc_file_name(doc_name): |
85 | application_id = doc_info.get('application_id') | 86 | if doc_name.endswith('pdf'): |
86 | doc_file_name = doc_info.get('doc_file_name') | 87 | name, _ = os.path.splitext(doc_name) |
87 | business_type = doc_info.get('business_type') | 88 | return name |
89 | return doc_name | ||
90 | |||
91 | def get_doc_info(self, token, doc, business_type, file_path): | ||
92 | business_type = consts.BUSINESS_TYPE_DICT.get(business_type) | ||
93 | doc_schema_id, auto_filing = consts.DOC_SCHEMA_ID_FILL.get(doc.document_scheme) | ||
94 | application_id = doc.application_id | ||
95 | doc_file_name = self.get_doc_file_name(doc.document_name) | ||
96 | origin_file_name = os.path.basename(file_path) | ||
88 | fields_with_value = [ | 97 | fields_with_value = [ |
89 | {'FieldId': consts.APPLICATION_ID_META_FIELD_id, | 98 | {'FieldId': consts.APPLICATION_ID_META_FIELD_id, |
90 | 'FieldValue': xsd.AnyObject(xsd.String(), application_id)}, | 99 | 'FieldValue': xsd.AnyObject(xsd.String(), application_id)}, |
... | @@ -99,20 +108,20 @@ class EDMS: | ... | @@ -99,20 +108,20 @@ class EDMS: |
99 | 'DocumentName': doc_file_name, | 108 | 'DocumentName': doc_file_name, |
100 | 'FieldsWithValues': fields_with_values, | 109 | 'FieldsWithValues': fields_with_values, |
101 | 'UploadToken': token, | 110 | 'UploadToken': token, |
102 | 'OriginalFileName': doc_file_name, | 111 | 'OriginalFileName': origin_file_name, |
103 | 'SendEmailToMembers': False, | 112 | 'SendEmailToMembers': False, |
104 | 'AutoFilingScriptToUse': auto_filing, | 113 | 'AutoFilingScriptToUse': auto_filing, |
105 | 'DocumentSchemaType': consts.DOC_SCHEMA_TYPE, | 114 | 'DocumentSchemaType': consts.DOC_SCHEMA_TYPE, |
106 | } | 115 | } |
107 | return info | 116 | return info |
108 | 117 | ||
109 | def add_doc_info(self, headers, token, doc_info): | 118 | def add_doc_info(self, headers, token, doc, business_type, file_path): |
110 | info = self.get_doc_info(token, doc_info) | 119 | info = self.get_doc_info(token, doc, business_type, file_path) |
111 | with self.dm_client.settings(extra_http_headers=headers): | 120 | with self.dm_client.settings(extra_http_headers=headers): |
112 | metadata_version_id = self.dm_client.service.AddDocumentInfo(info=info) | 121 | metadata_version_id = self.dm_client.service.AddDocumentInfo(info=info) |
113 | return metadata_version_id | 122 | return metadata_version_id |
114 | 123 | ||
115 | def upload(self, file_path, file_size, doc_info): | 124 | def upload(self, file_path, doc, business_type): |
116 | # file_path = '/Users/clay/Postman/files/OCRuploadTest4.txt' | 125 | # file_path = '/Users/clay/Postman/files/OCRuploadTest4.txt' |
117 | # file_size = 16 | 126 | # file_size = 16 |
118 | # doc_info = { | 127 | # doc_info = { |
... | @@ -122,12 +131,12 @@ class EDMS: | ... | @@ -122,12 +131,12 @@ class EDMS: |
122 | # 'business_type': 'CO00001', | 131 | # 'business_type': 'CO00001', |
123 | # } | 132 | # } |
124 | headers = self.get_headers() | 133 | headers = self.get_headers() |
125 | token = self.create_upload_token(headers, file_size) | 134 | token = self.create_upload_token(headers) |
126 | headers.update({'Content-Type': 'application/octet-stream'}) | 135 | headers.update({'Content-Type': 'application/octet-stream'}) |
127 | params = {'token': token} | 136 | params = {'token': token} |
128 | self.upload_handler(file_path, params, headers) | 137 | self.upload_handler(file_path, params, headers) |
129 | headers.pop('Content-Type') | 138 | headers.pop('Content-Type') |
130 | metadata_version_id = self.add_doc_info(headers, token, doc_info) | 139 | metadata_version_id = self.add_doc_info(headers, token, doc, business_type, file_path) |
131 | return metadata_version_id | 140 | return metadata_version_id |
132 | 141 | ||
133 | 142 | ... | ... |
1 | import os | 1 | import os |
2 | import time | 2 | import time |
3 | import fitz | 3 | import fitz |
4 | import xlwt | ||
5 | import signal | 4 | import signal |
6 | import base64 | 5 | import base64 |
7 | import asyncio | 6 | import asyncio |
8 | import aiohttp | 7 | import aiohttp |
8 | import locale | ||
9 | from PIL import Image | 9 | from PIL import Image |
10 | from io import BytesIO | 10 | from io import BytesIO |
11 | from zeep import Client | 11 | from openpyxl import Workbook |
12 | from openpyxl.styles import numbers | ||
13 | from openpyxl.utils import get_column_letter | ||
12 | 14 | ||
13 | from django.core.management import BaseCommand | 15 | from django.core.management import BaseCommand |
14 | from common.mixins import LoggerMixin | 16 | from common.mixins import LoggerMixin |
... | @@ -23,7 +25,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -23,7 +25,7 @@ class Command(BaseCommand, LoggerMixin): |
23 | 25 | ||
24 | def __init__(self): | 26 | def __init__(self): |
25 | super().__init__() | 27 | super().__init__() |
26 | self.log_base = '[doc process]' | 28 | self.log_base = '[doc ocr process]' |
27 | # 处理文件开关 | 29 | # 处理文件开关 |
28 | self.switch = True | 30 | self.switch = True |
29 | # 数据目录 | 31 | # 数据目录 |
... | @@ -50,46 +52,54 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -50,46 +52,54 @@ class Command(BaseCommand, LoggerMixin): |
50 | task_str, is_priority = rh.dequeue() | 52 | task_str, is_priority = rh.dequeue() |
51 | if task_str is None: | 53 | if task_str is None: |
52 | self.cronjob_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base)) | 54 | self.cronjob_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base)) |
53 | return None, None, None, None | 55 | return None, None |
54 | 56 | ||
55 | business_type, doc_id_str = task_str.split('_') | 57 | business_type, doc_id_str = task_str.split(consts.SPLIT_STR) |
56 | doc_id = int(doc_id_str) | 58 | doc_id = int(doc_id_str) |
57 | doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc | 59 | doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc |
58 | doc_info = doc_class.objects.filter(id=doc_id, status=DocStatus.INIT.value).values( | 60 | # doc_info = doc_class.objects.filter(id=doc_id, status=DocStatus.INIT.value).values( |
59 | 'id', 'metadata_version_id', 'application_id', 'document_name', 'document_scheme').first() | 61 | # 'id', 'metadata_version_id', 'application_id', 'document_name', 'document_scheme').first() |
60 | if doc_info is None: | 62 | doc = doc_class.objects.filter(id=doc_id).first() |
61 | self.cronjob_log.warn('{0} [get_doc_info] [doc completed] [task_str={1}] [is_priority={2}]'.format( | 63 | if doc is None: |
64 | self.cronjob_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format( | ||
62 | self.log_base, task_str, is_priority)) | 65 | self.log_base, task_str, is_priority)) |
63 | return None, None, None, None | 66 | return None, None |
64 | doc_class.objects.filter(id=doc_id).update(status=DocStatus.PROCESSING.value) | 67 | elif doc.status != DocStatus.INIT.value: |
65 | self.cronjob_log.info('{0} [get_doc_info] [task_str={1}] [is_priority={2}] [doc_info={3}]'.format( | 68 | self.cronjob_log.warn('{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] ' |
66 | self.log_base, task_str, is_priority, doc_info)) | 69 | '[doc_status={3}]'.format(self.log_base, task_str, is_priority, doc.status)) |
67 | return doc_info, doc_class, doc_id, business_type | 70 | return None, None |
68 | 71 | doc.status = DocStatus.PROCESSING.value | |
69 | def pdf_download(self, doc_id, doc_info, business_type): | 72 | doc.save() |
70 | if doc_info is None: | 73 | self.cronjob_log.info('{0} [get_doc_info] [success] [task_str={1}] [is_priority={2}]'.format( |
74 | self.log_base, task_str, is_priority)) | ||
75 | return doc, business_type | ||
76 | |||
77 | def pdf_download(self, doc, business_type): | ||
78 | if doc is None: | ||
71 | return None, None, None | 79 | return None, None, None |
72 | # TODO EDMS下载pdf | 80 | # TODO EDMS下载pdf |
73 | 81 | doc_data_path = os.path.join(self.data_dir, business_type, str(doc.id)) | |
74 | doc_data_path = os.path.join(self.data_dir, business_type, str(doc_id)) | 82 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) |
75 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc_id)) | 83 | if doc.application_id != consts.FIXED_APPLICATION_ID: |
76 | excel_path = os.path.join(doc_data_path, '{0}.xls'.format(doc_id)) | 84 | self.edms.download(pdf_path, doc.metadata_version_id) |
77 | self.cronjob_log.info('{0} [pdf download success] [business_type={1}] [doc_info={2}] [pdf_path={3}]'.format( | 85 | |
78 | self.log_base, business_type, doc_info, pdf_path)) | 86 | excel_path = os.path.join(doc_data_path, '{0}.xls'.format(doc.id)) |
87 | self.cronjob_log.info('{0} [pdf download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format( | ||
88 | self.log_base, business_type, doc.id, pdf_path)) | ||
79 | return doc_data_path, excel_path, pdf_path | 89 | return doc_data_path, excel_path, pdf_path |
80 | 90 | ||
81 | @staticmethod | 91 | @staticmethod |
82 | def append_sheet(wb, sheets_list, img_name): | 92 | def append_sheet(wb, sheets_list, img_name): |
83 | for i, sheet in enumerate(sheets_list): | 93 | for i, sheet in enumerate(sheets_list): |
84 | ws = wb.add_sheet('{0}_{1}'.format(img_name, i)) | 94 | ws = wb.create_sheet('{0}_{1}'.format(img_name, i)) |
85 | cells = sheet.get('cells') | 95 | cells = sheet.get('cells') |
86 | for cell in cells: | 96 | for cell in cells: |
87 | c1 = cell.get('start_column') | 97 | c1 = cell.get('start_column') |
88 | c2 = cell.get('end_column') | 98 | # c2 = cell.get('end_column') |
89 | r1 = cell.get('start_row') | 99 | r1 = cell.get('start_row') |
90 | r2 = cell.get('end_row') | 100 | # r2 = cell.get('end_row') |
91 | label = cell.get('words') | 101 | label = cell.get('words') |
92 | ws.write_merge(r1, r2, c1, c2, label=label) | 102 | ws.cell(row=r1+1, column=c1+1, value=label) |
93 | 103 | ||
94 | @staticmethod | 104 | @staticmethod |
95 | def get_ocr_json(img_path): | 105 | def get_ocr_json(img_path): |
... | @@ -112,6 +122,46 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -112,6 +122,46 @@ class Command(BaseCommand, LoggerMixin): |
112 | img_name = os.path.basename(img_path) | 122 | img_name = os.path.basename(img_path) |
113 | self.append_sheet(wb, sheets_list, img_name) | 123 | self.append_sheet(wb, sheets_list, img_name) |
114 | 124 | ||
125 | def proof(self, ws): | ||
126 | # 找到金额、余额列 | ||
127 | amount_col = overage_col = None | ||
128 | for i in ws[1]: | ||
129 | if i.value in consts.AMOUNT_COL_TITLE_SET: | ||
130 | amount_col = i.column | ||
131 | amount_col_letter = get_column_letter(amount_col) | ||
132 | elif i.value in consts.OVERAGE_COL_TITLE_SET: | ||
133 | overage_col = i.column | ||
134 | overage_col_letter = get_column_letter(overage_col) | ||
135 | if amount_col is None or overage_col is None: | ||
136 | return | ||
137 | # 文本转数值 | ||
138 | for col_tuple in ws.iter_cols(min_row=2, min_col=amount_col, max_col=overage_col): | ||
139 | for c in col_tuple: | ||
140 | try: | ||
141 | c.value = locale.atof(c.value) | ||
142 | c.number_format = numbers.FORMAT_NUMBER_00 | ||
143 | except Exception: | ||
144 | continue | ||
145 | # 增加核对结果列 | ||
146 | proof_col_letter = get_column_letter(ws.max_column + 1) | ||
147 | for c in ws[proof_col_letter]: | ||
148 | if c.row == 1: | ||
149 | c.value = consts.PROOF_COL_TITLE | ||
150 | elif c.row == 2: | ||
151 | continue | ||
152 | else: | ||
153 | c.value = '=IF({3}{0}=SUM({2}{0},{3}{1}), "{4}", "{5}")'.format( | ||
154 | c.row, c.row - 1, amount_col_letter, overage_col_letter, *consts.PROOF_RES) | ||
155 | |||
156 | def wb_process(self, wb, excel_path): | ||
157 | locale.setlocale(locale.LC_NUMERIC, 'en_US.UTF-8') | ||
158 | for ws in wb.worksheets: | ||
159 | if ws.title == 'Sheet': | ||
160 | ws.title = consts.META_SHEET_TITLE | ||
161 | else: | ||
162 | self.proof(ws) | ||
163 | wb.save(excel_path) # TODO no sheet (res always []) | ||
164 | |||
115 | @staticmethod | 165 | @staticmethod |
116 | def getimage(pix): | 166 | def getimage(pix): |
117 | if pix.colorspace.n != 4: | 167 | if pix.colorspace.n != 4: |
... | @@ -124,7 +174,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -124,7 +174,7 @@ class Command(BaseCommand, LoggerMixin): |
124 | s = item[1] # xref of its /SMask | 174 | s = item[1] # xref of its /SMask |
125 | is_rgb = True if item[5] == 'DeviceRGB' else False | 175 | is_rgb = True if item[5] == 'DeviceRGB' else False |
126 | 176 | ||
127 | # GRAY/RGB # TODO 颜色空间不同处理 | 177 | # RGB |
128 | if is_rgb: | 178 | if is_rgb: |
129 | if s == 0: | 179 | if s == 0: |
130 | return doc.extractImage(x) | 180 | return doc.extractImage(x) |
... | @@ -158,7 +208,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -158,7 +208,7 @@ class Command(BaseCommand, LoggerMixin): |
158 | 208 | ||
159 | pix1 = pix2 = None # free temp pixmaps | 209 | pix1 = pix2 = None # free temp pixmaps |
160 | 210 | ||
161 | pix = fitz.Pixmap(fitz.csRGB, pix) # CMYK to RGB | 211 | pix = fitz.Pixmap(fitz.csRGB, pix) # GRAY/CMYK to RGB |
162 | return self.getimage(pix) | 212 | return self.getimage(pix) |
163 | 213 | ||
164 | @staticmethod | 214 | @staticmethod |
... | @@ -200,10 +250,11 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -200,10 +250,11 @@ class Command(BaseCommand, LoggerMixin): |
200 | 250 | ||
201 | while self.switch: | 251 | while self.switch: |
202 | # 1. 从队列获取文件信息 | 252 | # 1. 从队列获取文件信息 |
203 | doc_info, doc_class, doc_id, business_type = self.get_doc_info() | 253 | doc, business_type = self.get_doc_info() |
204 | 254 | ||
255 | try: | ||
205 | # 2. 从EDMS获取PDF文件 | 256 | # 2. 从EDMS获取PDF文件 |
206 | doc_data_path, excel_path, pdf_path = self.pdf_download(doc_id, doc_info, business_type) | 257 | doc_data_path, excel_path, pdf_path = self.pdf_download(doc, business_type) |
207 | 258 | ||
208 | # 队列为空时的处理 | 259 | # 队列为空时的处理 |
209 | if pdf_path is None: | 260 | if pdf_path is None: |
... | @@ -212,7 +263,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -212,7 +263,7 @@ class Command(BaseCommand, LoggerMixin): |
212 | continue | 263 | continue |
213 | 264 | ||
214 | sleep_second = int(conf.SLEEP_SECOND) | 265 | sleep_second = int(conf.SLEEP_SECOND) |
215 | try: | 266 | |
216 | # 3.PDF文件提取图片 | 267 | # 3.PDF文件提取图片 |
217 | img_save_path = os.path.join(doc_data_path, 'img') | 268 | img_save_path = os.path.join(doc_data_path, 'img') |
218 | os.makedirs(img_save_path, exist_ok=True) | 269 | os.makedirs(img_save_path, exist_ok=True) |
... | @@ -233,8 +284,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -233,8 +284,8 @@ class Command(BaseCommand, LoggerMixin): |
233 | save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number)) | 284 | save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number)) |
234 | pm.writePNG(save_path) | 285 | pm.writePNG(save_path) |
235 | img_path_list.append(save_path) | 286 | img_path_list.append(save_path) |
236 | self.cronjob_log.info('{0} [page to img success] [doc_id={1}] [pdf_path={2}] ' | 287 | self.cronjob_log.info('{0} [page to img success] [pdf_path={1}] [page={2}]'.format( |
237 | '[page={3}]'.format(self.log_base, doc_id, pdf_path, page.number)) | 288 | self.log_base, pdf_path, page.number)) |
238 | else: # 提取图片 | 289 | else: # 提取图片 |
239 | for img_index, img_il in enumerate(img_il_list): | 290 | for img_index, img_il in enumerate(img_il_list): |
240 | if len(img_il) == 1: # 当只有一张图片时, 简化处理 | 291 | if len(img_il) == 1: # 当只有一张图片时, 简化处理 |
... | @@ -246,8 +297,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -246,8 +297,8 @@ class Command(BaseCommand, LoggerMixin): |
246 | f.write(img_data) | 297 | f.write(img_data) |
247 | img_path_list.append(save_path) | 298 | img_path_list.append(save_path) |
248 | self.cronjob_log.info( | 299 | self.cronjob_log.info( |
249 | '{0} [extract img success] [doc_id={1}] [pdf_path={2}] [page={3}] ' | 300 | '{0} [extract img success] [pdf_path={1}] [page={2}] [img_index={3}]'.format( |
250 | '[img_index={4}]'.format(self.log_base, doc_id, pdf_path, pno, img_index)) | 301 | self.log_base, pdf_path, pno, img_index)) |
251 | else: # 多张图片,竖向拼接 | 302 | else: # 多张图片,竖向拼接 |
252 | height_sum = 0 | 303 | height_sum = 0 |
253 | im_list = [] | 304 | im_list = [] |
... | @@ -276,28 +327,41 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -276,28 +327,41 @@ class Command(BaseCommand, LoggerMixin): |
276 | res.save(save_path) | 327 | res.save(save_path) |
277 | img_path_list.append(save_path) | 328 | img_path_list.append(save_path) |
278 | self.cronjob_log.info( | 329 | self.cronjob_log.info( |
279 | '{0} [extract img success] [doc_id={1}] [pdf_path={2}] [page={3}] ' | 330 | '{0} [extract img success] [pdf_path={1}] [page={2}] [img_index={3}]'.format( |
280 | '[img_index={4}]'.format(self.log_base, doc_id, pdf_path, pno, img_index)) | 331 | self.log_base, pdf_path, pno, img_index)) |
281 | self.cronjob_log.info('{0} [pdf to img success] [doc_id={1}]'.format(self.log_base, doc_id)) | 332 | self.cronjob_log.info('{0} [pdf to img success] [business_type={1}] [doc_id={2}]'.format( |
333 | self.log_base, business_type, doc.id)) | ||
282 | 334 | ||
283 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc_id))) | 335 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) |
284 | 336 | ||
285 | # 4.图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件 | 337 | # 4.图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件 |
286 | wb = xlwt.Workbook() | 338 | wb = Workbook() |
287 | loop = asyncio.get_event_loop() | 339 | loop = asyncio.get_event_loop() |
288 | tasks = [self.img_ocr_excel(wb, img_path) for img_path in img_path_list] | 340 | tasks = [self.img_ocr_excel(wb, img_path) for img_path in img_path_list] |
289 | loop.run_until_complete(asyncio.wait(tasks)) | 341 | loop.run_until_complete(asyncio.wait(tasks)) |
290 | # loop.close() | 342 | # loop.close() |
291 | wb.save(excel_path) # TODO no sheet (res always []) | ||
292 | # 整合excel文件 | ||
293 | 343 | ||
344 | # 整合excel文件 | ||
345 | # self.wb_process(wb, excel_path) | ||
346 | wb.save(excel_path) | ||
347 | except Exception as e: | ||
348 | doc.status = DocStatus.PROCESS_FAILED.value | ||
349 | doc.save() | ||
350 | self.cronjob_log.error('{0} [process failed] [business_type={1}] [doc_id={2}] [err={3}]'.format( | ||
351 | self.log_base, business_type, doc.id, e)) | ||
352 | else: | ||
353 | try: | ||
294 | # 5.上传至EDMS | 354 | # 5.上传至EDMS |
295 | 355 | self.edms.upload(excel_path, doc, business_type) | |
296 | except Exception as e: | 356 | except Exception as e: |
297 | doc_class.objects.filter(id=doc_id).update(status=DocStatus.PROCESS_FAILED.value) | 357 | doc.status = DocStatus.UPLOAD_FAILED.value |
298 | self.cronjob_log.error('{0} [process failed] [doc_id={1}] [err={2}]'.format(self.log_base, doc_id, e)) | 358 | doc.save() |
359 | self.cronjob_log.error('{0} [upload failed] [business_type={1}] [doc_id={2}] [err={3}]'.format( | ||
360 | self.log_base, business_type, doc.id, e)) | ||
299 | else: | 361 | else: |
300 | doc_class.objects.filter(id=doc_id).update(status=DocStatus.COMPLETE.value) | 362 | doc.status = DocStatus.COMPLETE.value |
301 | self.cronjob_log.info('{0} [doc process complete] [doc_id={1}]'.format(self.log_base, doc_id)) | 363 | doc.save() |
364 | self.cronjob_log.info('{0} [doc process complete] [business_type={1}] [doc_id={2}]'.format( | ||
365 | self.log_base, business_type, doc.id)) | ||
302 | 366 | ||
303 | self.cronjob_log.info('{0} [stop safely]'.format(self.log_base)) | 367 | self.cronjob_log.info('{0} [stop safely]'.format(self.log_base)) | ... | ... |
... | @@ -86,73 +86,143 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -86,73 +86,143 @@ class Command(BaseCommand, LoggerMixin): |
86 | 86 | ||
87 | @staticmethod | 87 | @staticmethod |
88 | def split_il(il): | 88 | def split_il(il): |
89 | img_il_list = [] | 89 | small_img_il_list = [] |
90 | big_img_il_list = [] | ||
90 | start = 0 | 91 | start = 0 |
92 | index = 0 | ||
91 | length = len(il) | 93 | length = len(il) |
92 | for i in range(length): | 94 | for i in range(length): |
95 | if il[i][2] >= 700 and il[i][3] >= 647: | ||
96 | if start < i: | ||
97 | small_img_il_list.append((il[start: i], index)) | ||
98 | index += 1 | ||
99 | else: | ||
100 | start += 1 | ||
101 | big_img_il_list.append((il[i], index)) | ||
102 | index += 1 | ||
103 | continue | ||
93 | if i == start: | 104 | if i == start: |
94 | if i == length - 1: | 105 | if i == length - 1: |
95 | img_il_list.append(il[start: length]) | 106 | small_img_il_list.append((il[start: length], index)) |
96 | continue | 107 | continue |
97 | elif i == length - 1: | 108 | elif i == length - 1: |
98 | img_il_list.append(il[start: length]) | 109 | if il[i][2] == il[i - 1][2]: |
110 | small_img_il_list.append((il[start: length], index)) | ||
111 | else: | ||
112 | small_img_il_list.append((il[start: i], index)) | ||
113 | small_img_il_list.append((il[i: length], index+1)) | ||
99 | continue | 114 | continue |
100 | if il[i][2] != il[i - 1][2]: | 115 | if il[i][2] != il[i - 1][2]: |
101 | img_il_list.append(il[start: i]) | 116 | small_img_il_list.append((il[start: i], index)) |
117 | index += 1 | ||
102 | start = i | 118 | start = i |
103 | elif il[i][3] != il[i - 1][3]: | 119 | elif il[i][3] != il[i - 1][3] and il[i][2] < 1200: |
104 | img_il_list.append(il[start: i + 1]) | 120 | small_img_il_list.append((il[start: i + 1], index)) |
121 | index += 1 | ||
105 | start = i + 1 | 122 | start = i + 1 |
106 | return img_il_list | 123 | return small_img_il_list, big_img_il_list |
107 | 124 | ||
108 | def handle(self, *args, **kwargs): | 125 | def handle(self, *args, **kwargs): |
109 | pdf_dir = '/Users/clay/Desktop/普通打印-部分无线/竖版-无表格-农业银行' | 126 | pdf_dir = '/Users/clay/Desktop/问题PDF' |
110 | img_dir = '/Users/clay/Desktop/普通打印-部分无线_img/竖版-无表格-农业银行' | 127 | img_dir = '/Users/clay/Desktop/问题PDF' |
111 | os.makedirs(img_dir, exist_ok=True) | ||
112 | for d in os.listdir(pdf_dir): | 128 | for d in os.listdir(pdf_dir): |
113 | # if d in ['.DS_Store', 'CH-B008486764.pdf', 'CH-B008003736.pdf', 'CH-B008487476.pdf', 'CH-B006763780.pdf', | 129 | # if d in ['.DS_Store', 'CH-B008003736.pdf', 'CH-B006317088.pdf', 'CH-B008487476.pdf', 'CH-B006337608.pdf', |
114 | # 'CH-B009000564.pdf', 'CH-B009020488.pdf']: | 130 | # 'CH-B006391612.pdf', 'CH-B006536124.pdf', 'CH-B006526652.pdf', 'CH-B009003592.pdf']: |
115 | if d in ['.DS_Store', '1竖版-无表格-农业银行样例.PNG']: | 131 | # continue |
132 | # if d != 'CH-B006393152.PDF': | ||
133 | # if d != 'CH-B006526652.pdf': | ||
134 | if d != 'CH-B008487944.pdf': | ||
116 | continue | 135 | continue |
117 | pdf_path = os.path.join(pdf_dir, d) | 136 | pdf_path = os.path.join(pdf_dir, d) |
118 | # pdf_path = '/Users/clay/Desktop/普通打印part2/工商银行(标准版)/CH-B006754676.pdf' | ||
119 | if os.path.isfile(pdf_path): | 137 | if os.path.isfile(pdf_path): |
120 | img_save_path = os.path.join(img_dir, d) | 138 | img_save_path = os.path.join(img_dir, d[:-4]) |
121 | if os.path.exists(img_save_path): | 139 | # if os.path.exists(img_save_path): |
122 | continue | 140 | # continue |
123 | os.makedirs(img_save_path, exist_ok=True) | 141 | os.makedirs(img_save_path, exist_ok=True) |
124 | with fitz.Document(pdf_path) as pdf: | 142 | with fitz.Document(pdf_path) as pdf: |
125 | self.cronjob_log.info('{0} [pdf_path={1}] [metadata={2}]'.format( | 143 | self.cronjob_log.info('{0} [pdf_path={1}] [metadata={2}]'.format( |
126 | self.log_base, pdf_path, pdf.metadata)) | 144 | self.log_base, pdf_path, pdf.metadata)) |
127 | # xref_list = [] | 145 | xref_set = set() |
128 | for pno in range(pdf.pageCount): | 146 | for pno in range(pdf.pageCount): |
147 | print('---------------------------------------') | ||
129 | il = pdf.getPageImageList(pno) | 148 | il = pdf.getPageImageList(pno) |
149 | # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) | ||
150 | print(il) | ||
151 | |||
152 | # for img_index, img in enumerate(il): | ||
153 | # pix = self.recoverpix(pdf, img) | ||
154 | # ext, img_data = self.get_img_data(pix) | ||
155 | # save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( | ||
156 | # pno, img_index, ext)) | ||
157 | # with open(save_path, "wb") as f: | ||
158 | # f.write(img_data) | ||
159 | |||
160 | if len(il) == 0: | ||
161 | page = pdf.loadPage(pno) | ||
162 | pm = page.getPixmap(matrix=self.trans, alpha=False) | ||
163 | save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number)) | ||
164 | pm.writePNG(save_path) | ||
165 | elif len(il) == 1: | ||
166 | width = il[0][2] | ||
167 | height = il[0][3] | ||
168 | colorspace = il[0][5] | ||
169 | adobe_filter = il[0][-1] | ||
170 | if colorspace == '' or adobe_filter in ['', '']: | ||
171 | continue | ||
172 | # 小图 | ||
173 | if width < 500 and height < 500: | ||
174 | page = pdf.loadPage(pno) | ||
175 | pm = page.getPixmap(matrix=self.trans, alpha=False) | ||
176 | save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number)) | ||
177 | pm.writePNG(save_path) | ||
178 | # 大图 | ||
179 | elif il[0][0] not in xref_set: | ||
180 | pix = self.recoverpix(pdf, il[0]) | ||
181 | ext, img_data = self.get_img_data(pix) | ||
182 | save_path = os.path.join(img_save_path, 'page_{0}_img_0.{1}'.format(pno, ext)) | ||
183 | with open(save_path, "wb") as f: | ||
184 | f.write(img_data) | ||
185 | xref_set.add(il[0][0]) | ||
186 | else: | ||
130 | il.sort(key=lambda x: x[0]) | 187 | il.sort(key=lambda x: x[0]) |
131 | img_il_list = self.split_il(il) | 188 | small_img_il_list, big_img_il_list = self.split_il(il) |
132 | del il | 189 | print(small_img_il_list) |
190 | print(big_img_il_list) | ||
191 | print('+++++++++++++++++++++++++++++++++++') | ||
133 | 192 | ||
134 | print(img_il_list) | 193 | if len(small_img_il_list) > 2: # 单页无规律小图过多时,使用页面转图片 |
135 | if len(img_il_list) > 3: # 单页无规律小图过多时,使用页面转图片 | ||
136 | page = pdf.loadPage(pno) | 194 | page = pdf.loadPage(pno) |
137 | pm = page.getPixmap(matrix=self.trans, alpha=False) | 195 | pm = page.getPixmap(matrix=self.trans, alpha=False) |
138 | save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number)) | 196 | save_path = os.path.join(img_save_path, 'page_{0}_img_0.png'.format(page.number)) |
139 | pm.writePNG(save_path) | 197 | pm.writePNG(save_path) |
140 | # img_path_list.append(save_path) | ||
141 | # self.cronjob_log.info('{0} [page to img success] [doc_id={1}] [pdf_path={2}] ' | ||
142 | # '[page={3}]'.format(self.log_base, doc_id, pdf_path, page.number)) | ||
143 | else: # 提取图片 | 198 | else: # 提取图片 |
144 | for img_index, img_il in enumerate(img_il_list): | 199 | for img_il, img_index in big_img_il_list: |
145 | if len(img_il) == 1: # 当只有一张图片时, 简化处理 | 200 | if img_il[0] in xref_set: |
201 | continue | ||
202 | pix = self.recoverpix(pdf, img_il) | ||
203 | ext, img_data = self.get_img_data(pix) | ||
204 | save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( | ||
205 | pno, img_index, ext)) | ||
206 | with open(save_path, "wb") as f: | ||
207 | f.write(img_data) | ||
208 | xref_set.add(img_il[0]) | ||
209 | |||
210 | for img_il, img_index in small_img_il_list: | ||
211 | # 小图 | ||
212 | if len(img_il) == 1 and img_il[0][2] < 500 and img_il[0][3] < 500: | ||
213 | page = pdf.loadPage(pno) | ||
214 | pm = page.getPixmap(matrix=self.trans, alpha=False) | ||
215 | save_path = os.path.join(img_save_path, | ||
216 | 'page_{0}_img_0.png'.format(page.number)) | ||
217 | pm.writePNG(save_path) | ||
218 | elif len(img_il) == 1 and img_il[0][0] not in xref_set: # 当只有一张图片时, 简化处理 | ||
146 | pix = self.recoverpix(pdf, img_il[0]) | 219 | pix = self.recoverpix(pdf, img_il[0]) |
147 | ext, img_data = self.get_img_data(pix) | 220 | ext, img_data = self.get_img_data(pix) |
148 | save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( | 221 | save_path = os.path.join(img_save_path, 'page_{0}_img_{1}.{2}'.format( |
149 | pno, img_index, ext)) | 222 | pno, img_index, ext)) |
150 | with open(save_path, "wb") as f: | 223 | with open(save_path, "wb") as f: |
151 | f.write(img_data) | 224 | f.write(img_data) |
152 | # img_path_list.append(save_path) | 225 | xref_set.add(img_il[0][0]) |
153 | # self.cronjob_log.info( | ||
154 | # '{0} [extract img success] [doc_id={1}] [pdf_path={2}] [page={3}] ' | ||
155 | # '[img_index={4}]'.format(self.log_base, doc_id, pdf_path, pno, img_index)) | ||
156 | else: # 多张图片,竖向拼接 | 226 | else: # 多张图片,竖向拼接 |
157 | height_sum = 0 | 227 | height_sum = 0 |
158 | im_list = [] | 228 | im_list = [] |
... | @@ -179,6 +249,3 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -179,6 +249,3 @@ class Command(BaseCommand, LoggerMixin): |
179 | res.paste(m, box=(0, h_now)) | 249 | res.paste(m, box=(0, h_now)) |
180 | h_now += h | 250 | h_now += h |
181 | res.save(save_path) | 251 | res.save(save_path) |
182 | # else: | ||
183 | # img_dir_path = os.path.join(img_dir, d) | ||
184 | # os.makedirs(img_dir_path, exist_ok=True) | ... | ... |
... | @@ -26,7 +26,21 @@ class DocHandler: | ... | @@ -26,7 +26,21 @@ class DocHandler: |
26 | 26 | ||
27 | @staticmethod | 27 | @staticmethod |
28 | def get_doc_class(business_type): | 28 | def get_doc_class(business_type): |
29 | is_hil = business_type in consts.HIL_SET | 29 | return (HILDoc, consts.HIL_PREFIX) if business_type in consts.HIL_SET else (AFCDoc, consts.AFC_PREFIX) |
30 | doc_class, prefix = (HILDoc, consts.HIL_PREFIX) if is_hil else (AFCDoc, consts.AFC_PREFIX) | 30 | |
31 | return doc_class, prefix | 31 | def fix_scheme(self, scheme): |
32 | if scheme in consts.DOC_SCHEME_LIST: | ||
33 | return scheme | ||
34 | elif scheme.upper() in consts.DOC_SCHEME_LIST: | ||
35 | return scheme.upper() | ||
36 | else: | ||
37 | return consts.DOC_SCHEME_LIST[0] | ||
38 | |||
39 | def fix_data_source(self, data_source): | ||
40 | if data_source in consts.DATA_SOURCE_LIST: | ||
41 | return data_source | ||
42 | elif data_source.upper() in consts.DATA_SOURCE_LIST: | ||
43 | return data_source.upper() | ||
44 | else: | ||
45 | return consts.DATA_SOURCE_LIST[0] | ||
32 | 46 | ... | ... |
... | @@ -7,20 +7,3 @@ class DocStatus(NamedEnum): | ... | @@ -7,20 +7,3 @@ class DocStatus(NamedEnum): |
7 | PROCESS_FAILED = (2, '识别失败') | 7 | PROCESS_FAILED = (2, '识别失败') |
8 | UPLOAD_FAILED = (3, '同步失败') | 8 | UPLOAD_FAILED = (3, '同步失败') |
9 | COMPLETE = (4, '已完成') | 9 | COMPLETE = (4, '已完成') |
10 | |||
11 | |||
12 | class DocScheme(NamedEnum): | ||
13 | ACCEPTANCE = (0, "Acceptance") | ||
14 | SETTLEMENT = (1, 'Settlement') | ||
15 | CONTRACT_MANAGEMENT = (2, 'Contract Management') | ||
16 | |||
17 | |||
18 | class BusinessType(NamedEnum): | ||
19 | AFC = (0, "CO00001") | ||
20 | HIL = (1, 'CO00002') | ||
21 | |||
22 | |||
23 | class DataSource(NamedEnum): | ||
24 | POS = (0, "POS") | ||
25 | EAPP = (1, 'EAPP') | ||
26 | ECONTRACT = (2, 'Econtract') | ... | ... |
... | @@ -60,7 +60,7 @@ doc_list_args = { | ... | @@ -60,7 +60,7 @@ doc_list_args = { |
60 | 'status': fields.Int(required=False, | 60 | 'status': fields.Int(required=False, |
61 | validate=validate.OneOf(DocStatus.get_value_lst())), | 61 | validate=validate.OneOf(DocStatus.get_value_lst())), |
62 | 'application_id': fields.Str(required=False, validate=validate.Length(max=64)), | 62 | 'application_id': fields.Str(required=False, validate=validate.Length(max=64)), |
63 | 'data_source': fields.Str(required=False, validate=validate.Length(max=64)), | 63 | 'data_source': fields.Str(required=False, validate=validate.OneOf(consts.DATA_SOURCE_LIST)), |
64 | 'business_type': fields.Str(required=True, validate=validate.OneOf(consts.BUSINESS_TYPE_LIST)), | 64 | 'business_type': fields.Str(required=True, validate=validate.OneOf(consts.BUSINESS_TYPE_LIST)), |
65 | 'upload_time_start': fields.Date(required=False), | 65 | 'upload_time_start': fields.Date(required=False), |
66 | 'upload_time_end': fields.Date(required=False), | 66 | 'upload_time_end': fields.Date(required=False), |
... | @@ -100,6 +100,8 @@ class UploadDocView(GenericView, DocHandler): | ... | @@ -100,6 +100,8 @@ class UploadDocView(GenericView, DocHandler): |
100 | document = args.get('document') | 100 | document = args.get('document') |
101 | business_type = document.get('businessType') | 101 | business_type = document.get('businessType') |
102 | application_id = application_data.get('applicationId') | 102 | application_id = application_data.get('applicationId') |
103 | document_scheme = document.get('documentScheme') | ||
104 | data_source = document.get('dataSource') | ||
103 | try: | 105 | try: |
104 | # 1. 上传信息记录 | 106 | # 1. 上传信息记录 |
105 | record = UploadDocRecords.objects.create( | 107 | record = UploadDocRecords.objects.create( |
... | @@ -110,9 +112,9 @@ class UploadDocView(GenericView, DocHandler): | ... | @@ -110,9 +112,9 @@ class UploadDocView(GenericView, DocHandler): |
110 | guarantor_1=applicant_data.get('guarantor1Name'), | 112 | guarantor_1=applicant_data.get('guarantor1Name'), |
111 | guarantor_2=applicant_data.get('guarantor2Name'), | 113 | guarantor_2=applicant_data.get('guarantor2Name'), |
112 | document_name=document.get('documentName'), | 114 | document_name=document.get('documentName'), |
113 | document_scheme=document.get('documentScheme'), | 115 | document_scheme=document_scheme, |
114 | business_type=business_type, | 116 | business_type=business_type, |
115 | data_source=document.get('dataSource'), | 117 | data_source=data_source, |
116 | upload_finish_time=document.get('uploadFinishTime'), | 118 | upload_finish_time=document.get('uploadFinishTime'), |
117 | ) | 119 | ) |
118 | except IntegrityError as e: | 120 | except IntegrityError as e: |
... | @@ -130,17 +132,17 @@ class UploadDocView(GenericView, DocHandler): | ... | @@ -130,17 +132,17 @@ class UploadDocView(GenericView, DocHandler): |
130 | guarantor_1=applicant_data.get('guarantor1Name'), | 132 | guarantor_1=applicant_data.get('guarantor1Name'), |
131 | guarantor_2=applicant_data.get('guarantor2Name'), | 133 | guarantor_2=applicant_data.get('guarantor2Name'), |
132 | document_name=document.get('documentName'), | 134 | document_name=document.get('documentName'), |
133 | document_scheme=document.get('documentScheme'), | 135 | document_scheme=self.fix_scheme(document_scheme), |
134 | data_source=document.get('dataSource'), | 136 | data_source=self.fix_data_source(data_source), |
135 | upload_finish_time=document.get('uploadFinishTime'), | 137 | upload_finish_time=document.get('uploadFinishTime'), |
136 | ) | 138 | ) |
137 | # 3. 选择队列进入 | 139 | # 3. 选择队列进入 |
138 | is_priority = PriorityApplication.objects.filter(application_id=application_id, on_off=True).exists() | 140 | is_priority = PriorityApplication.objects.filter(application_id=application_id, on_off=True).exists() |
139 | value = ['{0}_{1}'.format(prefix, doc.id)] | 141 | tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)] |
140 | redis_res = rh.enqueue(value, is_priority) | 142 | enqueue_res = rh.enqueue(tasks, is_priority) |
141 | self.running_log.info('[doc upload success] [args={0}] [record_id={1}] [prefix={2}] [doc_id={3}] ' | 143 | self.running_log.info('[doc upload success] [args={0}] [record_id={1}] [business_type={2}] [doc_id={3}] ' |
142 | '[is_priority={4}] [enqueue_res={5}]'.format(args, record.id, prefix, doc.id, | 144 | '[is_priority={4}] [enqueue_res={5}]'.format(args, record.id, prefix, doc.id, |
143 | is_priority, redis_res)) | 145 | is_priority, enqueue_res)) |
144 | return response.ok() | 146 | return response.ok() |
145 | 147 | ||
146 | post.openapi_doc = ''' | 148 | post.openapi_doc = ''' |
... | @@ -174,6 +176,7 @@ class PriorityDocView(GenericView, DocHandler): | ... | @@ -174,6 +176,7 @@ class PriorityDocView(GenericView, DocHandler): |
174 | application_id = application_info.get('APPLICATION_ID') | 176 | application_id = application_info.get('APPLICATION_ID') |
175 | submit_datetime = application_info.get('SUBMIT_DATETIME') | 177 | submit_datetime = application_info.get('SUBMIT_DATETIME') |
176 | entity = application_info.get('ENTITY') | 178 | entity = application_info.get('ENTITY') |
179 | if submit_datetime.utcoffset() is not None: | ||
177 | submit_datetime = timezone.make_naive(submit_datetime, timezone.get_current_timezone()) | 180 | submit_datetime = timezone.make_naive(submit_datetime, timezone.get_current_timezone()) |
178 | GCAPRecords.objects.create( | 181 | GCAPRecords.objects.create( |
179 | entity=entity, | 182 | entity=entity, |
... | @@ -190,14 +193,14 @@ class PriorityDocView(GenericView, DocHandler): | ... | @@ -190,14 +193,14 @@ class PriorityDocView(GenericView, DocHandler): |
190 | doc_class, prefix = self.get_doc_class(entity) | 193 | doc_class, prefix = self.get_doc_class(entity) |
191 | doc_ids = doc_class.objects.filter(application_id=application_id, | 194 | doc_ids = doc_class.objects.filter(application_id=application_id, |
192 | status=DocStatus.INIT.value).values_list('id', flat=True) | 195 | status=DocStatus.INIT.value).values_list('id', flat=True) |
193 | task_str_list = ['{0}_{1}'.format(prefix, doc_id) for doc_id in doc_ids] | 196 | tasks_list = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc_id) for doc_id in doc_ids] |
194 | if not task_str_list: | 197 | if not tasks_list: |
195 | self.running_log.info( | 198 | self.running_log.info( |
196 | '[priority doc success] [args={0}] [task_str_list={1}]'.format(args, task_str_list)) | 199 | '[priority doc success] [args={0}]'.format(args)) |
197 | else: | 200 | else: |
198 | enqueue_res = rh.enqueue(task_str_list, is_priority=True) | 201 | enqueue_res = rh.enqueue(tasks_list, is_priority=True) |
199 | self.running_log.info('[priority doc success] [args={0}] [task_str_list={1}] [enqueue_res={2}]'.format( | 202 | self.running_log.info('[priority doc success] [args={0}] [tasks_list={1}] [enqueue_res={2}]'.format( |
200 | args, task_str_list, enqueue_res)) | 203 | args, tasks_list, enqueue_res)) |
201 | return response.ok() | 204 | return response.ok() |
202 | 205 | ||
203 | post.openapi_doc = ''' | 206 | post.openapi_doc = ''' |
... | @@ -268,7 +271,7 @@ class DocView(GenericView, DocHandler): | ... | @@ -268,7 +271,7 @@ class DocView(GenericView, DocHandler): |
268 | @use_args(upload_pdf_args, location='files') | 271 | @use_args(upload_pdf_args, location='files') |
269 | def post(self, request, args): | 272 | def post(self, request, args): |
270 | # 1. 上传信息记录 | 273 | # 1. 上传信息记录 |
271 | const_str = '手工单' | 274 | const_str = consts.FIXED_APPLICATION_ID |
272 | metadata_version_id = str(int(time.time())) | 275 | metadata_version_id = str(int(time.time())) |
273 | upload_finish_time = timezone.now() | 276 | upload_finish_time = timezone.now() |
274 | document_scheme = random.choice(consts.DOC_SCHEME_LIST) | 277 | document_scheme = random.choice(consts.DOC_SCHEME_LIST) |
... | @@ -305,8 +308,8 @@ class DocView(GenericView, DocHandler): | ... | @@ -305,8 +308,8 @@ class DocView(GenericView, DocHandler): |
305 | ) | 308 | ) |
306 | # 3. 选择队列进入 | 309 | # 3. 选择队列进入 |
307 | is_priority = False | 310 | is_priority = False |
308 | value = ['{0}_{1}'.format(prefix, doc.id)] | 311 | tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)] |
309 | redis_res = rh.enqueue(value, is_priority) | 312 | enqueue_res = rh.enqueue(tasks, is_priority) |
310 | 313 | ||
311 | pdf_file = args.get('pdf_file') | 314 | pdf_file = args.get('pdf_file') |
312 | save_dir_path = os.path.join(conf.DATA_DIR, business_type, str(doc.id)) | 315 | save_dir_path = os.path.join(conf.DATA_DIR, business_type, str(doc.id)) |
... | @@ -314,7 +317,7 @@ class DocView(GenericView, DocHandler): | ... | @@ -314,7 +317,7 @@ class DocView(GenericView, DocHandler): |
314 | os.makedirs(save_dir_path, exist_ok=True) | 317 | os.makedirs(save_dir_path, exist_ok=True) |
315 | file_write(pdf_file, save_file_path) | 318 | file_write(pdf_file, save_file_path) |
316 | 319 | ||
317 | self.running_log.info('[mock doc upload success] [args={0}] [record_id={1}] [prefix={2}] [doc_id={3}] ' | 320 | self.running_log.info('[mock doc upload success] [args={0}] [record_id={1}] [business_type={2}] [doc_id={3}] ' |
318 | '[is_priority={4}] [enqueue_res={5}]'.format(args, record.id, prefix, doc.id, | 321 | '[is_priority={4}] [enqueue_res={5}]'.format(args, record.id, prefix, doc.id, |
319 | is_priority, redis_res)) | 322 | is_priority, enqueue_res)) |
320 | return response.ok() | 323 | return response.ok() | ... | ... |
-
Please register or sign in to post a comment