Merge branch 'feature/zip'
Showing
4 changed files
with
311 additions
and
32 deletions
| ... | @@ -16,7 +16,7 @@ from multiprocessing import Process, Queue, Manager, Lock | ... | @@ -16,7 +16,7 @@ from multiprocessing import Process, Queue, Manager, Lock |
| 16 | 16 | ||
| 17 | from settings import conf | 17 | from settings import conf |
| 18 | from common.mixins import LoggerMixin | 18 | from common.mixins import LoggerMixin |
| 19 | from common.tools.file_tools import write_zip_file | 19 | from common.tools.file_tools import get_pwd_list_from_str, extract_zip_or_rar, get_file_paths |
| 20 | from common.tools.pdf_to_img import PDFHandler | 20 | from common.tools.pdf_to_img import PDFHandler |
| 21 | from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict | 21 | from common.electronic_afc_contract.afc_contract_ocr import predict as afc_predict |
| 22 | from common.electronic_hil_contract.hil_contract_ocr import predict as hil_predict | 22 | from common.electronic_hil_contract.hil_contract_ocr import predict as hil_predict |
| ... | @@ -89,14 +89,39 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -89,14 +89,39 @@ class Command(BaseCommand, LoggerMixin): |
| 89 | # doc = doc_class.objects.filter(id=doc_id).first() | 89 | # doc = doc_class.objects.filter(id=doc_id).first() |
| 90 | # return doc, business_type | 90 | # return doc, business_type |
| 91 | 91 | ||
| 92 | def get_doc_info(self): | 92 | def get_zip_doc_info(self, task_str): |
| 93 | task_str, is_priority = rh.dequeue() | 93 | try: |
| 94 | if task_str is None: | 94 | info_tuple = task_str.split(consts.SPLIT_STR) |
| 95 | self.online_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base)) | 95 | if len(info_tuple) == 2: |
| 96 | return None, None, None, None | 96 | business_type, doc_id_str = info_tuple |
| 97 | else: | ||
| 98 | business_type, doc_id_str, classify_1_str = info_tuple | ||
| 99 | doc_id = int(doc_id_str) | ||
| 100 | doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc | ||
| 101 | zip_doc = doc_class.objects.filter(id=doc_id).first() | ||
| 102 | |||
| 103 | if zip_doc is None: | ||
| 104 | self.online_log.warn('{0} [zip_2_pdfs] [doc not exist] [task_str={1}]'.format( | ||
| 105 | self.log_base, task_str)) | ||
| 106 | return None, business_type | ||
| 107 | elif zip_doc.status != DocStatus.INIT.value: | ||
| 108 | self.online_log.warn('{0} [zip_2_pdfs] [doc status error] [task_str={1}] [doc_status={2}]'.format( | ||
| 109 | self.log_base, task_str, zip_doc.status)) | ||
| 110 | return None, business_type | ||
| 111 | |||
| 112 | zip_doc.status = DocStatus.PROCESSING.value | ||
| 113 | zip_doc.start_time = timezone.now() | ||
| 114 | zip_doc.save() | ||
| 115 | except Exception as e: | ||
| 116 | self.online_log.error('{0} [process error (zip_2_pdfs)] [error={1}]'.format( | ||
| 117 | self.log_base, traceback.format_exc())) | ||
| 118 | return None, None | ||
| 119 | else: | ||
| 120 | self.online_log.info('{0} [zip_2_pdfs] [db save end] [task_str={1}]'.format( | ||
| 121 | self.log_base, task_str)) | ||
| 122 | return zip_doc, business_type | ||
| 97 | 123 | ||
| 98 | self.online_log.info('{0} [get_doc_info] [task={1}] [is_priority={2}]'.format( | 124 | def get_doc_info(self, task_str, is_priority=False): |
| 99 | self.log_base, task_str, is_priority)) | ||
| 100 | try: | 125 | try: |
| 101 | # doc, business_type = self.get_doc_object(task_str) | 126 | # doc, business_type = self.get_doc_object(task_str) |
| 102 | info_tuple = task_str.split(consts.SPLIT_STR) | 127 | info_tuple = task_str.split(consts.SPLIT_STR) |
| ... | @@ -1094,11 +1119,153 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1094,11 +1119,153 @@ class Command(BaseCommand, LoggerMixin): |
| 1094 | # summary['confidence'] = max(summary['confidence']) | 1119 | # summary['confidence'] = max(summary['confidence']) |
| 1095 | return merged_bs_summary | 1120 | return merged_bs_summary |
| 1096 | 1121 | ||
| 1097 | def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock, error_list, res_dict, finish_queue): | 1122 | def zip_2_pdfs(self, zip_task_queue, error_list): |
| 1123 | while len(error_list) == 0: | ||
| 1124 | # 1. 从redis队列中读取任务: AFC_111_0 | ||
| 1125 | task_str = rh.dequeue_zip() | ||
| 1126 | if task_str is None: | ||
| 1127 | self.online_log.info('{0} [zip_2_pdfs] [zip queue empty]'.format(self.log_base)) | ||
| 1128 | time.sleep(self.sleep_time_doc_get) | ||
| 1129 | continue | ||
| 1130 | |||
| 1131 | self.online_log.info('{0} [zip_2_pdfs] [task={1}]'.format(self.log_base, task_str)) | ||
| 1132 | |||
| 1133 | # 2. 修改doc状态: 识别中 | ||
| 1134 | zip_doc, business_type = self.get_zip_doc_info(task_str) | ||
| 1135 | if zip_doc is None: | ||
| 1136 | time.sleep(self.sleep_time_doc_get) | ||
| 1137 | continue | ||
| 1138 | |||
| 1139 | # 3. 从ECM下载压缩包 | ||
| 1140 | doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(zip_doc.id)) | ||
| 1141 | os.makedirs(doc_data_path, exist_ok=True) | ||
| 1142 | zip_path = os.path.join(doc_data_path, zip_doc.document_name) | ||
| 1143 | for times in range(consts.RETRY_TIMES): | ||
| 1144 | try: | ||
| 1145 | self.edms.download(zip_path, zip_doc.metadata_version_id, zip_doc.document_scheme, business_type) | ||
| 1146 | except Exception as e: | ||
| 1147 | self.online_log.warn('{0} [zip_2_pdfs] [ecm download failed] [task={1}] [times={2}] ' | ||
| 1148 | '[error={3}]'.format(self.log_base, task_str, times, | ||
| 1149 | traceback.format_exc())) | ||
| 1150 | else: | ||
| 1151 | self.online_log.info('{0} [zip_2_pdfs] [ecm download success] [task={1}] [times={2}] ' | ||
| 1152 | '[zip_path={3}]'.format(self.log_base, task_str, times, zip_path)) | ||
| 1153 | break | ||
| 1154 | else: | ||
| 1155 | try: | ||
| 1156 | zip_doc.status = DocStatus.PROCESS_FAILED.value | ||
| 1157 | zip_doc.save() | ||
| 1158 | except Exception as e: | ||
| 1159 | self.online_log.error('{0} [zip_2_pdfs] [process error (db save)] [task={1}] [error={2}]'.format( | ||
| 1160 | self.log_base, task_str, traceback.format_exc())) | ||
| 1161 | time.sleep(self.sleep_time_doc_get) | ||
| 1162 | continue | ||
| 1163 | |||
| 1164 | # 4. 解压 | ||
| 1165 | extract_path = os.path.join(doc_data_path, 'extract_content') | ||
| 1166 | os.makedirs(extract_path, exist_ok=True) | ||
| 1167 | try: | ||
| 1168 | pwd_list = get_pwd_list_from_str(zip_doc.document_name) | ||
| 1169 | is_success = extract_zip_or_rar(zip_path, extract_path, pwd_list) | ||
| 1170 | except Exception as e: | ||
| 1171 | is_success = False | ||
| 1172 | |||
| 1173 | if not is_success: | ||
| 1174 | self.online_log.warn('{0} [zip_2_pdfs] [extract failed] [task={1}] [error={2}]'.format( | ||
| 1175 | self.log_base, task_str, traceback.format_exc())) | ||
| 1176 | try: | ||
| 1177 | zip_doc.status = DocStatus.PROCESS_FAILED.value | ||
| 1178 | zip_doc.save() | ||
| 1179 | except Exception as e: | ||
| 1180 | self.online_log.error('{0} [zip_2_pdfs] [process error (db save)] [task={1}] [error={2}]'.format( | ||
| 1181 | self.log_base, task_str, traceback.format_exc())) | ||
| 1182 | time.sleep(self.sleep_time_doc_get) | ||
| 1183 | continue | ||
| 1184 | |||
| 1185 | self.online_log.info('{0} [zip_2_pdfs] [extract success] [task={1}] [extract_path={2}]'.format( | ||
| 1186 | self.log_base, task_str, extract_path)) | ||
| 1187 | |||
| 1188 | # 5. 找出PDF文件重命名并移动到目标文件夹中。新建doc记录,新建task_str进入队列 | ||
| 1189 | pdf_paths = get_file_paths(extract_path, ['.pdf', '.PDF']) | ||
| 1190 | count = 0 | ||
| 1191 | pdf_task_str_list = [] | ||
| 1192 | for pdf_path in pdf_paths: | ||
| 1193 | if count > 50: | ||
| 1194 | self.online_log.info('{0} [zip_2_pdfs] [pdf count > 50, skip] [task={1}]'.format( | ||
| 1195 | self.log_base, task_str)) | ||
| 1196 | break | ||
| 1197 | |||
| 1198 | count += 1 | ||
| 1199 | try: | ||
| 1200 | doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc | ||
| 1201 | pdf_doc = doc_class.objects.create( | ||
| 1202 | metadata_version_id='from: {0}'.format(zip_doc.id), | ||
| 1203 | application_id=zip_doc.application_id, | ||
| 1204 | # main_applicant=applicant_data.get('mainApplicantName'), | ||
| 1205 | # co_applicant=applicant_data.get('coApplicantName'), | ||
| 1206 | # guarantor_1=applicant_data.get('guarantor1Name'), | ||
| 1207 | # guarantor_2=applicant_data.get('guarantor2Name'), | ||
| 1208 | document_name=os.path.basename(pdf_path), | ||
| 1209 | document_scheme=zip_doc.document_scheme, | ||
| 1210 | data_source=zip_doc.data_source, | ||
| 1211 | upload_finish_time=zip_doc.upload_finish_time, | ||
| 1212 | ) | ||
| 1213 | |||
| 1214 | pdf_doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(pdf_doc.id)) | ||
| 1215 | os.makedirs(pdf_doc_data_path, exist_ok=True) | ||
| 1216 | target_pdf_path = os.path.join(pdf_doc_data_path, '{0}.pdf'.format(pdf_doc.id)) | ||
| 1217 | shutil.move(pdf_path, target_pdf_path) | ||
| 1218 | |||
| 1219 | pdf_task_str = consts.SPLIT_STR.join([business_type, str(pdf_doc.id), '0']) | ||
| 1220 | pdf_task_str_list.append(pdf_task_str) | ||
| 1221 | except Exception as e: | ||
| 1222 | self.online_log.warn('{0} [zip_2_pdfs] [recreate pdf task failed] [task={1}] [pdf_path={2}]' | ||
| 1223 | ' [error={3}]'.format(self.log_base, task_str, pdf_path, | ||
| 1224 | traceback.format_exc())) | ||
| 1225 | else: | ||
| 1226 | self.online_log.info('{0} [zip_2_pdfs] [recreate pdf task success] [task={1}] ' | ||
| 1227 | '[pdf_task={2}]'.format(self.log_base, task_str, pdf_path, | ||
| 1228 | traceback.format_exc())) | ||
| 1229 | |||
| 1230 | if len(pdf_task_str_list) > 0: | ||
| 1231 | for pdf_task_str in pdf_task_str_list: | ||
| 1232 | try: | ||
| 1233 | zip_task_queue.put(pdf_task_str) | ||
| 1234 | except Exception as e: | ||
| 1235 | self.online_log.warn('{0} [zip_2_pdfs] [put pdf task failed] [task={1}] [pdf_task={2}]' | ||
| 1236 | ' [error={3}]'.format(self.log_base, task_str, pdf_task_str, | ||
| 1237 | traceback.format_exc())) | ||
| 1238 | else: | ||
| 1239 | self.online_log.info('{0} [zip_2_pdfs] [zip task no pdf] [task={1}]'.format(self.log_base, task_str)) | ||
| 1240 | |||
| 1241 | # 6. 完成,修改doc状态:识别完成 | ||
| 1242 | try: | ||
| 1243 | zip_doc.status = DocStatus.COMPLETE.value | ||
| 1244 | zip_doc.end_time = timezone.now() | ||
| 1245 | zip_doc.duration = min((zip_doc.end_time - zip_doc.start_time).seconds, 32760) | ||
| 1246 | zip_doc.save() | ||
| 1247 | except Exception as e: | ||
| 1248 | self.online_log.error('{0} [zip_2_pdfs] [process error (db save)] [task={1}] [error={2}]'.format( | ||
| 1249 | self.log_base, task_str, traceback.format_exc())) | ||
| 1250 | |||
| 1251 | def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock, error_list, res_dict, finish_queue, zip_task_queue): | ||
| 1098 | while self.switch: | 1252 | while self.switch: |
| 1099 | try: | 1253 | try: |
| 1254 | task_str = zip_task_queue.get(block=False) | ||
| 1255 | is_priority = False | ||
| 1256 | except Exception as e: | ||
| 1257 | task_str, is_priority = rh.dequeue() | ||
| 1258 | if task_str is None: | ||
| 1259 | self.online_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base)) | ||
| 1260 | time.sleep(self.sleep_time_doc_get) | ||
| 1261 | continue | ||
| 1262 | |||
| 1263 | self.online_log.info('{0} [get_doc_info] [task={1}] [is_priority={2}]'.format( | ||
| 1264 | self.log_base, task_str, is_priority)) | ||
| 1265 | |||
| 1266 | try: | ||
| 1100 | # 1. 从队列获取文件信息 | 1267 | # 1. 从队列获取文件信息 |
| 1101 | doc, business_type, task_str, classify_1_str = self.get_doc_info() | 1268 | doc, business_type, task_str, classify_1_str = self.get_doc_info(task_str, is_priority) |
| 1102 | # 队列为空时的处理 | 1269 | # 队列为空时的处理 |
| 1103 | if doc is None: | 1270 | if doc is None: |
| 1104 | time.sleep(self.sleep_time_doc_get) | 1271 | time.sleep(self.sleep_time_doc_get) |
| ... | @@ -1119,19 +1286,29 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1119,19 +1286,29 @@ class Command(BaseCommand, LoggerMixin): |
| 1119 | if classify_1_str == '0': | 1286 | if classify_1_str == '0': |
| 1120 | try: | 1287 | try: |
| 1121 | # 2. 从EDMS获取PDF文件 | 1288 | # 2. 从EDMS获取PDF文件 |
| 1122 | max_count_obj = Configs.objects.filter(id=2).first() | 1289 | # max_count_obj = Configs.objects.filter(id=2).first() |
| 1123 | try: | 1290 | # try: |
| 1124 | max_img_count = int(max_count_obj.value) | 1291 | # max_img_count = int(max_count_obj.value) |
| 1125 | except Exception as e: | 1292 | # except Exception as e: |
| 1126 | max_img_count = 500 | 1293 | max_img_count = 500 |
| 1127 | 1294 | ||
| 1128 | for times in range(consts.RETRY_TIMES): | 1295 | for times in range(consts.RETRY_TIMES): |
| 1129 | try: | 1296 | try: |
| 1130 | if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): | 1297 | if doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): |
| 1298 | self.online_log.info('{0} [mo ni xia dan] [task={1}] [times={2}] ' | ||
| 1299 | '[pdf_path={3}]'.format(self.log_base, task_str, | ||
| 1300 | times, pdf_path)) | ||
| 1301 | elif os.path.exists(pdf_path): | ||
| 1302 | self.online_log.info('{0} [pdf from zip file] [task={1}] [times={2}] ' | ||
| 1303 | '[pdf_path={3}]'.format(self.log_base, task_str, | ||
| 1304 | times, pdf_path)) | ||
| 1305 | else: | ||
| 1131 | # self.edms.download(pdf_path, doc.metadata_version_id) | 1306 | # self.edms.download(pdf_path, doc.metadata_version_id) |
| 1132 | self.edms.download(pdf_path, doc.metadata_version_id, doc.document_scheme, business_type) | 1307 | self.edms.download(pdf_path, doc.metadata_version_id, doc.document_scheme, |
| 1133 | self.online_log.info('{0} [edms download success] [task={1}] [times={2}] ' | 1308 | business_type) |
| 1134 | '[pdf_path={3}]'.format(self.log_base, task_str, times, pdf_path)) | 1309 | self.online_log.info('{0} [ecm download success] [task={1}] [times={2}] ' |
| 1310 | '[pdf_path={3}]'.format(self.log_base, task_str, | ||
| 1311 | times, pdf_path)) | ||
| 1135 | 1312 | ||
| 1136 | # 3.PDF文件提取图片 | 1313 | # 3.PDF文件提取图片 |
| 1137 | self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format( | 1314 | self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format( |
| ... | @@ -2098,9 +2275,16 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -2098,9 +2275,16 @@ class Command(BaseCommand, LoggerMixin): |
| 2098 | res_dict = manager.dict() | 2275 | res_dict = manager.dict() |
| 2099 | img_queue = Queue(self.img_queue_size) | 2276 | img_queue = Queue(self.img_queue_size) |
| 2100 | finish_queue = Queue() | 2277 | finish_queue = Queue() |
| 2278 | zip_task_queue = Queue() | ||
| 2101 | 2279 | ||
| 2102 | process_list = [] | 2280 | process_list = [] |
| 2103 | pdf_process = Process(target=self.pdf_2_img_2_queue, args=(img_queue, todo_count_dict, lock, error_list, res_dict, finish_queue)) | 2281 | zip_process = Process(target=self.zip_2_pdfs, |
| 2282 | args=(zip_task_queue, error_list)) | ||
| 2283 | process_list.append(zip_process) | ||
| 2284 | |||
| 2285 | pdf_process = Process(target=self.pdf_2_img_2_queue, | ||
| 2286 | args=(img_queue, todo_count_dict, lock, error_list, res_dict, | ||
| 2287 | finish_queue, zip_task_queue)) | ||
| 2104 | process_list.append(pdf_process) | 2288 | process_list.append(pdf_process) |
| 2105 | 2289 | ||
| 2106 | for url in self.ocr_1_urls.values(): | 2290 | for url in self.ocr_1_urls.values(): | ... | ... |
| ... | @@ -570,15 +570,14 @@ class UploadDocView(GenericView, DocHandler): | ... | @@ -570,15 +570,14 @@ class UploadDocView(GenericView, DocHandler): |
| 570 | data_source = self.fix_data_source(data_source) | 570 | data_source = self.fix_data_source(data_source) |
| 571 | document_scheme = self.fix_scheme(document_scheme) | 571 | document_scheme = self.fix_scheme(document_scheme) |
| 572 | 572 | ||
| 573 | if document_name.endswith('.zip'): | 573 | # if document_name.endswith('.zip'): |
| 574 | self.running_log.info('[doc upload success] [zip file skip] [args={0}]'.format(args)) | 574 | # self.running_log.info('[doc upload success] [zip file skip] [args={0}]'.format(args)) |
| 575 | return response.ok() | 575 | # return response.ok() |
| 576 | 576 | ||
| 577 | if data_source == consts.DATA_SOURCE_LIST[1]: | 577 | if data_source == consts.DATA_SOURCE_LIST[1]: |
| 578 | if isinstance(document_name, str): | 578 | if document_name.endswith('-证书.pdf') or document_name.endswith('-证书'): |
| 579 | if document_name.endswith('-证书.pdf') or document_name.endswith('-证书'): | 579 | self.running_log.info('[doc upload success] [eapp license skip] [args={0}]'.format(args)) |
| 580 | self.running_log.info('[doc upload success] [eapp license skip] [args={0}]'.format(args)) | 580 | return response.ok() |
| 581 | return response.ok() | ||
| 582 | 581 | ||
| 583 | # 2. 根据业务类型分库存储 | 582 | # 2. 根据业务类型分库存储 |
| 584 | doc_class, prefix = self.get_doc_class(business_type) | 583 | doc_class, prefix = self.get_doc_class(business_type) |
| ... | @@ -594,17 +593,24 @@ class UploadDocView(GenericView, DocHandler): | ... | @@ -594,17 +593,24 @@ class UploadDocView(GenericView, DocHandler): |
| 594 | data_source=data_source, | 593 | data_source=data_source, |
| 595 | upload_finish_time=document.get('uploadFinishTime'), | 594 | upload_finish_time=document.get('uploadFinishTime'), |
| 596 | ) | 595 | ) |
| 596 | |||
| 597 | # 3. 选择队列进入 | 597 | # 3. 选择队列进入 |
| 598 | is_priority = PriorityApplication.objects.filter(application_id=application_id, on_off=True).exists() | 598 | is_priority = PriorityApplication.objects.filter(application_id=application_id, on_off=True).exists() |
| 599 | 599 | is_zip = False | |
| 600 | |||
| 600 | classify_1 = 0 | 601 | classify_1 = 0 |
| 602 | # 电子合同 | ||
| 601 | if data_source == consts.DATA_SOURCE_LIST[-1] and document_scheme == consts.DOC_SCHEME_LIST[1]: | 603 | if data_source == consts.DATA_SOURCE_LIST[-1] and document_scheme == consts.DOC_SCHEME_LIST[1]: |
| 602 | for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix): | 604 | for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix): |
| 603 | if keyword in document_name: | 605 | if keyword in document_name: |
| 604 | classify_1 = classify_1_tmp | 606 | classify_1 = classify_1_tmp |
| 605 | break | 607 | break |
| 608 | elif document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \ | ||
| 609 | or document_name.endswith('.RAR'): | ||
| 610 | is_zip = True | ||
| 611 | |||
| 606 | task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)]) | 612 | task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)]) |
| 607 | enqueue_res = rh.enqueue([task], is_priority) | 613 | enqueue_res = rh.enqueue([task], is_priority, is_zip) |
| 608 | self.running_log.info('[doc upload success] [args={0}] [business_type={1}] [doc_id={2}] ' | 614 | self.running_log.info('[doc upload success] [args={0}] [business_type={1}] [doc_id={2}] ' |
| 609 | '[is_priority={3}] [enqueue_res={4}]'.format(args, prefix, doc.id, | 615 | '[is_priority={3}] [enqueue_res={4}]'.format(args, prefix, doc.id, |
| 610 | is_priority, enqueue_res)) | 616 | is_priority, enqueue_res)) |
| ... | @@ -669,7 +675,7 @@ class PriorityDocView(GenericView, DocHandler): | ... | @@ -669,7 +675,7 @@ class PriorityDocView(GenericView, DocHandler): |
| 669 | self.running_log.info( | 675 | self.running_log.info( |
| 670 | '[priority doc success] [args={0}]'.format(args)) | 676 | '[priority doc success] [args={0}]'.format(args)) |
| 671 | else: | 677 | else: |
| 672 | enqueue_res = rh.enqueue(tasks_list, is_priority=True) | 678 | enqueue_res = rh.enqueue(tasks_list, is_priority=True) # TODO 可能把压缩文件放入优先队列 |
| 673 | self.running_log.info('[priority doc success] [args={0}] [tasks_list={1}] [enqueue_res={2}]'.format( | 679 | self.running_log.info('[priority doc success] [args={0}] [tasks_list={1}] [enqueue_res={2}]'.format( |
| 674 | args, tasks_list, enqueue_res)) | 680 | args, tasks_list, enqueue_res)) |
| 675 | return response.ok() | 681 | return response.ok() | ... | ... |
| ... | @@ -35,16 +35,27 @@ class RedisHandler: | ... | @@ -35,16 +35,27 @@ class RedisHandler: |
| 35 | self.prefix = 'bwm_ocr' | 35 | self.prefix = 'bwm_ocr' |
| 36 | self.common_queue_key = '{0}:common_queue'.format(self.prefix) | 36 | self.common_queue_key = '{0}:common_queue'.format(self.prefix) |
| 37 | self.priority_queue_key = '{0}:priority_queue'.format(self.prefix) | 37 | self.priority_queue_key = '{0}:priority_queue'.format(self.prefix) |
| 38 | self.zip_queue_key = '{0}:zip_queue'.format(self.prefix) | ||
| 38 | self.session_id_key = '{0}:session_id'.format(self.prefix) | 39 | self.session_id_key = '{0}:session_id'.format(self.prefix) |
| 39 | self.cms_token_key = '{0}:cms_token'.format(self.prefix) | 40 | self.cms_token_key = '{0}:cms_token'.format(self.prefix) |
| 40 | self.ecm_token_key = '{0}:ecm_token'.format(self.prefix) | 41 | self.ecm_token_key = '{0}:ecm_token'.format(self.prefix) |
| 41 | self.login_limit_key = '{0}:login_limit'.format(self.prefix) | 42 | self.login_limit_key = '{0}:login_limit'.format(self.prefix) |
| 42 | 43 | ||
| 43 | def enqueue(self, tasks, is_priority=False): | 44 | def enqueue(self, tasks, is_priority=False, is_zip=False): |
| 44 | # 1 | 45 | # 1 |
| 45 | key = self.priority_queue_key if is_priority else self.common_queue_key | 46 | if is_zip: |
| 47 | key = self.zip_queue_key | ||
| 48 | elif is_priority: | ||
| 49 | key = self.priority_queue_key | ||
| 50 | else: | ||
| 51 | key = self.common_queue_key | ||
| 46 | return self.redis.lpush(key, tasks) | 52 | return self.redis.lpush(key, tasks) |
| 47 | 53 | ||
| 54 | def dequeue_zip(self): | ||
| 55 | # task or None | ||
| 56 | task = self.redis.rpop(self.zip_queue_key) | ||
| 57 | return task | ||
| 58 | |||
| 48 | def dequeue(self): | 59 | def dequeue(self): |
| 49 | # task or None | 60 | # task or None |
| 50 | task = self.redis.rpop(self.priority_queue_key) | 61 | task = self.redis.rpop(self.priority_queue_key) | ... | ... |
| 1 | import os | 1 | import os |
| 2 | import re | ||
| 3 | import zipfile | ||
| 4 | |||
| 5 | import rarfile | ||
| 2 | from zipfile import ZipFile | 6 | from zipfile import ZipFile |
| 3 | 7 | ||
| 4 | 8 | ||
| ... | @@ -18,3 +22,77 @@ def write_zip_file(dir_name, zipfile_path): | ... | @@ -18,3 +22,77 @@ def write_zip_file(dir_name, zipfile_path): |
| 18 | src_file_path = os.path.join(root, single_file) | 22 | src_file_path = os.path.join(root, single_file) |
| 19 | file_target_path = os.path.join(root_target_path, single_file) | 23 | file_target_path = os.path.join(root_target_path, single_file) |
| 20 | z.write(src_file_path, file_target_path) | 24 | z.write(src_file_path, file_target_path) |
| 25 | |||
| 26 | |||
| 27 | def get_pwd_list_from_str(doc_name): | ||
| 28 | try: | ||
| 29 | pwd_list = re.findall(r'\d{6}', doc_name) | ||
| 30 | return pwd_list | ||
| 31 | except Exception as e: | ||
| 32 | return [] | ||
| 33 | |||
| 34 | |||
| 35 | def extract_zip_or_rar(file_path, extract_path, pwd_list=[]): | ||
| 36 | if file_path.endswith('.zip') or file_path.endswith('.ZIP'): | ||
| 37 | if len(pwd_list) > 0: | ||
| 38 | for password in pwd_list: | ||
| 39 | try: | ||
| 40 | with zipfile.ZipFile(file_path) as zf: | ||
| 41 | zf.extractall(extract_path, pwd=bytes(password, 'utf-8')) | ||
| 42 | except Exception as e: | ||
| 43 | continue | ||
| 44 | else: | ||
| 45 | return True | ||
| 46 | else: | ||
| 47 | return False | ||
| 48 | else: | ||
| 49 | try: | ||
| 50 | with zipfile.ZipFile(file_path) as zf: | ||
| 51 | zf.extractall(extract_path) | ||
| 52 | except Exception as e: | ||
| 53 | return False | ||
| 54 | else: | ||
| 55 | return True | ||
| 56 | elif file_path.endswith('.rar') or file_path.endswith('.RAR'): | ||
| 57 | if len(pwd_list) > 0: | ||
| 58 | for password in pwd_list: | ||
| 59 | try: | ||
| 60 | with rarfile.RarFile(file_path) as rf: | ||
| 61 | rf.extractall(extract_path, pwd=password) | ||
| 62 | except Exception as e: | ||
| 63 | continue | ||
| 64 | else: | ||
| 65 | return True | ||
| 66 | else: | ||
| 67 | return False | ||
| 68 | else: | ||
| 69 | try: | ||
| 70 | with rarfile.RarFile(file_path) as rf: | ||
| 71 | rf.extractall(extract_path) | ||
| 72 | except Exception as e: | ||
| 73 | return False | ||
| 74 | else: | ||
| 75 | return True | ||
| 76 | else: | ||
| 77 | return False | ||
| 78 | |||
| 79 | |||
| 80 | def get_file_paths(input_path, suffix_list): | ||
| 81 | """ | ||
| 82 | |||
| 83 | Args: | ||
| 84 | input_path: str 目标目录 | ||
| 85 | suffix_list: list 搜索的文件的后缀列表 | ||
| 86 | |||
| 87 | Returns: list 搜索到的相关文件绝对路径列表 | ||
| 88 | |||
| 89 | """ | ||
| 90 | for parent, _, filenames in os.walk(input_path): | ||
| 91 | for filename in filenames: | ||
| 92 | for suffix in suffix_list: | ||
| 93 | if filename.endswith(suffix): | ||
| 94 | file_path = os.path.join(parent, filename) | ||
| 95 | break | ||
| 96 | else: | ||
| 97 | continue | ||
| 98 | yield file_path | ... | ... |
-
Please register or sign in to post a comment