fix bug
Showing
2 changed files
with
74 additions
and
64 deletions
| ... | @@ -6,6 +6,7 @@ import base64 | ... | @@ -6,6 +6,7 @@ import base64 | 
| 6 | import signal | 6 | import signal | 
| 7 | import requests | 7 | import requests | 
| 8 | import traceback | 8 | import traceback | 
| 9 | from datetime import datetime | ||
| 9 | from django.core.management import BaseCommand | 10 | from django.core.management import BaseCommand | 
| 10 | from multiprocessing import Process | 11 | from multiprocessing import Process | 
| 11 | 12 | ||
| ... | @@ -57,7 +58,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -57,7 +58,7 @@ class Command(BaseCommand, LoggerMixin): | 
| 57 | 58 | ||
| 58 | @staticmethod | 59 | @staticmethod | 
| 59 | def get_path(name, img_output_dir, wb_output_dir, pdf_output_dir): | 60 | def get_path(name, img_output_dir, wb_output_dir, pdf_output_dir): | 
| 60 | time_stamp = int(time.time()) | 61 | time_stamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') | 
| 61 | new_name = '{0}_{1}'.format(time_stamp, name) | 62 | new_name = '{0}_{1}'.format(time_stamp, name) | 
| 62 | img_save_path = os.path.join(img_output_dir, new_name) | 63 | img_save_path = os.path.join(img_output_dir, new_name) | 
| 63 | pdf_save_path = os.path.join(pdf_output_dir, new_name) | 64 | pdf_save_path = os.path.join(pdf_output_dir, new_name) | ... | ... | 
| ... | @@ -365,48 +365,52 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -365,48 +365,52 @@ class Command(BaseCommand, LoggerMixin): | 
| 365 | 365 | ||
| 366 | def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock): | 366 | def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock): | 
| 367 | while self.switch: | 367 | while self.switch: | 
| 368 | # 1. 从队列获取文件信息 | ||
| 369 | doc, business_type, task_str = self.get_doc_info() | ||
| 370 | # 队列为空时的处理 | ||
| 371 | if doc is None: | ||
| 372 | time.sleep(self.sleep_time_doc_get) | ||
| 373 | continue | ||
| 374 | |||
| 375 | try: | 368 | try: | 
| 376 | # 2. 从EDMS获取PDF文件 | 369 | # 1. 从队列获取文件信息 | 
| 377 | doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id)) | 370 | doc, business_type, task_str = self.get_doc_info() | 
| 378 | os.makedirs(doc_data_path, exist_ok=True) | 371 | # 队列为空时的处理 | 
| 379 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) | 372 | if doc is None: | 
| 380 | self.pdf_download(doc, pdf_path) | 373 | time.sleep(self.sleep_time_doc_get) | 
| 381 | 374 | continue | |
| 382 | # 3.PDF文件提取图片 | ||
| 383 | self.cronjob_log.info('{0} [pdf to img start] [task={1}]'.format(self.log_base, task_str)) | ||
| 384 | start_time = time.time() | ||
| 385 | img_save_path = os.path.join(doc_data_path, 'img') | ||
| 386 | pdf_handler = PDFHandler(pdf_path, img_save_path) | ||
| 387 | pdf_handler.extract_image() | ||
| 388 | end_time = time.time() | ||
| 389 | speed_time = int(end_time - start_time) | ||
| 390 | self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format( | ||
| 391 | self.log_base, task_str, speed_time)) | ||
| 392 | |||
| 393 | with lock: | ||
| 394 | todo_count_dict[task_str] = len(pdf_handler.img_path_list) | ||
| 395 | for img_path in pdf_handler.img_path_list: | ||
| 396 | while img_queue.full(): | ||
| 397 | self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) | ||
| 398 | time.sleep(self.sleep_time_img_put) | ||
| 399 | img_queue.put(img_path) | ||
| 400 | except EDMSException as e: | ||
| 401 | doc.status = DocStatus.PROCESS_FAILED.value | ||
| 402 | doc.save() | ||
| 403 | self.cronjob_log.error('{0} [process failed (edms download)] [task={1}] [error={2}]'.format( | ||
| 404 | self.log_base, task_str, traceback.format_exc())) | ||
| 405 | except Exception as e: | 375 | except Exception as e: | 
| 406 | doc.status = DocStatus.PROCESS_FAILED.value | 376 | self.cronjob_log.error('{0} [process failed (get doc into)] [error={1}]'.format( | 
| 407 | doc.save() | 377 | self.log_base, traceback.format_exc())) | 
| 408 | self.cronjob_log.error('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format( | 378 | else: | 
| 409 | self.log_base, task_str, traceback.format_exc())) | 379 | try: | 
| 380 | # 2. 从EDMS获取PDF文件 | ||
| 381 | doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id)) | ||
| 382 | os.makedirs(doc_data_path, exist_ok=True) | ||
| 383 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) | ||
| 384 | self.pdf_download(doc, pdf_path) | ||
| 385 | |||
| 386 | # 3.PDF文件提取图片 | ||
| 387 | self.cronjob_log.info('{0} [pdf to img start] [task={1}]'.format(self.log_base, task_str)) | ||
| 388 | start_time = time.time() | ||
| 389 | img_save_path = os.path.join(doc_data_path, 'img') | ||
| 390 | pdf_handler = PDFHandler(pdf_path, img_save_path) | ||
| 391 | pdf_handler.extract_image() | ||
| 392 | end_time = time.time() | ||
| 393 | speed_time = int(end_time - start_time) | ||
| 394 | self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format( | ||
| 395 | self.log_base, task_str, speed_time)) | ||
| 396 | |||
| 397 | with lock: | ||
| 398 | todo_count_dict[task_str] = len(pdf_handler.img_path_list) | ||
| 399 | for img_path in pdf_handler.img_path_list: | ||
| 400 | while img_queue.full(): | ||
| 401 | self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) | ||
| 402 | time.sleep(self.sleep_time_img_put) | ||
| 403 | img_queue.put(img_path) | ||
| 404 | except EDMSException as e: | ||
| 405 | doc.status = DocStatus.PROCESS_FAILED.value | ||
| 406 | doc.save() | ||
| 407 | self.cronjob_log.error('{0} [process failed (edms download)] [task={1}] [error={2}]'.format( | ||
| 408 | self.log_base, task_str, traceback.format_exc())) | ||
| 409 | except Exception as e: | ||
| 410 | doc.status = DocStatus.PROCESS_FAILED.value | ||
| 411 | doc.save() | ||
| 412 | self.cronjob_log.error('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format( | ||
| 413 | self.log_base, task_str, traceback.format_exc())) | ||
| 410 | 414 | ||
| 411 | def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url): | 415 | def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url): | 
| 412 | while True: | 416 | while True: | 
| ... | @@ -418,16 +422,17 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -418,16 +422,17 @@ class Command(BaseCommand, LoggerMixin): | 
| 418 | continue | 422 | continue | 
| 419 | else: | 423 | else: | 
| 420 | self.cronjob_log.info('{0} [img_2_ocr_1] [get img] [img_path={1}]'.format(self.log_base, img_path)) | 424 | self.cronjob_log.info('{0} [img_2_ocr_1] [get img] [img_path={1}]'.format(self.log_base, img_path)) | 
| 421 | with open(img_path, 'rb') as f: | ||
| 422 | base64_data = base64.b64encode(f.read()) | ||
| 423 | # 获取解码后的base64值 | ||
| 424 | file_data = base64_data.decode() | ||
| 425 | json_data_1 = { | ||
| 426 | "file": file_data | ||
| 427 | } | ||
| 428 | 425 | ||
| 429 | for times in range(consts.RETRY_TIMES): | 426 | for times in range(consts.RETRY_TIMES): | 
| 430 | try: | 427 | try: | 
| 428 | with open(img_path, 'rb') as f: | ||
| 429 | base64_data = base64.b64encode(f.read()) | ||
| 430 | # 获取解码后的base64值 | ||
| 431 | file_data = base64_data.decode() | ||
| 432 | json_data_1 = { | ||
| 433 | "file": file_data | ||
| 434 | } | ||
| 435 | |||
| 431 | start_time = time.time() | 436 | start_time = time.time() | 
| 432 | ocr_1_response = requests.post(url, json=json_data_1) | 437 | ocr_1_response = requests.post(url, json=json_data_1) | 
| 433 | if ocr_1_response.status_code != 200: | 438 | if ocr_1_response.status_code != 200: | 
| ... | @@ -447,22 +452,26 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -447,22 +452,26 @@ class Command(BaseCommand, LoggerMixin): | 
| 447 | self.cronjob_log.warn('{0} [ocr_1 failed] [img_path={1}]'.format(self.log_base, img_path)) | 452 | self.cronjob_log.warn('{0} [ocr_1 failed] [img_path={1}]'.format(self.log_base, img_path)) | 
| 448 | # continue | 453 | # continue | 
| 449 | 454 | ||
| 450 | del json_data_1 | 455 | try: | 
| 451 | # /data/bmw-ocr-data/AFC/tmp/6/img/page_0_img_0.jpeg | 456 | del json_data_1 | 
| 452 | # AFC_2 | 457 | # /data/bmw-ocr-data/AFC/tmp/6/img/page_0_img_0.jpeg | 
| 453 | path_split = img_path.split('/') | 458 | # AFC_2 | 
| 454 | task_str = consts.SPLIT_STR.join((path_split[-5], path_split[-3])) | 459 | path_split = img_path.split('/') | 
| 455 | 460 | task_str = consts.SPLIT_STR.join((path_split[-5], path_split[-3])) | |
| 456 | with lock: | 461 | |
| 457 | doc_res_dict = res_dict.setdefault(task_str, {}) | 462 | with lock: | 
| 458 | doc_res_dict[img_path] = ocr_1_res | 463 | doc_res_dict = res_dict.setdefault(task_str, {}) | 
| 459 | res_dict[task_str] = doc_res_dict | 464 | doc_res_dict[img_path] = ocr_1_res | 
| 460 | todo_count = todo_count_dict.get(task_str) | 465 | res_dict[task_str] = doc_res_dict | 
| 461 | if todo_count == 1: | 466 | todo_count = todo_count_dict.get(task_str) | 
| 462 | finish_queue.put(task_str) | 467 | if todo_count == 1: | 
| 463 | del todo_count_dict[task_str] | 468 | finish_queue.put(task_str) | 
| 464 | else: | 469 | del todo_count_dict[task_str] | 
| 465 | todo_count_dict[task_str] = todo_count - 1 | 470 | else: | 
| 471 | todo_count_dict[task_str] = todo_count - 1 | ||
| 472 | except Exception as e: | ||
| 473 | self.cronjob_log.error('{0} [process failed (store ocr res)] [img_path={1}] [error={2}]'.format( | ||
| 474 | self.log_base, img_path, traceback.format_exc())) | ||
| 466 | 475 | ||
| 467 | def res_2_wb(self, res_dict, finish_queue, lock): | 476 | def res_2_wb(self, res_dict, finish_queue, lock): | 
| 468 | while True: | 477 | while True: | ... | ... | 
- 
Please register or sign in to post a comment