fix bug
Showing
2 changed files
with
74 additions
and
64 deletions
... | @@ -6,6 +6,7 @@ import base64 | ... | @@ -6,6 +6,7 @@ import base64 |
6 | import signal | 6 | import signal |
7 | import requests | 7 | import requests |
8 | import traceback | 8 | import traceback |
9 | from datetime import datetime | ||
9 | from django.core.management import BaseCommand | 10 | from django.core.management import BaseCommand |
10 | from multiprocessing import Process | 11 | from multiprocessing import Process |
11 | 12 | ||
... | @@ -57,7 +58,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -57,7 +58,7 @@ class Command(BaseCommand, LoggerMixin): |
57 | 58 | ||
58 | @staticmethod | 59 | @staticmethod |
59 | def get_path(name, img_output_dir, wb_output_dir, pdf_output_dir): | 60 | def get_path(name, img_output_dir, wb_output_dir, pdf_output_dir): |
60 | time_stamp = int(time.time()) | 61 | time_stamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') |
61 | new_name = '{0}_{1}'.format(time_stamp, name) | 62 | new_name = '{0}_{1}'.format(time_stamp, name) |
62 | img_save_path = os.path.join(img_output_dir, new_name) | 63 | img_save_path = os.path.join(img_output_dir, new_name) |
63 | pdf_save_path = os.path.join(pdf_output_dir, new_name) | 64 | pdf_save_path = os.path.join(pdf_output_dir, new_name) | ... | ... |
... | @@ -365,48 +365,52 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -365,48 +365,52 @@ class Command(BaseCommand, LoggerMixin): |
365 | 365 | ||
366 | def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock): | 366 | def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock): |
367 | while self.switch: | 367 | while self.switch: |
368 | # 1. 从队列获取文件信息 | ||
369 | doc, business_type, task_str = self.get_doc_info() | ||
370 | # 队列为空时的处理 | ||
371 | if doc is None: | ||
372 | time.sleep(self.sleep_time_doc_get) | ||
373 | continue | ||
374 | |||
375 | try: | 368 | try: |
376 | # 2. 从EDMS获取PDF文件 | 369 | # 1. 从队列获取文件信息 |
377 | doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id)) | 370 | doc, business_type, task_str = self.get_doc_info() |
378 | os.makedirs(doc_data_path, exist_ok=True) | 371 | # 队列为空时的处理 |
379 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) | 372 | if doc is None: |
380 | self.pdf_download(doc, pdf_path) | 373 | time.sleep(self.sleep_time_doc_get) |
381 | 374 | continue | |
382 | # 3.PDF文件提取图片 | ||
383 | self.cronjob_log.info('{0} [pdf to img start] [task={1}]'.format(self.log_base, task_str)) | ||
384 | start_time = time.time() | ||
385 | img_save_path = os.path.join(doc_data_path, 'img') | ||
386 | pdf_handler = PDFHandler(pdf_path, img_save_path) | ||
387 | pdf_handler.extract_image() | ||
388 | end_time = time.time() | ||
389 | speed_time = int(end_time - start_time) | ||
390 | self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format( | ||
391 | self.log_base, task_str, speed_time)) | ||
392 | |||
393 | with lock: | ||
394 | todo_count_dict[task_str] = len(pdf_handler.img_path_list) | ||
395 | for img_path in pdf_handler.img_path_list: | ||
396 | while img_queue.full(): | ||
397 | self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) | ||
398 | time.sleep(self.sleep_time_img_put) | ||
399 | img_queue.put(img_path) | ||
400 | except EDMSException as e: | ||
401 | doc.status = DocStatus.PROCESS_FAILED.value | ||
402 | doc.save() | ||
403 | self.cronjob_log.error('{0} [process failed (edms download)] [task={1}] [error={2}]'.format( | ||
404 | self.log_base, task_str, traceback.format_exc())) | ||
405 | except Exception as e: | 375 | except Exception as e: |
406 | doc.status = DocStatus.PROCESS_FAILED.value | 376 | self.cronjob_log.error('{0} [process failed (get doc into)] [error={1}]'.format( |
407 | doc.save() | 377 | self.log_base, traceback.format_exc())) |
408 | self.cronjob_log.error('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format( | 378 | else: |
409 | self.log_base, task_str, traceback.format_exc())) | 379 | try: |
380 | # 2. 从EDMS获取PDF文件 | ||
381 | doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id)) | ||
382 | os.makedirs(doc_data_path, exist_ok=True) | ||
383 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) | ||
384 | self.pdf_download(doc, pdf_path) | ||
385 | |||
386 | # 3.PDF文件提取图片 | ||
387 | self.cronjob_log.info('{0} [pdf to img start] [task={1}]'.format(self.log_base, task_str)) | ||
388 | start_time = time.time() | ||
389 | img_save_path = os.path.join(doc_data_path, 'img') | ||
390 | pdf_handler = PDFHandler(pdf_path, img_save_path) | ||
391 | pdf_handler.extract_image() | ||
392 | end_time = time.time() | ||
393 | speed_time = int(end_time - start_time) | ||
394 | self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format( | ||
395 | self.log_base, task_str, speed_time)) | ||
396 | |||
397 | with lock: | ||
398 | todo_count_dict[task_str] = len(pdf_handler.img_path_list) | ||
399 | for img_path in pdf_handler.img_path_list: | ||
400 | while img_queue.full(): | ||
401 | self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) | ||
402 | time.sleep(self.sleep_time_img_put) | ||
403 | img_queue.put(img_path) | ||
404 | except EDMSException as e: | ||
405 | doc.status = DocStatus.PROCESS_FAILED.value | ||
406 | doc.save() | ||
407 | self.cronjob_log.error('{0} [process failed (edms download)] [task={1}] [error={2}]'.format( | ||
408 | self.log_base, task_str, traceback.format_exc())) | ||
409 | except Exception as e: | ||
410 | doc.status = DocStatus.PROCESS_FAILED.value | ||
411 | doc.save() | ||
412 | self.cronjob_log.error('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format( | ||
413 | self.log_base, task_str, traceback.format_exc())) | ||
410 | 414 | ||
411 | def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url): | 415 | def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url): |
412 | while True: | 416 | while True: |
... | @@ -418,16 +422,17 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -418,16 +422,17 @@ class Command(BaseCommand, LoggerMixin): |
418 | continue | 422 | continue |
419 | else: | 423 | else: |
420 | self.cronjob_log.info('{0} [img_2_ocr_1] [get img] [img_path={1}]'.format(self.log_base, img_path)) | 424 | self.cronjob_log.info('{0} [img_2_ocr_1] [get img] [img_path={1}]'.format(self.log_base, img_path)) |
421 | with open(img_path, 'rb') as f: | ||
422 | base64_data = base64.b64encode(f.read()) | ||
423 | # 获取解码后的base64值 | ||
424 | file_data = base64_data.decode() | ||
425 | json_data_1 = { | ||
426 | "file": file_data | ||
427 | } | ||
428 | 425 | ||
429 | for times in range(consts.RETRY_TIMES): | 426 | for times in range(consts.RETRY_TIMES): |
430 | try: | 427 | try: |
428 | with open(img_path, 'rb') as f: | ||
429 | base64_data = base64.b64encode(f.read()) | ||
430 | # 获取解码后的base64值 | ||
431 | file_data = base64_data.decode() | ||
432 | json_data_1 = { | ||
433 | "file": file_data | ||
434 | } | ||
435 | |||
431 | start_time = time.time() | 436 | start_time = time.time() |
432 | ocr_1_response = requests.post(url, json=json_data_1) | 437 | ocr_1_response = requests.post(url, json=json_data_1) |
433 | if ocr_1_response.status_code != 200: | 438 | if ocr_1_response.status_code != 200: |
... | @@ -447,22 +452,26 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -447,22 +452,26 @@ class Command(BaseCommand, LoggerMixin): |
447 | self.cronjob_log.warn('{0} [ocr_1 failed] [img_path={1}]'.format(self.log_base, img_path)) | 452 | self.cronjob_log.warn('{0} [ocr_1 failed] [img_path={1}]'.format(self.log_base, img_path)) |
448 | # continue | 453 | # continue |
449 | 454 | ||
450 | del json_data_1 | 455 | try: |
451 | # /data/bmw-ocr-data/AFC/tmp/6/img/page_0_img_0.jpeg | 456 | del json_data_1 |
452 | # AFC_2 | 457 | # /data/bmw-ocr-data/AFC/tmp/6/img/page_0_img_0.jpeg |
453 | path_split = img_path.split('/') | 458 | # AFC_2 |
454 | task_str = consts.SPLIT_STR.join((path_split[-5], path_split[-3])) | 459 | path_split = img_path.split('/') |
455 | 460 | task_str = consts.SPLIT_STR.join((path_split[-5], path_split[-3])) | |
456 | with lock: | 461 | |
457 | doc_res_dict = res_dict.setdefault(task_str, {}) | 462 | with lock: |
458 | doc_res_dict[img_path] = ocr_1_res | 463 | doc_res_dict = res_dict.setdefault(task_str, {}) |
459 | res_dict[task_str] = doc_res_dict | 464 | doc_res_dict[img_path] = ocr_1_res |
460 | todo_count = todo_count_dict.get(task_str) | 465 | res_dict[task_str] = doc_res_dict |
461 | if todo_count == 1: | 466 | todo_count = todo_count_dict.get(task_str) |
462 | finish_queue.put(task_str) | 467 | if todo_count == 1: |
463 | del todo_count_dict[task_str] | 468 | finish_queue.put(task_str) |
464 | else: | 469 | del todo_count_dict[task_str] |
465 | todo_count_dict[task_str] = todo_count - 1 | 470 | else: |
471 | todo_count_dict[task_str] = todo_count - 1 | ||
472 | except Exception as e: | ||
473 | self.cronjob_log.error('{0} [process failed (store ocr res)] [img_path={1}] [error={2}]'.format( | ||
474 | self.log_base, img_path, traceback.format_exc())) | ||
466 | 475 | ||
467 | def res_2_wb(self, res_dict, finish_queue, lock): | 476 | def res_2_wb(self, res_dict, finish_queue, lock): |
468 | while True: | 477 | while True: | ... | ... |
-
Please register or sign in to post a comment