6934c592 by 周伟奇

fix bug

1 parent b34cd942
...@@ -6,6 +6,7 @@ import base64 ...@@ -6,6 +6,7 @@ import base64
6 import signal 6 import signal
7 import requests 7 import requests
8 import traceback 8 import traceback
9 from datetime import datetime
9 from django.core.management import BaseCommand 10 from django.core.management import BaseCommand
10 from multiprocessing import Process 11 from multiprocessing import Process
11 12
...@@ -57,7 +58,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -57,7 +58,7 @@ class Command(BaseCommand, LoggerMixin):
57 58
58 @staticmethod 59 @staticmethod
59 def get_path(name, img_output_dir, wb_output_dir, pdf_output_dir): 60 def get_path(name, img_output_dir, wb_output_dir, pdf_output_dir):
60 time_stamp = int(time.time()) 61 time_stamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
61 new_name = '{0}_{1}'.format(time_stamp, name) 62 new_name = '{0}_{1}'.format(time_stamp, name)
62 img_save_path = os.path.join(img_output_dir, new_name) 63 img_save_path = os.path.join(img_output_dir, new_name)
63 pdf_save_path = os.path.join(pdf_output_dir, new_name) 64 pdf_save_path = os.path.join(pdf_output_dir, new_name)
......
...@@ -365,48 +365,52 @@ class Command(BaseCommand, LoggerMixin): ...@@ -365,48 +365,52 @@ class Command(BaseCommand, LoggerMixin):
365 365
366 def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock): 366 def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock):
367 while self.switch: 367 while self.switch:
368 # 1. 从队列获取文件信息
369 doc, business_type, task_str = self.get_doc_info()
370 # 队列为空时的处理
371 if doc is None:
372 time.sleep(self.sleep_time_doc_get)
373 continue
374
375 try: 368 try:
376 # 2. 从EDMS获取PDF文件 369 # 1. 从队列获取文件信息
377 doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id)) 370 doc, business_type, task_str = self.get_doc_info()
378 os.makedirs(doc_data_path, exist_ok=True) 371 # 队列为空时的处理
379 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) 372 if doc is None:
380 self.pdf_download(doc, pdf_path) 373 time.sleep(self.sleep_time_doc_get)
381 374 continue
382 # 3.PDF文件提取图片
383 self.cronjob_log.info('{0} [pdf to img start] [task={1}]'.format(self.log_base, task_str))
384 start_time = time.time()
385 img_save_path = os.path.join(doc_data_path, 'img')
386 pdf_handler = PDFHandler(pdf_path, img_save_path)
387 pdf_handler.extract_image()
388 end_time = time.time()
389 speed_time = int(end_time - start_time)
390 self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format(
391 self.log_base, task_str, speed_time))
392
393 with lock:
394 todo_count_dict[task_str] = len(pdf_handler.img_path_list)
395 for img_path in pdf_handler.img_path_list:
396 while img_queue.full():
397 self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
398 time.sleep(self.sleep_time_img_put)
399 img_queue.put(img_path)
400 except EDMSException as e:
401 doc.status = DocStatus.PROCESS_FAILED.value
402 doc.save()
403 self.cronjob_log.error('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
404 self.log_base, task_str, traceback.format_exc()))
405 except Exception as e: 375 except Exception as e:
406 doc.status = DocStatus.PROCESS_FAILED.value 376 self.cronjob_log.error('{0} [process failed (get doc into)] [error={1}]'.format(
407 doc.save() 377 self.log_base, traceback.format_exc()))
408 self.cronjob_log.error('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format( 378 else:
409 self.log_base, task_str, traceback.format_exc())) 379 try:
380 # 2. 从EDMS获取PDF文件
381 doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id))
382 os.makedirs(doc_data_path, exist_ok=True)
383 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
384 self.pdf_download(doc, pdf_path)
385
386 # 3.PDF文件提取图片
387 self.cronjob_log.info('{0} [pdf to img start] [task={1}]'.format(self.log_base, task_str))
388 start_time = time.time()
389 img_save_path = os.path.join(doc_data_path, 'img')
390 pdf_handler = PDFHandler(pdf_path, img_save_path)
391 pdf_handler.extract_image()
392 end_time = time.time()
393 speed_time = int(end_time - start_time)
394 self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format(
395 self.log_base, task_str, speed_time))
396
397 with lock:
398 todo_count_dict[task_str] = len(pdf_handler.img_path_list)
399 for img_path in pdf_handler.img_path_list:
400 while img_queue.full():
401 self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
402 time.sleep(self.sleep_time_img_put)
403 img_queue.put(img_path)
404 except EDMSException as e:
405 doc.status = DocStatus.PROCESS_FAILED.value
406 doc.save()
407 self.cronjob_log.error('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
408 self.log_base, task_str, traceback.format_exc()))
409 except Exception as e:
410 doc.status = DocStatus.PROCESS_FAILED.value
411 doc.save()
412 self.cronjob_log.error('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format(
413 self.log_base, task_str, traceback.format_exc()))
410 414
411 def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url): 415 def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url):
412 while True: 416 while True:
...@@ -418,16 +422,17 @@ class Command(BaseCommand, LoggerMixin): ...@@ -418,16 +422,17 @@ class Command(BaseCommand, LoggerMixin):
418 continue 422 continue
419 else: 423 else:
420 self.cronjob_log.info('{0} [img_2_ocr_1] [get img] [img_path={1}]'.format(self.log_base, img_path)) 424 self.cronjob_log.info('{0} [img_2_ocr_1] [get img] [img_path={1}]'.format(self.log_base, img_path))
421 with open(img_path, 'rb') as f:
422 base64_data = base64.b64encode(f.read())
423 # 获取解码后的base64值
424 file_data = base64_data.decode()
425 json_data_1 = {
426 "file": file_data
427 }
428 425
429 for times in range(consts.RETRY_TIMES): 426 for times in range(consts.RETRY_TIMES):
430 try: 427 try:
428 with open(img_path, 'rb') as f:
429 base64_data = base64.b64encode(f.read())
430 # 获取解码后的base64值
431 file_data = base64_data.decode()
432 json_data_1 = {
433 "file": file_data
434 }
435
431 start_time = time.time() 436 start_time = time.time()
432 ocr_1_response = requests.post(url, json=json_data_1) 437 ocr_1_response = requests.post(url, json=json_data_1)
433 if ocr_1_response.status_code != 200: 438 if ocr_1_response.status_code != 200:
...@@ -447,22 +452,26 @@ class Command(BaseCommand, LoggerMixin): ...@@ -447,22 +452,26 @@ class Command(BaseCommand, LoggerMixin):
447 self.cronjob_log.warn('{0} [ocr_1 failed] [img_path={1}]'.format(self.log_base, img_path)) 452 self.cronjob_log.warn('{0} [ocr_1 failed] [img_path={1}]'.format(self.log_base, img_path))
448 # continue 453 # continue
449 454
450 del json_data_1 455 try:
451 # /data/bmw-ocr-data/AFC/tmp/6/img/page_0_img_0.jpeg 456 del json_data_1
452 # AFC_2 457 # /data/bmw-ocr-data/AFC/tmp/6/img/page_0_img_0.jpeg
453 path_split = img_path.split('/') 458 # AFC_2
454 task_str = consts.SPLIT_STR.join((path_split[-5], path_split[-3])) 459 path_split = img_path.split('/')
455 460 task_str = consts.SPLIT_STR.join((path_split[-5], path_split[-3]))
456 with lock: 461
457 doc_res_dict = res_dict.setdefault(task_str, {}) 462 with lock:
458 doc_res_dict[img_path] = ocr_1_res 463 doc_res_dict = res_dict.setdefault(task_str, {})
459 res_dict[task_str] = doc_res_dict 464 doc_res_dict[img_path] = ocr_1_res
460 todo_count = todo_count_dict.get(task_str) 465 res_dict[task_str] = doc_res_dict
461 if todo_count == 1: 466 todo_count = todo_count_dict.get(task_str)
462 finish_queue.put(task_str) 467 if todo_count == 1:
463 del todo_count_dict[task_str] 468 finish_queue.put(task_str)
464 else: 469 del todo_count_dict[task_str]
465 todo_count_dict[task_str] = todo_count - 1 470 else:
471 todo_count_dict[task_str] = todo_count - 1
472 except Exception as e:
473 self.cronjob_log.error('{0} [process failed (store ocr res)] [img_path={1}] [error={2}]'.format(
474 self.log_base, img_path, traceback.format_exc()))
466 475
467 def res_2_wb(self, res_dict, finish_queue, lock): 476 def res_2_wb(self, res_dict, finish_queue, lock):
468 while True: 477 while True:
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!