d9b0ae8c by 周伟奇

add folder ocr process

1 parent 4076848e
...@@ -35,4 +35,3 @@ ocr/* ...@@ -35,4 +35,3 @@ ocr/*
35 src/*.sh 35 src/*.sh
36 36
37 test* 37 test*
...\ No newline at end of file ...\ No newline at end of file
38 folder_ocr_process.py
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -83,6 +83,7 @@ RES_SHEET_HEADER = ('页码', '图片序号', '检测图片序号', '结果') ...@@ -83,6 +83,7 @@ RES_SHEET_HEADER = ('页码', '图片序号', '检测图片序号', '结果')
83 RES_SUCCESS = '识别成功' 83 RES_SUCCESS = '识别成功'
84 RES_SUCCESS_OTHER = '识别成功(其他类)' 84 RES_SUCCESS_OTHER = '识别成功(其他类)'
85 RES_SUCCESS_EMPTY = '识别成功(空数据)' 85 RES_SUCCESS_EMPTY = '识别成功(空数据)'
86 RES_FAILED = '识别失败'
86 RES_FAILED_1 = '识别失败(阶段1)' 87 RES_FAILED_1 = '识别失败(阶段1)'
87 RES_FAILED_2 = '识别失败(阶段2)' 88 RES_FAILED_2 = '识别失败(阶段2)'
88 RES_FAILED_3 = '识别失败(阶段1数据格式错误)' 89 RES_FAILED_3 = '识别失败(阶段1数据格式错误)'
......
1 import os
2 import re
3 import time
4 import json
5 import shutil
6 import base64
7 import signal
8 import asyncio
9 import aiohttp
10 import difflib
11 import requests
12 import traceback
13 from collections import Counter
14 from datetime import datetime, date
15 from django.utils import timezone
16 from django.core.management import BaseCommand
17 from multiprocessing import Process, Queue, Manager, Lock
18
19 from settings import conf
20 from common.mixins import LoggerMixin
21 from common.tools.file_tools import write_zip_file
22 from common.tools.pdf_to_img import PDFHandler
23 from apps.doc import consts
24 from apps.doc.ocr.edms import EDMS, rh
25 from apps.doc.named_enum import KeywordsType
26 from apps.doc.exceptions import EDMSException, OCR1Exception, OCR2Exception
27 from apps.doc.ocr.wb import BSWorkbook, Workbook
28 from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords
29
30
31 class Command(BaseCommand, LoggerMixin):
32
33 def __init__(self):
34 super().__init__()
35 self.log_base = '[folder ocr process]'
36 # 处理文件开关
37 self.switch = True
38 # 睡眠时间
39 self.sleep_time = float(conf.SLEEP_SECOND_FOLDER)
40 # input foler
41 self.input_dirs = conf.get_namespace('INPUT_DIR_')
42 # ocr相关
43 self.ocr_url = conf.OCR_URL_FOLDER
44 # 优雅退出信号:15
45 signal.signal(signal.SIGTERM, self.signal_handler)
46
47 def signal_handler(self, sig, frame):
48 self.switch = False # 停止处理文件
49
50 def license1_process(self, ocr_data, license_summary, classify, res_list, pno, ino, part_idx):
51 # 类别:'0'身份证, '1'居住证
52 license_data = ocr_data.get('data', [])
53 if not license_data:
54 res_list.append((pno, ino, part_idx, consts.RES_SUCCESS_EMPTY))
55 return
56 res_list.append((pno, ino, part_idx, consts.RES_SUCCESS))
57 license_summary.setdefault(classify, []).extend(license_data)
58
59 @staticmethod
60 def parse_img_path(img_path):
61 # 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext)
62 img_name, _ = os.path.splitext(os.path.basename(img_path))
63 if re.match(r'page_\d+_img_\d+', img_name):
64 part_list = img_name.split('_')
65 return img_name, int(part_list[1])+1, int(part_list[3])+1
66 else:
67 return img_name, 1, 1
68
69 @staticmethod
70 def get_path(name, img_output_dir, wb_output_dir):
71 time_stamp = int(time.time())
72 new_name = '{0}_{1}'.format(time_stamp, name)
73 img_save_path = os.path.join(img_output_dir, new_name)
74 excel_name = '{0}.xlsx'.format(os.path.splitext(new_name)[0])
75 excel_path = os.path.join(wb_output_dir, excel_name)
76 return img_save_path, excel_path
77
78 def res_process(self, all_res, classify, excel_path):
79 try:
80 license_summary = {}
81 res_list = []
82
83 if not all_res:
84 return
85 else:
86 for img_path, ocr_res in all_res.items():
87 img_name, pno, ino = self.parse_img_path(img_path)
88 part_idx = 1
89
90 if isinstance(ocr_res, dict):
91 if ocr_res.get('code') == 1:
92 data_list = ocr_res.get('data', [])
93 if isinstance(data_list, list):
94 for part_idx, ocr_data in enumerate(data_list):
95 part_idx = part_idx + 1
96 self.license1_process(ocr_data, license_summary, classify, res_list, pno, ino, part_idx)
97 else:
98 res_list.append((pno, ino, part_idx, consts.RES_FAILED_3))
99 else:
100 res_list.append((pno, ino, part_idx, consts.RES_FAILED))
101 else:
102 res_list.append((pno, ino, part_idx, consts.RES_FAILED))
103
104 wb = BSWorkbook(set(), set(), set())
105 wb.simple_license_rebuild(license_summary, consts.DOC_SCHEME_LIST[0])
106 wb.save(excel_path)
107 except Exception as e:
108 self.cronjob_log.error('{0} [wb build error] [path={1}] [error={2}]'.format(
109 self.log_base, excel_path, traceback.format_exc()))
110
111 def ocr_process(self, img_path, classify):
112 if os.path.exists(img_path):
113 # TODO 图片验证
114 with open(img_path, 'rb') as f:
115 base64_data = base64.b64encode(f.read())
116 # 获取解码后的base64值
117 file_data = base64_data.decode()
118 json_data = {
119 "file": file_data,
120 "classify": classify
121 }
122
123 for times in range(consts.RETRY_TIMES):
124 try:
125 start_time = time.time()
126 ocr_response = requests.post(self.ocr_url, json=json_data)
127 if ocr_response.status_code != 200:
128 raise OCR1Exception('{0} ocr status code: {0}'.format(self.log_base, ocr_response.status_code))
129 except Exception as e:
130 self.cronjob_log.warn('{0} [ocr failed] [times={1}] [img_path={2}] [error={3}]'.format(
131 self.log_base, times, img_path, traceback.format_exc()))
132 else:
133 ocr_res = ocr_response.json()
134 end_time = time.time()
135 speed_time = int(end_time - start_time)
136 self.cronjob_log.info('{0} [ocr success] [img={1}] [res={2}] [speed_time={3}]'.format(
137 self.log_base, img_path, ocr_res, speed_time))
138 return ocr_res
139 else:
140 self.cronjob_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path))
141
142 def pdf_process(self, name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir):
143 if os.path.exists(path):
144 try:
145 img_save_path, excel_path= self.get_path(name, img_output_dir, wb_output_dir)
146 self.cronjob_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path))
147 pdf_handler = PDFHandler(path, img_save_path)
148 pdf_handler.extract_image()
149 self.cronjob_log.info('{0} [pdf to img end] [path={1}]'.format(self.log_base, path))
150 except Exception as e:
151 self.cronjob_log.error('{0} [pdf to img error] [path={1}] [error={2}]'.format(
152 self.log_base, path, traceback.format_exc()))
153 else:
154 all_res = {}
155 for img_path in pdf_handler.img_path_list:
156 ocr_res = self.ocr_process(img_path, classify)
157 all_res[img_path] = ocr_res
158 self.res_process(all_res, classify, excel_path)
159
160 shutil.move(path, pdf_output_dir)
161
162 def img_process(self, name, path, classify, wb_output_dir, img_output_dir):
163 ocr_res = self.ocr_process(path, classify)
164 all_res = {path: ocr_res}
165
166 try:
167 img_save_path, excel_path = self.get_path(name, img_output_dir, wb_output_dir)
168 except Exception as e:
169 self.cronjob_log.error('{0} [get path error] [path={1}] [error={2}]'.format(
170 self.log_base, path, traceback.format_exc()))
171 else:
172 self.res_process(all_res, classify, excel_path)
173 shutil.move(path, img_save_path)
174
175 def folder_process(self, input_dir, classify):
176 output_dir = os.path.join(os.path.dirname(input_dir), 'Output')
177 img_output_dir = os.path.join(output_dir, 'image')
178 wb_output_dir = os.path.join(output_dir, 'excel')
179 pdf_output_dir = os.path.join(output_dir, 'pdf')
180 os.makedirs(output_dir, exist_ok=True)
181 os.makedirs(img_output_dir, exist_ok=True)
182 os.makedirs(wb_output_dir, exist_ok=True)
183 os.makedirs(pdf_output_dir, exist_ok=True)
184 while self.switch:
185 # 1. 从input dir获取pdf or image
186 list_dir = os.listdir(input_dir)
187 if not list_dir:
188 self.cronjob_log.error('{0} [input dir empty] [input_dir={1}]'.format(self.log_base, input_dir))
189 time.sleep(self.sleep_time)
190 for name in list_dir:
191 path = os.path.join(input_dir, name)
192 if os.path.isfile(path):
193 self.cronjob_log.info('{0} [file start] [path={1}]'.format(self.log_base, path))
194 if name.endswith('.pdf'):
195 self.pdf_process(name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir)
196 else:
197 self.img_process(name, path, classify, wb_output_dir, img_output_dir)
198 self.cronjob_log.info('{0} [file end] [path={1}]'.format(self.log_base, path))
199
200 def handle(self, *args, **kwargs):
201 process_list = []
202 for classify_idx, input_dir in self.input_dirs.items():
203 classify = int(classify_idx.split('_')[0])
204 process = Process(target=self.folder_process, args=(input_dir, classify))
205 process_list.append(process)
206
207 for p in process_list:
208 p.start()
209 for p in process_list:
210 p.join()
211
212 self.cronjob_log.info('{0} [stop safely]'.format(self.log_base))
...@@ -583,6 +583,29 @@ class BSWorkbook(Workbook): ...@@ -583,6 +583,29 @@ class BSWorkbook(Workbook):
583 count += 1 583 count += 1
584 count_list.append((field_str, count)) 584 count_list.append((field_str, count))
585 585
586 def simple_license_rebuild(self, license_summary, document_scheme):
587 for classify, (_, name, field_order, side_diff, scheme_diff, _) in consts.LICENSE_ORDER:
588 license_list = license_summary.get(classify)
589 if not license_list:
590 continue
591 ws = self.create_sheet(name)
592 if scheme_diff and document_scheme == consts.DOC_SCHEME_LIST[1]:
593 classify = consts.MVC_CLASSIFY_SE
594 for license_dict in license_list:
595 if classify == consts.IC_CLASSIFY and license_dict.get('类别') == '1':
596 license_summary.setdefault(consts.RP_CLASSIFY, []).append(license_dict)
597 continue
598 if side_diff:
599 key, field_order_yes, field_order_no = consts.FIELD_ORDER_MAP.get(classify)
600 field_order = field_order_yes if key in license_dict else field_order_no
601 for search_field, write_field in field_order:
602 field_value = license_dict.get(search_field, '')
603 if isinstance(field_value, list):
604 ws.append((write_field, *field_value))
605 else:
606 ws.append((write_field, field_value))
607 ws.append((None, ))
608
586 def res_sheet(self, res_list): 609 def res_sheet(self, res_list):
587 if res_list: 610 if res_list:
588 res_list.sort(key=lambda x: (x[0], x[1], x[2])) 611 res_list.sort(key=lambda x: (x[0], x[1], x[2]))
......
...@@ -4,6 +4,8 @@ SLEEP_SECOND_DOC_GET = 2 ...@@ -4,6 +4,8 @@ SLEEP_SECOND_DOC_GET = 2
4 SLEEP_SECOND_IMG_PUT = 2 4 SLEEP_SECOND_IMG_PUT = 2
5 SLEEP_SECOND_IMG_GET = 0.5 5 SLEEP_SECOND_IMG_GET = 0.5
6 SLEEP_SECOND_TASK_GET = 2 6 SLEEP_SECOND_TASK_GET = 2
7 SLEEP_SECOND_FOLDER = 2
8
7 IMG_QUEUE_SIZE = 500 9 IMG_QUEUE_SIZE = 500
8 10
9 EDMS_DOWNLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/DownloadHandler.ashx 11 EDMS_DOWNLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/DownloadHandler.ashx
......
...@@ -4,6 +4,8 @@ SLEEP_SECOND_DOC_GET = 10 ...@@ -4,6 +4,8 @@ SLEEP_SECOND_DOC_GET = 10
4 SLEEP_SECOND_IMG_PUT = 2 4 SLEEP_SECOND_IMG_PUT = 2
5 SLEEP_SECOND_IMG_GET = 0.5 5 SLEEP_SECOND_IMG_GET = 0.5
6 SLEEP_SECOND_TASK_GET = 2 6 SLEEP_SECOND_TASK_GET = 2
7 SLEEP_SECOND_FOLDER = 2
8
7 IMG_QUEUE_SIZE = 500 9 IMG_QUEUE_SIZE = 500
8 10
9 EDMS_DOWNLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/DownloadHandler.ashx 11 EDMS_DOWNLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/DownloadHandler.ashx
......
...@@ -4,6 +4,8 @@ SLEEP_SECOND_DOC_GET = 2 ...@@ -4,6 +4,8 @@ SLEEP_SECOND_DOC_GET = 2
4 SLEEP_SECOND_IMG_PUT = 2 4 SLEEP_SECOND_IMG_PUT = 2
5 SLEEP_SECOND_IMG_GET = 0.5 5 SLEEP_SECOND_IMG_GET = 0.5
6 SLEEP_SECOND_TASK_GET = 2 6 SLEEP_SECOND_TASK_GET = 2
7 SLEEP_SECOND_FOLDER = 2
8
7 IMG_QUEUE_SIZE = 500 9 IMG_QUEUE_SIZE = 500
8 10
9 EDMS_DOWNLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/DownloadHandler.ashx 11 EDMS_DOWNLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/DownloadHandler.ashx
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!