init reocr
Showing
2 changed files
with
67 additions
and
11 deletions
... | @@ -100,7 +100,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -100,7 +100,7 @@ class Command(BaseCommand, LoggerMixin): |
100 | if len(info_tuple) == 2: | 100 | if len(info_tuple) == 2: |
101 | business_type, doc_id_str = info_tuple | 101 | business_type, doc_id_str = info_tuple |
102 | else: | 102 | else: |
103 | business_type, doc_id_str, classify_1_str = info_tuple | 103 | business_type, doc_id_str, classify_1_str, re_ocr_flag = info_tuple |
104 | doc_id = int(doc_id_str) | 104 | doc_id = int(doc_id_str) |
105 | doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc | 105 | doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc |
106 | zip_doc = doc_class.objects.filter(id=doc_id).first() | 106 | zip_doc = doc_class.objects.filter(id=doc_id).first() |
... | @@ -124,7 +124,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -124,7 +124,7 @@ class Command(BaseCommand, LoggerMixin): |
124 | else: | 124 | else: |
125 | self.online_log.info('{0} [zip_2_pdfs] [db save end] [task_str={1}]'.format( | 125 | self.online_log.info('{0} [zip_2_pdfs] [db save end] [task_str={1}]'.format( |
126 | self.log_base, task_str)) | 126 | self.log_base, task_str)) |
127 | return zip_doc, business_type | 127 | return zip_doc, business_type, re_ocr_flag |
128 | 128 | ||
129 | def get_doc_info(self, task_str, is_priority=False): | 129 | def get_doc_info(self, task_str, is_priority=False): |
130 | try: | 130 | try: |
... | @@ -135,7 +135,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -135,7 +135,7 @@ class Command(BaseCommand, LoggerMixin): |
135 | classify_1_str = '0' | 135 | classify_1_str = '0' |
136 | rebuild_task_str = task_str | 136 | rebuild_task_str = task_str |
137 | else: | 137 | else: |
138 | business_type, doc_id_str, classify_1_str = info_tuple | 138 | business_type, doc_id_str, classify_1_str, re_ocr_flag = info_tuple |
139 | rebuild_task_str = '{0}{1}{2}'.format(business_type, consts.SPLIT_STR, doc_id_str) | 139 | rebuild_task_str = '{0}{1}{2}'.format(business_type, consts.SPLIT_STR, doc_id_str) |
140 | doc_id = int(doc_id_str) | 140 | doc_id = int(doc_id_str) |
141 | doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc | 141 | doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc |
... | @@ -160,7 +160,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -160,7 +160,7 @@ class Command(BaseCommand, LoggerMixin): |
160 | else: | 160 | else: |
161 | self.online_log.info('{0} [get_doc_info] [db save end] [task_str={1}] [is_priority={2}]'.format( | 161 | self.online_log.info('{0} [get_doc_info] [db save end] [task_str={1}] [is_priority={2}]'.format( |
162 | self.log_base, task_str, is_priority)) | 162 | self.log_base, task_str, is_priority)) |
163 | return doc, business_type, rebuild_task_str, classify_1_str | 163 | return doc, business_type, rebuild_task_str, classify_1_str, re_ocr_flag |
164 | 164 | ||
165 | # def pdf_download(self, doc, pdf_path): | 165 | # def pdf_download(self, doc, pdf_path): |
166 | # if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): | 166 | # if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): |
... | @@ -1202,7 +1202,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1202,7 +1202,7 @@ class Command(BaseCommand, LoggerMixin): |
1202 | self.online_log.info('{0} [zip_2_pdfs] [task={1}]'.format(self.log_base, task_str)) | 1202 | self.online_log.info('{0} [zip_2_pdfs] [task={1}]'.format(self.log_base, task_str)) |
1203 | 1203 | ||
1204 | # 2. 修改doc状态: 识别中 | 1204 | # 2. 修改doc状态: 识别中 |
1205 | zip_doc, business_type = self.get_zip_doc_info(task_str) | 1205 | zip_doc, business_type, re_ocr_flag = self.get_zip_doc_info(task_str) |
1206 | if zip_doc is None: | 1206 | if zip_doc is None: |
1207 | time.sleep(self.sleep_time_doc_get) | 1207 | time.sleep(self.sleep_time_doc_get) |
1208 | continue | 1208 | continue |
... | @@ -1287,7 +1287,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1287,7 +1287,7 @@ class Command(BaseCommand, LoggerMixin): |
1287 | target_pdf_path = os.path.join(pdf_doc_data_path, '{0}.pdf'.format(pdf_doc.id)) | 1287 | target_pdf_path = os.path.join(pdf_doc_data_path, '{0}.pdf'.format(pdf_doc.id)) |
1288 | shutil.move(pdf_path, target_pdf_path) | 1288 | shutil.move(pdf_path, target_pdf_path) |
1289 | 1289 | ||
1290 | pdf_task_str = consts.SPLIT_STR.join([business_type, str(pdf_doc.id), '0']) | 1290 | pdf_task_str = consts.SPLIT_STR.join([business_type, str(pdf_doc.id), '0', re_ocr_flag]) |
1291 | pdf_task_str_list.append(pdf_task_str) | 1291 | pdf_task_str_list.append(pdf_task_str) |
1292 | except Exception as e: | 1292 | except Exception as e: |
1293 | self.online_log.warn('{0} [zip_2_pdfs] [recreate pdf task failed] [task={1}] [pdf_path={2}]' | 1293 | self.online_log.warn('{0} [zip_2_pdfs] [recreate pdf task failed] [task={1}] [pdf_path={2}]' |
... | @@ -1336,7 +1336,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1336,7 +1336,7 @@ class Command(BaseCommand, LoggerMixin): |
1336 | 1336 | ||
1337 | try: | 1337 | try: |
1338 | # 1. 从队列获取文件信息 | 1338 | # 1. 从队列获取文件信息 |
1339 | doc, business_type, task_str, classify_1_str = self.get_doc_info(task_str, is_priority) | 1339 | doc, business_type, task_str, classify_1_str, re_ocr_flag = self.get_doc_info(task_str, is_priority) |
1340 | # 队列为空时的处理 | 1340 | # 队列为空时的处理 |
1341 | if doc is None: | 1341 | if doc is None: |
1342 | time.sleep(self.sleep_time_doc_get) | 1342 | time.sleep(self.sleep_time_doc_get) |
... | @@ -1386,7 +1386,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1386,7 +1386,8 @@ class Command(BaseCommand, LoggerMixin): |
1386 | self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format( | 1386 | self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format( |
1387 | self.log_base, task_str, times)) | 1387 | self.log_base, task_str, times)) |
1388 | start_time = time.time() | 1388 | start_time = time.time() |
1389 | pdf_handler.extract_image(max_img_count) | 1389 | max_img_count_or_none = None if re_ocr_flag == 'Y' else max_img_count |
1390 | pdf_handler.extract_image(max_img_count_or_none) | ||
1390 | end_time = time.time() | 1391 | end_time = time.time() |
1391 | speed_time = int(end_time - start_time) | 1392 | speed_time = int(end_time - start_time) |
1392 | self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'.format( | 1393 | self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'.format( |
... | @@ -1404,7 +1405,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -1404,7 +1405,7 @@ class Command(BaseCommand, LoggerMixin): |
1404 | self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format( | 1405 | self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format( |
1405 | self.log_base, task_str)) | 1406 | self.log_base, task_str)) |
1406 | raise Exception('pdf img empty') | 1407 | raise Exception('pdf img empty') |
1407 | elif pdf_handler.img_count >= max_img_count: | 1408 | elif re_ocr_flag == 'N' and pdf_handler.img_count >= max_img_count: |
1408 | self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format( | 1409 | self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format( |
1409 | self.log_base, task_str, pdf_handler.img_count)) | 1410 | self.log_base, task_str, pdf_handler.img_count)) |
1410 | 1411 | ... | ... |
... | @@ -589,6 +589,11 @@ invoice_download_args = { | ... | @@ -589,6 +589,11 @@ invoice_download_args = { |
589 | 'application_ids': fields.Str(required=True), | 589 | 'application_ids': fields.Str(required=True), |
590 | } | 590 | } |
591 | 591 | ||
592 | doc_reocr_args = { | ||
593 | 'doc_id': fields.Int(required=True), | ||
594 | 'application_entity': fields.Int(required=True), | ||
595 | } | ||
596 | |||
592 | 597 | ||
593 | class UploadDocView(GenericView, DocHandler): | 598 | class UploadDocView(GenericView, DocHandler): |
594 | # permission_classes = [] | 599 | # permission_classes = [] |
... | @@ -698,7 +703,7 @@ class UploadDocView(GenericView, DocHandler): | ... | @@ -698,7 +703,7 @@ class UploadDocView(GenericView, DocHandler): |
698 | or document_name.endswith('.RAR'): | 703 | or document_name.endswith('.RAR'): |
699 | is_zip = True | 704 | is_zip = True |
700 | 705 | ||
701 | task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)]) | 706 | task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1), 'N']) |
702 | enqueue_res = rh.enqueue([task], is_priority, is_zip) | 707 | enqueue_res = rh.enqueue([task], is_priority, is_zip) |
703 | self.running_log.info('[doc upload success] [args={0}] [business_type={1}] [doc_id={2}] ' | 708 | self.running_log.info('[doc upload success] [args={0}] [business_type={1}] [doc_id={2}] ' |
704 | '[is_priority={3}] [enqueue_res={4}] [is_fsm={5} [classify_1={6}]]'.format(args, prefix, doc.id, | 709 | '[is_priority={3}] [enqueue_res={4}] [is_fsm={5} [classify_1={6}]]'.format(args, prefix, doc.id, |
... | @@ -1249,7 +1254,7 @@ class DocView(DocGenericView, DocHandler): | ... | @@ -1249,7 +1254,7 @@ class DocView(DocGenericView, DocHandler): |
1249 | break | 1254 | break |
1250 | 1255 | ||
1251 | # tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)] | 1256 | # tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)] |
1252 | task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)]) | 1257 | task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1), 'N']) |
1253 | enqueue_res = rh.enqueue([task], is_priority) | 1258 | enqueue_res = rh.enqueue([task], is_priority) |
1254 | 1259 | ||
1255 | self.running_log.info('[mock doc upload success] [args={0}] [business_type={1}] [doc_id={2}] ' | 1260 | self.running_log.info('[mock doc upload success] [args={0}] [business_type={1}] [doc_id={2}] ' |
... | @@ -2069,3 +2074,53 @@ class InvoiceQueryInfoView(GenericView): | ... | @@ -2069,3 +2074,53 @@ class InvoiceQueryInfoView(GenericView): |
2069 | except Exception as e: | 2074 | except Exception as e: |
2070 | self.running_log.error("invoice info request to java error, url:{0}, param:{1}, errorMsg:{2}".format( | 2075 | self.running_log.error("invoice info request to java error, url:{0}, param:{1}, errorMsg:{2}".format( |
2071 | url, json.dumps(body), traceback.format_exc())) | 2076 | url, json.dumps(body), traceback.format_exc())) |
2077 | |||
2078 | class DocReOcrView(GenericView, DocHandler): | ||
2079 | permission_classes = [IsAuthenticated] | ||
2080 | authentication_classes = [OAuth2AuthenticationWithUser] | ||
2081 | |||
2082 | # required_scopes = ['write'] | ||
2083 | |||
2084 | # 现有文件重新识别接口 | ||
2085 | @use_args(doc_reocr_args, location='data') | ||
2086 | def post(self, request, args): | ||
2087 | start_time = time.time() | ||
2088 | |||
2089 | application_entity = args.get('application_entity') | ||
2090 | doc_id = args.get('doc_id') | ||
2091 | |||
2092 | doc_class, prefix = self.get_doc_class(application_entity) | ||
2093 | doc = doc_class.objects.filter(id=doc_id).first() | ||
2094 | |||
2095 | # 3. 选择队列进入 | ||
2096 | is_priority = PriorityApplication.objects.filter(application_id=doc.application_id, on_off=True).exists() | ||
2097 | is_zip = False | ||
2098 | |||
2099 | classify_1 = 0 | ||
2100 | # 电子合同 Econtract or OVP(FSM) | ||
2101 | if doc.data_source == consts.DATA_SOURCE_LIST[2] or doc.data_source == consts.DATA_SOURCE_LIST[3]: | ||
2102 | if doc.document_scheme == consts.DOC_SCHEME_LIST[1]: | ||
2103 | for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix): | ||
2104 | if keyword in doc.document_name: | ||
2105 | classify_1 = classify_1_tmp | ||
2106 | break | ||
2107 | # FSM合同:WEP/MSI/SC/SC2 | ||
2108 | elif doc.data_source == consts.DATA_SOURCE_LIST[0] and doc.document_scheme == consts.DOC_SCHEME_LIST[0]: | ||
2109 | for keyword, classify_1_tmp in consts.FSM_ECONTRACT_KEYWORDS_MAP.get(prefix): | ||
2110 | if keyword in doc.document_name: | ||
2111 | classify_1 = classify_1_tmp | ||
2112 | break | ||
2113 | |||
2114 | |||
2115 | if doc.document_name.endswith('.zip') or doc.document_name.endswith('.rar') or doc.document_name.endswith('.ZIP') \ | ||
2116 | or doc.document_name.endswith('.RAR'): | ||
2117 | is_zip = True | ||
2118 | |||
2119 | # task = 'AFC_11001_0_Y' 'AFC_11001_0_N' 最后的Y,N表示是否是reocr,N否,Y是 | ||
2120 | task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1), 'Y']) | ||
2121 | enqueue_res = rh.enqueue([task], is_priority, is_zip) | ||
2122 | self.running_log.info('[doc upload success] [args={0}] [business_type={1}] [doc_id={2}] ' | ||
2123 | '[is_priority={3}] [enqueue_res={4}] [classify_1={5}]'.format(args, prefix, doc.id, | ||
2124 | is_priority, enqueue_res, classify_1)) | ||
2125 | |||
2126 | return response.ok() | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
-
Please register or sign in to post a comment