91ff8153 by 冯轩

init reocr

1 parent e08e5c00
...@@ -100,7 +100,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -100,7 +100,7 @@ class Command(BaseCommand, LoggerMixin):
100 if len(info_tuple) == 2: 100 if len(info_tuple) == 2:
101 business_type, doc_id_str = info_tuple 101 business_type, doc_id_str = info_tuple
102 else: 102 else:
103 business_type, doc_id_str, classify_1_str = info_tuple 103 business_type, doc_id_str, classify_1_str, re_ocr_flag = info_tuple
104 doc_id = int(doc_id_str) 104 doc_id = int(doc_id_str)
105 doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc 105 doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
106 zip_doc = doc_class.objects.filter(id=doc_id).first() 106 zip_doc = doc_class.objects.filter(id=doc_id).first()
...@@ -124,7 +124,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -124,7 +124,7 @@ class Command(BaseCommand, LoggerMixin):
124 else: 124 else:
125 self.online_log.info('{0} [zip_2_pdfs] [db save end] [task_str={1}]'.format( 125 self.online_log.info('{0} [zip_2_pdfs] [db save end] [task_str={1}]'.format(
126 self.log_base, task_str)) 126 self.log_base, task_str))
127 return zip_doc, business_type 127 return zip_doc, business_type, re_ocr_flag
128 128
129 def get_doc_info(self, task_str, is_priority=False): 129 def get_doc_info(self, task_str, is_priority=False):
130 try: 130 try:
...@@ -135,7 +135,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -135,7 +135,7 @@ class Command(BaseCommand, LoggerMixin):
135 classify_1_str = '0' 135 classify_1_str = '0'
136 rebuild_task_str = task_str 136 rebuild_task_str = task_str
137 else: 137 else:
138 business_type, doc_id_str, classify_1_str = info_tuple 138 business_type, doc_id_str, classify_1_str, re_ocr_flag = info_tuple
139 rebuild_task_str = '{0}{1}{2}'.format(business_type, consts.SPLIT_STR, doc_id_str) 139 rebuild_task_str = '{0}{1}{2}'.format(business_type, consts.SPLIT_STR, doc_id_str)
140 doc_id = int(doc_id_str) 140 doc_id = int(doc_id_str)
141 doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc 141 doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
...@@ -160,7 +160,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -160,7 +160,7 @@ class Command(BaseCommand, LoggerMixin):
160 else: 160 else:
161 self.online_log.info('{0} [get_doc_info] [db save end] [task_str={1}] [is_priority={2}]'.format( 161 self.online_log.info('{0} [get_doc_info] [db save end] [task_str={1}] [is_priority={2}]'.format(
162 self.log_base, task_str, is_priority)) 162 self.log_base, task_str, is_priority))
163 return doc, business_type, rebuild_task_str, classify_1_str 163 return doc, business_type, rebuild_task_str, classify_1_str, re_ocr_flag
164 164
165 # def pdf_download(self, doc, pdf_path): 165 # def pdf_download(self, doc, pdf_path):
166 # if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): 166 # if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
...@@ -1202,7 +1202,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1202,7 +1202,7 @@ class Command(BaseCommand, LoggerMixin):
1202 self.online_log.info('{0} [zip_2_pdfs] [task={1}]'.format(self.log_base, task_str)) 1202 self.online_log.info('{0} [zip_2_pdfs] [task={1}]'.format(self.log_base, task_str))
1203 1203
1204 # 2. 修改doc状态: 识别中 1204 # 2. 修改doc状态: 识别中
1205 zip_doc, business_type = self.get_zip_doc_info(task_str) 1205 zip_doc, business_type, re_ocr_flag = self.get_zip_doc_info(task_str)
1206 if zip_doc is None: 1206 if zip_doc is None:
1207 time.sleep(self.sleep_time_doc_get) 1207 time.sleep(self.sleep_time_doc_get)
1208 continue 1208 continue
...@@ -1287,7 +1287,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1287,7 +1287,7 @@ class Command(BaseCommand, LoggerMixin):
1287 target_pdf_path = os.path.join(pdf_doc_data_path, '{0}.pdf'.format(pdf_doc.id)) 1287 target_pdf_path = os.path.join(pdf_doc_data_path, '{0}.pdf'.format(pdf_doc.id))
1288 shutil.move(pdf_path, target_pdf_path) 1288 shutil.move(pdf_path, target_pdf_path)
1289 1289
1290 pdf_task_str = consts.SPLIT_STR.join([business_type, str(pdf_doc.id), '0']) 1290 pdf_task_str = consts.SPLIT_STR.join([business_type, str(pdf_doc.id), '0', re_ocr_flag])
1291 pdf_task_str_list.append(pdf_task_str) 1291 pdf_task_str_list.append(pdf_task_str)
1292 except Exception as e: 1292 except Exception as e:
1293 self.online_log.warn('{0} [zip_2_pdfs] [recreate pdf task failed] [task={1}] [pdf_path={2}]' 1293 self.online_log.warn('{0} [zip_2_pdfs] [recreate pdf task failed] [task={1}] [pdf_path={2}]'
...@@ -1336,7 +1336,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1336,7 +1336,7 @@ class Command(BaseCommand, LoggerMixin):
1336 1336
1337 try: 1337 try:
1338 # 1. 从队列获取文件信息 1338 # 1. 从队列获取文件信息
1339 doc, business_type, task_str, classify_1_str = self.get_doc_info(task_str, is_priority) 1339 doc, business_type, task_str, classify_1_str, re_ocr_flag = self.get_doc_info(task_str, is_priority)
1340 # 队列为空时的处理 1340 # 队列为空时的处理
1341 if doc is None: 1341 if doc is None:
1342 time.sleep(self.sleep_time_doc_get) 1342 time.sleep(self.sleep_time_doc_get)
...@@ -1386,7 +1386,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1386,7 +1386,8 @@ class Command(BaseCommand, LoggerMixin):
1386 self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format( 1386 self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
1387 self.log_base, task_str, times)) 1387 self.log_base, task_str, times))
1388 start_time = time.time() 1388 start_time = time.time()
1389 pdf_handler.extract_image(max_img_count) 1389 max_img_count_or_none = None if re_ocr_flag == 'Y' else max_img_count
1390 pdf_handler.extract_image(max_img_count_or_none)
1390 end_time = time.time() 1391 end_time = time.time()
1391 speed_time = int(end_time - start_time) 1392 speed_time = int(end_time - start_time)
1392 self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'.format( 1393 self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'.format(
...@@ -1404,7 +1405,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1404,7 +1405,7 @@ class Command(BaseCommand, LoggerMixin):
1404 self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format( 1405 self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format(
1405 self.log_base, task_str)) 1406 self.log_base, task_str))
1406 raise Exception('pdf img empty') 1407 raise Exception('pdf img empty')
1407 elif pdf_handler.img_count >= max_img_count: 1408 elif re_ocr_flag == 'N' and pdf_handler.img_count >= max_img_count:
1408 self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format( 1409 self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format(
1409 self.log_base, task_str, pdf_handler.img_count)) 1410 self.log_base, task_str, pdf_handler.img_count))
1410 1411
......
...@@ -589,6 +589,11 @@ invoice_download_args = { ...@@ -589,6 +589,11 @@ invoice_download_args = {
589 'application_ids': fields.Str(required=True), 589 'application_ids': fields.Str(required=True),
590 } 590 }
591 591
592 doc_reocr_args = {
593 'doc_id': fields.Int(required=True),
594 'application_entity': fields.Int(required=True),
595 }
596
592 597
593 class UploadDocView(GenericView, DocHandler): 598 class UploadDocView(GenericView, DocHandler):
594 # permission_classes = [] 599 # permission_classes = []
...@@ -698,7 +703,7 @@ class UploadDocView(GenericView, DocHandler): ...@@ -698,7 +703,7 @@ class UploadDocView(GenericView, DocHandler):
698 or document_name.endswith('.RAR'): 703 or document_name.endswith('.RAR'):
699 is_zip = True 704 is_zip = True
700 705
701 task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)]) 706 task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1), 'N'])
702 enqueue_res = rh.enqueue([task], is_priority, is_zip) 707 enqueue_res = rh.enqueue([task], is_priority, is_zip)
703 self.running_log.info('[doc upload success] [args={0}] [business_type={1}] [doc_id={2}] ' 708 self.running_log.info('[doc upload success] [args={0}] [business_type={1}] [doc_id={2}] '
704 '[is_priority={3}] [enqueue_res={4}] [is_fsm={5} [classify_1={6}]]'.format(args, prefix, doc.id, 709 '[is_priority={3}] [enqueue_res={4}] [is_fsm={5} [classify_1={6}]]'.format(args, prefix, doc.id,
...@@ -1249,7 +1254,7 @@ class DocView(DocGenericView, DocHandler): ...@@ -1249,7 +1254,7 @@ class DocView(DocGenericView, DocHandler):
1249 break 1254 break
1250 1255
1251 # tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)] 1256 # tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)]
1252 task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)]) 1257 task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1), 'N'])
1253 enqueue_res = rh.enqueue([task], is_priority) 1258 enqueue_res = rh.enqueue([task], is_priority)
1254 1259
1255 self.running_log.info('[mock doc upload success] [args={0}] [business_type={1}] [doc_id={2}] ' 1260 self.running_log.info('[mock doc upload success] [args={0}] [business_type={1}] [doc_id={2}] '
...@@ -2068,4 +2073,54 @@ class InvoiceQueryInfoView(GenericView): ...@@ -2068,4 +2073,54 @@ class InvoiceQueryInfoView(GenericView):
2068 return response2.ok(data=java_result) 2073 return response2.ok(data=java_result)
2069 except Exception as e: 2074 except Exception as e:
2070 self.running_log.error("invoice info request to java error, url:{0}, param:{1}, errorMsg:{2}".format( 2075 self.running_log.error("invoice info request to java error, url:{0}, param:{1}, errorMsg:{2}".format(
2071 url, json.dumps(body), traceback.format_exc()))
...\ No newline at end of file ...\ No newline at end of file
2076 url, json.dumps(body), traceback.format_exc()))
2077
2078 class DocReOcrView(GenericView, DocHandler):
2079 permission_classes = [IsAuthenticated]
2080 authentication_classes = [OAuth2AuthenticationWithUser]
2081
2082 # required_scopes = ['write']
2083
2084 # 现有文件重新识别接口
2085 @use_args(doc_reocr_args, location='data')
2086 def post(self, request, args):
2087 start_time = time.time()
2088
2089 application_entity = args.get('application_entity')
2090 doc_id = args.get('doc_id')
2091
2092 doc_class, prefix = self.get_doc_class(application_entity)
2093 doc = doc_class.objects.filter(id=doc_id).first()
2094
2095 # 3. 选择队列进入
2096 is_priority = PriorityApplication.objects.filter(application_id=doc.application_id, on_off=True).exists()
2097 is_zip = False
2098
2099 classify_1 = 0
2100 # 电子合同 Econtract or OVP(FSM)
2101 if doc.data_source == consts.DATA_SOURCE_LIST[2] or doc.data_source == consts.DATA_SOURCE_LIST[3]:
2102 if doc.document_scheme == consts.DOC_SCHEME_LIST[1]:
2103 for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix):
2104 if keyword in doc.document_name:
2105 classify_1 = classify_1_tmp
2106 break
2107 # FSM合同:WEP/MSI/SC/SC2
2108 elif doc.data_source == consts.DATA_SOURCE_LIST[0] and doc.document_scheme == consts.DOC_SCHEME_LIST[0]:
2109 for keyword, classify_1_tmp in consts.FSM_ECONTRACT_KEYWORDS_MAP.get(prefix):
2110 if keyword in doc.document_name:
2111 classify_1 = classify_1_tmp
2112 break
2113
2114
2115 if doc.document_name.endswith('.zip') or doc.document_name.endswith('.rar') or doc.document_name.endswith('.ZIP') \
2116 or doc.document_name.endswith('.RAR'):
2117 is_zip = True
2118
2119 # task = 'AFC_11001_0_Y' 'AFC_11001_0_N' 最后的Y,N表示是否是reocr,N否,Y是
2120 task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1), 'Y'])
2121 enqueue_res = rh.enqueue([task], is_priority, is_zip)
2122 self.running_log.info('[doc upload success] [args={0}] [business_type={1}] [doc_id={2}] '
2123 '[is_priority={3}] [enqueue_res={4}] [classify_1={5}]'.format(args, prefix, doc.id,
2124 is_priority, enqueue_res, classify_1))
2125
2126 return response.ok()
...\ No newline at end of file ...\ No newline at end of file
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!