add zip & rar file

周伟奇
Showing 4 changed files with 107 additions and 12 deletions
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/views.py
src/common/redis_cache/handler.py
src/common/tools/file_tools.py
--- a/src/apps/doc/management/commands/ocr_process.py
View file @6010c32
+++ b/src/apps/doc/management/commands/ocr_process.py
View file @6010c32
--- a/src/apps/doc/views.py
View file @6010c32
+++ b/src/apps/doc/views.py
View file @6010c32
@@ -566,15 +566,14 @@ class UploadDocView(GenericView, DocHandler):
        data_source = self.fix_data_source(data_source)
        document_scheme = self.fix_scheme(document_scheme)
-        if document_name.endswith('.zip'):
+        # if document_name.endswith('.zip'):
-            self.running_log.info('[doc upload success] [zip file skip] [args={0}]'.format(args))
+        #     self.running_log.info('[doc upload success] [zip file skip] [args={0}]'.format(args))
-            return response.ok()
+        #     return response.ok()
        if data_source == consts.DATA_SOURCE_LIST[1]:
-            if isinstance(document_name, str):
+            if document_name.endswith('-证书.pdf') or document_name.endswith('-证书'):
-                if document_name.endswith('-证书.pdf') or document_name.endswith('-证书'):
+                self.running_log.info('[doc upload success] [eapp license skip] [args={0}]'.format(args))
-                    self.running_log.info('[doc upload success] [eapp license skip] [args={0}]'.format(args))
+                return response.ok()
-                    return response.ok()
        # 2. 根据业务类型分库存储
        doc_class, prefix = self.get_doc_class(business_type)
@@ -590,17 +589,24 @@ class UploadDocView(GenericView, DocHandler):
            data_source=data_source,
            upload_finish_time=document.get('uploadFinishTime'),
        )
        # 3. 选择队列进入
        is_priority = PriorityApplication.objects.filter(application_id=application_id, on_off=True).exists()
+        is_zip = False
        classify_1 = 0
+        # 电子合同
        if data_source == consts.DATA_SOURCE_LIST[-1] and document_scheme == consts.DOC_SCHEME_LIST[1]:
            for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix):
                if keyword in document_name:
                    classify_1 = classify_1_tmp
                    break
+        elif document_name.endswith('.zip') or document_name.endswith('.rar') or document_name.endswith('.ZIP') \
+                or document_name.endswith('.RAR'):
+            is_zip = True
        task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)])
-        enqueue_res = rh.enqueue([task], is_priority)
+        enqueue_res = rh.enqueue([task], is_priority, is_zip)
        self.running_log.info('[doc upload success] [args={0}] [business_type={1}] [doc_id={2}] '
                              '[is_priority={3}] [enqueue_res={4}]'.format(args, prefix, doc.id,
                                                                           is_priority, enqueue_res))
@@ -665,7 +671,7 @@ class PriorityDocView(GenericView, DocHandler):
                self.running_log.info(
                    '[priority doc success] [args={0}]'.format(args))
            else:
-                enqueue_res = rh.enqueue(tasks_list, is_priority=True)
+                enqueue_res = rh.enqueue(tasks_list, is_priority=True)  # TODO 可能把压缩文件放入优先队列
                self.running_log.info('[priority doc success] [args={0}] [tasks_list={1}] [enqueue_res={2}]'.format(
                    args, tasks_list, enqueue_res))
        return response.ok()
--- a/src/common/redis_cache/handler.py
View file @6010c32
+++ b/src/common/redis_cache/handler.py
View file @6010c32
@@ -35,16 +35,27 @@ class RedisHandler:
        self.prefix = 'bwm_ocr'
        self.common_queue_key = '{0}:common_queue'.format(self.prefix)
        self.priority_queue_key = '{0}:priority_queue'.format(self.prefix)
+        self.zip_queue_key = '{0}:zip_queue'.format(self.prefix)
        self.session_id_key = '{0}:session_id'.format(self.prefix)
        self.cms_token_key = '{0}:cms_token'.format(self.prefix)
        self.ecm_token_key = '{0}:ecm_token'.format(self.prefix)
        self.login_limit_key = '{0}:login_limit'.format(self.prefix)
-    def enqueue(self, tasks, is_priority=False):
+    def enqueue(self, tasks, is_priority=False, is_zip=False):
        # 1
-        key = self.priority_queue_key if is_priority else self.common_queue_key
+        if is_zip:
+            key = self.zip_queue_key
+        elif is_priority:
+            key = self.priority_queue_key
+        else:
+            key = self.common_queue_key
        return self.redis.lpush(key, tasks)
+    def dequeue_zip(self):
+        # task or None
+        task = self.redis.rpop(self.zip_queue_key)
+        return task
    def dequeue(self):
        # task or None
        task = self.redis.rpop(self.priority_queue_key)
--- a/src/common/tools/file_tools.py
View file @6010c32
+++ b/src/common/tools/file_tools.py
View file @6010c32
 import os
+import re
+import zipfile
+import rarfile
 from zipfile import ZipFile
@@ -18,3 +22,77 @@ def write_zip_file(dir_name, zipfile_path):
                src_file_path = os.path.join(root, single_file)
                file_target_path = os.path.join(root_target_path, single_file)
                z.write(src_file_path, file_target_path)
+def get_pwd_list_from_str(doc_name):
+    try:
+        pwd_list = re.findall(r'\d{6}', doc_name)
+        return pwd_list
+    except Exception as e:
+        return []
+def extract_zip_or_rar(file_path, extract_path, pwd_list=[]):
+    if file_path.endswith('.zip') or file_path.endswith('.ZIP'):
+        if len(pwd_list) > 0:
+            for password in pwd_list:
+                try:
+                    with zipfile.ZipFile(file_path) as zf:
+                        zf.extractall(extract_path, pwd=bytes(password, 'utf-8'))
+                except Exception as e:
+                    continue
+                else:
+                    return True
+            else:
+                return False
+        else:
+            try:
+                with zipfile.ZipFile(file_path) as zf:
+                    zf.extractall(extract_path)
+            except Exception as e:
+                return False
+            else:
+                return True
+    elif file_path.endswith('.rar') or file_path.endswith('.RAR'):
+        if len(pwd_list) > 0:
+            for password in pwd_list:
+                try:
+                    with rarfile.RarFile(file_path) as rf:
+                        rf.extractall(extract_path, pwd=password)
+                except Exception as e:
+                    continue
+                else:
+                    return True
+            else:
+                return False
+        else:
+            try:
+                with rarfile.RarFile(file_path) as rf:
+                    rf.extractall(extract_path)
+            except Exception as e:
+                return False
+            else:
+                return True
+    else:
+        return False
+def get_file_paths(input_path, suffix_list):
+    """
+    Args:
+        input_path: str 目标目录
+        suffix_list: list 搜索的文件的后缀列表
+    Returns: list 搜索到的相关文件绝对路径列表
+    """
+    for parent, _, filenames in os.walk(input_path):
+        for filename in filenames:
+            for suffix in suffix_list:
+                if filename.endswith(suffix):
+                    file_path = os.path.join(parent, filename)
+                    break
+            else:
+                continue
+            yield file_path