From b331bf72257f709acf3ebb1185448ab05708fd99 Mon Sep 17 00:00:00 2001 From: chenyao <chenyao@situdata.com> Date: Tue, 3 Dec 2024 19:12:16 +0800 Subject: [PATCH] 添加income_keywords的处理 --- src/apps/doc/management/commands/ocr_process.py | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/src/apps/doc/management/commands/ocr_process.py b/src/apps/doc/management/commands/ocr_process.py index 8416348..df8d0f8 100644 --- a/src/apps/doc/management/commands/ocr_process.py +++ b/src/apps/doc/management/commands/ocr_process.py @@ -177,7 +177,7 @@ class Command(BaseCommand, LoggerMixin): # raise EDMSException(edms_exc) # self.online_log.info('{0} [edms download success] [pdf_path={1}]'.format(self.log_base, pdf_path)) - def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino, part_idx): + def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino, part_idx, income_keywords_dictionary): sheets = ocr_data.get('data', []) if not sheets: res_list.append((pno, ino, part_idx, consts.RES_SUCCESS_EMPTY)) @@ -196,6 +196,10 @@ class Command(BaseCommand, LoggerMixin): c1 = cell.get('start_column') r1 = cell.get('start_row') words = cell.get('words') + if words is not None: + if words in consts.INCOME_KEYWORDS_LIST: + if consts.INCOME_KEYWORDS_DICT.get(words) not in income_keywords_dictionary["income_keywords"]: + income_keywords_dictionary["income_keywords"].append(consts.INCOME_KEYWORDS_DICT.setdefault(words, "")) ws.cell(row=r1 + 1, column=c1 + 1, value=words) # 真伪 @@ -921,7 +925,7 @@ class Command(BaseCommand, LoggerMixin): summary['role'] = self.get_most(summary['role']) return bs_summary - def rebuild_bs(self, bs_summary): + def rebuild_bs(self, bs_summary, income_keywords_dictionary): # bs_summary = { # '卡号': { # 'classify': 0, @@ -935,7 +939,23 @@ class Command(BaseCommand, LoggerMixin): # 'sheet': ['sheet_name'] # } # } + + # income_keywords_dictionary = { + # 'income_keywords': [ # 其中 0-8 个 + # 'yanglaojin', + # "shebao", + # "daifagongzi", + # "gongziruzhang", + # "jiangjin", + # "yanglaobaoxian", + # "daifa", + # "gongzi" + # ] + # } res = [] + income_keywords_list = income_keywords_dictionary.get('income_keywords', []) + income_filtered_keywords = [keyword_str for keyword_str in income_keywords_list if keyword_str] + income_keywords_str = ",".join(income_filtered_keywords) for bs_info in bs_summary.values(): try: print_date = bs_info.get('print_time', '').strftime("%Y-%m-%d") @@ -951,7 +971,8 @@ class Command(BaseCommand, LoggerMixin): 'print_time': print_date, 'timedelta': bs_info.get('timedelta', ''), 'verify': bs_info.get('verify_res_ebank', True), - 'e_bank': bs_info.get('e_bank', False) + 'e_bank': bs_info.get('e_bank', False), + 'income_keywords': income_keywords_str } ) return res @@ -1742,6 +1763,7 @@ class Command(BaseCommand, LoggerMixin): license_summary = {} contract_result = {} contract_result_compare = {} + income_keywords_dictionary = {"income_keywords": []} # 添加财报三个报表的处理 financial_statement_dict = {} # 添加财报情况说明的处理 @@ -1939,7 +1961,7 @@ class Command(BaseCommand, LoggerMixin): ino, part_idx, img_path, contract_result_compare) else: # 流水处理 bs_classify_set.add(classify) - self.bs_process(wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino, part_idx) + self.bs_process(wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino, part_idx, income_keywords_dictionary) else: res_list.append((pno, ino, part_idx, consts.RES_FAILED_1)) self.online_log.info('{0} [ocr_1 res error] [img={1}]'.format(self.log_base, img_path)) @@ -2095,7 +2117,7 @@ class Command(BaseCommand, LoggerMixin): license_summary, contract_result_compare)) self.rebuild_contract(license_summary, contract_result_compare) - bs_rebuild = self.rebuild_bs(merged_bs_summary) + bs_rebuild = self.rebuild_bs(merged_bs_summary, income_keywords_dictionary) if len(bs_rebuild) > 0: license_summary[consts.BS_CLASSIFY] = bs_rebuild -- libgit2 0.24.0