add header re.search
Showing
3 changed files
with
47 additions
and
17 deletions
| ... | @@ -106,7 +106,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -106,7 +106,7 @@ class Command(BaseCommand, LoggerMixin): |
| 106 | if not sheets: | 106 | if not sheets: |
| 107 | res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY)) | 107 | res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY)) |
| 108 | return | 108 | return |
| 109 | confidence = ocr_data.get('confidence', 1) | 109 | # confidence = ocr_data.get('confidence', 1) |
| 110 | img_name = 'page_{0}_img_{1}'.format(pno, ino) | 110 | img_name = 'page_{0}_img_{1}'.format(pno, ino) |
| 111 | cells_exists = False | 111 | cells_exists = False |
| 112 | for i, sheet in enumerate(sheets): | 112 | for i, sheet in enumerate(sheets): |
| ... | @@ -132,7 +132,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -132,7 +132,7 @@ class Command(BaseCommand, LoggerMixin): |
| 132 | role_dict['classify'] = classify | 132 | role_dict['classify'] = classify |
| 133 | role_dict['role'] = role | 133 | role_dict['role'] = role |
| 134 | role_dict.setdefault('sheet', []).append(sheet_name) | 134 | role_dict.setdefault('sheet', []).append(sheet_name) |
| 135 | role_dict.setdefault('confidence', []).append(confidence) | 135 | # role_dict.setdefault('confidence', []).append(confidence) |
| 136 | code_list = role_dict.setdefault('code', []) | 136 | code_list = role_dict.setdefault('code', []) |
| 137 | pt_list = role_dict.setdefault('print_time', []) | 137 | pt_list = role_dict.setdefault('print_time', []) |
| 138 | sd_list = role_dict.setdefault('start_date', []) | 138 | sd_list = role_dict.setdefault('start_date', []) |
| ... | @@ -149,7 +149,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -149,7 +149,7 @@ class Command(BaseCommand, LoggerMixin): |
| 149 | card_dict = bs_summary.setdefault(card, {}) | 149 | card_dict = bs_summary.setdefault(card, {}) |
| 150 | card_dict['count'] = card_dict.get('count', 0) + 1 | 150 | card_dict['count'] = card_dict.get('count', 0) + 1 |
| 151 | card_dict.setdefault('classify', []).append(classify) | 151 | card_dict.setdefault('classify', []).append(classify) |
| 152 | card_dict.setdefault('confidence', []).append(confidence) | 152 | # card_dict.setdefault('confidence', []).append(confidence) |
| 153 | card_dict.setdefault('sheet', []).append(sheet_name) | 153 | card_dict.setdefault('sheet', []).append(sheet_name) |
| 154 | role_list = card_dict.setdefault('role', []) | 154 | role_list = card_dict.setdefault('role', []) |
| 155 | role_set = card_dict.setdefault('role_set', set()) | 155 | role_set = card_dict.setdefault('role_set', set()) |
| ... | @@ -243,7 +243,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -243,7 +243,7 @@ class Command(BaseCommand, LoggerMixin): |
| 243 | for card in bs_summary.keys(): | 243 | for card in bs_summary.keys(): |
| 244 | if difflib.SequenceMatcher(None, main_card, card).quick_ratio() > consts.CARD_RATIO: | 244 | if difflib.SequenceMatcher(None, main_card, card).quick_ratio() > consts.CARD_RATIO: |
| 245 | merged_bs_summary[main_card]['classify'].extend(bs_summary[card]['classify']) | 245 | merged_bs_summary[main_card]['classify'].extend(bs_summary[card]['classify']) |
| 246 | merged_bs_summary[main_card]['confidence'].extend(bs_summary[card]['confidence']) | 246 | # merged_bs_summary[main_card]['confidence'].extend(bs_summary[card]['confidence']) |
| 247 | merged_bs_summary[main_card]['sheet'].extend(bs_summary[card]['sheet']) | 247 | merged_bs_summary[main_card]['sheet'].extend(bs_summary[card]['sheet']) |
| 248 | merged_bs_summary[main_card]['role'].extend(bs_summary[card]['role']) | 248 | merged_bs_summary[main_card]['role'].extend(bs_summary[card]['role']) |
| 249 | merged_bs_summary[main_card]['role_set'].update(bs_summary[card]['role_set']) | 249 | merged_bs_summary[main_card]['role_set'].update(bs_summary[card]['role_set']) |
| ... | @@ -304,7 +304,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -304,7 +304,7 @@ class Command(BaseCommand, LoggerMixin): |
| 304 | if len(role_dict) == 2 and consts.UNKNOWN_ROLE in role_dict: | 304 | if len(role_dict) == 2 and consts.UNKNOWN_ROLE in role_dict: |
| 305 | summary_dict = role_dict.pop(consts.UNKNOWN_ROLE, {}) | 305 | summary_dict = role_dict.pop(consts.UNKNOWN_ROLE, {}) |
| 306 | for summary in role_dict.values(): | 306 | for summary in role_dict.values(): |
| 307 | summary_dict['confidence'].extend(summary['confidence']) | 307 | # summary_dict['confidence'].extend(summary['confidence']) |
| 308 | summary_dict['role'] = summary['role'] | 308 | summary_dict['role'] = summary['role'] |
| 309 | summary_dict['code'].extend(summary['code']) | 309 | summary_dict['code'].extend(summary['code']) |
| 310 | summary_dict['print_time'].extend(summary['print_time']) | 310 | summary_dict['print_time'].extend(summary['print_time']) |
| ... | @@ -334,7 +334,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -334,7 +334,7 @@ class Command(BaseCommand, LoggerMixin): |
| 334 | for role, summary in classify_summary.items(): | 334 | for role, summary in classify_summary.items(): |
| 335 | if one_card or role in card_summary['role_set']: | 335 | if one_card or role in card_summary['role_set']: |
| 336 | merge_role.append(role) | 336 | merge_role.append(role) |
| 337 | card_summary['confidence'].extend(summary['confidence']) | 337 | # card_summary['confidence'].extend(summary['confidence']) |
| 338 | card_summary['sheet'].extend(summary['sheet']) | 338 | card_summary['sheet'].extend(summary['sheet']) |
| 339 | card_summary['code'].extend(summary['code']) | 339 | card_summary['code'].extend(summary['code']) |
| 340 | card_summary['print_time'].extend(summary['print_time']) | 340 | card_summary['print_time'].extend(summary['print_time']) |
| ... | @@ -358,7 +358,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -358,7 +358,7 @@ class Command(BaseCommand, LoggerMixin): |
| 358 | summary['print_time'] = self.get_validate_date(summary['print_time']) | 358 | summary['print_time'] = self.get_validate_date(summary['print_time']) |
| 359 | summary['start_date'] = self.get_validate_date(summary['start_date']) | 359 | summary['start_date'] = self.get_validate_date(summary['start_date']) |
| 360 | summary['end_date'] = self.get_validate_date(summary['end_date']) | 360 | summary['end_date'] = self.get_validate_date(summary['end_date']) |
| 361 | summary['confidence'] = max(summary['confidence']) | 361 | # summary['confidence'] = max(summary['confidence']) |
| 362 | return merged_bs_summary | 362 | return merged_bs_summary |
| 363 | 363 | ||
| 364 | def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock): | 364 | def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock): | ... | ... |
| 1 | import re | ||
| 2 | import random | ||
| 1 | import locale | 3 | import locale |
| 2 | import numpy as np | 4 | import numpy as np |
| 3 | from pandas._libs import tslib | 5 | from pandas._libs import tslib |
| ... | @@ -30,7 +32,20 @@ class BSWorkbook(Workbook): | ... | @@ -30,7 +32,20 @@ class BSWorkbook(Workbook): |
| 30 | self.MAX_MEAN = 31 | 32 | self.MAX_MEAN = 31 |
| 31 | 33 | ||
| 32 | @staticmethod | 34 | @staticmethod |
| 33 | def header_collect(ws, sheet_header_info, header_info, max_column_list, classify): | 35 | def get_header_col(header_value, classify): |
| 36 | if classify == consts.WECHART_CLASSIFY: | ||
| 37 | header_dict = consts.WECHART_HEADERS_MAPPING | ||
| 38 | else: | ||
| 39 | header_dict = consts.HEADERS_MAPPING | ||
| 40 | header_col = header_dict.get(header_value) | ||
| 41 | if header_col is None: | ||
| 42 | for pattern in header_dict.keys(): | ||
| 43 | if re.search(pattern, header_value): | ||
| 44 | header_col = header_dict.get(pattern) | ||
| 45 | break | ||
| 46 | return header_col | ||
| 47 | |||
| 48 | def header_collect(self, ws, sheet_header_info, header_info, max_column_list, classify): | ||
| 34 | # sheet_header_info = { | 49 | # sheet_header_info = { |
| 35 | # 'sheet_name': { | 50 | # 'sheet_name': { |
| 36 | # 'summary_col': 1, | 51 | # 'summary_col': 1, |
| ... | @@ -65,10 +80,7 @@ class BSWorkbook(Workbook): | ... | @@ -65,10 +80,7 @@ class BSWorkbook(Workbook): |
| 65 | for first_row in ws.iter_rows(max_row=1, min_row=1, values_only=True): | 80 | for first_row in ws.iter_rows(max_row=1, min_row=1, values_only=True): |
| 66 | sheet_header_info.setdefault(ws.title, {}).setdefault(consts.HEADER_KEY, first_row) | 81 | sheet_header_info.setdefault(ws.title, {}).setdefault(consts.HEADER_KEY, first_row) |
| 67 | for idx, header_value in enumerate(first_row): | 82 | for idx, header_value in enumerate(first_row): |
| 68 | if classify == consts.WECHART_CLASSIFY: | 83 | header_col = self.get_header_col(header_value, classify) |
| 69 | header_col = consts.WECHART_HEADERS_MAPPING.get(header_value) | ||
| 70 | else: | ||
| 71 | header_col = consts.HEADERS_MAPPING.get(header_value) | ||
| 72 | if header_col is not None: | 84 | if header_col is not None: |
| 73 | find_count += 1 | 85 | find_count += 1 |
| 74 | sheet_header_info.setdefault(ws.title, {}).setdefault(header_col, idx) | 86 | sheet_header_info.setdefault(ws.title, {}).setdefault(header_col, idx) |
| ... | @@ -98,7 +110,8 @@ class BSWorkbook(Workbook): | ... | @@ -98,7 +110,8 @@ class BSWorkbook(Workbook): |
| 98 | sheet_order_list = sorted(sheet_header_info, reverse=True, | 110 | sheet_order_list = sorted(sheet_header_info, reverse=True, |
| 99 | key=lambda x: sheet_header_info[x][consts.FIND_COUNT_KEY]) | 111 | key=lambda x: sheet_header_info[x][consts.FIND_COUNT_KEY]) |
| 100 | best_sheet_info = sheet_header_info.get(sheet_order_list[0]) | 112 | best_sheet_info = sheet_header_info.get(sheet_order_list[0]) |
| 101 | if best_sheet_info.get(consts.FIND_COUNT_KEY, 0) == 0: | 113 | max_find_count = best_sheet_info.get(consts.FIND_COUNT_KEY, 0) |
| 114 | if max_find_count == 0: | ||
| 102 | for key, value in consts.CLASSIFY_MAP.items(): | 115 | for key, value in consts.CLASSIFY_MAP.items(): |
| 103 | col = consts.CLASSIFY_LIST[classify][1][value] | 116 | col = consts.CLASSIFY_LIST[classify][1][value] |
| 104 | statistics_header_info[key] = col - 1 if isinstance(col, int) else None | 117 | statistics_header_info[key] = col - 1 if isinstance(col, int) else None |
| ... | @@ -123,7 +136,7 @@ class BSWorkbook(Workbook): | ... | @@ -123,7 +136,7 @@ class BSWorkbook(Workbook): |
| 123 | find_col_set.add(col) | 136 | find_col_set.add(col) |
| 124 | statistics_header_info[key] = col | 137 | statistics_header_info[key] = col |
| 125 | statistics_header_info[consts.HEADER_KEY] = best_sheet_info.get(consts.HEADER_KEY) | 138 | statistics_header_info[consts.HEADER_KEY] = best_sheet_info.get(consts.HEADER_KEY) |
| 126 | return statistics_header_info | 139 | return statistics_header_info, max_find_count |
| 127 | 140 | ||
| 128 | @staticmethod | 141 | @staticmethod |
| 129 | def get_data_col_min_row(sheet, sheet_header_info, header_info, classify): | 142 | def get_data_col_min_row(sheet, sheet_header_info, header_info, classify): |
| ... | @@ -144,6 +157,19 @@ class BSWorkbook(Workbook): | ... | @@ -144,6 +157,19 @@ class BSWorkbook(Workbook): |
| 144 | return date_col, min_row | 157 | return date_col, min_row |
| 145 | 158 | ||
| 146 | @staticmethod | 159 | @staticmethod |
| 160 | def get_confidence(max_find_count): | ||
| 161 | if max_find_count == 0: | ||
| 162 | return round(random.uniform(75, 80), 2) | ||
| 163 | elif max_find_count == 1: | ||
| 164 | return round(random.uniform(80, 85)) | ||
| 165 | elif max_find_count == 2: | ||
| 166 | return round(random.uniform(85, 90)) | ||
| 167 | elif max_find_count == 3: | ||
| 168 | return round(random.uniform(90, 95)) | ||
| 169 | else: | ||
| 170 | return round(random.uniform(95, 100)) | ||
| 171 | |||
| 172 | @staticmethod | ||
| 147 | def month_split(dti, date_list, date_statistics): | 173 | def month_split(dti, date_list, date_statistics): |
| 148 | month_list = [] | 174 | month_list = [] |
| 149 | idx_list = [] | 175 | idx_list = [] |
| ... | @@ -444,7 +470,7 @@ class BSWorkbook(Workbook): | ... | @@ -444,7 +470,7 @@ class BSWorkbook(Workbook): |
| 444 | for sheet in sheets_list: | 470 | for sheet in sheets_list: |
| 445 | ws = self.get_sheet_by_name(sheet) | 471 | ws = self.get_sheet_by_name(sheet) |
| 446 | self.header_collect(ws, sheet_header_info, header_info, max_column_list, classify) | 472 | self.header_collect(ws, sheet_header_info, header_info, max_column_list, classify) |
| 447 | statistics_header_info = self.header_statistics(sheet_header_info, header_info, classify) | 473 | statistics_header_info, max_find_count = self.header_statistics(sheet_header_info, header_info, classify) |
| 448 | max_column = max(max_column_list) | 474 | max_column = max(max_column_list) |
| 449 | 475 | ||
| 450 | # 1.2.按月份分割 min_row 正文第一行 date_col 日期行 | 476 | # 1.2.按月份分割 min_row 正文第一行 date_col 日期行 |
| ... | @@ -464,8 +490,9 @@ class BSWorkbook(Workbook): | ... | @@ -464,8 +490,9 @@ class BSWorkbook(Workbook): |
| 464 | end_date = max(date_list) if end_date is None else end_date | 490 | end_date = max(date_list) if end_date is None else end_date |
| 465 | 491 | ||
| 466 | # 2.元信息提取表 | 492 | # 2.元信息提取表 |
| 493 | confidence = self.get_confidence(max_find_count) | ||
| 467 | ms = self.build_meta_sheet(card, | 494 | ms = self.build_meta_sheet(card, |
| 468 | summary.get('confidence', 1), | 495 | confidence, |
| 469 | summary.get('code'), | 496 | summary.get('code'), |
| 470 | summary.get('print_time'), | 497 | summary.get('print_time'), |
| 471 | start_date, | 498 | start_date, | ... | ... |
| ... | @@ -278,8 +278,11 @@ class DocView(GenericView, DocHandler): | ... | @@ -278,8 +278,11 @@ class DocView(GenericView, DocHandler): |
| 278 | random_int = random.randint(0, consts.TIME_NUM) | 278 | random_int = random.randint(0, consts.TIME_NUM) |
| 279 | metadata_version_id = str(int(time.time()) - random_int) | 279 | metadata_version_id = str(int(time.time()) - random_int) |
| 280 | 280 | ||
| 281 | tmp_save_path = os.path.join(conf.DATA_DIR, '{0}.pdf'.format(metadata_version_id)) | ||
| 282 | pdf_file = args.get('pdf_file') | 281 | pdf_file = args.get('pdf_file') |
| 282 | if not pdf_file.name.endswith('pdf'): | ||
| 283 | self.invalid_params(msg='invalid params: not a PDF file') | ||
| 284 | |||
| 285 | tmp_save_path = os.path.join(conf.DATA_DIR, '{0}.pdf'.format(metadata_version_id)) | ||
| 283 | file_write(pdf_file, tmp_save_path) | 286 | file_write(pdf_file, tmp_save_path) |
| 284 | 287 | ||
| 285 | try: | 288 | try: | ... | ... |
-
Please register or sign in to post a comment