add header re.search
Showing
3 changed files
with
47 additions
and
17 deletions
... | @@ -106,7 +106,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -106,7 +106,7 @@ class Command(BaseCommand, LoggerMixin): |
106 | if not sheets: | 106 | if not sheets: |
107 | res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY)) | 107 | res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY)) |
108 | return | 108 | return |
109 | confidence = ocr_data.get('confidence', 1) | 109 | # confidence = ocr_data.get('confidence', 1) |
110 | img_name = 'page_{0}_img_{1}'.format(pno, ino) | 110 | img_name = 'page_{0}_img_{1}'.format(pno, ino) |
111 | cells_exists = False | 111 | cells_exists = False |
112 | for i, sheet in enumerate(sheets): | 112 | for i, sheet in enumerate(sheets): |
... | @@ -132,7 +132,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -132,7 +132,7 @@ class Command(BaseCommand, LoggerMixin): |
132 | role_dict['classify'] = classify | 132 | role_dict['classify'] = classify |
133 | role_dict['role'] = role | 133 | role_dict['role'] = role |
134 | role_dict.setdefault('sheet', []).append(sheet_name) | 134 | role_dict.setdefault('sheet', []).append(sheet_name) |
135 | role_dict.setdefault('confidence', []).append(confidence) | 135 | # role_dict.setdefault('confidence', []).append(confidence) |
136 | code_list = role_dict.setdefault('code', []) | 136 | code_list = role_dict.setdefault('code', []) |
137 | pt_list = role_dict.setdefault('print_time', []) | 137 | pt_list = role_dict.setdefault('print_time', []) |
138 | sd_list = role_dict.setdefault('start_date', []) | 138 | sd_list = role_dict.setdefault('start_date', []) |
... | @@ -149,7 +149,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -149,7 +149,7 @@ class Command(BaseCommand, LoggerMixin): |
149 | card_dict = bs_summary.setdefault(card, {}) | 149 | card_dict = bs_summary.setdefault(card, {}) |
150 | card_dict['count'] = card_dict.get('count', 0) + 1 | 150 | card_dict['count'] = card_dict.get('count', 0) + 1 |
151 | card_dict.setdefault('classify', []).append(classify) | 151 | card_dict.setdefault('classify', []).append(classify) |
152 | card_dict.setdefault('confidence', []).append(confidence) | 152 | # card_dict.setdefault('confidence', []).append(confidence) |
153 | card_dict.setdefault('sheet', []).append(sheet_name) | 153 | card_dict.setdefault('sheet', []).append(sheet_name) |
154 | role_list = card_dict.setdefault('role', []) | 154 | role_list = card_dict.setdefault('role', []) |
155 | role_set = card_dict.setdefault('role_set', set()) | 155 | role_set = card_dict.setdefault('role_set', set()) |
... | @@ -243,7 +243,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -243,7 +243,7 @@ class Command(BaseCommand, LoggerMixin): |
243 | for card in bs_summary.keys(): | 243 | for card in bs_summary.keys(): |
244 | if difflib.SequenceMatcher(None, main_card, card).quick_ratio() > consts.CARD_RATIO: | 244 | if difflib.SequenceMatcher(None, main_card, card).quick_ratio() > consts.CARD_RATIO: |
245 | merged_bs_summary[main_card]['classify'].extend(bs_summary[card]['classify']) | 245 | merged_bs_summary[main_card]['classify'].extend(bs_summary[card]['classify']) |
246 | merged_bs_summary[main_card]['confidence'].extend(bs_summary[card]['confidence']) | 246 | # merged_bs_summary[main_card]['confidence'].extend(bs_summary[card]['confidence']) |
247 | merged_bs_summary[main_card]['sheet'].extend(bs_summary[card]['sheet']) | 247 | merged_bs_summary[main_card]['sheet'].extend(bs_summary[card]['sheet']) |
248 | merged_bs_summary[main_card]['role'].extend(bs_summary[card]['role']) | 248 | merged_bs_summary[main_card]['role'].extend(bs_summary[card]['role']) |
249 | merged_bs_summary[main_card]['role_set'].update(bs_summary[card]['role_set']) | 249 | merged_bs_summary[main_card]['role_set'].update(bs_summary[card]['role_set']) |
... | @@ -304,7 +304,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -304,7 +304,7 @@ class Command(BaseCommand, LoggerMixin): |
304 | if len(role_dict) == 2 and consts.UNKNOWN_ROLE in role_dict: | 304 | if len(role_dict) == 2 and consts.UNKNOWN_ROLE in role_dict: |
305 | summary_dict = role_dict.pop(consts.UNKNOWN_ROLE, {}) | 305 | summary_dict = role_dict.pop(consts.UNKNOWN_ROLE, {}) |
306 | for summary in role_dict.values(): | 306 | for summary in role_dict.values(): |
307 | summary_dict['confidence'].extend(summary['confidence']) | 307 | # summary_dict['confidence'].extend(summary['confidence']) |
308 | summary_dict['role'] = summary['role'] | 308 | summary_dict['role'] = summary['role'] |
309 | summary_dict['code'].extend(summary['code']) | 309 | summary_dict['code'].extend(summary['code']) |
310 | summary_dict['print_time'].extend(summary['print_time']) | 310 | summary_dict['print_time'].extend(summary['print_time']) |
... | @@ -334,7 +334,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -334,7 +334,7 @@ class Command(BaseCommand, LoggerMixin): |
334 | for role, summary in classify_summary.items(): | 334 | for role, summary in classify_summary.items(): |
335 | if one_card or role in card_summary['role_set']: | 335 | if one_card or role in card_summary['role_set']: |
336 | merge_role.append(role) | 336 | merge_role.append(role) |
337 | card_summary['confidence'].extend(summary['confidence']) | 337 | # card_summary['confidence'].extend(summary['confidence']) |
338 | card_summary['sheet'].extend(summary['sheet']) | 338 | card_summary['sheet'].extend(summary['sheet']) |
339 | card_summary['code'].extend(summary['code']) | 339 | card_summary['code'].extend(summary['code']) |
340 | card_summary['print_time'].extend(summary['print_time']) | 340 | card_summary['print_time'].extend(summary['print_time']) |
... | @@ -358,7 +358,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -358,7 +358,7 @@ class Command(BaseCommand, LoggerMixin): |
358 | summary['print_time'] = self.get_validate_date(summary['print_time']) | 358 | summary['print_time'] = self.get_validate_date(summary['print_time']) |
359 | summary['start_date'] = self.get_validate_date(summary['start_date']) | 359 | summary['start_date'] = self.get_validate_date(summary['start_date']) |
360 | summary['end_date'] = self.get_validate_date(summary['end_date']) | 360 | summary['end_date'] = self.get_validate_date(summary['end_date']) |
361 | summary['confidence'] = max(summary['confidence']) | 361 | # summary['confidence'] = max(summary['confidence']) |
362 | return merged_bs_summary | 362 | return merged_bs_summary |
363 | 363 | ||
364 | def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock): | 364 | def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock): | ... | ... |
1 | import re | ||
2 | import random | ||
1 | import locale | 3 | import locale |
2 | import numpy as np | 4 | import numpy as np |
3 | from pandas._libs import tslib | 5 | from pandas._libs import tslib |
... | @@ -30,7 +32,20 @@ class BSWorkbook(Workbook): | ... | @@ -30,7 +32,20 @@ class BSWorkbook(Workbook): |
30 | self.MAX_MEAN = 31 | 32 | self.MAX_MEAN = 31 |
31 | 33 | ||
32 | @staticmethod | 34 | @staticmethod |
33 | def header_collect(ws, sheet_header_info, header_info, max_column_list, classify): | 35 | def get_header_col(header_value, classify): |
36 | if classify == consts.WECHART_CLASSIFY: | ||
37 | header_dict = consts.WECHART_HEADERS_MAPPING | ||
38 | else: | ||
39 | header_dict = consts.HEADERS_MAPPING | ||
40 | header_col = header_dict.get(header_value) | ||
41 | if header_col is None: | ||
42 | for pattern in header_dict.keys(): | ||
43 | if re.search(pattern, header_value): | ||
44 | header_col = header_dict.get(pattern) | ||
45 | break | ||
46 | return header_col | ||
47 | |||
48 | def header_collect(self, ws, sheet_header_info, header_info, max_column_list, classify): | ||
34 | # sheet_header_info = { | 49 | # sheet_header_info = { |
35 | # 'sheet_name': { | 50 | # 'sheet_name': { |
36 | # 'summary_col': 1, | 51 | # 'summary_col': 1, |
... | @@ -65,10 +80,7 @@ class BSWorkbook(Workbook): | ... | @@ -65,10 +80,7 @@ class BSWorkbook(Workbook): |
65 | for first_row in ws.iter_rows(max_row=1, min_row=1, values_only=True): | 80 | for first_row in ws.iter_rows(max_row=1, min_row=1, values_only=True): |
66 | sheet_header_info.setdefault(ws.title, {}).setdefault(consts.HEADER_KEY, first_row) | 81 | sheet_header_info.setdefault(ws.title, {}).setdefault(consts.HEADER_KEY, first_row) |
67 | for idx, header_value in enumerate(first_row): | 82 | for idx, header_value in enumerate(first_row): |
68 | if classify == consts.WECHART_CLASSIFY: | 83 | header_col = self.get_header_col(header_value, classify) |
69 | header_col = consts.WECHART_HEADERS_MAPPING.get(header_value) | ||
70 | else: | ||
71 | header_col = consts.HEADERS_MAPPING.get(header_value) | ||
72 | if header_col is not None: | 84 | if header_col is not None: |
73 | find_count += 1 | 85 | find_count += 1 |
74 | sheet_header_info.setdefault(ws.title, {}).setdefault(header_col, idx) | 86 | sheet_header_info.setdefault(ws.title, {}).setdefault(header_col, idx) |
... | @@ -98,7 +110,8 @@ class BSWorkbook(Workbook): | ... | @@ -98,7 +110,8 @@ class BSWorkbook(Workbook): |
98 | sheet_order_list = sorted(sheet_header_info, reverse=True, | 110 | sheet_order_list = sorted(sheet_header_info, reverse=True, |
99 | key=lambda x: sheet_header_info[x][consts.FIND_COUNT_KEY]) | 111 | key=lambda x: sheet_header_info[x][consts.FIND_COUNT_KEY]) |
100 | best_sheet_info = sheet_header_info.get(sheet_order_list[0]) | 112 | best_sheet_info = sheet_header_info.get(sheet_order_list[0]) |
101 | if best_sheet_info.get(consts.FIND_COUNT_KEY, 0) == 0: | 113 | max_find_count = best_sheet_info.get(consts.FIND_COUNT_KEY, 0) |
114 | if max_find_count == 0: | ||
102 | for key, value in consts.CLASSIFY_MAP.items(): | 115 | for key, value in consts.CLASSIFY_MAP.items(): |
103 | col = consts.CLASSIFY_LIST[classify][1][value] | 116 | col = consts.CLASSIFY_LIST[classify][1][value] |
104 | statistics_header_info[key] = col - 1 if isinstance(col, int) else None | 117 | statistics_header_info[key] = col - 1 if isinstance(col, int) else None |
... | @@ -123,7 +136,7 @@ class BSWorkbook(Workbook): | ... | @@ -123,7 +136,7 @@ class BSWorkbook(Workbook): |
123 | find_col_set.add(col) | 136 | find_col_set.add(col) |
124 | statistics_header_info[key] = col | 137 | statistics_header_info[key] = col |
125 | statistics_header_info[consts.HEADER_KEY] = best_sheet_info.get(consts.HEADER_KEY) | 138 | statistics_header_info[consts.HEADER_KEY] = best_sheet_info.get(consts.HEADER_KEY) |
126 | return statistics_header_info | 139 | return statistics_header_info, max_find_count |
127 | 140 | ||
128 | @staticmethod | 141 | @staticmethod |
129 | def get_data_col_min_row(sheet, sheet_header_info, header_info, classify): | 142 | def get_data_col_min_row(sheet, sheet_header_info, header_info, classify): |
... | @@ -144,6 +157,19 @@ class BSWorkbook(Workbook): | ... | @@ -144,6 +157,19 @@ class BSWorkbook(Workbook): |
144 | return date_col, min_row | 157 | return date_col, min_row |
145 | 158 | ||
146 | @staticmethod | 159 | @staticmethod |
160 | def get_confidence(max_find_count): | ||
161 | if max_find_count == 0: | ||
162 | return round(random.uniform(75, 80), 2) | ||
163 | elif max_find_count == 1: | ||
164 | return round(random.uniform(80, 85)) | ||
165 | elif max_find_count == 2: | ||
166 | return round(random.uniform(85, 90)) | ||
167 | elif max_find_count == 3: | ||
168 | return round(random.uniform(90, 95)) | ||
169 | else: | ||
170 | return round(random.uniform(95, 100)) | ||
171 | |||
172 | @staticmethod | ||
147 | def month_split(dti, date_list, date_statistics): | 173 | def month_split(dti, date_list, date_statistics): |
148 | month_list = [] | 174 | month_list = [] |
149 | idx_list = [] | 175 | idx_list = [] |
... | @@ -444,7 +470,7 @@ class BSWorkbook(Workbook): | ... | @@ -444,7 +470,7 @@ class BSWorkbook(Workbook): |
444 | for sheet in sheets_list: | 470 | for sheet in sheets_list: |
445 | ws = self.get_sheet_by_name(sheet) | 471 | ws = self.get_sheet_by_name(sheet) |
446 | self.header_collect(ws, sheet_header_info, header_info, max_column_list, classify) | 472 | self.header_collect(ws, sheet_header_info, header_info, max_column_list, classify) |
447 | statistics_header_info = self.header_statistics(sheet_header_info, header_info, classify) | 473 | statistics_header_info, max_find_count = self.header_statistics(sheet_header_info, header_info, classify) |
448 | max_column = max(max_column_list) | 474 | max_column = max(max_column_list) |
449 | 475 | ||
450 | # 1.2.按月份分割 min_row 正文第一行 date_col 日期行 | 476 | # 1.2.按月份分割 min_row 正文第一行 date_col 日期行 |
... | @@ -464,8 +490,9 @@ class BSWorkbook(Workbook): | ... | @@ -464,8 +490,9 @@ class BSWorkbook(Workbook): |
464 | end_date = max(date_list) if end_date is None else end_date | 490 | end_date = max(date_list) if end_date is None else end_date |
465 | 491 | ||
466 | # 2.元信息提取表 | 492 | # 2.元信息提取表 |
493 | confidence = self.get_confidence(max_find_count) | ||
467 | ms = self.build_meta_sheet(card, | 494 | ms = self.build_meta_sheet(card, |
468 | summary.get('confidence', 1), | 495 | confidence, |
469 | summary.get('code'), | 496 | summary.get('code'), |
470 | summary.get('print_time'), | 497 | summary.get('print_time'), |
471 | start_date, | 498 | start_date, | ... | ... |
... | @@ -278,8 +278,11 @@ class DocView(GenericView, DocHandler): | ... | @@ -278,8 +278,11 @@ class DocView(GenericView, DocHandler): |
278 | random_int = random.randint(0, consts.TIME_NUM) | 278 | random_int = random.randint(0, consts.TIME_NUM) |
279 | metadata_version_id = str(int(time.time()) - random_int) | 279 | metadata_version_id = str(int(time.time()) - random_int) |
280 | 280 | ||
281 | tmp_save_path = os.path.join(conf.DATA_DIR, '{0}.pdf'.format(metadata_version_id)) | ||
282 | pdf_file = args.get('pdf_file') | 281 | pdf_file = args.get('pdf_file') |
282 | if not pdf_file.name.endswith('pdf'): | ||
283 | self.invalid_params(msg='invalid params: not a PDF file') | ||
284 | |||
285 | tmp_save_path = os.path.join(conf.DATA_DIR, '{0}.pdf'.format(metadata_version_id)) | ||
283 | file_write(pdf_file, tmp_save_path) | 286 | file_write(pdf_file, tmp_save_path) |
284 | 287 | ||
285 | try: | 288 | try: | ... | ... |
-
Please register or sign in to post a comment