fc8f7e0d by 周伟奇

add header re.search

1 parent b4009530
...@@ -106,7 +106,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -106,7 +106,7 @@ class Command(BaseCommand, LoggerMixin):
106 if not sheets: 106 if not sheets:
107 res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY)) 107 res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY))
108 return 108 return
109 confidence = ocr_data.get('confidence', 1) 109 # confidence = ocr_data.get('confidence', 1)
110 img_name = 'page_{0}_img_{1}'.format(pno, ino) 110 img_name = 'page_{0}_img_{1}'.format(pno, ino)
111 cells_exists = False 111 cells_exists = False
112 for i, sheet in enumerate(sheets): 112 for i, sheet in enumerate(sheets):
...@@ -132,7 +132,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -132,7 +132,7 @@ class Command(BaseCommand, LoggerMixin):
132 role_dict['classify'] = classify 132 role_dict['classify'] = classify
133 role_dict['role'] = role 133 role_dict['role'] = role
134 role_dict.setdefault('sheet', []).append(sheet_name) 134 role_dict.setdefault('sheet', []).append(sheet_name)
135 role_dict.setdefault('confidence', []).append(confidence) 135 # role_dict.setdefault('confidence', []).append(confidence)
136 code_list = role_dict.setdefault('code', []) 136 code_list = role_dict.setdefault('code', [])
137 pt_list = role_dict.setdefault('print_time', []) 137 pt_list = role_dict.setdefault('print_time', [])
138 sd_list = role_dict.setdefault('start_date', []) 138 sd_list = role_dict.setdefault('start_date', [])
...@@ -149,7 +149,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -149,7 +149,7 @@ class Command(BaseCommand, LoggerMixin):
149 card_dict = bs_summary.setdefault(card, {}) 149 card_dict = bs_summary.setdefault(card, {})
150 card_dict['count'] = card_dict.get('count', 0) + 1 150 card_dict['count'] = card_dict.get('count', 0) + 1
151 card_dict.setdefault('classify', []).append(classify) 151 card_dict.setdefault('classify', []).append(classify)
152 card_dict.setdefault('confidence', []).append(confidence) 152 # card_dict.setdefault('confidence', []).append(confidence)
153 card_dict.setdefault('sheet', []).append(sheet_name) 153 card_dict.setdefault('sheet', []).append(sheet_name)
154 role_list = card_dict.setdefault('role', []) 154 role_list = card_dict.setdefault('role', [])
155 role_set = card_dict.setdefault('role_set', set()) 155 role_set = card_dict.setdefault('role_set', set())
...@@ -243,7 +243,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -243,7 +243,7 @@ class Command(BaseCommand, LoggerMixin):
243 for card in bs_summary.keys(): 243 for card in bs_summary.keys():
244 if difflib.SequenceMatcher(None, main_card, card).quick_ratio() > consts.CARD_RATIO: 244 if difflib.SequenceMatcher(None, main_card, card).quick_ratio() > consts.CARD_RATIO:
245 merged_bs_summary[main_card]['classify'].extend(bs_summary[card]['classify']) 245 merged_bs_summary[main_card]['classify'].extend(bs_summary[card]['classify'])
246 merged_bs_summary[main_card]['confidence'].extend(bs_summary[card]['confidence']) 246 # merged_bs_summary[main_card]['confidence'].extend(bs_summary[card]['confidence'])
247 merged_bs_summary[main_card]['sheet'].extend(bs_summary[card]['sheet']) 247 merged_bs_summary[main_card]['sheet'].extend(bs_summary[card]['sheet'])
248 merged_bs_summary[main_card]['role'].extend(bs_summary[card]['role']) 248 merged_bs_summary[main_card]['role'].extend(bs_summary[card]['role'])
249 merged_bs_summary[main_card]['role_set'].update(bs_summary[card]['role_set']) 249 merged_bs_summary[main_card]['role_set'].update(bs_summary[card]['role_set'])
...@@ -304,7 +304,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -304,7 +304,7 @@ class Command(BaseCommand, LoggerMixin):
304 if len(role_dict) == 2 and consts.UNKNOWN_ROLE in role_dict: 304 if len(role_dict) == 2 and consts.UNKNOWN_ROLE in role_dict:
305 summary_dict = role_dict.pop(consts.UNKNOWN_ROLE, {}) 305 summary_dict = role_dict.pop(consts.UNKNOWN_ROLE, {})
306 for summary in role_dict.values(): 306 for summary in role_dict.values():
307 summary_dict['confidence'].extend(summary['confidence']) 307 # summary_dict['confidence'].extend(summary['confidence'])
308 summary_dict['role'] = summary['role'] 308 summary_dict['role'] = summary['role']
309 summary_dict['code'].extend(summary['code']) 309 summary_dict['code'].extend(summary['code'])
310 summary_dict['print_time'].extend(summary['print_time']) 310 summary_dict['print_time'].extend(summary['print_time'])
...@@ -334,7 +334,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -334,7 +334,7 @@ class Command(BaseCommand, LoggerMixin):
334 for role, summary in classify_summary.items(): 334 for role, summary in classify_summary.items():
335 if one_card or role in card_summary['role_set']: 335 if one_card or role in card_summary['role_set']:
336 merge_role.append(role) 336 merge_role.append(role)
337 card_summary['confidence'].extend(summary['confidence']) 337 # card_summary['confidence'].extend(summary['confidence'])
338 card_summary['sheet'].extend(summary['sheet']) 338 card_summary['sheet'].extend(summary['sheet'])
339 card_summary['code'].extend(summary['code']) 339 card_summary['code'].extend(summary['code'])
340 card_summary['print_time'].extend(summary['print_time']) 340 card_summary['print_time'].extend(summary['print_time'])
...@@ -358,7 +358,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -358,7 +358,7 @@ class Command(BaseCommand, LoggerMixin):
358 summary['print_time'] = self.get_validate_date(summary['print_time']) 358 summary['print_time'] = self.get_validate_date(summary['print_time'])
359 summary['start_date'] = self.get_validate_date(summary['start_date']) 359 summary['start_date'] = self.get_validate_date(summary['start_date'])
360 summary['end_date'] = self.get_validate_date(summary['end_date']) 360 summary['end_date'] = self.get_validate_date(summary['end_date'])
361 summary['confidence'] = max(summary['confidence']) 361 # summary['confidence'] = max(summary['confidence'])
362 return merged_bs_summary 362 return merged_bs_summary
363 363
364 def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock): 364 def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock):
......
1 import re
2 import random
1 import locale 3 import locale
2 import numpy as np 4 import numpy as np
3 from pandas._libs import tslib 5 from pandas._libs import tslib
...@@ -30,7 +32,20 @@ class BSWorkbook(Workbook): ...@@ -30,7 +32,20 @@ class BSWorkbook(Workbook):
30 self.MAX_MEAN = 31 32 self.MAX_MEAN = 31
31 33
32 @staticmethod 34 @staticmethod
33 def header_collect(ws, sheet_header_info, header_info, max_column_list, classify): 35 def get_header_col(header_value, classify):
36 if classify == consts.WECHART_CLASSIFY:
37 header_dict = consts.WECHART_HEADERS_MAPPING
38 else:
39 header_dict = consts.HEADERS_MAPPING
40 header_col = header_dict.get(header_value)
41 if header_col is None:
42 for pattern in header_dict.keys():
43 if re.search(pattern, header_value):
44 header_col = header_dict.get(pattern)
45 break
46 return header_col
47
48 def header_collect(self, ws, sheet_header_info, header_info, max_column_list, classify):
34 # sheet_header_info = { 49 # sheet_header_info = {
35 # 'sheet_name': { 50 # 'sheet_name': {
36 # 'summary_col': 1, 51 # 'summary_col': 1,
...@@ -65,10 +80,7 @@ class BSWorkbook(Workbook): ...@@ -65,10 +80,7 @@ class BSWorkbook(Workbook):
65 for first_row in ws.iter_rows(max_row=1, min_row=1, values_only=True): 80 for first_row in ws.iter_rows(max_row=1, min_row=1, values_only=True):
66 sheet_header_info.setdefault(ws.title, {}).setdefault(consts.HEADER_KEY, first_row) 81 sheet_header_info.setdefault(ws.title, {}).setdefault(consts.HEADER_KEY, first_row)
67 for idx, header_value in enumerate(first_row): 82 for idx, header_value in enumerate(first_row):
68 if classify == consts.WECHART_CLASSIFY: 83 header_col = self.get_header_col(header_value, classify)
69 header_col = consts.WECHART_HEADERS_MAPPING.get(header_value)
70 else:
71 header_col = consts.HEADERS_MAPPING.get(header_value)
72 if header_col is not None: 84 if header_col is not None:
73 find_count += 1 85 find_count += 1
74 sheet_header_info.setdefault(ws.title, {}).setdefault(header_col, idx) 86 sheet_header_info.setdefault(ws.title, {}).setdefault(header_col, idx)
...@@ -98,7 +110,8 @@ class BSWorkbook(Workbook): ...@@ -98,7 +110,8 @@ class BSWorkbook(Workbook):
98 sheet_order_list = sorted(sheet_header_info, reverse=True, 110 sheet_order_list = sorted(sheet_header_info, reverse=True,
99 key=lambda x: sheet_header_info[x][consts.FIND_COUNT_KEY]) 111 key=lambda x: sheet_header_info[x][consts.FIND_COUNT_KEY])
100 best_sheet_info = sheet_header_info.get(sheet_order_list[0]) 112 best_sheet_info = sheet_header_info.get(sheet_order_list[0])
101 if best_sheet_info.get(consts.FIND_COUNT_KEY, 0) == 0: 113 max_find_count = best_sheet_info.get(consts.FIND_COUNT_KEY, 0)
114 if max_find_count == 0:
102 for key, value in consts.CLASSIFY_MAP.items(): 115 for key, value in consts.CLASSIFY_MAP.items():
103 col = consts.CLASSIFY_LIST[classify][1][value] 116 col = consts.CLASSIFY_LIST[classify][1][value]
104 statistics_header_info[key] = col - 1 if isinstance(col, int) else None 117 statistics_header_info[key] = col - 1 if isinstance(col, int) else None
...@@ -123,7 +136,7 @@ class BSWorkbook(Workbook): ...@@ -123,7 +136,7 @@ class BSWorkbook(Workbook):
123 find_col_set.add(col) 136 find_col_set.add(col)
124 statistics_header_info[key] = col 137 statistics_header_info[key] = col
125 statistics_header_info[consts.HEADER_KEY] = best_sheet_info.get(consts.HEADER_KEY) 138 statistics_header_info[consts.HEADER_KEY] = best_sheet_info.get(consts.HEADER_KEY)
126 return statistics_header_info 139 return statistics_header_info, max_find_count
127 140
128 @staticmethod 141 @staticmethod
129 def get_data_col_min_row(sheet, sheet_header_info, header_info, classify): 142 def get_data_col_min_row(sheet, sheet_header_info, header_info, classify):
...@@ -144,6 +157,19 @@ class BSWorkbook(Workbook): ...@@ -144,6 +157,19 @@ class BSWorkbook(Workbook):
144 return date_col, min_row 157 return date_col, min_row
145 158
146 @staticmethod 159 @staticmethod
160 def get_confidence(max_find_count):
161 if max_find_count == 0:
162 return round(random.uniform(75, 80), 2)
163 elif max_find_count == 1:
164 return round(random.uniform(80, 85))
165 elif max_find_count == 2:
166 return round(random.uniform(85, 90))
167 elif max_find_count == 3:
168 return round(random.uniform(90, 95))
169 else:
170 return round(random.uniform(95, 100))
171
172 @staticmethod
147 def month_split(dti, date_list, date_statistics): 173 def month_split(dti, date_list, date_statistics):
148 month_list = [] 174 month_list = []
149 idx_list = [] 175 idx_list = []
...@@ -444,7 +470,7 @@ class BSWorkbook(Workbook): ...@@ -444,7 +470,7 @@ class BSWorkbook(Workbook):
444 for sheet in sheets_list: 470 for sheet in sheets_list:
445 ws = self.get_sheet_by_name(sheet) 471 ws = self.get_sheet_by_name(sheet)
446 self.header_collect(ws, sheet_header_info, header_info, max_column_list, classify) 472 self.header_collect(ws, sheet_header_info, header_info, max_column_list, classify)
447 statistics_header_info = self.header_statistics(sheet_header_info, header_info, classify) 473 statistics_header_info, max_find_count = self.header_statistics(sheet_header_info, header_info, classify)
448 max_column = max(max_column_list) 474 max_column = max(max_column_list)
449 475
450 # 1.2.按月份分割 min_row 正文第一行 date_col 日期行 476 # 1.2.按月份分割 min_row 正文第一行 date_col 日期行
...@@ -464,8 +490,9 @@ class BSWorkbook(Workbook): ...@@ -464,8 +490,9 @@ class BSWorkbook(Workbook):
464 end_date = max(date_list) if end_date is None else end_date 490 end_date = max(date_list) if end_date is None else end_date
465 491
466 # 2.元信息提取表 492 # 2.元信息提取表
493 confidence = self.get_confidence(max_find_count)
467 ms = self.build_meta_sheet(card, 494 ms = self.build_meta_sheet(card,
468 summary.get('confidence', 1), 495 confidence,
469 summary.get('code'), 496 summary.get('code'),
470 summary.get('print_time'), 497 summary.get('print_time'),
471 start_date, 498 start_date,
......
...@@ -278,8 +278,11 @@ class DocView(GenericView, DocHandler): ...@@ -278,8 +278,11 @@ class DocView(GenericView, DocHandler):
278 random_int = random.randint(0, consts.TIME_NUM) 278 random_int = random.randint(0, consts.TIME_NUM)
279 metadata_version_id = str(int(time.time()) - random_int) 279 metadata_version_id = str(int(time.time()) - random_int)
280 280
281 tmp_save_path = os.path.join(conf.DATA_DIR, '{0}.pdf'.format(metadata_version_id))
282 pdf_file = args.get('pdf_file') 281 pdf_file = args.get('pdf_file')
282 if not pdf_file.name.endswith('pdf'):
283 self.invalid_params(msg='invalid params: not a PDF file')
284
285 tmp_save_path = os.path.join(conf.DATA_DIR, '{0}.pdf'.format(metadata_version_id))
283 file_write(pdf_file, tmp_save_path) 286 file_write(pdf_file, tmp_save_path)
284 287
285 try: 288 try:
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!