fc8f7e0d by 周伟奇

add header re.search

1 parent b4009530
......@@ -106,7 +106,7 @@ class Command(BaseCommand, LoggerMixin):
if not sheets:
res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY))
return
confidence = ocr_data.get('confidence', 1)
# confidence = ocr_data.get('confidence', 1)
img_name = 'page_{0}_img_{1}'.format(pno, ino)
cells_exists = False
for i, sheet in enumerate(sheets):
......@@ -132,7 +132,7 @@ class Command(BaseCommand, LoggerMixin):
role_dict['classify'] = classify
role_dict['role'] = role
role_dict.setdefault('sheet', []).append(sheet_name)
role_dict.setdefault('confidence', []).append(confidence)
# role_dict.setdefault('confidence', []).append(confidence)
code_list = role_dict.setdefault('code', [])
pt_list = role_dict.setdefault('print_time', [])
sd_list = role_dict.setdefault('start_date', [])
......@@ -149,7 +149,7 @@ class Command(BaseCommand, LoggerMixin):
card_dict = bs_summary.setdefault(card, {})
card_dict['count'] = card_dict.get('count', 0) + 1
card_dict.setdefault('classify', []).append(classify)
card_dict.setdefault('confidence', []).append(confidence)
# card_dict.setdefault('confidence', []).append(confidence)
card_dict.setdefault('sheet', []).append(sheet_name)
role_list = card_dict.setdefault('role', [])
role_set = card_dict.setdefault('role_set', set())
......@@ -243,7 +243,7 @@ class Command(BaseCommand, LoggerMixin):
for card in bs_summary.keys():
if difflib.SequenceMatcher(None, main_card, card).quick_ratio() > consts.CARD_RATIO:
merged_bs_summary[main_card]['classify'].extend(bs_summary[card]['classify'])
merged_bs_summary[main_card]['confidence'].extend(bs_summary[card]['confidence'])
# merged_bs_summary[main_card]['confidence'].extend(bs_summary[card]['confidence'])
merged_bs_summary[main_card]['sheet'].extend(bs_summary[card]['sheet'])
merged_bs_summary[main_card]['role'].extend(bs_summary[card]['role'])
merged_bs_summary[main_card]['role_set'].update(bs_summary[card]['role_set'])
......@@ -304,7 +304,7 @@ class Command(BaseCommand, LoggerMixin):
if len(role_dict) == 2 and consts.UNKNOWN_ROLE in role_dict:
summary_dict = role_dict.pop(consts.UNKNOWN_ROLE, {})
for summary in role_dict.values():
summary_dict['confidence'].extend(summary['confidence'])
# summary_dict['confidence'].extend(summary['confidence'])
summary_dict['role'] = summary['role']
summary_dict['code'].extend(summary['code'])
summary_dict['print_time'].extend(summary['print_time'])
......@@ -334,7 +334,7 @@ class Command(BaseCommand, LoggerMixin):
for role, summary in classify_summary.items():
if one_card or role in card_summary['role_set']:
merge_role.append(role)
card_summary['confidence'].extend(summary['confidence'])
# card_summary['confidence'].extend(summary['confidence'])
card_summary['sheet'].extend(summary['sheet'])
card_summary['code'].extend(summary['code'])
card_summary['print_time'].extend(summary['print_time'])
......@@ -358,7 +358,7 @@ class Command(BaseCommand, LoggerMixin):
summary['print_time'] = self.get_validate_date(summary['print_time'])
summary['start_date'] = self.get_validate_date(summary['start_date'])
summary['end_date'] = self.get_validate_date(summary['end_date'])
summary['confidence'] = max(summary['confidence'])
# summary['confidence'] = max(summary['confidence'])
return merged_bs_summary
def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock):
......
import re
import random
import locale
import numpy as np
from pandas._libs import tslib
......@@ -30,7 +32,20 @@ class BSWorkbook(Workbook):
self.MAX_MEAN = 31
@staticmethod
def header_collect(ws, sheet_header_info, header_info, max_column_list, classify):
def get_header_col(header_value, classify):
if classify == consts.WECHART_CLASSIFY:
header_dict = consts.WECHART_HEADERS_MAPPING
else:
header_dict = consts.HEADERS_MAPPING
header_col = header_dict.get(header_value)
if header_col is None:
for pattern in header_dict.keys():
if re.search(pattern, header_value):
header_col = header_dict.get(pattern)
break
return header_col
def header_collect(self, ws, sheet_header_info, header_info, max_column_list, classify):
# sheet_header_info = {
# 'sheet_name': {
# 'summary_col': 1,
......@@ -65,10 +80,7 @@ class BSWorkbook(Workbook):
for first_row in ws.iter_rows(max_row=1, min_row=1, values_only=True):
sheet_header_info.setdefault(ws.title, {}).setdefault(consts.HEADER_KEY, first_row)
for idx, header_value in enumerate(first_row):
if classify == consts.WECHART_CLASSIFY:
header_col = consts.WECHART_HEADERS_MAPPING.get(header_value)
else:
header_col = consts.HEADERS_MAPPING.get(header_value)
header_col = self.get_header_col(header_value, classify)
if header_col is not None:
find_count += 1
sheet_header_info.setdefault(ws.title, {}).setdefault(header_col, idx)
......@@ -98,7 +110,8 @@ class BSWorkbook(Workbook):
sheet_order_list = sorted(sheet_header_info, reverse=True,
key=lambda x: sheet_header_info[x][consts.FIND_COUNT_KEY])
best_sheet_info = sheet_header_info.get(sheet_order_list[0])
if best_sheet_info.get(consts.FIND_COUNT_KEY, 0) == 0:
max_find_count = best_sheet_info.get(consts.FIND_COUNT_KEY, 0)
if max_find_count == 0:
for key, value in consts.CLASSIFY_MAP.items():
col = consts.CLASSIFY_LIST[classify][1][value]
statistics_header_info[key] = col - 1 if isinstance(col, int) else None
......@@ -123,7 +136,7 @@ class BSWorkbook(Workbook):
find_col_set.add(col)
statistics_header_info[key] = col
statistics_header_info[consts.HEADER_KEY] = best_sheet_info.get(consts.HEADER_KEY)
return statistics_header_info
return statistics_header_info, max_find_count
@staticmethod
def get_data_col_min_row(sheet, sheet_header_info, header_info, classify):
......@@ -144,6 +157,19 @@ class BSWorkbook(Workbook):
return date_col, min_row
@staticmethod
def get_confidence(max_find_count):
if max_find_count == 0:
return round(random.uniform(75, 80), 2)
elif max_find_count == 1:
return round(random.uniform(80, 85))
elif max_find_count == 2:
return round(random.uniform(85, 90))
elif max_find_count == 3:
return round(random.uniform(90, 95))
else:
return round(random.uniform(95, 100))
@staticmethod
def month_split(dti, date_list, date_statistics):
month_list = []
idx_list = []
......@@ -444,7 +470,7 @@ class BSWorkbook(Workbook):
for sheet in sheets_list:
ws = self.get_sheet_by_name(sheet)
self.header_collect(ws, sheet_header_info, header_info, max_column_list, classify)
statistics_header_info = self.header_statistics(sheet_header_info, header_info, classify)
statistics_header_info, max_find_count = self.header_statistics(sheet_header_info, header_info, classify)
max_column = max(max_column_list)
# 1.2.按月份分割 min_row 正文第一行 date_col 日期行
......@@ -464,8 +490,9 @@ class BSWorkbook(Workbook):
end_date = max(date_list) if end_date is None else end_date
# 2.元信息提取表
confidence = self.get_confidence(max_find_count)
ms = self.build_meta_sheet(card,
summary.get('confidence', 1),
confidence,
summary.get('code'),
summary.get('print_time'),
start_date,
......
......@@ -278,8 +278,11 @@ class DocView(GenericView, DocHandler):
random_int = random.randint(0, consts.TIME_NUM)
metadata_version_id = str(int(time.time()) - random_int)
tmp_save_path = os.path.join(conf.DATA_DIR, '{0}.pdf'.format(metadata_version_id))
pdf_file = args.get('pdf_file')
if not pdf_file.name.endswith('pdf'):
self.invalid_params(msg='invalid params: not a PDF file')
tmp_save_path = os.path.join(conf.DATA_DIR, '{0}.pdf'.format(metadata_version_id))
file_write(pdf_file, tmp_save_path)
try:
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!