7dfc2ee8 by 周伟奇

merge license

2 parents 1242adb8 e570371a
......@@ -33,6 +33,5 @@ data/*
# 脚本
src/*.sh
test.py
ocr_test.py
ocr_test_2.py
\ No newline at end of file
test*
ocr_test.py
\ No newline at end of file
......
class EDMSException(Exception):
pass
......@@ -141,32 +141,22 @@ class BSWorkbook(Workbook):
# month_info process
month_info = month_mapping.setdefault('xxxx-xx', [])
month_info.append((ws.title, min_row, ws.max_row, 0))
elif len(month_list) == 1:
# reverse_trend_list process
reverse_trend = self.get_reverse_trend(dti.day, idx_list)
reverse_trend_list.append(reverse_trend)
# month_info process
month_info = month_mapping.setdefault(month_list[0], [])
day_mean = np.mean(dti.day.dropna())
if len(month_info) == 0:
month_info.append((ws.title, min_row, ws.max_row, day_mean))
else:
for i, item in enumerate(month_info):
if day_mean <= item[-1]:
month_info.insert(i, (ws.title, min_row, ws.max_row, day_mean))
break
else:
month_info.append((ws.title, min_row, ws.max_row, day_mean))
else:
# reverse_trend_list process
reverse_trend = self.get_reverse_trend(dti.day, idx_list)
reverse_trend_list.append(reverse_trend)
# month_info process
for i, item in enumerate(month_list[:-1]):
month_mapping.setdefault(item, []).append(
(ws.title, idx_list[i] + min_row, idx_list[i + 1] + min_row - 1, self.MAX_MEAN))
month_mapping.setdefault(month_list[-1], []).insert(
0, (ws.title, idx_list[-1] + min_row, ws.max_row, 0))
day_idx = dti.day
idx_list_max_idx = len(idx_list) - 1
for i, item in enumerate(month_list):
if i == idx_list_max_idx:
day_mean = np.mean(day_idx[idx_list[i]:].dropna())
month_mapping.setdefault(item, []).append(
(ws.title, idx_list[i] + min_row, ws.max_row, day_mean))
else:
day_mean = np.mean(day_idx[idx_list[i]: idx_list[i + 1]].dropna())
month_mapping.setdefault(item, []).append(
(ws.title, idx_list[i] + min_row, idx_list[i + 1] + min_row - 1, day_mean))
def build_metadata_rows(self, confidence, code, print_time, start_date, end_date):
if start_date is None or end_date is None:
......@@ -191,9 +181,9 @@ class BSWorkbook(Workbook):
def create_meta_sheet(self, card):
if self.worksheets[0].title == 'Sheet':
ms = self.worksheets[0]
ms.title = '{0}({1})'.format(self.meta_sheet_title, card)
ms.title = '{0}({1})'.format(self.meta_sheet_title, card[-6:])
else:
ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, card))
ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, card[-6:]))
return ms
def build_meta_sheet(self, card, confidence, code, print_time, start_date, end_date):
......@@ -203,6 +193,26 @@ class BSWorkbook(Workbook):
ms.append(row)
return ms
@staticmethod
def amount_format(amount_str):
if not isinstance(amount_str, str) or amount_str == '':
return amount_str
# 1.替换
res_str = amount_str.translate(consts.TRANS)
# 2.删除多余的-
res_str = res_str[0] + res_str[1:].replace('-', '')
# 3.首字符处理
if res_str[0] in consts.ERROR_CHARS:
res_str = '-{0}'.format(res_str[1:])
# 4.逗号与句号处理
if len(res_str) >= 4:
period_idx = len(res_str) - 3
if res_str[period_idx] == '.' and res_str[period_idx - 1] == ',':
res_str = '{0}{1}'.format(res_str[:period_idx - 1], res_str[period_idx:])
elif res_str[period_idx] == ',':
res_str = '{0}.{1}'.format(res_str[:period_idx], res_str[period_idx + 1:])
return res_str
def build_month_sheet(self, card, month_mapping, ms, is_reverse):
tmp_ws = self.create_sheet('tmp_ws')
for month in sorted(month_mapping.keys()):
......@@ -235,29 +245,25 @@ class BSWorkbook(Workbook):
# 3.3.余额转数值
over_cell = rows[consts.OVER_IDX]
try:
if isinstance(over_cell.value, str):
over_cell.value = over_cell.value.translate(consts.TRANS)
over_cell.value = locale.atof(over_cell.value)
over_cell.value = locale.atof(self.amount_format(over_cell.value))
except Exception as e:
continue
else:
over_cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1
# 3.4.额转数值
# 3.4.额转数值
try:
try:
if isinstance(amount_cell.value, str): # TODO 可在转化数字失败后,再替换
amount_cell.value = amount_cell.value.translate(consts.TRANS)
amount_cell.value = locale.atof(amount_cell.value)
amount_cell.value = locale.atof(self.amount_format(amount_cell.value))
except Exception as e:
try:
if isinstance(rows[consts.INCOME_IDX].value, str):
rows[consts.OUTLAY_IDX].value = rows[consts.INCOME_IDX].value.translate(consts.TRANS)
amount_cell.value = locale.atof(rows[consts.OUTLAY_IDX].value)
amount_cell.value = locale.atof(self.amount_format(rows[consts.INCOME_IDX].value))
if amount_cell.value == 0:
raise
elif amount_cell.value < 0:
amount_cell.value = -amount_cell.value
except Exception as e:
if isinstance(rows[consts.OUTLAY_IDX].value, str):
rows[consts.OUTLAY_IDX].value = rows[consts.OUTLAY_IDX].value.translate(consts.TRANS)
amount_cell.value = locale.atof(rows[consts.OUTLAY_IDX].value)
amount_cell.value = locale.atof(self.amount_format(rows[consts.OUTLAY_IDX].value))
if amount_cell.value > 0:
amount_cell.value = -amount_cell.value
except Exception as e:
......@@ -313,18 +319,18 @@ class BSWorkbook(Workbook):
# }
for card, summary in bs_summary.items():
# 1.原表修剪、排列、按照月份分割
start_date = summary['start_date']
end_date = summary['end_date']
start_date = summary.get('start_date')
end_date = summary.get('end_date')
date_statistics = False
if start_date is None or end_date is None:
date_statistics = True
date_list = []
month_mapping = {}
reverse_trend_list = []
for sheet in summary['sheet']:
for sheet in summary.get('sheet', []):
ws = self.get_sheet_by_name(sheet)
# 1.1.删除多余列、排列
min_row = self.sheet_prune(ws, summary['classify'])
min_row = self.sheet_prune(ws, summary.get('classify', 0))
# 1.2.按月份分割
self.sheet_split(ws, month_mapping, reverse_trend_list, min_row, date_list, date_statistics)
......@@ -334,32 +340,43 @@ class BSWorkbook(Workbook):
# 2.元信息提取表
ms = self.build_meta_sheet(card,
summary['confidence'],
summary['code'],
summary['print_time'],
summary.get('confidence', 1),
summary.get('code'),
summary.get('print_time'),
start_date,
end_date)
# 3.创建月份表、提取/高亮关键行
is_reverse = False
if sum(reverse_trend_list) > 0: # 倒序处理
is_reverse = True
for month_list in month_mapping.values():
month_list.sort(key=lambda x: x[-1], reverse=True)
# 倒序处理
is_reverse = True if sum(reverse_trend_list) > 0 else False
for month_list in month_mapping.values():
month_list.sort(key=lambda x: x[-1], reverse=is_reverse)
self.build_month_sheet(card, month_mapping, ms, is_reverse)
# 4.删除原表
for sheet in summary['sheet']:
for sheet in summary.get('sheet'):
self.remove(self.get_sheet_by_name(sheet))
def license_rebuild(self, license_summary):
for en_key, cn_key in consts.LICENSE_ORDER:
ws = self.create_sheet(cn_key)
for bl in license_summary.get(en_key, []):
for classify, (_, name) in consts.LICENSE_ORDER:
res = license_summary.get(classify)
if res is None:
continue
ws = self.create_sheet(name)
for bl in res:
for bl_field in bl:
ws.append(bl_field)
ws.append((None, ))
def rebuild(self, bs_summary, license_summary):
def skip_img_sheet(self, skip_img):
if skip_img:
ws = self.create_sheet(consts.SKIP_IMG_SHEET_NAME)
ws.append(consts.SKIP_IMG_SHEET_HEADER)
for img_tuple in skip_img:
ws.append(img_tuple)
def rebuild(self, bs_summary, license_summary, skip_img):
self.bs_rebuild(bs_summary)
# self.license_rebuild(license_summary)
self.license_rebuild(license_summary)
self.skip_img_sheet(skip_img)
......
......@@ -25,7 +25,7 @@ class PDFHandler:
def __init__(self, path, img_dir_path):
self.path = path
self.img_dir_path = img_dir_path
self.img_info_list = []
self.img_path_list = []
self.xref_set = set()
def get_img_save_path(self, pno, img_index=0, ext='png'):
......@@ -38,7 +38,7 @@ class PDFHandler:
pm = page.getPixmap(matrix=trans_2, alpha=False)
img_save_path = self.get_img_save_path(page.number)
pm.writePNG(img_save_path)
self.img_info_list.append((img_save_path, page.number, 0))
self.img_path_list.append(img_save_path)
@staticmethod
def getimage(pix):
......@@ -88,7 +88,7 @@ class PDFHandler:
with open(img_save_path, "wb") as f:
f.write(img_data)
self.xref_set.add(xref)
self.img_info_list.append((img_save_path, pno, img_index))
self.img_path_list.append(img_save_path)
@staticmethod
def split_il(il):
......@@ -179,7 +179,7 @@ class PDFHandler:
img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2])
new_img.save(img_save_path)
page_to_png = False
self.img_info_list.append((img_save_path, pno, img_index))
self.img_path_list.append(img_save_path)
# 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片
if page_to_png:
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!