92b21d6a by 周伟奇

fix bug

1 parent 6ba8d65d
......@@ -19,16 +19,16 @@ from common.mixins import LoggerMixin
from common.tools.pdf_to_img import PDFHandler
from apps.doc import consts
from apps.doc.exceptions import OCR1Exception, OCR4Exception
from apps.doc.ocr.wb import BSWorkbook
from apps.doc.ocr.wb import BSWorkbook, PatternFill
class Finder:
"""Summary
Attributes:
ocr_results (TYPE): Description
"""
def __init__(self, ocr_results=None):
self.ocr_results = ocr_results
......@@ -82,15 +82,15 @@ class Finder:
for key in ocr_results[pno]:
bbox, text = ocr_results[pno][key]
ocr_texts += text
pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符
pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符
ocr_texts = pattern.sub('', ocr_texts)
score = fuzz.ratio(page_template, ocr_texts) / 100.
score = fuzz.ratio(page_template, ocr_texts)/100.
classes.append([pno, score])
pred = sorted(classes, key=lambda x: x[1], reverse=True)[0]
return pred
def get_top_key(self, ocr_results, key_string): # 加入过滤词典
def get_top_key(self, ocr_results, key_string): # 加入过滤词典
"""找到与 key_string 最匹配的字段的 key
"""
if len(ocr_results) == 0:
......@@ -111,7 +111,7 @@ class Finder:
continue
inter = Polygon(g).intersection(Polygon(p)).area
union = g.area + p.area - inter
iou = inter / union
iou = inter/union
iou_list.append([iou, key])
if len(iou_list) == 0:
return -1, -1
......@@ -128,8 +128,8 @@ class Finder:
bbox, text = ocr_results[key]
# 定制化规则, 比如过滤一些词呀什么的
# 该例中, 我们要去掉非中文字符
pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符
text = pattern.sub('', text)
pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符
text = pattern.sub('', text)
tmp_ocr_results[key] = [bbox, text]
# 先根据 key_string 找到 key 的位置所在, 再判断该位置是否包含 value
......@@ -141,8 +141,8 @@ class Finder:
if len(words) == 0:
# 将 bbox 右移一个单位
x0, y0, x1, y1, x2, y2, x3, y3 = bbox
rw = abs(x0 - x1)
anchor = [x0 + rw, y0, x1 + rw, y1, x2 + rw, y2, x3 + rw, y3]
rw = abs(x0-x1)
anchor = [x0+rw, y0, x1+rw, y1, x2+rw, y2, x3+rw, y3]
iou, key = self.get_top_iou(ocr_results, anchor)
if ratio > 0.3:
bbox, text = ocr_results[key]
......@@ -223,7 +223,7 @@ class Finder:
bbox, text = self.ocr_results[pno][key]
all_texts += text
searchObj = re.search(r'保证人\[(.*?)\]与甲方', all_texts)
searchObj = re.search( r'保证人\[(.*?)\]与甲方', all_texts)
if searchObj:
words = f'[{searchObj.group(1)}]'
words = words.replace('【', '[').replace('】', ']').replace(',', ',').replace('(', '(').replace(')', ')')
......@@ -256,7 +256,9 @@ class Finder:
if score > 0.5:
if len(self.ocr_results[pno]) > 0:
# 根据关键词,找这一行字符
lines = self.get_line(self.ocr_results[pno], 'RMB')
lines = ''
for i in ['RMB', 'CNY']:
lines += self.get_line(self.ocr_results[pno], i)
# searchObj = re.search( r'RMB(.*?)in', lines)
searchObj = re.search(r'[0-9,.]+', lines)
if searchObj:
......@@ -264,10 +266,10 @@ class Finder:
amount_eng = words
lines = self.get_line(self.ocr_results[pno], '人民币')
searchObj = re.search(r'大写(.*?)综合', lines)
searchObj = re.search( r'大写(.*?)综合', lines)
if searchObj:
words = searchObj.group(1)
pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符
pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符
words = pattern.sub('', words)
words = words.replace("仔", "仟").replace("任", "仟")
words = words.replace("值", "佰")
......@@ -276,15 +278,15 @@ class Finder:
words = words.replace("政", "玖")
words = words.replace("垒", "叁")
amount_chn = words
lines = self.get_line(self.ocr_results[pno], 'ending')
if len(lines) > 0:
start, end = lines.split('ending')
searchStart = re.search(r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', start)
searchStart = re.search( r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', start)
if searchStart:
words = searchStart.group()
term_start_eng = words
searchEnd = re.search(r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', end)
searchEnd = re.search( r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', end)
if searchEnd:
words = searchEnd.group()
term_end_eng = words
......@@ -292,29 +294,29 @@ class Finder:
lines = self.get_line(self.ocr_results[pno], '至')
if len(lines) > 0:
start, end = lines.split('至')
searchStart = re.search(r'[0-9]{4}-[0-9]+-[0-9]+', start)
searchStart = re.search( r'[0-9]{4}-[0-9]+-[0-9]+', start)
if searchStart:
words = searchStart.group()
term_start_chn = words
searchEnd = re.search(r'[0-9]{4}-[0-9]+-[0-9]+', end)
searchEnd = re.search( r'[0-9]{4}-[0-9]+-[0-9]+', end)
if searchEnd:
words = searchEnd.group()
term_end_chn = words
lines = self.get_line(self.ocr_results[pno], 'above')
searchObj = re.search(r'aboveto([0-9]+)', lines.replace('O', '0'))
searchObj = re.search( r'aboveto([0-9]+)', lines.replace('O', '0').replace('too', 'to0'))
if searchObj:
words = searchObj.group(1)
deposit_eng = f'{words}%'
lines = self.get_line(self.ocr_results[pno], '授信额度的')
searchObj = re.search(r'授信额度的([0-9]+)', lines.replace('O', '0'))
searchObj = re.search( r'授信额度的([0-9]+)', lines.replace('O', '0'))
if searchObj:
words = searchObj.group(1)
deposit_chn = f'{words}%'
return amount_eng, amount_chn, term_start_eng, term_end_eng, \
term_start_chn, term_end_chn, deposit_eng, deposit_chn
term_start_chn, term_end_chn, deposit_eng, deposit_chn
def get_other_arrangements_and_conditions(self):
"""获取其它约定与条件文本段落
......@@ -330,7 +332,7 @@ class Finder:
searchObj = re.search(r'Conditions:(.*?)其他约定与条件', all_texts, re.I)
if searchObj:
words = searchObj.group(1)
pattern = re.compile("[\u4e00-\u9fa5]") # 去除中文字符
pattern = re.compile("[\u4e00-\u9fa5]") # 去除中文字符
words = pattern.sub('', words)
other_arrangements_and_conditions_eng = words
......@@ -356,7 +358,7 @@ class Finder:
self.init_result["保证人"] = guarantor
amount_eng, amount_chn, term_start_eng, term_end_eng, \
term_start_chn, term_end_chn, deposit_eng, deposit_chn = self.get_info_in_page_39()
term_start_chn, term_end_chn, deposit_eng, deposit_chn = self.get_info_in_page_39()
self.init_result["综合授信额度金额英文"] = amount_eng
self.init_result["综合授信额度金额中文"] = amount_chn
self.init_result["综合授信额度期限开始日期英文"] = term_start_eng
......@@ -371,7 +373,6 @@ class Finder:
self.init_result["其他约定与条件中文"] = words_chn
return self.init_result
class TIFFHandler:
def __init__(self, path, img_save_path):
......@@ -409,6 +410,7 @@ class Command(BaseCommand, LoggerMixin):
self.input_dir = conf.WSC_DIR
# ocr相关
self.go_ocr_url = conf.WSC_GO_URL
self.amount_fill = PatternFill("solid", fgColor="00FFFF00")
# 优雅退出信号:15
signal.signal(signal.SIGTERM, self.signal_handler)
......@@ -435,6 +437,19 @@ class Command(BaseCommand, LoggerMixin):
excel_path = os.path.join(wb_output_dir, excel_name)
return img_save_path, excel_path, pdf_save_path
@staticmethod
def get_mode_code(code_list):
result_dict = {}
for code in code_list:
if code in result_dict:
result_dict[code] += 1
else:
result_dict[code] = 1
if len(result_dict) == 1:
return None
else:
return sorted(result_dict.items(), key=lambda x:x[1], reverse=True)[0][0]
def res_process(self, all_res, excel_path):
try:
self.finder.ocr_results = all_res
......@@ -442,11 +457,26 @@ class Command(BaseCommand, LoggerMixin):
wb = BSWorkbook(set(), set(), set(), set(), set())
ws = wb.create_sheet(self.sheet_name)
row_idx = 1
code_idx = 1
mode_code = None
for write_field, field_value in results.items():
row_idx += 1
if isinstance(field_value, list):
if write_field == '合同编号列表':
code_idx = row_idx
mode_code = self.get_mode_code(field_value)
ws.append((write_field, *field_value))
else:
ws.append((write_field, field_value))
if isinstance(mode_code, str):
for cell in ws[code_idx]:
if cell.value == '合同编号列表':
continue
if cell.value != mode_code:
cell.fill = self.amount_fill
wb.remove_base_sheet()
wb.save(excel_path)
except Exception as e:
......
......@@ -257,19 +257,19 @@ class PDFHandler:
self.page_to_png(page)
def check_ebank(self, pdf):
page_text_list = []
# page_text_list = []
text_item_sum = 0
for pno in range(pdf.pageCount):
page = pdf.loadPage(pno)
if page.rotation is None:
rotation = 0
elif isinstance(page.rotation, int):
divisor, remainder = divmod(page.rotation, 90)
if remainder != 0:
return
rotation = divmod(divisor, 4)[1]
else:
return
# if page.rotation is None:
# rotation = 0
# elif isinstance(page.rotation, int):
# divisor, remainder = divmod(page.rotation, 90)
# if remainder != 0:
# return
# rotation = divmod(divisor, 4)[1]
# else:
# return
textpage = page.getTextPage()
text = textpage.extractDICT()
text_list = []
......@@ -284,17 +284,17 @@ class PDFHandler:
text_item_sum += len(text_list)
if text_item_sum < (pno + 1) * 5:
return
else:
page_text_list.append(
{
'width': text.get('width'),
'height': text.get('height'),
'rotation': rotation,
'text': text_list
}
)
# else:
# page_text_list.append(
# {
# 'width': text.get('width'),
# 'height': text.get('height'),
# 'rotation': rotation,
# 'text': text_list
# }
# )
self.is_ebank = True
self.page_text_list = page_text_list
# self.page_text_list = page_text_list
def extract_image(self, max_img_count=None):
self.img_path_list = []
......@@ -310,7 +310,7 @@ class PDFHandler:
if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
self.img_count = pdf.pageCount
return
# self.check_ebank(pdf)
self.check_ebank(pdf)
for pno in range(pdf.pageCount):
il = pdf.getPageImageList(pno) # 获取页面图片对象
# (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!