93c7cc0a by 周伟奇

KWOM_July

1 parent b10ff66a
......@@ -25,6 +25,15 @@ def extract_info(ocr_results):
def predict(pdf_info, is_qrs=False, is_fsm=False):
pop_seceond_page_info = {}
if not is_fsm and not is_qrs and len(pdf_info) == 9:
pop_seceond_page_info = pdf_info.pop('1', {})
for pno in range(8):
if pno == 0:
pdf_info[str(pno)]['blocks'].extend(pop_seceond_page_info.get('blocks', []))
else:
pdf_info[str(pno)] = pdf_info.pop(str(pno+1))
ocr_results = {}
for pno in pdf_info:
ocr_results[pno] = {}
......
......@@ -13,6 +13,7 @@ class Finder:
self.item = {"words": None,
"position": None,
}
self.cn_re = re.compile(u'[\u4e00-\u9fa5]')
def gen_init_result(self, is_asp):
# 格式化算法输出
......@@ -187,6 +188,11 @@ class Finder:
vin['position'] = location
return vin
def cn_char_filter(self, src_str):
cn_chars = re.findall(self.cn_re, src_str)
cn_str = ''.join(cn_chars)
return cn_str
def get_loan_principal(self, page_num='0'):
chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾',
'佰', '仟', '万', '亿', '元', '角', '分', '零', '整']
......@@ -201,7 +207,7 @@ class Finder:
for line in block['lines']:
for span in line['spans']:
bbox, text = span['bbox'], span['text']
if fuzz.ratio(''.join(chinese_keywords), text) > 15:
if fuzz.ratio(''.join(chinese_keywords), self.cn_char_filter(text)) >= 10:
text = text.split(':')[-1].strip()
upper['position'] = bbox
upper['words'] = text
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!