643c2e5d by 冯轩

Merge remote-tracking branch 'origin/feature/KWOM_July' into feature/uat-tmp

2 parents 48a9134c 93c7cc0a
...@@ -25,6 +25,15 @@ def extract_info(ocr_results): ...@@ -25,6 +25,15 @@ def extract_info(ocr_results):
25 25
26 26
27 def predict(pdf_info, is_qrs=False, is_fsm=False): 27 def predict(pdf_info, is_qrs=False, is_fsm=False):
28 pop_seceond_page_info = {}
29 if not is_fsm and not is_qrs and len(pdf_info) == 9:
30 pop_seceond_page_info = pdf_info.pop('1', {})
31 for pno in range(8):
32 if pno == 0:
33 pdf_info[str(pno)]['blocks'].extend(pop_seceond_page_info.get('blocks', []))
34 else:
35 pdf_info[str(pno)] = pdf_info.pop(str(pno+1))
36
28 ocr_results = {} 37 ocr_results = {}
29 for pno in pdf_info: 38 for pno in pdf_info:
30 ocr_results[pno] = {} 39 ocr_results[pno] = {}
......
...@@ -13,6 +13,7 @@ class Finder: ...@@ -13,6 +13,7 @@ class Finder:
13 self.item = {"words": None, 13 self.item = {"words": None,
14 "position": None, 14 "position": None,
15 } 15 }
16 self.cn_re = re.compile(u'[\u4e00-\u9fa5]')
16 17
17 def gen_init_result(self, is_asp): 18 def gen_init_result(self, is_asp):
18 # 格式化算法输出 19 # 格式化算法输出
...@@ -187,6 +188,11 @@ class Finder: ...@@ -187,6 +188,11 @@ class Finder:
187 vin['position'] = location 188 vin['position'] = location
188 return vin 189 return vin
189 190
191 def cn_char_filter(self, src_str):
192 cn_chars = re.findall(self.cn_re, src_str)
193 cn_str = ''.join(cn_chars)
194 return cn_str
195
190 def get_loan_principal(self, page_num='0'): 196 def get_loan_principal(self, page_num='0'):
191 chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', 197 chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾',
192 '佰', '仟', '万', '亿', '元', '角', '分', '零', '整'] 198 '佰', '仟', '万', '亿', '元', '角', '分', '零', '整']
...@@ -201,7 +207,7 @@ class Finder: ...@@ -201,7 +207,7 @@ class Finder:
201 for line in block['lines']: 207 for line in block['lines']:
202 for span in line['spans']: 208 for span in line['spans']:
203 bbox, text = span['bbox'], span['text'] 209 bbox, text = span['bbox'], span['text']
204 if fuzz.ratio(''.join(chinese_keywords), text) > 15: 210 if fuzz.ratio(''.join(chinese_keywords), self.cn_char_filter(text)) >= 10:
205 text = text.split(':')[-1].strip() 211 text = text.split(':')[-1].strip()
206 upper['position'] = bbox 212 upper['position'] = bbox
207 upper['words'] = text 213 upper['words'] = text
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!