Merge remote-tracking branch 'origin/feature/KWOM_July' into feature/uat-tmp
Showing
3 changed files
with
16 additions
and
1 deletions
... | @@ -25,6 +25,15 @@ def extract_info(ocr_results): | ... | @@ -25,6 +25,15 @@ def extract_info(ocr_results): |
25 | 25 | ||
26 | 26 | ||
27 | def predict(pdf_info, is_qrs=False, is_fsm=False): | 27 | def predict(pdf_info, is_qrs=False, is_fsm=False): |
28 | pop_seceond_page_info = {} | ||
29 | if not is_fsm and not is_qrs and len(pdf_info) == 9: | ||
30 | pop_seceond_page_info = pdf_info.pop('1', {}) | ||
31 | for pno in range(8): | ||
32 | if pno == 0: | ||
33 | pdf_info[str(pno)]['blocks'].extend(pop_seceond_page_info.get('blocks', [])) | ||
34 | else: | ||
35 | pdf_info[str(pno)] = pdf_info.pop(str(pno+1)) | ||
36 | |||
28 | ocr_results = {} | 37 | ocr_results = {} |
29 | for pno in pdf_info: | 38 | for pno in pdf_info: |
30 | ocr_results[pno] = {} | 39 | ocr_results[pno] = {} | ... | ... |
... | @@ -13,6 +13,7 @@ class Finder: | ... | @@ -13,6 +13,7 @@ class Finder: |
13 | self.item = {"words": None, | 13 | self.item = {"words": None, |
14 | "position": None, | 14 | "position": None, |
15 | } | 15 | } |
16 | self.cn_re = re.compile(u'[\u4e00-\u9fa5]') | ||
16 | 17 | ||
17 | def gen_init_result(self, is_asp): | 18 | def gen_init_result(self, is_asp): |
18 | # 格式化算法输出 | 19 | # 格式化算法输出 |
... | @@ -187,6 +188,11 @@ class Finder: | ... | @@ -187,6 +188,11 @@ class Finder: |
187 | vin['position'] = location | 188 | vin['position'] = location |
188 | return vin | 189 | return vin |
189 | 190 | ||
191 | def cn_char_filter(self, src_str): | ||
192 | cn_chars = re.findall(self.cn_re, src_str) | ||
193 | cn_str = ''.join(cn_chars) | ||
194 | return cn_str | ||
195 | |||
190 | def get_loan_principal(self, page_num='0'): | 196 | def get_loan_principal(self, page_num='0'): |
191 | chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', | 197 | chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', |
192 | '佰', '仟', '万', '亿', '元', '角', '分', '零', '整'] | 198 | '佰', '仟', '万', '亿', '元', '角', '分', '零', '整'] |
... | @@ -201,7 +207,7 @@ class Finder: | ... | @@ -201,7 +207,7 @@ class Finder: |
201 | for line in block['lines']: | 207 | for line in block['lines']: |
202 | for span in line['spans']: | 208 | for span in line['spans']: |
203 | bbox, text = span['bbox'], span['text'] | 209 | bbox, text = span['bbox'], span['text'] |
204 | if fuzz.ratio(''.join(chinese_keywords), text) > 15: | 210 | if fuzz.ratio(''.join(chinese_keywords), self.cn_char_filter(text)) >= 10: |
205 | text = text.split(':')[-1].strip() | 211 | text = text.split(':')[-1].strip() |
206 | upper['position'] = bbox | 212 | upper['position'] = bbox |
207 | upper['words'] = text | 213 | upper['words'] = text | ... | ... |
This diff is collapsed.
Click to expand it.
-
Please register or sign in to post a comment