KWOM_July
Showing
3 changed files
with
16 additions
and
1 deletions
| ... | @@ -25,6 +25,15 @@ def extract_info(ocr_results): | ... | @@ -25,6 +25,15 @@ def extract_info(ocr_results): |
| 25 | 25 | ||
| 26 | 26 | ||
| 27 | def predict(pdf_info, is_qrs=False, is_fsm=False): | 27 | def predict(pdf_info, is_qrs=False, is_fsm=False): |
| 28 | pop_seceond_page_info = {} | ||
| 29 | if not is_fsm and not is_qrs and len(pdf_info) == 9: | ||
| 30 | pop_seceond_page_info = pdf_info.pop('1', {}) | ||
| 31 | for pno in range(8): | ||
| 32 | if pno == 0: | ||
| 33 | pdf_info[str(pno)]['blocks'].extend(pop_seceond_page_info.get('blocks', [])) | ||
| 34 | else: | ||
| 35 | pdf_info[str(pno)] = pdf_info.pop(str(pno+1)) | ||
| 36 | |||
| 28 | ocr_results = {} | 37 | ocr_results = {} |
| 29 | for pno in pdf_info: | 38 | for pno in pdf_info: |
| 30 | ocr_results[pno] = {} | 39 | ocr_results[pno] = {} | ... | ... |
| ... | @@ -13,6 +13,7 @@ class Finder: | ... | @@ -13,6 +13,7 @@ class Finder: |
| 13 | self.item = {"words": None, | 13 | self.item = {"words": None, |
| 14 | "position": None, | 14 | "position": None, |
| 15 | } | 15 | } |
| 16 | self.cn_re = re.compile(u'[\u4e00-\u9fa5]') | ||
| 16 | 17 | ||
| 17 | def gen_init_result(self, is_asp): | 18 | def gen_init_result(self, is_asp): |
| 18 | # 格式化算法输出 | 19 | # 格式化算法输出 |
| ... | @@ -187,6 +188,11 @@ class Finder: | ... | @@ -187,6 +188,11 @@ class Finder: |
| 187 | vin['position'] = location | 188 | vin['position'] = location |
| 188 | return vin | 189 | return vin |
| 189 | 190 | ||
| 191 | def cn_char_filter(self, src_str): | ||
| 192 | cn_chars = re.findall(self.cn_re, src_str) | ||
| 193 | cn_str = ''.join(cn_chars) | ||
| 194 | return cn_str | ||
| 195 | |||
| 190 | def get_loan_principal(self, page_num='0'): | 196 | def get_loan_principal(self, page_num='0'): |
| 191 | chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', | 197 | chinese_keywords = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', |
| 192 | '佰', '仟', '万', '亿', '元', '角', '分', '零', '整'] | 198 | '佰', '仟', '万', '亿', '元', '角', '分', '零', '整'] |
| ... | @@ -201,7 +207,7 @@ class Finder: | ... | @@ -201,7 +207,7 @@ class Finder: |
| 201 | for line in block['lines']: | 207 | for line in block['lines']: |
| 202 | for span in line['spans']: | 208 | for span in line['spans']: |
| 203 | bbox, text = span['bbox'], span['text'] | 209 | bbox, text = span['bbox'], span['text'] |
| 204 | if fuzz.ratio(''.join(chinese_keywords), text) > 15: | 210 | if fuzz.ratio(''.join(chinese_keywords), self.cn_char_filter(text)) >= 10: |
| 205 | text = text.split(':')[-1].strip() | 211 | text = text.split(':')[-1].strip() |
| 206 | upper['position'] = bbox | 212 | upper['position'] = bbox |
| 207 | upper['words'] = text | 213 | upper['words'] = text | ... | ... |
This diff is collapsed.
Click to expand it.
-
Please register or sign in to post a comment