fix registeredCapital
Showing
2 changed files
with
121 additions
and
22 deletions
| 1 | import re | 1 | import re |
| 2 | from datetime import datetime | 2 | from datetime import datetime |
| 3 | from .rmb_upper import to_rmb_upper | 3 | from .rmb_lower import rmb_handler |
| 4 | 4 | ||
| 5 | 5 | ||
| 6 | class Comparison: | 6 | class Comparison: |
| ... | @@ -36,26 +36,26 @@ class Comparison: | ... | @@ -36,26 +36,26 @@ class Comparison: |
| 36 | return self.RESULT_N | 36 | return self.RESULT_N |
| 37 | 37 | ||
| 38 | def common_compare(self, input_str, ocr_str, idx, **kwargs): | 38 | def common_compare(self, input_str, ocr_str, idx, **kwargs): |
| 39 | if ocr_str == '': | 39 | if not isinstance(ocr_str, str) or not isinstance(input_str, str): |
| 40 | return self.RESULT_NA, None | ||
| 41 | if not isinstance(input_str, str) or not isinstance(ocr_str, str): | ||
| 42 | return self.RESULT_NA, ocr_str | 40 | return self.RESULT_NA, ocr_str |
| 41 | if ocr_str == '' or ocr_str.strip() == '': | ||
| 42 | return self.RESULT_NA, None | ||
| 43 | return self.build_res(input_str == ocr_str), ocr_str | 43 | return self.build_res(input_str == ocr_str), ocr_str |
| 44 | 44 | ||
| 45 | def company_compare(self, input_str, ocr_str, idx, **kwargs): | 45 | def company_compare(self, input_str, ocr_str, idx, **kwargs): |
| 46 | if ocr_str == '': | 46 | if not isinstance(ocr_str, str) or not isinstance(input_str, str): |
| 47 | return self.RESULT_NA, None | ||
| 48 | if not isinstance(input_str, str) or not isinstance(ocr_str, str): | ||
| 49 | return self.RESULT_NA, ocr_str | 47 | return self.RESULT_NA, ocr_str |
| 48 | if ocr_str == '' or ocr_str.strip() == '': | ||
| 49 | return self.RESULT_NA, None | ||
| 50 | input_tmp = re.sub(self.re_obj, '', input_str).strip() | 50 | input_tmp = re.sub(self.re_obj, '', input_str).strip() |
| 51 | ocr_tmp = re.sub(self.re_obj, '', ocr_str).strip() | 51 | ocr_tmp = re.sub(self.re_obj, '', ocr_str).strip() |
| 52 | return self.build_res(input_tmp == ocr_tmp), ocr_str | 52 | return self.build_res(input_tmp == ocr_tmp), ocr_str |
| 53 | 53 | ||
| 54 | def name_compare(self, input_str, ocr_str, idx, **kwargs): | 54 | def name_compare(self, input_str, ocr_str, idx, **kwargs): |
| 55 | if ocr_str == '': | 55 | if not isinstance(ocr_str, str) or not isinstance(input_str, str): |
| 56 | return self.RESULT_NA, None | ||
| 57 | if not isinstance(input_str, str) or not isinstance(ocr_str, str): | ||
| 58 | return self.RESULT_NA, ocr_str | 56 | return self.RESULT_NA, ocr_str |
| 57 | if ocr_str == '' or ocr_str.strip() == '': | ||
| 58 | return self.RESULT_NA, None | ||
| 59 | if kwargs.get('is_passport'): | 59 | if kwargs.get('is_passport'): |
| 60 | input_tmp = input_str.upper().replace(' ', '') | 60 | input_tmp = input_str.upper().replace(' ', '') |
| 61 | ocr_tmp = ocr_str.upper().replace(' ', '') | 61 | ocr_tmp = ocr_str.upper().replace(' ', '') |
| ... | @@ -71,11 +71,12 @@ class Comparison: | ... | @@ -71,11 +71,12 @@ class Comparison: |
| 71 | return self.build_res(input_s == ocr_s), ocr_str | 71 | return self.build_res(input_s == ocr_s), ocr_str |
| 72 | 72 | ||
| 73 | def date_compare(self, input_str, ocr_str, idx, **kwargs): | 73 | def date_compare(self, input_str, ocr_str, idx, **kwargs): |
| 74 | if ocr_str == '': | 74 | if not isinstance(ocr_str, str) or not isinstance(input_str, str): |
| 75 | return self.RESULT_NA, None | ||
| 76 | if not isinstance(input_str, str) or not isinstance(ocr_str, str): | ||
| 77 | return self.RESULT_NA, ocr_str | 75 | return self.RESULT_NA, ocr_str |
| 78 | if kwargs.get('long', False) and '长期' in ocr_str: | 76 | if ocr_str == '' or ocr_str.strip() == '': |
| 77 | return self.RESULT_NA, None | ||
| 78 | if kwargs.get('long', False): | ||
| 79 | if '长期' in ocr_str or '永久' in ocr_str: | ||
| 79 | if input_str == '2099-12-31' or input_str == '2099-01-01': | 80 | if input_str == '2099-12-31' or input_str == '2099-01-01': |
| 80 | return self.RESULT_Y, '2099-12-31' | 81 | return self.RESULT_Y, '2099-12-31' |
| 81 | else: | 82 | else: |
| ... | @@ -102,26 +103,26 @@ class Comparison: | ... | @@ -102,26 +103,26 @@ class Comparison: |
| 102 | return self.build_res(input_str == ocr_str), ocr_output | 103 | return self.build_res(input_str == ocr_str), ocr_output |
| 103 | 104 | ||
| 104 | def rmb_compare(self, input_str, ocr_str, idx, **kwargs): | 105 | def rmb_compare(self, input_str, ocr_str, idx, **kwargs): |
| 105 | if not isinstance(input_str, str) or not isinstance(ocr_str, str): | 106 | if not isinstance(ocr_str, str) or not isinstance(input_str, str): |
| 106 | return self.RESULT_NA, None | 107 | return self.RESULT_NA, None |
| 107 | if ocr_str == '': | 108 | if ocr_str == '' or ocr_str.strip() == '': |
| 108 | return self.RESULT_NA, None | 109 | return self.RESULT_NA, None |
| 109 | try: | 110 | try: |
| 110 | input_rmb_upper = to_rmb_upper(float(input_str)) | 111 | ocr_lower = rmb_handler.to_rmb_lower() |
| 111 | res = self.build_res(input_rmb_upper == ocr_str) | 112 | res = self.build_res(float(input_str) == ocr_lower) |
| 112 | except Exception as e: | 113 | except Exception as e: |
| 113 | return self.RESULT_N, None | 114 | return self.RESULT_N, None |
| 114 | else: | 115 | else: |
| 115 | if res == self.RESULT_Y: | 116 | if res == self.RESULT_Y: |
| 116 | return res, input_str | 117 | return res, input_str |
| 117 | else: | 118 | else: |
| 118 | return res, None | 119 | return res, ocr_lower |
| 119 | 120 | ||
| 120 | def type_compare(self, input_str, ocr_str, idx, **kwargs): | 121 | def type_compare(self, input_str, ocr_str, idx, **kwargs): |
| 121 | if ocr_str == '': | 122 | if not isinstance(ocr_str, str) or not isinstance(input_str, str): |
| 122 | return self.RESULT_NA, None | ||
| 123 | if not isinstance(input_str, str) or not isinstance(ocr_str, str): | ||
| 124 | return self.RESULT_NA, ocr_str | 123 | return self.RESULT_NA, ocr_str |
| 124 | if ocr_str == '' or ocr_str.strip() == '': | ||
| 125 | return self.RESULT_NA, None | ||
| 125 | for map_tuple in self.TYPE_MAPPING: | 126 | for map_tuple in self.TYPE_MAPPING: |
| 126 | if re.search(map_tuple[0], ocr_str) is not None: | 127 | if re.search(map_tuple[0], ocr_str) is not None: |
| 127 | compare_str = map_tuple[1] | 128 | compare_str = map_tuple[1] | ... | ... |
src/common/tools/rmb_lower.py
0 → 100644
| 1 | import re | ||
| 2 | |||
| 3 | |||
| 4 | class RMBHandler: | ||
| 5 | |||
| 6 | def __init__(self): | ||
| 7 | self.num_mapping = { | ||
| 8 | '零': 0, | ||
| 9 | '壹': 1, | ||
| 10 | '贰': 2, | ||
| 11 | '叁': 3, | ||
| 12 | '肆': 4, | ||
| 13 | '伍': 5, | ||
| 14 | '陆': 6, | ||
| 15 | '柒': 7, | ||
| 16 | '捌': 8, | ||
| 17 | '玖': 9 | ||
| 18 | } | ||
| 19 | self.unit_mapping = { | ||
| 20 | '厘': (0, 0.001), | ||
| 21 | '分': (1, 0.01), | ||
| 22 | '角': (2, 0.1), | ||
| 23 | '圆': (3, 1), | ||
| 24 | '拾': (4, 10), | ||
| 25 | '佰': (5, 100), | ||
| 26 | '仟': (6, 1000), | ||
| 27 | '万': (7, 10000), | ||
| 28 | '亿': (8, 100000000) | ||
| 29 | } | ||
| 30 | |||
| 31 | def upper_to_lower(self, price): | ||
| 32 | result = 0 | ||
| 33 | last_unit_idx = 0 | ||
| 34 | num = 0 | ||
| 35 | for idx, c in enumerate(price): | ||
| 36 | if c in self.num_mapping: | ||
| 37 | num = self.num_mapping.get(c) | ||
| 38 | else: | ||
| 39 | if idx == 0: | ||
| 40 | num = 1 | ||
| 41 | unit_idx, unit = self.unit_mapping.get(c, (0, 0)) | ||
| 42 | if unit_idx > last_unit_idx: | ||
| 43 | result = (result + num) * unit | ||
| 44 | else: | ||
| 45 | result = result + (num * unit) | ||
| 46 | last_unit_idx = unit_idx | ||
| 47 | num = 0 | ||
| 48 | return result | ||
| 49 | |||
| 50 | def pre_process(self, price, upper=True): | ||
| 51 | if upper: | ||
| 52 | for idx, c in enumerate(price): | ||
| 53 | if c in self.num_mapping or c in self.unit_mapping: | ||
| 54 | head = idx | ||
| 55 | break | ||
| 56 | else: | ||
| 57 | return None | ||
| 58 | |||
| 59 | for idx in range(len(price)-1, -1, -1): | ||
| 60 | if price[idx] in self.num_mapping or price[idx] in self.unit_mapping: | ||
| 61 | tail = idx + 1 | ||
| 62 | break | ||
| 63 | else: | ||
| 64 | return None | ||
| 65 | |||
| 66 | return price[head: tail] | ||
| 67 | else: | ||
| 68 | pass | ||
| 69 | |||
| 70 | def to_rmb_lower(self, price): | ||
| 71 | try: | ||
| 72 | if re.search(r'[\d]', price) is None: | ||
| 73 | price = self.pre_process(price) | ||
| 74 | if not price: | ||
| 75 | return None | ||
| 76 | result = self.upper_to_lower(price) | ||
| 77 | if result is not None: | ||
| 78 | result = float(round(result, 3)) | ||
| 79 | return result | ||
| 80 | else: | ||
| 81 | re_obj = re.search(r'(\d+\.?\d*)([万亿]?)', price) | ||
| 82 | digit = float(re_obj.group(1)) | ||
| 83 | unit = re_obj.group(2) | ||
| 84 | if unit in self.unit_mapping: | ||
| 85 | digit = digit * self.unit_mapping[unit][1] | ||
| 86 | return digit | ||
| 87 | except Exception as e: | ||
| 88 | return None | ||
| 89 | |||
| 90 | |||
| 91 | rmb_handler = RMBHandler() | ||
| 92 | |||
| 93 | if __name__ == '__main__': | ||
| 94 | test_2 = ['壹万伍仟肆佰壹拾圆叁角伍分肆厘', '捌万陆仟肆佰壹拾圆整', '壹万伍仟肆佰壹拾元贰角捌分肆厘', '拾壹亿壹仟万伍仟肆佰壹拾元贰角捌分肆厘', '拾伍万圆'] | ||
| 95 | test_1 = ['sfdds', '柒佰玖拾万元整', '100万元整', '人民币伍佰万圆整', '人民币壹仟万元', '100万元', '贰佰壹拾捌万圆整', '(人民币)壹仟万元', '壹佰壹拾万圆整', '人民币30.0000万元整', '伍拾万元人民币'] | ||
| 96 | input_list = test_1 | ||
| 97 | for i in input_list: | ||
| 98 | print('{0}={1}'.format(i, rmb_handler.to_rmb_lower(i))) |
-
Please register or sign in to post a comment