a473474e by 周伟奇

fix registeredCapital

1 parent ad575f50
1 import re 1 import re
2 from datetime import datetime 2 from datetime import datetime
3 from .rmb_upper import to_rmb_upper 3 from .rmb_lower import rmb_handler
4 4
5 5
6 class Comparison: 6 class Comparison:
...@@ -36,26 +36,26 @@ class Comparison: ...@@ -36,26 +36,26 @@ class Comparison:
36 return self.RESULT_N 36 return self.RESULT_N
37 37
38 def common_compare(self, input_str, ocr_str, idx, **kwargs): 38 def common_compare(self, input_str, ocr_str, idx, **kwargs):
39 if ocr_str == '': 39 if not isinstance(ocr_str, str) or not isinstance(input_str, str):
40 return self.RESULT_NA, None
41 if not isinstance(input_str, str) or not isinstance(ocr_str, str):
42 return self.RESULT_NA, ocr_str 40 return self.RESULT_NA, ocr_str
41 if ocr_str == '' or ocr_str.strip() == '':
42 return self.RESULT_NA, None
43 return self.build_res(input_str == ocr_str), ocr_str 43 return self.build_res(input_str == ocr_str), ocr_str
44 44
45 def company_compare(self, input_str, ocr_str, idx, **kwargs): 45 def company_compare(self, input_str, ocr_str, idx, **kwargs):
46 if ocr_str == '': 46 if not isinstance(ocr_str, str) or not isinstance(input_str, str):
47 return self.RESULT_NA, None
48 if not isinstance(input_str, str) or not isinstance(ocr_str, str):
49 return self.RESULT_NA, ocr_str 47 return self.RESULT_NA, ocr_str
48 if ocr_str == '' or ocr_str.strip() == '':
49 return self.RESULT_NA, None
50 input_tmp = re.sub(self.re_obj, '', input_str).strip() 50 input_tmp = re.sub(self.re_obj, '', input_str).strip()
51 ocr_tmp = re.sub(self.re_obj, '', ocr_str).strip() 51 ocr_tmp = re.sub(self.re_obj, '', ocr_str).strip()
52 return self.build_res(input_tmp == ocr_tmp), ocr_str 52 return self.build_res(input_tmp == ocr_tmp), ocr_str
53 53
54 def name_compare(self, input_str, ocr_str, idx, **kwargs): 54 def name_compare(self, input_str, ocr_str, idx, **kwargs):
55 if ocr_str == '': 55 if not isinstance(ocr_str, str) or not isinstance(input_str, str):
56 return self.RESULT_NA, None
57 if not isinstance(input_str, str) or not isinstance(ocr_str, str):
58 return self.RESULT_NA, ocr_str 56 return self.RESULT_NA, ocr_str
57 if ocr_str == '' or ocr_str.strip() == '':
58 return self.RESULT_NA, None
59 if kwargs.get('is_passport'): 59 if kwargs.get('is_passport'):
60 input_tmp = input_str.upper().replace(' ', '') 60 input_tmp = input_str.upper().replace(' ', '')
61 ocr_tmp = ocr_str.upper().replace(' ', '') 61 ocr_tmp = ocr_str.upper().replace(' ', '')
...@@ -71,11 +71,12 @@ class Comparison: ...@@ -71,11 +71,12 @@ class Comparison:
71 return self.build_res(input_s == ocr_s), ocr_str 71 return self.build_res(input_s == ocr_s), ocr_str
72 72
73 def date_compare(self, input_str, ocr_str, idx, **kwargs): 73 def date_compare(self, input_str, ocr_str, idx, **kwargs):
74 if ocr_str == '': 74 if not isinstance(ocr_str, str) or not isinstance(input_str, str):
75 return self.RESULT_NA, None
76 if not isinstance(input_str, str) or not isinstance(ocr_str, str):
77 return self.RESULT_NA, ocr_str 75 return self.RESULT_NA, ocr_str
78 if kwargs.get('long', False) and '长期' in ocr_str: 76 if ocr_str == '' or ocr_str.strip() == '':
77 return self.RESULT_NA, None
78 if kwargs.get('long', False):
79 if '长期' in ocr_str or '永久' in ocr_str:
79 if input_str == '2099-12-31' or input_str == '2099-01-01': 80 if input_str == '2099-12-31' or input_str == '2099-01-01':
80 return self.RESULT_Y, '2099-12-31' 81 return self.RESULT_Y, '2099-12-31'
81 else: 82 else:
...@@ -102,26 +103,26 @@ class Comparison: ...@@ -102,26 +103,26 @@ class Comparison:
102 return self.build_res(input_str == ocr_str), ocr_output 103 return self.build_res(input_str == ocr_str), ocr_output
103 104
104 def rmb_compare(self, input_str, ocr_str, idx, **kwargs): 105 def rmb_compare(self, input_str, ocr_str, idx, **kwargs):
105 if not isinstance(input_str, str) or not isinstance(ocr_str, str): 106 if not isinstance(ocr_str, str) or not isinstance(input_str, str):
106 return self.RESULT_NA, None 107 return self.RESULT_NA, None
107 if ocr_str == '': 108 if ocr_str == '' or ocr_str.strip() == '':
108 return self.RESULT_NA, None 109 return self.RESULT_NA, None
109 try: 110 try:
110 input_rmb_upper = to_rmb_upper(float(input_str)) 111 ocr_lower = rmb_handler.to_rmb_lower()
111 res = self.build_res(input_rmb_upper == ocr_str) 112 res = self.build_res(float(input_str) == ocr_lower)
112 except Exception as e: 113 except Exception as e:
113 return self.RESULT_N, None 114 return self.RESULT_N, None
114 else: 115 else:
115 if res == self.RESULT_Y: 116 if res == self.RESULT_Y:
116 return res, input_str 117 return res, input_str
117 else: 118 else:
118 return res, None 119 return res, ocr_lower
119 120
120 def type_compare(self, input_str, ocr_str, idx, **kwargs): 121 def type_compare(self, input_str, ocr_str, idx, **kwargs):
121 if ocr_str == '': 122 if not isinstance(ocr_str, str) or not isinstance(input_str, str):
122 return self.RESULT_NA, None
123 if not isinstance(input_str, str) or not isinstance(ocr_str, str):
124 return self.RESULT_NA, ocr_str 123 return self.RESULT_NA, ocr_str
124 if ocr_str == '' or ocr_str.strip() == '':
125 return self.RESULT_NA, None
125 for map_tuple in self.TYPE_MAPPING: 126 for map_tuple in self.TYPE_MAPPING:
126 if re.search(map_tuple[0], ocr_str) is not None: 127 if re.search(map_tuple[0], ocr_str) is not None:
127 compare_str = map_tuple[1] 128 compare_str = map_tuple[1]
......
1 import re
2
3
4 class RMBHandler:
5
6 def __init__(self):
7 self.num_mapping = {
8 '零': 0,
9 '壹': 1,
10 '贰': 2,
11 '叁': 3,
12 '肆': 4,
13 '伍': 5,
14 '陆': 6,
15 '柒': 7,
16 '捌': 8,
17 '玖': 9
18 }
19 self.unit_mapping = {
20 '厘': (0, 0.001),
21 '分': (1, 0.01),
22 '角': (2, 0.1),
23 '圆': (3, 1),
24 '拾': (4, 10),
25 '佰': (5, 100),
26 '仟': (6, 1000),
27 '万': (7, 10000),
28 '亿': (8, 100000000)
29 }
30
31 def upper_to_lower(self, price):
32 result = 0
33 last_unit_idx = 0
34 num = 0
35 for idx, c in enumerate(price):
36 if c in self.num_mapping:
37 num = self.num_mapping.get(c)
38 else:
39 if idx == 0:
40 num = 1
41 unit_idx, unit = self.unit_mapping.get(c, (0, 0))
42 if unit_idx > last_unit_idx:
43 result = (result + num) * unit
44 else:
45 result = result + (num * unit)
46 last_unit_idx = unit_idx
47 num = 0
48 return result
49
50 def pre_process(self, price, upper=True):
51 if upper:
52 for idx, c in enumerate(price):
53 if c in self.num_mapping or c in self.unit_mapping:
54 head = idx
55 break
56 else:
57 return None
58
59 for idx in range(len(price)-1, -1, -1):
60 if price[idx] in self.num_mapping or price[idx] in self.unit_mapping:
61 tail = idx + 1
62 break
63 else:
64 return None
65
66 return price[head: tail]
67 else:
68 pass
69
70 def to_rmb_lower(self, price):
71 try:
72 if re.search(r'[\d]', price) is None:
73 price = self.pre_process(price)
74 if not price:
75 return None
76 result = self.upper_to_lower(price)
77 if result is not None:
78 result = float(round(result, 3))
79 return result
80 else:
81 re_obj = re.search(r'(\d+\.?\d*)([万亿]?)', price)
82 digit = float(re_obj.group(1))
83 unit = re_obj.group(2)
84 if unit in self.unit_mapping:
85 digit = digit * self.unit_mapping[unit][1]
86 return digit
87 except Exception as e:
88 return None
89
90
91 rmb_handler = RMBHandler()
92
93 if __name__ == '__main__':
94 test_2 = ['壹万伍仟肆佰壹拾圆叁角伍分肆厘', '捌万陆仟肆佰壹拾圆整', '壹万伍仟肆佰壹拾元贰角捌分肆厘', '拾壹亿壹仟万伍仟肆佰壹拾元贰角捌分肆厘', '拾伍万圆']
95 test_1 = ['sfdds', '柒佰玖拾万元整', '100万元整', '人民币伍佰万圆整', '人民币壹仟万元', '100万元', '贰佰壹拾捌万圆整', '(人民币)壹仟万元', '壹佰壹拾万圆整', '人民币30.0000万元整', '伍拾万元人民币']
96 input_list = test_1
97 for i in input_list:
98 print('{0}={1}'.format(i, rmb_handler.to_rmb_lower(i)))
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!