05e0f320 by 周伟奇

add simple word2vec

1 parent 3e58f6b0
...@@ -55,7 +55,7 @@ solver: ...@@ -55,7 +55,7 @@ solver:
55 # name: 'CrossEntropyLoss' 55 # name: 'CrossEntropyLoss'
56 args: 56 args:
57 reduction: "mean" 57 reduction: "mean"
58 alpha: 0.95 58 alpha: 0.8
59 59
60 logger: 60 logger:
61 log_root: '/Users/zhouweiqi/Downloads/test/logs' 61 log_root: '/Users/zhouweiqi/Downloads/test/logs'
......
...@@ -7,34 +7,55 @@ import uuid ...@@ -7,34 +7,55 @@ import uuid
7 import cv2 7 import cv2
8 import pandas as pd 8 import pandas as pd
9 from tools import get_file_paths, load_json 9 from tools import get_file_paths, load_json
10 from word2vec import simple_word2vec
10 11
11 12
12 def clean_go_res(go_res_dir): 13 def clean_go_res(go_res_dir):
13 max_seq_count = None
14 seq_sum = 0
15 file_count = 0
16
17 go_res_json_paths = get_file_paths(go_res_dir, ['.json', ]) 14 go_res_json_paths = get_file_paths(go_res_dir, ['.json', ])
18 for go_res_json_path in go_res_json_paths: 15 for go_res_json_path in go_res_json_paths:
19 print('Info: start {0}'.format(go_res_json_path)) 16 print('Info: start {0}'.format(go_res_json_path))
20 17
21 remove_key_set = set() 18 remove_idx_set = set()
22 go_res = load_json(go_res_json_path) 19 src_go_res_list = load_json(go_res_json_path)
23 for key, (_, text) in go_res.items(): 20 for idx, (_, text) in enumerate(src_go_res_list):
24 if text.strip() == '': 21 if text.strip() == '':
25 remove_key_set.add(key) 22 remove_idx_set.add(idx)
26 print(text) 23 print(text)
27 24
28 if len(remove_key_set) > 0: 25 if len(remove_idx_set) > 0:
29 for del_key in remove_key_set: 26 for del_idx in remove_idx_set:
30 del go_res[del_key] 27 del src_go_res_list[del_idx]
31 28
32 go_res_list = sorted(list(go_res.values()), key=lambda x: (x[0][1], x[0][0]), reverse=False) 29 go_res_list = sorted(src_go_res_list, key=lambda x: (x[0][1], x[0][0]), reverse=False)
33 30
34 with open(go_res_json_path, 'w') as fp: 31 with open(go_res_json_path, 'w') as fp:
35 json.dump(go_res_list, fp) 32 json.dump(go_res_list, fp)
36 print('Rerewirte {0}'.format(go_res_json_path)) 33 print('Rerewirte {0}'.format(go_res_json_path))
37 34
35
36 def char_length_statistics(go_res_dir):
37 max_char_length = None
38 target_file_name = None
39 go_res_json_paths = get_file_paths(go_res_dir, ['.json', ])
40 for go_res_json_path in go_res_json_paths:
41 print('Info: start {0}'.format(go_res_json_path))
42 src_go_res_list = load_json(go_res_json_path)
43 for _, text in src_go_res_list:
44 if max_char_length is None or len(text.strip()) > max_char_length:
45 max_char_length = len(text.strip())
46 target_file_name = go_res_json_path
47 return max_char_length, target_file_name
48
49 def bbox_statistics(go_res_dir):
50 max_seq_count = None
51 seq_sum = 0
52 file_count = 0
53
54 go_res_json_paths = get_file_paths(go_res_dir, ['.json', ])
55 for go_res_json_path in go_res_json_paths:
56 print('Info: start {0}'.format(go_res_json_path))
57
58 go_res_list = load_json(go_res_json_path)
38 seq_sum += len(go_res_list) 59 seq_sum += len(go_res_list)
39 file_count += 1 60 file_count += 1
40 if max_seq_count is None or len(go_res_list) > max_seq_count: 61 if max_seq_count is None or len(go_res_list) > max_seq_count:
...@@ -168,21 +189,35 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save ...@@ -168,21 +189,35 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
168 y_true = list() 189 y_true = list()
169 for i in range(160): 190 for i in range(160):
170 if i >= valid_lens: 191 if i >= valid_lens:
171 X.append([0., 0., 0., 0., 0., 0., 0., 0., 0.]) 192 X.append([0. for _ in range(14)])
172 y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) 193 y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
194
173 elif i in top_text_idx_set: 195 elif i in top_text_idx_set:
174 (x0, y0, x1, y1, x2, y2, x3, y3), _ = go_res_list[i] 196 (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i]
175 X.append([1., x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) 197 feature_vec = [1.]
198 feature_vec.extend(simple_word2vec(text))
199 feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h])
200 X.append(feature_vec)
201
176 y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) 202 y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
203
177 elif i in label_idx_dict: 204 elif i in label_idx_dict:
178 (x0, y0, x1, y1, x2, y2, x3, y3), _ = go_res_list[i] 205 (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i]
179 X.append([0., x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) 206 feature_vec = [0.]
207 feature_vec.extend(simple_word2vec(text))
208 feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h])
209 X.append(feature_vec)
210
180 base_label_list = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 211 base_label_list = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
181 base_label_list[label_idx_dict[i]] = 1 212 base_label_list[label_idx_dict[i]] = 1
182 y_true.append(base_label_list) 213 y_true.append(base_label_list)
183 else: 214 else:
184 (x0, y0, x1, y1, x2, y2, x3, y3), _ = go_res_list[i] 215 (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i]
185 X.append([0., x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) 216 feature_vec = [0.]
217 feature_vec.extend(simple_word2vec(text))
218 feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h])
219 X.append(feature_vec)
220
186 y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) 221 y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
187 222
188 all_data = [X, y_true, valid_lens] 223 all_data = [X, y_true, valid_lens]
...@@ -222,11 +257,15 @@ if __name__ == '__main__': ...@@ -222,11 +257,15 @@ if __name__ == '__main__':
222 valid_dataset_dir = os.path.join(dataset_save_dir, 'valid') 257 valid_dataset_dir = os.path.join(dataset_save_dir, 'valid')
223 valid_anno_file_path = os.path.join(dataset_save_dir, 'valid.csv') 258 valid_anno_file_path = os.path.join(dataset_save_dir, 'valid.csv')
224 259
225 # max_seq_lens, seq_lens_mean, max_seq_file_name = clean_go_res(go_dir) 260 # max_seq_lens, seq_lens_mean, max_seq_file_name = bbox_statistics(go_dir)
226 # print(max_seq_lens) # 152 261 # print(max_seq_lens) # 152
227 # print(max_seq_file_name) # CH-B101805176_page_2_img_0.json 262 # print(max_seq_file_name) # train/CH-B101805176_page_2_img_0.json
228 # print(seq_lens_mean) # 92 263 # print(seq_lens_mean) # 92
229 264
265 # max_char_lens, target_file_name = char_length_statistics(go_dir)
266 # print(max_char_lens) # 72
267 # print(target_file_name) # train/CH-B103053828-4.json
268
230 # top_text_list = text_statistics(go_dir) 269 # top_text_list = text_statistics(go_dir)
231 # for t in top_text_list: 270 # for t in top_text_list:
232 # print(t) 271 # print(t)
...@@ -288,4 +327,6 @@ if __name__ == '__main__': ...@@ -288,4 +327,6 @@ if __name__ == '__main__':
288 build_dataset(valid_image_path, valid_go_path, valid_label_path, filter_from_top_text_list, skip_list_valid, valid_dataset_dir) 327 build_dataset(valid_image_path, valid_go_path, valid_label_path, filter_from_top_text_list, skip_list_valid, valid_dataset_dir)
289 build_anno_file(valid_dataset_dir, valid_anno_file_path) 328 build_anno_file(valid_dataset_dir, valid_anno_file_path)
290 329
330 # print(simple_word2vec(' fd2jk接口 额24;叁‘,。测ADF壹试!¥? '))
331
291 332
......
1 import re
2 # from gensim.models import word2vec
3
4 def simple_word2vec(text):
5 clean_text = text.strip()
6 text_len = len(clean_text)
7
8 digit_num = 0
9 en_num = 0
10 cn_num = 0
11 space_num = 0
12 other_num = 0
13 for char in clean_text:
14 if char.isdigit():
15 digit_num += 1
16 elif re.match(r'[A-Za-z]', char):
17 en_num += 1
18 elif char.isspace():
19 space_num += 1
20 elif re.match(r'[\u4e00-\u9fa5]', char):
21 cn_num += 1
22 else:
23 other_num += 1
24
25 vec = [text_len/100,
26 cn_num/text_len,
27 en_num/text_len,
28 digit_num/text_len,
29 # space_num/text_len,
30 other_num/text_len,
31 ]
32
33 # print(text)
34 # print(clean_text)
35 # print('-------------')
36 # print(en_num)
37 # print(cn_num)
38 # print(digit_num)
39 # print(space_num)
40 # print(other_num)
41 # print('-------------')
42
43 return vec
...\ No newline at end of file ...\ No newline at end of file
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!