add simple word2vec
Showing
3 changed files
with
106 additions
and
22 deletions
... | @@ -55,7 +55,7 @@ solver: | ... | @@ -55,7 +55,7 @@ solver: |
55 | # name: 'CrossEntropyLoss' | 55 | # name: 'CrossEntropyLoss' |
56 | args: | 56 | args: |
57 | reduction: "mean" | 57 | reduction: "mean" |
58 | alpha: 0.95 | 58 | alpha: 0.8 |
59 | 59 | ||
60 | logger: | 60 | logger: |
61 | log_root: '/Users/zhouweiqi/Downloads/test/logs' | 61 | log_root: '/Users/zhouweiqi/Downloads/test/logs' | ... | ... |
... | @@ -7,34 +7,55 @@ import uuid | ... | @@ -7,34 +7,55 @@ import uuid |
7 | import cv2 | 7 | import cv2 |
8 | import pandas as pd | 8 | import pandas as pd |
9 | from tools import get_file_paths, load_json | 9 | from tools import get_file_paths, load_json |
10 | from word2vec import simple_word2vec | ||
10 | 11 | ||
11 | 12 | ||
12 | def clean_go_res(go_res_dir): | 13 | def clean_go_res(go_res_dir): |
13 | max_seq_count = None | ||
14 | seq_sum = 0 | ||
15 | file_count = 0 | ||
16 | |||
17 | go_res_json_paths = get_file_paths(go_res_dir, ['.json', ]) | 14 | go_res_json_paths = get_file_paths(go_res_dir, ['.json', ]) |
18 | for go_res_json_path in go_res_json_paths: | 15 | for go_res_json_path in go_res_json_paths: |
19 | print('Info: start {0}'.format(go_res_json_path)) | 16 | print('Info: start {0}'.format(go_res_json_path)) |
20 | 17 | ||
21 | remove_key_set = set() | 18 | remove_idx_set = set() |
22 | go_res = load_json(go_res_json_path) | 19 | src_go_res_list = load_json(go_res_json_path) |
23 | for key, (_, text) in go_res.items(): | 20 | for idx, (_, text) in enumerate(src_go_res_list): |
24 | if text.strip() == '': | 21 | if text.strip() == '': |
25 | remove_key_set.add(key) | 22 | remove_idx_set.add(idx) |
26 | print(text) | 23 | print(text) |
27 | 24 | ||
28 | if len(remove_key_set) > 0: | 25 | if len(remove_idx_set) > 0: |
29 | for del_key in remove_key_set: | 26 | for del_idx in remove_idx_set: |
30 | del go_res[del_key] | 27 | del src_go_res_list[del_idx] |
31 | 28 | ||
32 | go_res_list = sorted(list(go_res.values()), key=lambda x: (x[0][1], x[0][0]), reverse=False) | 29 | go_res_list = sorted(src_go_res_list, key=lambda x: (x[0][1], x[0][0]), reverse=False) |
33 | 30 | ||
34 | with open(go_res_json_path, 'w') as fp: | 31 | with open(go_res_json_path, 'w') as fp: |
35 | json.dump(go_res_list, fp) | 32 | json.dump(go_res_list, fp) |
36 | print('Rerewirte {0}'.format(go_res_json_path)) | 33 | print('Rerewirte {0}'.format(go_res_json_path)) |
37 | 34 | ||
35 | |||
36 | def char_length_statistics(go_res_dir): | ||
37 | max_char_length = None | ||
38 | target_file_name = None | ||
39 | go_res_json_paths = get_file_paths(go_res_dir, ['.json', ]) | ||
40 | for go_res_json_path in go_res_json_paths: | ||
41 | print('Info: start {0}'.format(go_res_json_path)) | ||
42 | src_go_res_list = load_json(go_res_json_path) | ||
43 | for _, text in src_go_res_list: | ||
44 | if max_char_length is None or len(text.strip()) > max_char_length: | ||
45 | max_char_length = len(text.strip()) | ||
46 | target_file_name = go_res_json_path | ||
47 | return max_char_length, target_file_name | ||
48 | |||
49 | def bbox_statistics(go_res_dir): | ||
50 | max_seq_count = None | ||
51 | seq_sum = 0 | ||
52 | file_count = 0 | ||
53 | |||
54 | go_res_json_paths = get_file_paths(go_res_dir, ['.json', ]) | ||
55 | for go_res_json_path in go_res_json_paths: | ||
56 | print('Info: start {0}'.format(go_res_json_path)) | ||
57 | |||
58 | go_res_list = load_json(go_res_json_path) | ||
38 | seq_sum += len(go_res_list) | 59 | seq_sum += len(go_res_list) |
39 | file_count += 1 | 60 | file_count += 1 |
40 | if max_seq_count is None or len(go_res_list) > max_seq_count: | 61 | if max_seq_count is None or len(go_res_list) > max_seq_count: |
... | @@ -168,21 +189,35 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save | ... | @@ -168,21 +189,35 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save |
168 | y_true = list() | 189 | y_true = list() |
169 | for i in range(160): | 190 | for i in range(160): |
170 | if i >= valid_lens: | 191 | if i >= valid_lens: |
171 | X.append([0., 0., 0., 0., 0., 0., 0., 0., 0.]) | 192 | X.append([0. for _ in range(14)]) |
172 | y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) | 193 | y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) |
194 | |||
173 | elif i in top_text_idx_set: | 195 | elif i in top_text_idx_set: |
174 | (x0, y0, x1, y1, x2, y2, x3, y3), _ = go_res_list[i] | 196 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] |
175 | X.append([1., x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) | 197 | feature_vec = [1.] |
198 | feature_vec.extend(simple_word2vec(text)) | ||
199 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) | ||
200 | X.append(feature_vec) | ||
201 | |||
176 | y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) | 202 | y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) |
203 | |||
177 | elif i in label_idx_dict: | 204 | elif i in label_idx_dict: |
178 | (x0, y0, x1, y1, x2, y2, x3, y3), _ = go_res_list[i] | 205 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] |
179 | X.append([0., x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) | 206 | feature_vec = [0.] |
207 | feature_vec.extend(simple_word2vec(text)) | ||
208 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) | ||
209 | X.append(feature_vec) | ||
210 | |||
180 | base_label_list = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] | 211 | base_label_list = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] |
181 | base_label_list[label_idx_dict[i]] = 1 | 212 | base_label_list[label_idx_dict[i]] = 1 |
182 | y_true.append(base_label_list) | 213 | y_true.append(base_label_list) |
183 | else: | 214 | else: |
184 | (x0, y0, x1, y1, x2, y2, x3, y3), _ = go_res_list[i] | 215 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] |
185 | X.append([0., x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) | 216 | feature_vec = [0.] |
217 | feature_vec.extend(simple_word2vec(text)) | ||
218 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) | ||
219 | X.append(feature_vec) | ||
220 | |||
186 | y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) | 221 | y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) |
187 | 222 | ||
188 | all_data = [X, y_true, valid_lens] | 223 | all_data = [X, y_true, valid_lens] |
... | @@ -222,11 +257,15 @@ if __name__ == '__main__': | ... | @@ -222,11 +257,15 @@ if __name__ == '__main__': |
222 | valid_dataset_dir = os.path.join(dataset_save_dir, 'valid') | 257 | valid_dataset_dir = os.path.join(dataset_save_dir, 'valid') |
223 | valid_anno_file_path = os.path.join(dataset_save_dir, 'valid.csv') | 258 | valid_anno_file_path = os.path.join(dataset_save_dir, 'valid.csv') |
224 | 259 | ||
225 | # max_seq_lens, seq_lens_mean, max_seq_file_name = clean_go_res(go_dir) | 260 | # max_seq_lens, seq_lens_mean, max_seq_file_name = bbox_statistics(go_dir) |
226 | # print(max_seq_lens) # 152 | 261 | # print(max_seq_lens) # 152 |
227 | # print(max_seq_file_name) # CH-B101805176_page_2_img_0.json | 262 | # print(max_seq_file_name) # train/CH-B101805176_page_2_img_0.json |
228 | # print(seq_lens_mean) # 92 | 263 | # print(seq_lens_mean) # 92 |
229 | 264 | ||
265 | # max_char_lens, target_file_name = char_length_statistics(go_dir) | ||
266 | # print(max_char_lens) # 72 | ||
267 | # print(target_file_name) # train/CH-B103053828-4.json | ||
268 | |||
230 | # top_text_list = text_statistics(go_dir) | 269 | # top_text_list = text_statistics(go_dir) |
231 | # for t in top_text_list: | 270 | # for t in top_text_list: |
232 | # print(t) | 271 | # print(t) |
... | @@ -288,4 +327,6 @@ if __name__ == '__main__': | ... | @@ -288,4 +327,6 @@ if __name__ == '__main__': |
288 | build_dataset(valid_image_path, valid_go_path, valid_label_path, filter_from_top_text_list, skip_list_valid, valid_dataset_dir) | 327 | build_dataset(valid_image_path, valid_go_path, valid_label_path, filter_from_top_text_list, skip_list_valid, valid_dataset_dir) |
289 | build_anno_file(valid_dataset_dir, valid_anno_file_path) | 328 | build_anno_file(valid_dataset_dir, valid_anno_file_path) |
290 | 329 | ||
330 | # print(simple_word2vec(' fd2jk接口 额24;叁‘,。测ADF壹试!¥? ')) | ||
331 | |||
291 | 332 | ... | ... |
data/word2vec.py
0 → 100644
1 | import re | ||
2 | # from gensim.models import word2vec | ||
3 | |||
4 | def simple_word2vec(text): | ||
5 | clean_text = text.strip() | ||
6 | text_len = len(clean_text) | ||
7 | |||
8 | digit_num = 0 | ||
9 | en_num = 0 | ||
10 | cn_num = 0 | ||
11 | space_num = 0 | ||
12 | other_num = 0 | ||
13 | for char in clean_text: | ||
14 | if char.isdigit(): | ||
15 | digit_num += 1 | ||
16 | elif re.match(r'[A-Za-z]', char): | ||
17 | en_num += 1 | ||
18 | elif char.isspace(): | ||
19 | space_num += 1 | ||
20 | elif re.match(r'[\u4e00-\u9fa5]', char): | ||
21 | cn_num += 1 | ||
22 | else: | ||
23 | other_num += 1 | ||
24 | |||
25 | vec = [text_len/100, | ||
26 | cn_num/text_len, | ||
27 | en_num/text_len, | ||
28 | digit_num/text_len, | ||
29 | # space_num/text_len, | ||
30 | other_num/text_len, | ||
31 | ] | ||
32 | |||
33 | # print(text) | ||
34 | # print(clean_text) | ||
35 | # print('-------------') | ||
36 | # print(en_num) | ||
37 | # print(cn_num) | ||
38 | # print(digit_num) | ||
39 | # print(space_num) | ||
40 | # print(other_num) | ||
41 | # print('-------------') | ||
42 | |||
43 | return vec | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or sign in to post a comment