add simple word2vec
Showing
3 changed files
with
106 additions
and
22 deletions
| ... | @@ -55,7 +55,7 @@ solver: | ... | @@ -55,7 +55,7 @@ solver: |
| 55 | # name: 'CrossEntropyLoss' | 55 | # name: 'CrossEntropyLoss' |
| 56 | args: | 56 | args: |
| 57 | reduction: "mean" | 57 | reduction: "mean" |
| 58 | alpha: 0.95 | 58 | alpha: 0.8 |
| 59 | 59 | ||
| 60 | logger: | 60 | logger: |
| 61 | log_root: '/Users/zhouweiqi/Downloads/test/logs' | 61 | log_root: '/Users/zhouweiqi/Downloads/test/logs' | ... | ... |
| ... | @@ -7,34 +7,55 @@ import uuid | ... | @@ -7,34 +7,55 @@ import uuid |
| 7 | import cv2 | 7 | import cv2 |
| 8 | import pandas as pd | 8 | import pandas as pd |
| 9 | from tools import get_file_paths, load_json | 9 | from tools import get_file_paths, load_json |
| 10 | from word2vec import simple_word2vec | ||
| 10 | 11 | ||
| 11 | 12 | ||
| 12 | def clean_go_res(go_res_dir): | 13 | def clean_go_res(go_res_dir): |
| 13 | max_seq_count = None | ||
| 14 | seq_sum = 0 | ||
| 15 | file_count = 0 | ||
| 16 | |||
| 17 | go_res_json_paths = get_file_paths(go_res_dir, ['.json', ]) | 14 | go_res_json_paths = get_file_paths(go_res_dir, ['.json', ]) |
| 18 | for go_res_json_path in go_res_json_paths: | 15 | for go_res_json_path in go_res_json_paths: |
| 19 | print('Info: start {0}'.format(go_res_json_path)) | 16 | print('Info: start {0}'.format(go_res_json_path)) |
| 20 | 17 | ||
| 21 | remove_key_set = set() | 18 | remove_idx_set = set() |
| 22 | go_res = load_json(go_res_json_path) | 19 | src_go_res_list = load_json(go_res_json_path) |
| 23 | for key, (_, text) in go_res.items(): | 20 | for idx, (_, text) in enumerate(src_go_res_list): |
| 24 | if text.strip() == '': | 21 | if text.strip() == '': |
| 25 | remove_key_set.add(key) | 22 | remove_idx_set.add(idx) |
| 26 | print(text) | 23 | print(text) |
| 27 | 24 | ||
| 28 | if len(remove_key_set) > 0: | 25 | if len(remove_idx_set) > 0: |
| 29 | for del_key in remove_key_set: | 26 | for del_idx in remove_idx_set: |
| 30 | del go_res[del_key] | 27 | del src_go_res_list[del_idx] |
| 31 | 28 | ||
| 32 | go_res_list = sorted(list(go_res.values()), key=lambda x: (x[0][1], x[0][0]), reverse=False) | 29 | go_res_list = sorted(src_go_res_list, key=lambda x: (x[0][1], x[0][0]), reverse=False) |
| 33 | 30 | ||
| 34 | with open(go_res_json_path, 'w') as fp: | 31 | with open(go_res_json_path, 'w') as fp: |
| 35 | json.dump(go_res_list, fp) | 32 | json.dump(go_res_list, fp) |
| 36 | print('Rerewirte {0}'.format(go_res_json_path)) | 33 | print('Rerewirte {0}'.format(go_res_json_path)) |
| 37 | 34 | ||
| 35 | |||
| 36 | def char_length_statistics(go_res_dir): | ||
| 37 | max_char_length = None | ||
| 38 | target_file_name = None | ||
| 39 | go_res_json_paths = get_file_paths(go_res_dir, ['.json', ]) | ||
| 40 | for go_res_json_path in go_res_json_paths: | ||
| 41 | print('Info: start {0}'.format(go_res_json_path)) | ||
| 42 | src_go_res_list = load_json(go_res_json_path) | ||
| 43 | for _, text in src_go_res_list: | ||
| 44 | if max_char_length is None or len(text.strip()) > max_char_length: | ||
| 45 | max_char_length = len(text.strip()) | ||
| 46 | target_file_name = go_res_json_path | ||
| 47 | return max_char_length, target_file_name | ||
| 48 | |||
| 49 | def bbox_statistics(go_res_dir): | ||
| 50 | max_seq_count = None | ||
| 51 | seq_sum = 0 | ||
| 52 | file_count = 0 | ||
| 53 | |||
| 54 | go_res_json_paths = get_file_paths(go_res_dir, ['.json', ]) | ||
| 55 | for go_res_json_path in go_res_json_paths: | ||
| 56 | print('Info: start {0}'.format(go_res_json_path)) | ||
| 57 | |||
| 58 | go_res_list = load_json(go_res_json_path) | ||
| 38 | seq_sum += len(go_res_list) | 59 | seq_sum += len(go_res_list) |
| 39 | file_count += 1 | 60 | file_count += 1 |
| 40 | if max_seq_count is None or len(go_res_list) > max_seq_count: | 61 | if max_seq_count is None or len(go_res_list) > max_seq_count: |
| ... | @@ -168,21 +189,35 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save | ... | @@ -168,21 +189,35 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save |
| 168 | y_true = list() | 189 | y_true = list() |
| 169 | for i in range(160): | 190 | for i in range(160): |
| 170 | if i >= valid_lens: | 191 | if i >= valid_lens: |
| 171 | X.append([0., 0., 0., 0., 0., 0., 0., 0., 0.]) | 192 | X.append([0. for _ in range(14)]) |
| 172 | y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) | 193 | y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) |
| 194 | |||
| 173 | elif i in top_text_idx_set: | 195 | elif i in top_text_idx_set: |
| 174 | (x0, y0, x1, y1, x2, y2, x3, y3), _ = go_res_list[i] | 196 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] |
| 175 | X.append([1., x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) | 197 | feature_vec = [1.] |
| 198 | feature_vec.extend(simple_word2vec(text)) | ||
| 199 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) | ||
| 200 | X.append(feature_vec) | ||
| 201 | |||
| 176 | y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) | 202 | y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) |
| 203 | |||
| 177 | elif i in label_idx_dict: | 204 | elif i in label_idx_dict: |
| 178 | (x0, y0, x1, y1, x2, y2, x3, y3), _ = go_res_list[i] | 205 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] |
| 179 | X.append([0., x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) | 206 | feature_vec = [0.] |
| 207 | feature_vec.extend(simple_word2vec(text)) | ||
| 208 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) | ||
| 209 | X.append(feature_vec) | ||
| 210 | |||
| 180 | base_label_list = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] | 211 | base_label_list = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] |
| 181 | base_label_list[label_idx_dict[i]] = 1 | 212 | base_label_list[label_idx_dict[i]] = 1 |
| 182 | y_true.append(base_label_list) | 213 | y_true.append(base_label_list) |
| 183 | else: | 214 | else: |
| 184 | (x0, y0, x1, y1, x2, y2, x3, y3), _ = go_res_list[i] | 215 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] |
| 185 | X.append([0., x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) | 216 | feature_vec = [0.] |
| 217 | feature_vec.extend(simple_word2vec(text)) | ||
| 218 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) | ||
| 219 | X.append(feature_vec) | ||
| 220 | |||
| 186 | y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) | 221 | y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) |
| 187 | 222 | ||
| 188 | all_data = [X, y_true, valid_lens] | 223 | all_data = [X, y_true, valid_lens] |
| ... | @@ -222,11 +257,15 @@ if __name__ == '__main__': | ... | @@ -222,11 +257,15 @@ if __name__ == '__main__': |
| 222 | valid_dataset_dir = os.path.join(dataset_save_dir, 'valid') | 257 | valid_dataset_dir = os.path.join(dataset_save_dir, 'valid') |
| 223 | valid_anno_file_path = os.path.join(dataset_save_dir, 'valid.csv') | 258 | valid_anno_file_path = os.path.join(dataset_save_dir, 'valid.csv') |
| 224 | 259 | ||
| 225 | # max_seq_lens, seq_lens_mean, max_seq_file_name = clean_go_res(go_dir) | 260 | # max_seq_lens, seq_lens_mean, max_seq_file_name = bbox_statistics(go_dir) |
| 226 | # print(max_seq_lens) # 152 | 261 | # print(max_seq_lens) # 152 |
| 227 | # print(max_seq_file_name) # CH-B101805176_page_2_img_0.json | 262 | # print(max_seq_file_name) # train/CH-B101805176_page_2_img_0.json |
| 228 | # print(seq_lens_mean) # 92 | 263 | # print(seq_lens_mean) # 92 |
| 229 | 264 | ||
| 265 | # max_char_lens, target_file_name = char_length_statistics(go_dir) | ||
| 266 | # print(max_char_lens) # 72 | ||
| 267 | # print(target_file_name) # train/CH-B103053828-4.json | ||
| 268 | |||
| 230 | # top_text_list = text_statistics(go_dir) | 269 | # top_text_list = text_statistics(go_dir) |
| 231 | # for t in top_text_list: | 270 | # for t in top_text_list: |
| 232 | # print(t) | 271 | # print(t) |
| ... | @@ -288,4 +327,6 @@ if __name__ == '__main__': | ... | @@ -288,4 +327,6 @@ if __name__ == '__main__': |
| 288 | build_dataset(valid_image_path, valid_go_path, valid_label_path, filter_from_top_text_list, skip_list_valid, valid_dataset_dir) | 327 | build_dataset(valid_image_path, valid_go_path, valid_label_path, filter_from_top_text_list, skip_list_valid, valid_dataset_dir) |
| 289 | build_anno_file(valid_dataset_dir, valid_anno_file_path) | 328 | build_anno_file(valid_dataset_dir, valid_anno_file_path) |
| 290 | 329 | ||
| 330 | # print(simple_word2vec(' fd2jk接口 额24;叁‘,。测ADF壹试!¥? ')) | ||
| 331 | |||
| 291 | 332 | ... | ... |
data/word2vec.py
0 → 100644
| 1 | import re | ||
| 2 | # from gensim.models import word2vec | ||
| 3 | |||
| 4 | def simple_word2vec(text): | ||
| 5 | clean_text = text.strip() | ||
| 6 | text_len = len(clean_text) | ||
| 7 | |||
| 8 | digit_num = 0 | ||
| 9 | en_num = 0 | ||
| 10 | cn_num = 0 | ||
| 11 | space_num = 0 | ||
| 12 | other_num = 0 | ||
| 13 | for char in clean_text: | ||
| 14 | if char.isdigit(): | ||
| 15 | digit_num += 1 | ||
| 16 | elif re.match(r'[A-Za-z]', char): | ||
| 17 | en_num += 1 | ||
| 18 | elif char.isspace(): | ||
| 19 | space_num += 1 | ||
| 20 | elif re.match(r'[\u4e00-\u9fa5]', char): | ||
| 21 | cn_num += 1 | ||
| 22 | else: | ||
| 23 | other_num += 1 | ||
| 24 | |||
| 25 | vec = [text_len/100, | ||
| 26 | cn_num/text_len, | ||
| 27 | en_num/text_len, | ||
| 28 | digit_num/text_len, | ||
| 29 | # space_num/text_len, | ||
| 30 | other_num/text_len, | ||
| 31 | ] | ||
| 32 | |||
| 33 | # print(text) | ||
| 34 | # print(clean_text) | ||
| 35 | # print('-------------') | ||
| 36 | # print(en_num) | ||
| 37 | # print(cn_num) | ||
| 38 | # print(digit_num) | ||
| 39 | # print(space_num) | ||
| 40 | # print(other_num) | ||
| 41 | # print('-------------') | ||
| 42 | |||
| 43 | return vec | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or sign in to post a comment