add jwg word2vec
Showing
2 changed files
with
42 additions
and
8 deletions
| ... | @@ -7,7 +7,7 @@ import uuid | ... | @@ -7,7 +7,7 @@ import uuid |
| 7 | import cv2 | 7 | import cv2 |
| 8 | import pandas as pd | 8 | import pandas as pd |
| 9 | from tools import get_file_paths, load_json | 9 | from tools import get_file_paths, load_json |
| 10 | from word2vec import simple_word2vec | 10 | from word2vec import simple_word2vec, jwq_word2vec |
| 11 | 11 | ||
| 12 | 12 | ||
| 13 | def clean_go_res(go_res_dir): | 13 | def clean_go_res(go_res_dir): |
| ... | @@ -187,28 +187,34 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save | ... | @@ -187,28 +187,34 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save |
| 187 | 187 | ||
| 188 | X = list() | 188 | X = list() |
| 189 | y_true = list() | 189 | y_true = list() |
| 190 | |||
| 191 | text_vec_max_lens = 15 * 50 | ||
| 192 | dim = 1 + 5 + 8 + text_vec_max_lens | ||
| 193 | num_classes = 10 | ||
| 190 | for i in range(160): | 194 | for i in range(160): |
| 191 | if i >= valid_lens: | 195 | if i >= valid_lens: |
| 192 | X.append([0. for _ in range(14)]) | 196 | X.append([0. for _ in range(dim)]) |
| 193 | y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) | 197 | y_true.append([0 for _ in range(num_classes)]) |
| 194 | 198 | ||
| 195 | elif i in top_text_idx_set: | 199 | elif i in top_text_idx_set: |
| 196 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] | 200 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] |
| 197 | feature_vec = [1.] | 201 | feature_vec = [1.] |
| 198 | feature_vec.extend(simple_word2vec(text)) | 202 | feature_vec.extend(simple_word2vec(text)) |
| 199 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) | 203 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) |
| 204 | feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) | ||
| 200 | X.append(feature_vec) | 205 | X.append(feature_vec) |
| 201 | 206 | ||
| 202 | y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) | 207 | y_true.append([0 for _ in range(num_classes)]) |
| 203 | 208 | ||
| 204 | elif i in label_idx_dict: | 209 | elif i in label_idx_dict: |
| 205 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] | 210 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] |
| 206 | feature_vec = [0.] | 211 | feature_vec = [0.] |
| 207 | feature_vec.extend(simple_word2vec(text)) | 212 | feature_vec.extend(simple_word2vec(text)) |
| 208 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) | 213 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) |
| 214 | feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) | ||
| 209 | X.append(feature_vec) | 215 | X.append(feature_vec) |
| 210 | 216 | ||
| 211 | base_label_list = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] | 217 | base_label_list = [0 for _ in range(num_classes)] |
| 212 | base_label_list[label_idx_dict[i]] = 1 | 218 | base_label_list[label_idx_dict[i]] = 1 |
| 213 | y_true.append(base_label_list) | 219 | y_true.append(base_label_list) |
| 214 | else: | 220 | else: |
| ... | @@ -216,9 +222,10 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save | ... | @@ -216,9 +222,10 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save |
| 216 | feature_vec = [0.] | 222 | feature_vec = [0.] |
| 217 | feature_vec.extend(simple_word2vec(text)) | 223 | feature_vec.extend(simple_word2vec(text)) |
| 218 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) | 224 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) |
| 225 | feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) | ||
| 219 | X.append(feature_vec) | 226 | X.append(feature_vec) |
| 220 | 227 | ||
| 221 | y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) | 228 | y_true.append([0 for _ in range(num_classes)]) |
| 222 | 229 | ||
| 223 | all_data = [X, y_true, valid_lens] | 230 | all_data = [X, y_true, valid_lens] |
| 224 | 231 | ||
| ... | @@ -328,5 +335,6 @@ if __name__ == '__main__': | ... | @@ -328,5 +335,6 @@ if __name__ == '__main__': |
| 328 | build_anno_file(valid_dataset_dir, valid_anno_file_path) | 335 | build_anno_file(valid_dataset_dir, valid_anno_file_path) |
| 329 | 336 | ||
| 330 | # print(simple_word2vec(' fd2jk接口 额24;叁‘,。测ADF壹试!¥? ')) | 337 | # print(simple_word2vec(' fd2jk接口 额24;叁‘,。测ADF壹试!¥? ')) |
| 338 | # print(jwq_word2vec('发', 15*50)) | ||
| 331 | 339 | ||
| 332 | 340 | ... | ... |
| 1 | import re | 1 | import re |
| 2 | # from gensim.models import word2vec | 2 | import numpy as np |
| 3 | from gensim.models import word2vec | ||
| 4 | word2vec_model = word2vec.Word2Vec.load('/Users/zhouweiqi/Downloads/xgboost/models/word2vec_train_single.model') | ||
| 5 | |||
| 3 | 6 | ||
| 4 | def simple_word2vec(text): | 7 | def simple_word2vec(text): |
| 5 | clean_text = text.strip() | 8 | clean_text = text.strip() |
| ... | @@ -40,4 +43,27 @@ def simple_word2vec(text): | ... | @@ -40,4 +43,27 @@ def simple_word2vec(text): |
| 40 | # print(other_num) | 43 | # print(other_num) |
| 41 | # print('-------------') | 44 | # print('-------------') |
| 42 | 45 | ||
| 43 | return vec | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 46 | return vec | ||
| 47 | |||
| 48 | def jwq_word2vec(text, text_vec_max_lens=1500): | ||
| 49 | clean_text = text.strip() | ||
| 50 | |||
| 51 | sentence_vec = list() | ||
| 52 | for char in clean_text: | ||
| 53 | try: | ||
| 54 | word_vec = word2vec_model.wv[char] | ||
| 55 | sentence_vec.extend(word_vec) | ||
| 56 | except: | ||
| 57 | word_vec = word2vec_model.wv['unk'] | ||
| 58 | sentence_vec.extend(word_vec) | ||
| 59 | |||
| 60 | if len(sentence_vec) > text_vec_max_lens: | ||
| 61 | sentence_vec = sentence_vec[:text_vec_max_lens] | ||
| 62 | else: | ||
| 63 | padding_number = text_vec_max_lens - len(sentence_vec) | ||
| 64 | for _ in range(padding_number): | ||
| 65 | sentence_vec.append(0.) | ||
| 66 | |||
| 67 | sentence_vec = np.float64(sentence_vec) | ||
| 68 | # print(type(sentence_vec)) | ||
| 69 | return sentence_vec | ... | ... |
-
Please register or sign in to post a comment