add jwg word2vec
Showing
2 changed files
with
41 additions
and
7 deletions
... | @@ -7,7 +7,7 @@ import uuid | ... | @@ -7,7 +7,7 @@ import uuid |
7 | import cv2 | 7 | import cv2 |
8 | import pandas as pd | 8 | import pandas as pd |
9 | from tools import get_file_paths, load_json | 9 | from tools import get_file_paths, load_json |
10 | from word2vec import simple_word2vec | 10 | from word2vec import simple_word2vec, jwq_word2vec |
11 | 11 | ||
12 | 12 | ||
13 | def clean_go_res(go_res_dir): | 13 | def clean_go_res(go_res_dir): |
... | @@ -187,28 +187,34 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save | ... | @@ -187,28 +187,34 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save |
187 | 187 | ||
188 | X = list() | 188 | X = list() |
189 | y_true = list() | 189 | y_true = list() |
190 | |||
191 | text_vec_max_lens = 15 * 50 | ||
192 | dim = 1 + 5 + 8 + text_vec_max_lens | ||
193 | num_classes = 10 | ||
190 | for i in range(160): | 194 | for i in range(160): |
191 | if i >= valid_lens: | 195 | if i >= valid_lens: |
192 | X.append([0. for _ in range(14)]) | 196 | X.append([0. for _ in range(dim)]) |
193 | y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) | 197 | y_true.append([0 for _ in range(num_classes)]) |
194 | 198 | ||
195 | elif i in top_text_idx_set: | 199 | elif i in top_text_idx_set: |
196 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] | 200 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] |
197 | feature_vec = [1.] | 201 | feature_vec = [1.] |
198 | feature_vec.extend(simple_word2vec(text)) | 202 | feature_vec.extend(simple_word2vec(text)) |
199 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) | 203 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) |
204 | feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) | ||
200 | X.append(feature_vec) | 205 | X.append(feature_vec) |
201 | 206 | ||
202 | y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) | 207 | y_true.append([0 for _ in range(num_classes)]) |
203 | 208 | ||
204 | elif i in label_idx_dict: | 209 | elif i in label_idx_dict: |
205 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] | 210 | (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] |
206 | feature_vec = [0.] | 211 | feature_vec = [0.] |
207 | feature_vec.extend(simple_word2vec(text)) | 212 | feature_vec.extend(simple_word2vec(text)) |
208 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) | 213 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) |
214 | feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) | ||
209 | X.append(feature_vec) | 215 | X.append(feature_vec) |
210 | 216 | ||
211 | base_label_list = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] | 217 | base_label_list = [0 for _ in range(num_classes)] |
212 | base_label_list[label_idx_dict[i]] = 1 | 218 | base_label_list[label_idx_dict[i]] = 1 |
213 | y_true.append(base_label_list) | 219 | y_true.append(base_label_list) |
214 | else: | 220 | else: |
... | @@ -216,9 +222,10 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save | ... | @@ -216,9 +222,10 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save |
216 | feature_vec = [0.] | 222 | feature_vec = [0.] |
217 | feature_vec.extend(simple_word2vec(text)) | 223 | feature_vec.extend(simple_word2vec(text)) |
218 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) | 224 | feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) |
225 | feature_vec.extend(jwq_word2vec(text, text_vec_max_lens)) | ||
219 | X.append(feature_vec) | 226 | X.append(feature_vec) |
220 | 227 | ||
221 | y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) | 228 | y_true.append([0 for _ in range(num_classes)]) |
222 | 229 | ||
223 | all_data = [X, y_true, valid_lens] | 230 | all_data = [X, y_true, valid_lens] |
224 | 231 | ||
... | @@ -328,5 +335,6 @@ if __name__ == '__main__': | ... | @@ -328,5 +335,6 @@ if __name__ == '__main__': |
328 | build_anno_file(valid_dataset_dir, valid_anno_file_path) | 335 | build_anno_file(valid_dataset_dir, valid_anno_file_path) |
329 | 336 | ||
330 | # print(simple_word2vec(' fd2jk接口 额24;叁‘,。测ADF壹试!¥? ')) | 337 | # print(simple_word2vec(' fd2jk接口 额24;叁‘,。测ADF壹试!¥? ')) |
338 | # print(jwq_word2vec('发', 15*50)) | ||
331 | 339 | ||
332 | 340 | ... | ... |
1 | import re | 1 | import re |
2 | # from gensim.models import word2vec | 2 | import numpy as np |
3 | from gensim.models import word2vec | ||
4 | word2vec_model = word2vec.Word2Vec.load('/Users/zhouweiqi/Downloads/xgboost/models/word2vec_train_single.model') | ||
5 | |||
3 | 6 | ||
4 | def simple_word2vec(text): | 7 | def simple_word2vec(text): |
5 | clean_text = text.strip() | 8 | clean_text = text.strip() |
... | @@ -41,3 +44,26 @@ def simple_word2vec(text): | ... | @@ -41,3 +44,26 @@ def simple_word2vec(text): |
41 | # print('-------------') | 44 | # print('-------------') |
42 | 45 | ||
43 | return vec | 46 | return vec |
47 | |||
48 | def jwq_word2vec(text, text_vec_max_lens=1500): | ||
49 | clean_text = text.strip() | ||
50 | |||
51 | sentence_vec = list() | ||
52 | for char in clean_text: | ||
53 | try: | ||
54 | word_vec = word2vec_model.wv[char] | ||
55 | sentence_vec.extend(word_vec) | ||
56 | except: | ||
57 | word_vec = word2vec_model.wv['unk'] | ||
58 | sentence_vec.extend(word_vec) | ||
59 | |||
60 | if len(sentence_vec) > text_vec_max_lens: | ||
61 | sentence_vec = sentence_vec[:text_vec_max_lens] | ||
62 | else: | ||
63 | padding_number = text_vec_max_lens - len(sentence_vec) | ||
64 | for _ in range(padding_number): | ||
65 | sentence_vec.append(0.) | ||
66 | |||
67 | sentence_vec = np.float64(sentence_vec) | ||
68 | # print(type(sentence_vec)) | ||
69 | return sentence_vec | ... | ... |
-
Please register or sign in to post a comment