890ea78a by 周伟奇

add jwg word2vec

1 parent c919b68e
...@@ -7,7 +7,7 @@ import uuid ...@@ -7,7 +7,7 @@ import uuid
7 import cv2 7 import cv2
8 import pandas as pd 8 import pandas as pd
9 from tools import get_file_paths, load_json 9 from tools import get_file_paths, load_json
10 from word2vec import simple_word2vec 10 from word2vec import simple_word2vec, jwq_word2vec
11 11
12 12
13 def clean_go_res(go_res_dir): 13 def clean_go_res(go_res_dir):
...@@ -187,28 +187,34 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save ...@@ -187,28 +187,34 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
187 187
188 X = list() 188 X = list()
189 y_true = list() 189 y_true = list()
190
191 text_vec_max_lens = 15 * 50
192 dim = 1 + 5 + 8 + text_vec_max_lens
193 num_classes = 10
190 for i in range(160): 194 for i in range(160):
191 if i >= valid_lens: 195 if i >= valid_lens:
192 X.append([0. for _ in range(14)]) 196 X.append([0. for _ in range(dim)])
193 y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) 197 y_true.append([0 for _ in range(num_classes)])
194 198
195 elif i in top_text_idx_set: 199 elif i in top_text_idx_set:
196 (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] 200 (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i]
197 feature_vec = [1.] 201 feature_vec = [1.]
198 feature_vec.extend(simple_word2vec(text)) 202 feature_vec.extend(simple_word2vec(text))
199 feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) 203 feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h])
204 feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
200 X.append(feature_vec) 205 X.append(feature_vec)
201 206
202 y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) 207 y_true.append([0 for _ in range(num_classes)])
203 208
204 elif i in label_idx_dict: 209 elif i in label_idx_dict:
205 (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i] 210 (x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i]
206 feature_vec = [0.] 211 feature_vec = [0.]
207 feature_vec.extend(simple_word2vec(text)) 212 feature_vec.extend(simple_word2vec(text))
208 feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) 213 feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h])
214 feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
209 X.append(feature_vec) 215 X.append(feature_vec)
210 216
211 base_label_list = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 217 base_label_list = [0 for _ in range(num_classes)]
212 base_label_list[label_idx_dict[i]] = 1 218 base_label_list[label_idx_dict[i]] = 1
213 y_true.append(base_label_list) 219 y_true.append(base_label_list)
214 else: 220 else:
...@@ -216,9 +222,10 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save ...@@ -216,9 +222,10 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
216 feature_vec = [0.] 222 feature_vec = [0.]
217 feature_vec.extend(simple_word2vec(text)) 223 feature_vec.extend(simple_word2vec(text))
218 feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h]) 224 feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h])
225 feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
219 X.append(feature_vec) 226 X.append(feature_vec)
220 227
221 y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) 228 y_true.append([0 for _ in range(num_classes)])
222 229
223 all_data = [X, y_true, valid_lens] 230 all_data = [X, y_true, valid_lens]
224 231
...@@ -328,5 +335,6 @@ if __name__ == '__main__': ...@@ -328,5 +335,6 @@ if __name__ == '__main__':
328 build_anno_file(valid_dataset_dir, valid_anno_file_path) 335 build_anno_file(valid_dataset_dir, valid_anno_file_path)
329 336
330 # print(simple_word2vec(' fd2jk接口 额24;叁‘,。测ADF壹试!¥? ')) 337 # print(simple_word2vec(' fd2jk接口 额24;叁‘,。测ADF壹试!¥? '))
338 # print(jwq_word2vec('发', 15*50))
331 339
332 340
......
1 import re 1 import re
2 # from gensim.models import word2vec 2 import numpy as np
3 from gensim.models import word2vec
4 word2vec_model = word2vec.Word2Vec.load('/Users/zhouweiqi/Downloads/xgboost/models/word2vec_train_single.model')
5
3 6
4 def simple_word2vec(text): 7 def simple_word2vec(text):
5 clean_text = text.strip() 8 clean_text = text.strip()
...@@ -41,3 +44,26 @@ def simple_word2vec(text): ...@@ -41,3 +44,26 @@ def simple_word2vec(text):
41 # print('-------------') 44 # print('-------------')
42 45
43 return vec 46 return vec
47
48 def jwq_word2vec(text, text_vec_max_lens=1500):
49 clean_text = text.strip()
50
51 sentence_vec = list()
52 for char in clean_text:
53 try:
54 word_vec = word2vec_model.wv[char]
55 sentence_vec.extend(word_vec)
56 except:
57 word_vec = word2vec_model.wv['unk']
58 sentence_vec.extend(word_vec)
59
60 if len(sentence_vec) > text_vec_max_lens:
61 sentence_vec = sentence_vec[:text_vec_max_lens]
62 else:
63 padding_number = text_vec_max_lens - len(sentence_vec)
64 for _ in range(padding_number):
65 sentence_vec.append(0.)
66
67 sentence_vec = np.float64(sentence_vec)
68 # print(type(sentence_vec))
69 return sentence_vec
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!