890ea78a by 周伟奇

add jwg word2vec

1 parent c919b68e
......@@ -7,7 +7,7 @@ import uuid
import cv2
import pandas as pd
from tools import get_file_paths, load_json
from word2vec import simple_word2vec
from word2vec import simple_word2vec, jwq_word2vec
def clean_go_res(go_res_dir):
......@@ -187,28 +187,34 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
X = list()
y_true = list()
text_vec_max_lens = 15 * 50
dim = 1 + 5 + 8 + text_vec_max_lens
num_classes = 10
for i in range(160):
if i >= valid_lens:
X.append([0. for _ in range(14)])
y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
X.append([0. for _ in range(dim)])
y_true.append([0 for _ in range(num_classes)])
elif i in top_text_idx_set:
(x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i]
feature_vec = [1.]
feature_vec.extend(simple_word2vec(text))
feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h])
feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
X.append(feature_vec)
y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
y_true.append([0 for _ in range(num_classes)])
elif i in label_idx_dict:
(x0, y0, x1, y1, x2, y2, x3, y3), text = go_res_list[i]
feature_vec = [0.]
feature_vec.extend(simple_word2vec(text))
feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h])
feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
X.append(feature_vec)
base_label_list = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
base_label_list = [0 for _ in range(num_classes)]
base_label_list[label_idx_dict[i]] = 1
y_true.append(base_label_list)
else:
......@@ -216,9 +222,10 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
feature_vec = [0.]
feature_vec.extend(simple_word2vec(text))
feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h])
feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
X.append(feature_vec)
y_true.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
y_true.append([0 for _ in range(num_classes)])
all_data = [X, y_true, valid_lens]
......@@ -328,5 +335,6 @@ if __name__ == '__main__':
build_anno_file(valid_dataset_dir, valid_anno_file_path)
# print(simple_word2vec(' fd2jk接口 额24;叁‘,。测ADF壹试!¥? '))
# print(jwq_word2vec('发', 15*50))
......
import re
# from gensim.models import word2vec
import numpy as np
from gensim.models import word2vec
word2vec_model = word2vec.Word2Vec.load('/Users/zhouweiqi/Downloads/xgboost/models/word2vec_train_single.model')
def simple_word2vec(text):
clean_text = text.strip()
......@@ -40,4 +43,27 @@ def simple_word2vec(text):
# print(other_num)
# print('-------------')
return vec
\ No newline at end of file
return vec
def jwq_word2vec(text, text_vec_max_lens=1500):
clean_text = text.strip()
sentence_vec = list()
for char in clean_text:
try:
word_vec = word2vec_model.wv[char]
sentence_vec.extend(word_vec)
except:
word_vec = word2vec_model.wv['unk']
sentence_vec.extend(word_vec)
if len(sentence_vec) > text_vec_max_lens:
sentence_vec = sentence_vec[:text_vec_max_lens]
else:
padding_number = text_vec_max_lens - len(sentence_vec)
for _ in range(padding_number):
sentence_vec.append(0.)
sentence_vec = np.float64(sentence_vec)
# print(type(sentence_vec))
return sentence_vec
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!