e573fab0 by 周伟奇

add tencent word2vec

1 parent f5b96f19
......@@ -5,11 +5,13 @@ import random
import uuid
import cv2
import re
import pandas as pd
import numpy as np
import jieba
from shapely.geometry import Polygon, MultiPoint
from tools import get_file_paths, load_json
from word2vec import jwq_word2vec, simple_word2vec
from word2vec import jwq_word2vec, simple_word2vec, jieba_and_tencent_word2vec
def bbox_iou(go_bbox, label_bbox, mode='iou'):
# 所有点的最小凸的表示形式,四边形对象,会自动计算四个点,最后顺序为:左上 左下 右下 右上 左上
......@@ -48,8 +50,6 @@ def bbox_iou(go_bbox, label_bbox, mode='iou'):
# else:
# return inter / union
def clean_go_res(go_res_dir):
go_res_json_paths = get_file_paths(go_res_dir, ['.json', ])
for go_res_json_path in go_res_json_paths:
......@@ -85,6 +85,28 @@ def char_length_statistics(go_res_dir):
target_file_name = go_res_json_path
return max_char_length, target_file_name
def char_length_statistics_jieba(go_res_dir):
max_char_length = None
target_file_name = None
max_char_length = None
statistics_dict = {}
go_res_json_paths = get_file_paths(go_res_dir, ['.json', ])
for go_res_json_path in go_res_json_paths:
print('Info: start {0}'.format(go_res_json_path))
src_go_res_list = load_json(go_res_json_path)
for _, text in src_go_res_list:
jieba_char_list = list(filter(lambda x:re.match(r'[\u4e00-\u9fa5]', x), jieba.lcut(text.strip())))
length = len(jieba_char_list)
if length in statistics_dict:
statistics_dict[length] += 1
else:
statistics_dict[length] = 1
if max_char_length is None or length > max_char_length:
target_file_name = go_res_json_path
target_jieba_char_list = jieba_char_list
max_char_length = length
return max_char_length, target_file_name, target_jieba_char_list, statistics_dict
def bbox_statistics(go_res_dir):
max_seq_count = None
seq_sum = 0
......@@ -223,9 +245,15 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
X = list()
y_true = list()
# dim = 1 + 5 + 8
# text_vec_max_lens = 15 * 50
# dim = 1 + 5 + 8 + text_vec_max_lens
dim = 1 + 5 + 8
max_jieba_char = 8
text_vec_max_lens = max_jieba_char * 100
dim = 1 + 5 + 8 + text_vec_max_lens
num_classes = 10
for i in range(160):
if i >= valid_lens:
......@@ -238,6 +266,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
feature_vec.extend(simple_word2vec(text))
feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h])
# feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char))
X.append(feature_vec)
y_true.append([0 for _ in range(num_classes)])
......@@ -248,6 +277,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
feature_vec.extend(simple_word2vec(text))
feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h])
# feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char))
X.append(feature_vec)
base_label_list = [0 for _ in range(num_classes)]
......@@ -259,6 +289,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
feature_vec.extend(simple_word2vec(text))
feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h])
# feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char))
X.append(feature_vec)
y_true.append([0 for _ in range(num_classes)])
......@@ -276,6 +307,8 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
'find_value': {go_res_list[k][-1]: group_cn_list[v] for k, v in label_idx_dict.items()}
}
# break
# print(create_map)
# print(is_create_map)
if create_map:
......@@ -300,7 +333,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
if __name__ == '__main__':
base_dir = '/Users/zhouweiqi/Downloads/gcfp/data'
go_dir = os.path.join(base_dir, 'go_res')
dataset_save_dir = os.path.join(base_dir, 'dataset160x14-pro-all-valid')
dataset_save_dir = os.path.join(base_dir, 'dataset160x814')
label_dir = os.path.join(base_dir, 'labeled')
train_go_path = os.path.join(go_dir, 'train')
......@@ -324,6 +357,12 @@ if __name__ == '__main__':
# print(max_char_lens) # 72
# print(target_file_name) # train/CH-B103053828-4.json
# max_char_length, target_file_name, target_jieba_char_list, statistics_dict = char_length_statistics_jieba(go_dir)
# print(max_char_length) # 24
# print(target_file_name) # train/CH-B102551568-6.json
# print(target_jieba_char_list)
# print(statistics_dict) # {2: 12077, 1: 12751, 0: 13073, 3: 4423, 4: 1212, 5: 969, 6: 744, 7: 524, 8: 199, 10: 45, 12: 9, 18: 44, 9: 109, 11: 19, 13: 4, 16: 4, 21: 2, 19: 2, 15: 8, 17: 7, 14: 3, 20: 1, 24: 1}
# top_text_list = text_statistics(go_dir)
# for t in top_text_list:
# print(t)
......
import re
import numpy as np
from gensim.models import word2vec
import jieba
from gensim.models import word2vec, KeyedVectors
word2vec_model = word2vec.Word2Vec.load('/Users/zhouweiqi/Downloads/xgboost/models/word2vec_train_single.model')
wv_from_text = KeyedVectors.load_word2vec_format('/Users/zhouweiqi/Downloads/tencent-ailab-embedding-zh-d100-v0.2.0-s/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt', binary=False)
def simple_word2vec(text):
......@@ -67,3 +69,24 @@ def jwq_word2vec(text, text_vec_max_lens=1500):
sentence_vec = np.float64(sentence_vec)
# print(type(sentence_vec))
return sentence_vec
def jieba_and_tencent_word2vec(text, max_jieba_char):
done_char_count = 0
sentence_vec = []
for cn_char in filter(lambda x:re.match(r'[\u4e00-\u9fa5]', x.strip()), jieba.lcut(text.strip())):
if done_char_count >= max_jieba_char:
break
try:
vec = wv_from_text.word_vec(cn_char.strip())
except Exception as e:
pass
else:
sentence_vec = np.append(sentence_vec, vec)
done_char_count += 1
if done_char_count < max_jieba_char:
sentence_vec = np.append(sentence_vec, np.zeros(((max_jieba_char-done_char_count)*100, ), dtype=np.float32))
return sentence_vec
\ No newline at end of file
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!