add tencent word2vec

周伟奇
Showing 2 changed files with 68 additions and 6 deletions
data/create_dataset2.py
data/word2vec.py
--- a/data/create_dataset2.py
View file @e573fab
+++ b/data/create_dataset2.py
View file @e573fab
@@ -5,11 +5,13 @@ import random
 import uuid
 import cv2
+import re
 import pandas as pd
 import numpy as np
+import jieba
 from shapely.geometry import Polygon, MultiPoint
 from tools import get_file_paths, load_json
-from word2vec import jwq_word2vec, simple_word2vec
+from word2vec import jwq_word2vec, simple_word2vec, jieba_and_tencent_word2vec
 def bbox_iou(go_bbox, label_bbox, mode='iou'):
    # 所有点的最小凸的表示形式，四边形对象，会自动计算四个点，最后顺序为：左上 左下  右下 右上 左上
@@ -48,8 +50,6 @@ def bbox_iou(go_bbox, label_bbox, mode='iou'):
    # else:
    #     return inter / union
 def clean_go_res(go_res_dir):
    go_res_json_paths = get_file_paths(go_res_dir, ['.json', ])
    for go_res_json_path in go_res_json_paths:
@@ -85,6 +85,28 @@ def char_length_statistics(go_res_dir):
                target_file_name = go_res_json_path
    return max_char_length, target_file_name
+def char_length_statistics_jieba(go_res_dir):
+    max_char_length = None
+    target_file_name = None
+    max_char_length = None
+    statistics_dict = {}
+    go_res_json_paths = get_file_paths(go_res_dir, ['.json', ])
+    for go_res_json_path in go_res_json_paths:
+        print('Info: start {0}'.format(go_res_json_path))
+        src_go_res_list = load_json(go_res_json_path)
+        for _, text in src_go_res_list:
+            jieba_char_list = list(filter(lambda x:re.match(r'[\u4e00-\u9fa5]', x), jieba.lcut(text.strip())))
+            length = len(jieba_char_list) 
+            if length in statistics_dict:
+                statistics_dict[length] += 1
+            else:
+                statistics_dict[length] = 1
+            if max_char_length is None or length > max_char_length:
+                target_file_name = go_res_json_path
+                target_jieba_char_list = jieba_char_list 
+                max_char_length = length
+    return max_char_length, target_file_name, target_jieba_char_list, statistics_dict 
 def bbox_statistics(go_res_dir):
    max_seq_count = None
    seq_sum = 0
@@ -223,9 +245,15 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
        X = list()
        y_true = list()
+        # dim = 1 + 5 + 8
        # text_vec_max_lens = 15 * 50
        # dim = 1 + 5 + 8 + text_vec_max_lens 
-        dim = 1 + 5 + 8
+        max_jieba_char = 8
+        text_vec_max_lens = max_jieba_char * 100
+        dim = 1 + 5 + 8 + text_vec_max_lens 
        num_classes = 10
        for i in range(160):
            if i >= valid_lens:
@@ -238,6 +266,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
                feature_vec.extend(simple_word2vec(text))
                feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h])
                # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
+                feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char))
                X.append(feature_vec)
                y_true.append([0 for _ in range(num_classes)])
@@ -248,6 +277,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
                feature_vec.extend(simple_word2vec(text))
                feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h])
                # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
+                feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char))
                X.append(feature_vec)
                base_label_list = [0 for _ in range(num_classes)]
@@ -259,6 +289,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
                feature_vec.extend(simple_word2vec(text))
                feature_vec.extend([x0/w, y0/h, x1/w, y1/h, x2/w, y2/h, x3/w, y3/h])
                # feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
+                feature_vec.extend(jieba_and_tencent_word2vec(text, max_jieba_char))
                X.append(feature_vec)
                y_true.append([0 for _ in range(num_classes)])
@@ -276,6 +307,8 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
                'find_value': {go_res_list[k][-1]: group_cn_list[v] for k, v in label_idx_dict.items()}
            }
+        # break
    # print(create_map)
    # print(is_create_map)
    if create_map:
@@ -300,7 +333,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
 if __name__ == '__main__':
    base_dir = '/Users/zhouweiqi/Downloads/gcfp/data'
    go_dir = os.path.join(base_dir, 'go_res')
-    dataset_save_dir = os.path.join(base_dir, 'dataset160x14-pro-all-valid')
+    dataset_save_dir = os.path.join(base_dir, 'dataset160x814')
    label_dir = os.path.join(base_dir, 'labeled')
    train_go_path = os.path.join(go_dir, 'train')
@@ -324,6 +357,12 @@ if __name__ == '__main__':
    # print(max_char_lens) # 72
    # print(target_file_name) # train/CH-B103053828-4.json
+    # max_char_length, target_file_name, target_jieba_char_list, statistics_dict = char_length_statistics_jieba(go_dir)
+    # print(max_char_length) # 24
+    # print(target_file_name) # train/CH-B102551568-6.json
+    # print(target_jieba_char_list)
+    # print(statistics_dict) # {2: 12077, 1: 12751, 0: 13073, 3: 4423, 4: 1212, 5: 969, 6: 744, 7: 524, 8: 199, 10: 45, 12: 9, 18: 44, 9: 109, 11: 19, 13: 4, 16: 4, 21: 2, 19: 2, 15: 8, 17: 7, 14: 3, 20: 1, 24: 1}
    # top_text_list = text_statistics(go_dir)
    # for t in top_text_list:
    #     print(t)
--- a/data/word2vec.py
View file @e573fab
+++ b/data/word2vec.py
View file @e573fab
 import re
 import numpy as np
-from gensim.models import word2vec
+import jieba
+from gensim.models import word2vec, KeyedVectors
 word2vec_model = word2vec.Word2Vec.load('/Users/zhouweiqi/Downloads/xgboost/models/word2vec_train_single.model')
+wv_from_text = KeyedVectors.load_word2vec_format('/Users/zhouweiqi/Downloads/tencent-ailab-embedding-zh-d100-v0.2.0-s/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt', binary=False)
 def simple_word2vec(text):
@@ -67,3 +69,24 @@ def jwq_word2vec(text, text_vec_max_lens=1500):
    sentence_vec = np.float64(sentence_vec)
    # print(type(sentence_vec))
    return sentence_vec
+def jieba_and_tencent_word2vec(text, max_jieba_char):
+    done_char_count = 0
+    sentence_vec = []
+    for cn_char in filter(lambda x:re.match(r'[\u4e00-\u9fa5]', x.strip()), jieba.lcut(text.strip())):
+        if done_char_count >= max_jieba_char:
+            break
+        try:
+            vec = wv_from_text.word_vec(cn_char.strip())
+        except Exception as e:
+            pass
+        else:
+            sentence_vec = np.append(sentence_vec, vec)
+            done_char_count += 1
+    if done_char_count < max_jieba_char:
+        sentence_vec = np.append(sentence_vec, np.zeros(((max_jieba_char-done_char_count)*100, ), dtype=np.float32))
+    return sentence_vec
\ No newline at end of file