Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
test_on_pytorch
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
e573fab0
authored
2022-12-23 11:11:02 +0800
by
周伟奇
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
add tencent word2vec
1 parent
f5b96f19
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
68 additions
and
6 deletions
data/create_dataset2.py
data/word2vec.py
data/create_dataset2.py
View file @
e573fab
...
...
@@ -5,11 +5,13 @@ import random
import
uuid
import
cv2
import
re
import
pandas
as
pd
import
numpy
as
np
import
jieba
from
shapely.geometry
import
Polygon
,
MultiPoint
from
tools
import
get_file_paths
,
load_json
from
word2vec
import
jwq_word2vec
,
simple_word2vec
from
word2vec
import
jwq_word2vec
,
simple_word2vec
,
jieba_and_tencent_word2vec
def
bbox_iou
(
go_bbox
,
label_bbox
,
mode
=
'iou'
):
# 所有点的最小凸的表示形式,四边形对象,会自动计算四个点,最后顺序为:左上 左下 右下 右上 左上
...
...
@@ -48,8 +50,6 @@ def bbox_iou(go_bbox, label_bbox, mode='iou'):
# else:
# return inter / union
def
clean_go_res
(
go_res_dir
):
go_res_json_paths
=
get_file_paths
(
go_res_dir
,
[
'.json'
,
])
for
go_res_json_path
in
go_res_json_paths
:
...
...
@@ -85,6 +85,28 @@ def char_length_statistics(go_res_dir):
target_file_name
=
go_res_json_path
return
max_char_length
,
target_file_name
def
char_length_statistics_jieba
(
go_res_dir
):
max_char_length
=
None
target_file_name
=
None
max_char_length
=
None
statistics_dict
=
{}
go_res_json_paths
=
get_file_paths
(
go_res_dir
,
[
'.json'
,
])
for
go_res_json_path
in
go_res_json_paths
:
print
(
'Info: start {0}'
.
format
(
go_res_json_path
))
src_go_res_list
=
load_json
(
go_res_json_path
)
for
_
,
text
in
src_go_res_list
:
jieba_char_list
=
list
(
filter
(
lambda
x
:
re
.
match
(
r'[\u4e00-\u9fa5]'
,
x
),
jieba
.
lcut
(
text
.
strip
())))
length
=
len
(
jieba_char_list
)
if
length
in
statistics_dict
:
statistics_dict
[
length
]
+=
1
else
:
statistics_dict
[
length
]
=
1
if
max_char_length
is
None
or
length
>
max_char_length
:
target_file_name
=
go_res_json_path
target_jieba_char_list
=
jieba_char_list
max_char_length
=
length
return
max_char_length
,
target_file_name
,
target_jieba_char_list
,
statistics_dict
def
bbox_statistics
(
go_res_dir
):
max_seq_count
=
None
seq_sum
=
0
...
...
@@ -223,9 +245,15 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
X
=
list
()
y_true
=
list
()
# dim = 1 + 5 + 8
# text_vec_max_lens = 15 * 50
# dim = 1 + 5 + 8 + text_vec_max_lens
dim
=
1
+
5
+
8
max_jieba_char
=
8
text_vec_max_lens
=
max_jieba_char
*
100
dim
=
1
+
5
+
8
+
text_vec_max_lens
num_classes
=
10
for
i
in
range
(
160
):
if
i
>=
valid_lens
:
...
...
@@ -238,6 +266,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
feature_vec
.
extend
(
simple_word2vec
(
text
))
feature_vec
.
extend
([
x0
/
w
,
y0
/
h
,
x1
/
w
,
y1
/
h
,
x2
/
w
,
y2
/
h
,
x3
/
w
,
y3
/
h
])
# feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
feature_vec
.
extend
(
jieba_and_tencent_word2vec
(
text
,
max_jieba_char
))
X
.
append
(
feature_vec
)
y_true
.
append
([
0
for
_
in
range
(
num_classes
)])
...
...
@@ -248,6 +277,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
feature_vec
.
extend
(
simple_word2vec
(
text
))
feature_vec
.
extend
([
x0
/
w
,
y0
/
h
,
x1
/
w
,
y1
/
h
,
x2
/
w
,
y2
/
h
,
x3
/
w
,
y3
/
h
])
# feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
feature_vec
.
extend
(
jieba_and_tencent_word2vec
(
text
,
max_jieba_char
))
X
.
append
(
feature_vec
)
base_label_list
=
[
0
for
_
in
range
(
num_classes
)]
...
...
@@ -259,6 +289,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
feature_vec
.
extend
(
simple_word2vec
(
text
))
feature_vec
.
extend
([
x0
/
w
,
y0
/
h
,
x1
/
w
,
y1
/
h
,
x2
/
w
,
y2
/
h
,
x3
/
w
,
y3
/
h
])
# feature_vec.extend(jwq_word2vec(text, text_vec_max_lens))
feature_vec
.
extend
(
jieba_and_tencent_word2vec
(
text
,
max_jieba_char
))
X
.
append
(
feature_vec
)
y_true
.
append
([
0
for
_
in
range
(
num_classes
)])
...
...
@@ -276,6 +307,8 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
'find_value'
:
{
go_res_list
[
k
][
-
1
]:
group_cn_list
[
v
]
for
k
,
v
in
label_idx_dict
.
items
()}
}
# break
# print(create_map)
# print(is_create_map)
if
create_map
:
...
...
@@ -300,7 +333,7 @@ def build_dataset(img_dir, go_res_dir, label_dir, top_text_list, skip_list, save
if
__name__
==
'__main__'
:
base_dir
=
'/Users/zhouweiqi/Downloads/gcfp/data'
go_dir
=
os
.
path
.
join
(
base_dir
,
'go_res'
)
dataset_save_dir
=
os
.
path
.
join
(
base_dir
,
'dataset160x
14-pro-all-valid
'
)
dataset_save_dir
=
os
.
path
.
join
(
base_dir
,
'dataset160x
814
'
)
label_dir
=
os
.
path
.
join
(
base_dir
,
'labeled'
)
train_go_path
=
os
.
path
.
join
(
go_dir
,
'train'
)
...
...
@@ -324,6 +357,12 @@ if __name__ == '__main__':
# print(max_char_lens) # 72
# print(target_file_name) # train/CH-B103053828-4.json
# max_char_length, target_file_name, target_jieba_char_list, statistics_dict = char_length_statistics_jieba(go_dir)
# print(max_char_length) # 24
# print(target_file_name) # train/CH-B102551568-6.json
# print(target_jieba_char_list)
# print(statistics_dict) # {2: 12077, 1: 12751, 0: 13073, 3: 4423, 4: 1212, 5: 969, 6: 744, 7: 524, 8: 199, 10: 45, 12: 9, 18: 44, 9: 109, 11: 19, 13: 4, 16: 4, 21: 2, 19: 2, 15: 8, 17: 7, 14: 3, 20: 1, 24: 1}
# top_text_list = text_statistics(go_dir)
# for t in top_text_list:
# print(t)
...
...
data/word2vec.py
View file @
e573fab
import
re
import
numpy
as
np
from
gensim.models
import
word2vec
import
jieba
from
gensim.models
import
word2vec
,
KeyedVectors
word2vec_model
=
word2vec
.
Word2Vec
.
load
(
'/Users/zhouweiqi/Downloads/xgboost/models/word2vec_train_single.model'
)
wv_from_text
=
KeyedVectors
.
load_word2vec_format
(
'/Users/zhouweiqi/Downloads/tencent-ailab-embedding-zh-d100-v0.2.0-s/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt'
,
binary
=
False
)
def
simple_word2vec
(
text
):
...
...
@@ -67,3 +69,24 @@ def jwq_word2vec(text, text_vec_max_lens=1500):
sentence_vec
=
np
.
float64
(
sentence_vec
)
# print(type(sentence_vec))
return
sentence_vec
def
jieba_and_tencent_word2vec
(
text
,
max_jieba_char
):
done_char_count
=
0
sentence_vec
=
[]
for
cn_char
in
filter
(
lambda
x
:
re
.
match
(
r'[\u4e00-\u9fa5]'
,
x
.
strip
()),
jieba
.
lcut
(
text
.
strip
())):
if
done_char_count
>=
max_jieba_char
:
break
try
:
vec
=
wv_from_text
.
word_vec
(
cn_char
.
strip
())
except
Exception
as
e
:
pass
else
:
sentence_vec
=
np
.
append
(
sentence_vec
,
vec
)
done_char_count
+=
1
if
done_char_count
<
max_jieba_char
:
sentence_vec
=
np
.
append
(
sentence_vec
,
np
.
zeros
(((
max_jieba_char
-
done_char_count
)
*
100
,
),
dtype
=
np
.
float32
))
return
sentence_vec
\ No newline at end of file
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment