-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWord_2_Vector.py
More file actions
79 lines (68 loc) · 3.1 KB
/
Word_2_Vector.py
File metadata and controls
79 lines (68 loc) · 3.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from gensim.models import word2vec
from os.path import join, exists, split
import os
import numpy as np
import Data_Preprocess
def train_word2vec(sentence_matrix, vocabulary_inv,
num_features=300, min_word_count=1, context=10):
"""
Trains, saves, loads Word2Vec model
Returns initial weights for embedding layer.
inputs:
sentence_matrix # int matrix: num_sentences x max_sentence_len
vocabulary_inv # dict {str:int}
num_features # Word vector dimensionality
min_word_count # Minimum word count
context # Context window size
"""
model_dir = 'word2vec_models'
model_name = "{0}features_{1}minwords_{2}context".format(num_features, min_word_count, context)
model_name = join(model_dir, model_name)
if exists(model_name):
embedding_model = word2vec.Word2Vec.load(model_name)
print('Loading existing Word2Vec model \'%s\'' % split(model_name)[-1])
else:
# Set values for various parameters
num_workers = 2 # Number of threads to run in parallel
downsampling = 1e-3 # Downsample setting for frequent words
# Initialize and train the model
print("Training Word2Vec model...")
sentences = [[vocabulary_inv[w] for w in s] for s in sentence_matrix]
embedding_model = word2vec.Word2Vec(sentences, workers=num_workers, \
size=num_features, min_count=min_word_count, \
window=context, sample=downsampling)
# If we don't plan to train the model any further, calling
# init_sims will make the model much more memory-efficient.
embedding_model.init_sims(replace=True)
# Saving the model for later use. You can load it later using Word2Vec.load()
if not exists(model_dir):
os.mkdir(model_dir)
print('Saving Word2Vec model \'%s\'' % split(model_name)[-1])
embedding_model.save(model_name)
# add unknown words
embedding_weights = [np.array([embedding_model[w] if w in embedding_model \
else np.random.uniform(-0.25, 0.25, embedding_model.vector_size) \
for w in vocabulary_inv])]
return embedding_weights
if __name__=='__main__':
X, Y, vocabulary_inv, vocabulary = Data_Preprocess.load_data()
#(X_train, y_train, train_vocabulary, train_vocabulary_inv), (X_test, y_test, train_vocabulary, train_vocabulary_inv)
# X_train = data[0][0]
# y_train = data[0][1]
# train_vocabulary = data[0][2]
# train_vocabulary_inv = data[0][3]
# X_test = data[1][0]
# y_test = data[1][1]
# test_vocabulary = data[1][2]
# test_vocabulary_inv = data[1][3]
#
#
# train_word_embeddings = train_word2vec("train",X_train, train_vocabulary_inv)
word_embeddings = train_word2vec(X, vocabulary_inv)
print(len(word_embeddings[0]))
# print(word_embeddings[0][0])
# print(vocabulary_inv[0])
# test_word_embeddings = train_word2vec("test",X_test, test_vocabulary_inv)
# print(len(w))
# print(len(w[0]))
# print(len(w[0][0]))