-
Notifications
You must be signed in to change notification settings - Fork 1
/
validation_and_prediction.py
266 lines (197 loc) · 9.33 KB
/
validation_and_prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
from keras import backend as K
import tensorflow as tf
import sklearn as sk
import sklearn.preprocessing
from sklearn import model_selection
import numpy as np
import keras
import tensorflow as tf
from keras.models import load_model
import helpers as HL
import glove_module as GM
def run_k_fold(models, X, Y, epochs, n_folds, patience):
"""
Runs K-fold cross validation on the neural net models given as parameter models.
Input:
models: A list of names for neural net models defined in neural_nets.py
X: Training set
Y: Labels
epochs: Max number of epochs to run each model in each k-fold
n_folds: Number of folds for the k-fold algorithm
Output:
model_scores: A list of tuples containing mean and std for each model run
thorugh the k-fold cross validation.
"""
#Need to set Keras-session in order to keep results reproducable
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
session = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(session)
model_scores = []
for neural_model in models:
model_name = neural_model.__name__
input_dimensions = X.shape[1]
model = neural_model(input_dimensions)
kfold = sk.model_selection.StratifiedKFold(n_splits=n_folds)
cv_scores = []
for train, test in kfold.split(X, Y):
# Defining callbacks to be used under fitting process
early_stopping= early_stopping_callback(patience_=patience, verbose_=1)
model_checkpoint = model_checkpoint_callback("best_neural_model_save.hdf5", verbose_=1)
model = neural_model(input_dimensions)
model.fit(
X[train],
Y[train],
epochs=epochs,
batch_size=1024,
verbose=1,
callbacks=[early_stopping, model_checkpoint],
validation_data=(X[test], Y[test])
)
#Load best model stored during fitting of model
model = load_model('best_neural_model_save.hdf5')
score = model.evaluate(X[test], Y[test], verbose=1)[1]
cv_scores.append(score)
model_mean = np.mean(cv_scores)
model_std = np.std(cv_scores)
print("Model: ", model_name)
print("%.2f%% (+/- %.2f%%)" % (model_mean, model_std))
model_scores.append([model_mean, model_std])
return model_scores
def classify_with_neural_networks(neural_nets_functions, global_vectors, processed_corpus, total_training_tweets, nr_pos_tweets, epochs, n_folds,patience=3):
num_of_dim = global_vectors.syn0.shape[1]
"""
Runs k-fold and returns model scores for the neural_nets given as a parameter.
Input:
neural_net_functions: The neural nets to run
global_vectors: global vectors created out the gensim-.txt files.
processed_corpus: The pre-processed corpus
total_training_tweets: (int) Number of tweets that are training tweets. Assums that the first poriton of the corpus is
training tweets, the second part is the unseen test set.
nr_pos_tweets: Number of traning tweets that are positiv
epochs: Max number of epochs for the neural model
n_folds: Number of folds in the k-fold
patience_: Patience for the early stopping callback used in the model fitting
Output:
model_scores: A list of tuples containing mean and std for each model run
thorugh the k-fold cross validation.
"""
num_of_dim = global_vectors.syn0.shape[1]
# seperate traindata and testdata
train_corpus = processed_corpus[:total_training_tweets:]
predict_corpus = processed_corpus[total_training_tweets::]
# Build a vector representation of all documents in corpus
vectors = np.zeros(len(train_corpus), dtype=object)
for i, doc in enumerate(train_corpus):
if (i % 50000) == 0:
print("tweets processed: %.0f of total number of tweets: %.0f" % (i,len(train_corpus)))
vectors[i] = GM.buildDocumentVector(doc, num_of_dim, global_vectors)
train_document_vecs = np.concatenate(vectors)
train_document_vecs = sklearn.preprocessing.scale(train_document_vecs)
labels = HL.create_labels(nr_pos_tweets, nr_pos_tweets)
model_scores= run_k_fold(neural_nets_functions, train_document_vecs, labels, epochs, n_folds,patience)
return model_scores
def model_checkpoint_callback(save_filename, verbose_,):
return keras.callbacks.ModelCheckpoint(
save_filename,
monitor='val_loss',
verbose=verbose_,
save_best_only=True,
save_weights_only=False,
mode='auto'
)
def early_stopping_callback(patience_, verbose_):
return keras.callbacks.EarlyStopping(
monitor='val_loss',
patience=patience_,
verbose = verbose_
)
def get_prediction(neural_net, global_vectors, full_corpus, total_training_tweets, nr_pos_tweets,kaggle_name, epochs, patience, split=0.8):
""" Creates a csv file with kaggle predictions and returns the predictions.
Input:
neural_net: Name of a neural net model
global_vectors: global vectors created out the gensim-.txt files.
total_training_tweets: (int) Number of tweets that are training tweets. Assums that the first poriton of the corpus is
training tweets, the second part is the unseen test set.
nr_pos_tweets: (int) number of traning tweets that are positiv
kaggle_name: Name for csv file, must end in '.csv'.
Output:
pred_ones: the predicions (1 or -1)
a .csv file with name 'kaggle_name'
"""
num_of_dim = global_vectors.syn0.shape[1]
# seperate traindata and testdata
train_corpus = full_corpus[:total_training_tweets:]
predict_corpus = full_corpus[total_training_tweets::]
# Build a vector of all the words in a tweet
train_document_vecs = np.concatenate([GM.buildDocumentVector(doc, num_of_dim, global_vectors) for doc in train_corpus])
train_document_vecs = sk.preprocessing.scale(train_document_vecs)
labels = HL.create_labels(nr_pos_tweets, nr_pos_tweets, kaggle=False)
train_document_vecs, labels = HL.shuffle_data(train_document_vecs,labels)
train_x, val_x, train_y, val_y = HL.split_data(train_document_vecs, labels, split)
test_document_vecs = np.concatenate([GM.buildDocumentVector(doc, num_of_dim, global_vectors) for doc in predict_corpus])
test_document_vecs = sk.preprocessing.scale(test_document_vecs)
model = neural_net(num_of_dim)
# Defining callbacks to be used under fitting process
early_stopping = early_stopping_callback(patience_=patience, verbose_=1)
model_checkpoint = model_checkpoint_callback("neural_model_prediction.hdf5", verbose_=1)
history = model.fit(
train_x,
train_y,
epochs=epochs,
batch_size=1024,
verbose=1,
callbacks=[early_stopping, model_checkpoint],
validation_data=(val_x, val_y)
)
# Loading the best model found during training
model = load_model('neural_model_prediction.hdf5')
prediction = model.predict(test_document_vecs)
prediction = [1 if i > 0.5 else -1 for i in prediction]
# Creating prediction
ids = list(range(1,10000+1))
HL.create_csv_submission(ids, prediction,kaggle_name)
return prediction
def train_NN(model, allX, allY, patience_, epochs=100000, split=0.8):
"""
Method used when exploring different topolgies for netural networks. Not utalized in final deliveries.
Input:
- model: Neural net model
- allX: Complete corpus
- allY: Labels
- patience_: Patience for the early stopping callback used in the model fitting
- epochs: Max number of epochs for the neural model
- split: The train/test ratio for the validation.
Output:
- model: The trained neural network model
- history: The history log of the model training
"""
# Shuffling data in-place
np.random.seed(1337)
np.random.shuffle(allY)
np.random.seed(1337)
np.random.shuffle(allX)
# defining the split index of the data
split_size = int(allX.shape[0]*split)
# Defining callbacks to be used under fitting process
early_stopping_callback = early_stopping_callback(monitor='val_loss', patience=patience_, verbose=1)
model_checkpoint_callback = model_checkpoint_callback("train_NN_dynamic_model.hdf5", verbose=1)
history = []
start = time.time()
try:
history = model.fit(
allX[:split_size],
allY[:split_size],
epochs=epochs,
batch_size=1024,
verbose=1,
callbacks=[early_stopping_callback, model_checkpoint_callback],
validation_data=(allX[split_size:], allY[split_size:])
)
return model, history
except (KeyboardInterrupt, SystemExit):
print('\n\ntime spent training:', (time.time() - start))
return model, history
else:
print("\n\nwhy did this happen?")
print(model, history)
return model, history