-
Notifications
You must be signed in to change notification settings - Fork 0
/
News_Classifier.py
321 lines (203 loc) · 9.37 KB
/
News_Classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
# coding: utf-8
# # Extracting the data
#
# Data is being extracted from a json file and put into the respective dataframes.
# In[1]:
import json
import pandas as pd
import csv
from pandas.io.json import json_normalize
import os
import numpy as np
from keras.layers import Activation, Conv1D, Dense, Embedding, Flatten, Input, MaxPooling1D
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
with open('C:/Users/admin/Desktop/New_News_Data.json', 'r') as f:
data = json.load(f)
df= pd.DataFrame(data)
print(df.shape)
headline=df['headline'] #extract text
category=df['category'] #extract target
print(headline[1],category[1])
unique_categories=df.category.unique()
unique_categories_dictionary=dict(zip(unique_categories,range(0,len(unique_categories))))
labels=df['category'].map(unique_categories_dictionary, na_action='ignore')
print(len(labels))
### The below code snippet was used to extract data from a json file and put into a dataframe.
#To convert bad json to good json
# count=0
# with open("C:/Users/admin/Desktop/News_Category_Dataset.json", "rt") as fin:
# with open("C:/Users/admin/Desktop/New_News_Data.json", "wt") as fout:
# for line in fin:
# count=count+1
# print(count,line)
# fout.write(line.replace('}', '},'))
#df = pd.DataFrame(data)
# flatten_json=json_normalize(data)
# headline=flatten_json['headline']
# category=flatten_json['category']
#print(df['headline'][:5],df['category'][:5])
# print(headline.head())
# print(category.head())
# ## Preprocessing the data
#
# We tokenize the text before we can feed it into a neural network. This tokenization process will also remove some of the features of the original text, such as all punctuation or words that are less common.
#
# We have to specify the size of our vocabulary. Words that are less frequent will get removed. In this case we want to retain the 20,000 most common words.
# In[2]:
vocab_size = 20000
tokenizer = Tokenizer(num_words=vocab_size) # Setup tokenizer
tokenizer.fit_on_texts(headline)
sequences = tokenizer.texts_to_sequences(headline) # Generate sequences
# In[3]:
print (len(sequences)) ## Testing the output of the tokenizer functions.
print (len(sequences[0]))
print (sequences[0])
# ##### Word_index
# Our text is now converted to sequences of numbers. It makes sense to convert some of those sequences back into text to check what the tokenization did to our text. To this end we create an inverse index that maps numbers to words while the tokenizer maps words to numbers.
# In[4]:
word_index = tokenizer.word_index
print('Found {:,} unique words.'.format(len(word_index)))
# In[5]:
#Create inverse index mapping numbers to words
inv_index = {v: k for k, v in tokenizer.word_index.items()}
# Print out text again
for w in sequences[0]:
x = inv_index.get(w)
print(x,end = ' ')
# #### Measuring the text length
#
# Let's ensure all sequences have the same length.
# In[6]:
# Get the average length of a text
avg = sum(map(len, sequences)) / len(sequences)
# Get the standard deviation of the sequence length
std = np.sqrt(sum(map(lambda x: (len(x) - avg)**2, sequences)) / len(sequences))
avg,std
# #### Using Average Length and SD of the text
# You can see, the average text is about 9 words long.Standard deviation is around 3.So,we will make the maximum length of a text to be 12.( i.e 9+3 )
# In[7]:
pad_sequences([[1,2,3]], maxlen=5) # testing the pad_sequences function
# In[8]:
max_length = 12
data = pad_sequences(sequences, maxlen=max_length)
# ### Turning labels into one-hot encodings
#
# Labels can quickly be encoded into one-hot vectors with Keras:
# In[9]:
from keras.utils import to_categorical
labels = to_categorical(np.asarray(labels))
print('Shape of data:', data.shape)
print('Shape of labels:', labels.shape)
# ### Loading GloVe embeddings
#
# A word embedding is a form of representing words and documents using a dense vector representation. The position of a word within the vector space is learned from text and is based on the words that surround the word when it is used. Word embeddings can be trained using the input corpus itself or can be generated using pre-trained word embeddings such as Glove, FastText, and Word2Vec.
# In[10]:
glove_dir = 'C:/Users/admin/Desktop' # This is the folder with the dataset/glove embeddings
embeddings_index = {} # We create a dictionary of word -> embedding
with open(os.path.join(glove_dir, 'glove.6B.100d.txt'),encoding="utf8") as f:
for line in f:
values = line.split()
word = values[0] # The first value is the word, the rest are the values of the embedding
embedding = np.asarray(values[1:], dtype='float32') # Load embedding
embeddings_index[word] = embedding # Add embedding to our embedding dictionary
print('Found {:,} word vectors in GloVe.'.format(len(embeddings_index)))
# In[11]:
print (embeddings_index['frog'])
print (len(embeddings_index['frog']))
# In[12]:
print (np.linalg.norm(embeddings_index['frog'] - embeddings_index['toad']))
print (np.linalg.norm(embeddings_index['frog'] - embeddings_index['man']))
# ### Using GloVe Embeddings
#
# Following snippet shows how to use pre-trained word embeddings in the model. There are four essential steps:
#
# 1.Loading the pretrained word embeddings.<br/>
# 2.Creating a tokenizer object.(already done in the above code)<br/>
# 3.Transforming text documents to sequence of tokens and pad them.(already done in the above code)<br/>
# 4.Create a mapping of token and their respective embeddings.</br>
# In[13]:
embedding_dim = 100 # We use 100 dimensional glove vectors
word_index = tokenizer.word_index
nb_words = min(vocab_size, len(word_index)) # How many words are there actually
embedding_matrix = np.zeros((nb_words, embedding_dim))
# The vectors need to be in the same position as their index.
# Meaning a word with token 1 needs to be in the second row (rows start with zero) and so on
# Loop over all words in the word index
for word, i in word_index.items():
# If we are above the amount of words we want to use we do nothing
if i >= vocab_size:
continue
# Get the embedding vector for the word
embedding_vector = embeddings_index.get(word)
# If there is an embedding vector, put it in the embedding matrix
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
# ### Architecture of the model :
#
# The architecture is comprised of three key pieces:<br/>
#
# 1.Word Embedding: A distributed representation of words where different words that have a similar meaning (based on their usage) also have a similar representation.<br/>
# 2.Convolutional Model: A feature extraction model that learns to extract salient features from documents represented using a word embedding.<br/>
# 3.Fully Connected Model: The interpretation of extracted features in terms of a predictive output.<br/>
# In[14]:
model = Sequential()
model.add(Embedding(vocab_size,
embedding_dim,
input_length=max_length,
weights = [embedding_matrix],
trainable = False))
model.add(Conv1D(128, 3, activation='relu')) #https://arxiv.org/abs/1408.5882 (Convolutional Neural Networks for Sentence Classification by Yoon Kim)
model.add(MaxPooling1D(3)) #https://arxiv.org/abs/1510.03820(A Sensitivity Analysis of (and Practitioners' Guide to) Convolutional Neural Networks for Sentence Classification)
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(31, activation='softmax'))
model.summary()
# ### Training the model
# In[17]:
model.compile(optimizer='adam',
loss='binary_crossentropy', # https://stackoverflow.com/questions/42081257/keras-binary-crossentropy-vs-categorical-crossentropy-performance
metrics=['accuracy'])
model_history=model.fit(data, labels, validation_split=0.2, epochs=2)
# ### Prediction
# In[18]:
index=124980
example = data[index] # get the tokens
# Print tokens as text
for w in example:
x = inv_index.get(w)
print(x,end = ' ')
# In[19]:
# Get prediction
pred = model.predict(example.reshape(1,12))
# In[20]:
# Output predicted category
print("Predicted category: "+unique_categories[np.argmax(pred)])
print("Original category: "+category[index])
# In[21]:
import matplotlib.pyplot as plt
acc = model_history.history['acc']
val_acc = model_history.history['val_acc']
loss = model_history.history['loss']
val_loss = model_history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.title('Training and validation accuracy')
plt.plot(epochs, acc, 'red', label='Training acc')
plt.plot(epochs, val_acc, 'blue', label='Validation acc')
plt.legend()
plt.figure()
plt.title('Training and validation loss')
plt.plot(epochs, loss, 'red', label='Training loss')
plt.plot(epochs, val_loss, 'blue', label='Validation loss')
plt.legend()
plt.show()
# ### Further Imporovements
#
# 1.We can also use the sub-headings for each category and concatenate each of the resprective sub-headings with the headings to captures more features from a text.In that case,we will have to increase our max_length.<br/>
#
# 2.We can also increase our size of vocabulary if computation is not a bottleneck.<br/>
#
# 3.We can add a convolutional layer with multiple filter widths and feature maps.<br/>
#
#