-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlstm_model_training.py
378 lines (257 loc) · 12.1 KB
/
lstm_model_training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
# -*- coding: utf-8 -*-
"""ISY503 Assessment 3.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1jfqJTB66xVUg2c7NpGojQjMAj7r5IEMQ
## Process Flow
1. Load Datasets
2. **Pre-process dataset** by removing special characters, stopwords, and other noise, and convert **sentiment labels** positive & negative to numbers 1 & 0, respectively
3. **Import GloVe Word Embedding** to build Embedding Dictionary + Use this to build Embedding Matrix for our Corpus
4. Model Training using **Deep Learning in Keras** using **LSTM Model** and analyse model performance and results
## **Setup**
"""
import string
import pandas as pd
import nltk
import re
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer # ImportRegexpTokenizer to remove punctuation and tokenize
from nltk.stem import WordNetLemmatizer
from numpy import array
from keras import models
from keras.preprocessing.text import one_hot, Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, GlobalMaxPooling1D, Embedding, Conv1D, LSTM
from sklearn.model_selection import train_test_split
plt.style.use('ggplot')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')
"""## **Load Data**"""
books_positive = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/books/positive.xml")
books_negative = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/books/negative.xml")
dvd_positive = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/dvd/positive.xml")
dvd_negative = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/dvd/negative.xml")
electronics_positive = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/electronics/positive.xml")
electronics_negative = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/electronics/negative.xml")
kitchen_positive = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/kitchen_%26_housewares/positive.xml")
kitchen_negative = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/kitchen_%26_housewares/negative.xml")
print(kitchen_negative['review_text'].head(20))
"""## **Quick Data Eval**"""
def show_rating_graph(collection, title):
# Must have a rating column
ax = collection['rating'].value_counts().sort_index().plot(kind='bar', title=title)
ax.set_xlabel('Rating')
plt.show()
#@title Books: Positive
show_rating_graph(books_positive, "Ratings of Books by Positive Reviews")
#@title Books: Negative
show_rating_graph(books_negative, "Ratings of Books by Negative Reviews")
#@title DVDs: Positive
show_rating_graph(dvd_positive, "Ratings of DVDs by Positive Reviews")
#@title DVDs: Negative
show_rating_graph(dvd_negative, "Ratings of DVDs by Negative Reviews")
#@title Electronics: Positive
show_rating_graph(electronics_positive, "Ratings of Electronics by Positive Reviews")
#@title Electronics: Negative
show_rating_graph(electronics_negative, "Ratings of Electronics by Negative Reviews")
#@title Kitchen and Housewares: Positive
show_rating_graph(kitchen_positive, "Ratings of kitchen and housewares by Positive Reviews")
#@title Kitchen and Housewares: Negative
show_rating_graph(kitchen_negative, "Ratings of kitchen and housewares by Negative Reviews")
"""# **Training**
## **Step 1: Cleaning Data**
"""
#@title Cleaning Data Functions
# Remove punctuations and Tokenize
def remove_punctuations_and_tokenize(input_string):
translator = str.maketrans("", "", string.punctuation)
formatted = input_string.translate(translator)
# Remove undesired underscore and slashes
formatted = formatted.replace("_"," ").replace("/","")
# Remove Numbers
formatted = re.sub(r"\b[0-9]+\b\s*", "", formatted)
# Remove Hyperlinks
formatted = re.sub(r"https?://\S+", "", formatted)
# Remove the HTML tags but keep their contents
formatted = re.sub(r"<.*?>", " ", formatted)
# Remove the alphanumerics like a389794njfhj because they dont add any value
formatted = re.sub(r'\w*\d\w*', '', formatted)
# Remove undesire punctuation
return RegexpTokenizer(r'\w+').tokenize(formatted)
def tokenize_reviews(reviews):
return [remove_punctuations_and_tokenize(review.lower()) for review in reviews]
# Remove Stopwords
def remove_stopwords(token_collection):
# Get the list of English stopwords
english_stopwords = stopwords.words('english')
# Remove stopwords
return [[word for word in tokens if word.lower() not in english_stopwords] for tokens in token_collection]
#@title Setting up Positive Reviews
positive_reviews = pd.concat([books_positive, dvd_positive, electronics_positive, kitchen_positive])
positive_tokenized_collection = tokenize_reviews(positive_reviews['review_text'])
positive_filtered_tokens = remove_stopwords(positive_tokenized_collection)
#@title Setting up Negative Reviews
negative_reviews = pd.concat([books_negative, dvd_negative, electronics_negative, kitchen_negative])
negative_tokenized_collection = tokenize_reviews(negative_reviews['review_text'])
negative_filtered_tokens = remove_stopwords(negative_tokenized_collection)
"""## **Step 2 (Optional): Preparing Data**"""
#@title POS Tagging function
def pos_tag_collection(token_collection):
return [nltk.pos_tag(tokens) for tokens in token_collection]
#@title Chunking function
def chunk_collection(tagged_reviews):
# Define chunk grammar rules
chunk_grammar = r"""
NP: {<DT|JJ|NN.*>+} # Noun Phrase
VP: {<VB.*><NP|PP>*} # Verb Phrase
"""
chunk_parser = nltk.RegexpParser(chunk_grammar)
return [chunk_parser.parse(tagged_review) for tagged_review in tagged_reviews]
# return [nltk.chunk.ne_chunk(tagged_review) for tagged_review in tagged_reviews]
#@title Lemmatization function
def lemmatize_collection(tagged_collection):
lemmatizer = WordNetLemmatizer()
lemmatized_collection = []
for tagged_phrase in tagged_collection:
phrase = []
for token, tag in tagged_phrase:
pos = 'n'
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
elif tag.startswith('RB'):
pos = 'r'
elif tag.startswith('JJ'):
pos = 'a'
phrase.append(lemmatizer.lemmatize(token, pos))
lemmatized_collection.append(phrase)
return lemmatized_collection
#@title POS Tagging for positive reviews
tagged_positives = pos_tag_collection(positive_filtered_tokens)
for tagged_positive in tagged_positives[:10]:
for token, tag in tagged_positive:
print(f"{token}: {tag}")
#@title Chunking for positive reviews
chunked_positives = chunk_collection(tagged_positives)
!pip install svgling
import IPython.display
for tagged_positive in chunked_positives[:10]:
IPython.display.display(tagged_positive)
#@title Lemmatization for positive reviews
lemmatized_positives = lemmatize_collection(tagged_positives)
lemmatized_positives[0]
#@title POS Tagging and Chunking for negative reviews
tagged_positives = pos_tag_collection(negative_filtered_tokens)
chunked_positives = chunk_collection(tagged_positives)
for tagged_positive in tagged_positives[:10]:
for token, tag in tagged_positive:
print(f"{token}: {tag}")
"""## **Step 3: Embedding Data**"""
#@title Combine positive and negative reviews with labels
reviews = positive_filtered_tokens + negative_filtered_tokens
labels = np.array([1] * len(positive_filtered_tokens) + [0] * len(negative_filtered_tokens))
#@title Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=42)
#@title Embedding layer expects the words to be in numeric form
# Using Tokenizer function from keras.preprocessing.text library
# Method fit_on_text trains the tokenizer
# Method texts_to_sequences converts sentences to their numeric form
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X_train)
X_train = word_tokenizer.texts_to_sequences(X_train)
X_test = word_tokenizer.texts_to_sequences(X_test)
#@title Adding 1 to store dimensions for words for which no pretrained word embeddings exist
vocab_length = len(word_tokenizer.word_index) + 1
vocab_length
#@title Padding all reviews to fixed length 100
maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
#@title Load GloVe word embeddings and create an Embeddings Dictionary
from numpy import asarray
from numpy import zeros
embeddings_dictionary = dict()
glove_file = open('/content/drive/MyDrive/Colab Notebooks/dataset/glove_word_embeddings.txt', encoding="utf8")
for line in glove_file:
records = line.split()
word = records[0]
vector_dimensions = asarray(records[1:], dtype='float32')
embeddings_dictionary [word] = vector_dimensions
glove_file.close()
#@title Create Embedding Matrix having 100 columns
# Containing 100-dimensional GloVe word embeddings for all words in our corpus.
embedding_matrix = zeros((vocab_length, 100))
for word, index in word_tokenizer.word_index.items():
embedding_vector = embeddings_dictionary.get(word)
if embedding_vector is not None:
embedding_matrix[index] = embedding_vector
embedding_matrix.shape
"""## **Step 4: Model Training**"""
#@title Import LSTM from keras
from keras.layers import LSTM
#@title Use Sequential for Neural Network architecture
lstm_model = Sequential()
embedding_layer = Embedding(vocab_length, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
lstm_model.add(embedding_layer)
lstm_model.add(LSTM(128))
lstm_model.add(Dense(1, activation='sigmoid'))
#@title Model compiling
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(lstm_model.summary())
#@title Model Training with 128 batch size and 12 epochs
lstm_model_history = lstm_model.fit(X_train, y_train, batch_size=128, epochs=12, verbose=1, validation_split=0.2)
#@title Predictions on the Test Set
score = lstm_model.evaluate(X_test, y_test, verbose=1)
#@title Model Performance
print("Test Score:", score[0])
print("Test Accuracy:", score[1])
#@title Model Performance Charts
import matplotlib.pyplot as plt
plt.plot(lstm_model_history.history['acc'])
plt.plot(lstm_model_history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()
plt.plot(lstm_model_history.history['loss'])
plt.plot(lstm_model_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()
round(score[1], 3)
#@title Saving the model as a keras file for possible use later
lstm_model.save("lstm_model.keras")
"""## **Step 5: Test Model**"""
#@title Load Model
loaded_model = models.load_model("lstm_model.keras")
# List of sample reviews
sample_reviews = [
"This movie was amazing! I loved every moment of it.",
"The product arrived damaged and the quality is terrible.",
"The book was okay, but not as good as I expected.",
"This restaurant has the best food ever. Highly recommended!",
"I'm extremely disappointed with this purchase. Waste of money."
]
#@title Tokenize and pad sequences for the sample reviews
untested_tokenized_collection = tokenize_reviews(sample_reviews)
untested_filtered_tokens = remove_stopwords(untested_tokenized_collection)
#@title Tokenising instance with earlier trained tokeniser
unseen_tokenized = word_tokenizer.texts_to_sequences(untested_filtered_tokens)
#@title Pooling instance to have maxlength of 100 tokens
unseen_padded = pad_sequences(unseen_tokenized, padding='post', maxlen=maxlen)
#@title Passing tokenised instance to the LSTM model for predictions
unseen_sentiments = lstm_model.predict(unseen_padded)
unseen_sentiments