lstm_model_training.py

# -*- coding: utf-8 -*-
"""ISY503 Assessment 3.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1jfqJTB66xVUg2c7NpGojQjMAj7r5IEMQ

## Process Flow


1.   Load Datasets
2.   **Pre-process dataset** by removing special characters, stopwords, and other noise, and convert **sentiment labels** positive & negative to numbers 1 & 0, respectively
3.   **Import GloVe Word Embedding** to build Embedding Dictionary + Use this to build Embedding Matrix for our Corpus
4. Model Training using **Deep Learning in Keras** using **LSTM Model** and analyse model performance and results

## **Setup**
"""

import string
import pandas as pd
import nltk
import re
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer # ImportRegexpTokenizer to remove punctuation and tokenize
from nltk.stem import WordNetLemmatizer
from numpy import array
from keras import models
from keras.preprocessing.text import one_hot, Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, GlobalMaxPooling1D, Embedding, Conv1D, LSTM
from sklearn.model_selection import train_test_split

plt.style.use('ggplot')

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')

"""## **Load Data**"""

books_positive = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/books/positive.xml")
books_negative = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/books/negative.xml")

dvd_positive = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/dvd/positive.xml")
dvd_negative = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/dvd/negative.xml")

electronics_positive = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/electronics/positive.xml")
electronics_negative = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/electronics/negative.xml")

kitchen_positive = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/kitchen_%26_housewares/positive.xml")
kitchen_negative = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/kitchen_%26_housewares/negative.xml")

print(kitchen_negative['review_text'].head(20))

"""## **Quick Data Eval**"""

def show_rating_graph(collection, title):
  # Must have a rating column
  ax = collection['rating'].value_counts().sort_index().plot(kind='bar', title=title)
  ax.set_xlabel('Rating')
  plt.show()

#@title Books: Positive

show_rating_graph(books_positive, "Ratings of Books by Positive Reviews")

#@title Books: Negative

show_rating_graph(books_negative, "Ratings of Books by Negative Reviews")

#@title DVDs: Positive

show_rating_graph(dvd_positive, "Ratings of DVDs by Positive Reviews")

#@title DVDs: Negative

show_rating_graph(dvd_negative, "Ratings of DVDs by Negative Reviews")

#@title Electronics: Positive

show_rating_graph(electronics_positive, "Ratings of Electronics by Positive Reviews")

#@title Electronics: Negative

show_rating_graph(electronics_negative, "Ratings of Electronics by Negative Reviews")

#@title Kitchen and Housewares: Positive

show_rating_graph(kitchen_positive, "Ratings of kitchen and housewares by Positive Reviews")

#@title Kitchen and Housewares: Negative

show_rating_graph(kitchen_negative, "Ratings of kitchen and housewares by Negative Reviews")

"""# **Training**

## **Step 1: Cleaning Data**
"""

#@title Cleaning Data Functions

# Remove punctuations and Tokenize
def remove_punctuations_and_tokenize(input_string):
  translator = str.maketrans("", "", string.punctuation)
  formatted = input_string.translate(translator)

  # Remove undesired underscore and  slashes
  formatted = formatted.replace("_"," ").replace("/","")

  # Remove Numbers
  formatted = re.sub(r"\b[0-9]+\b\s*", "", formatted)

  # Remove Hyperlinks
  formatted = re.sub(r"https?://\S+", "", formatted)

  # Remove the HTML tags but keep their contents
  formatted = re.sub(r"<.*?>", " ", formatted)

  # Remove the alphanumerics like a389794njfhj because they dont add any value
  formatted = re.sub(r'\w*\d\w*', '', formatted)

  # Remove undesire punctuation
  return RegexpTokenizer(r'\w+').tokenize(formatted)

def tokenize_reviews(reviews):
  return [remove_punctuations_and_tokenize(review.lower()) for review in reviews]

# Remove Stopwords
def remove_stopwords(token_collection):
  # Get the list of English stopwords
  english_stopwords = stopwords.words('english')

  # Remove stopwords
  return [[word for word in tokens if word.lower() not in english_stopwords] for tokens in token_collection]

#@title Setting up Positive Reviews

positive_reviews = pd.concat([books_positive, dvd_positive, electronics_positive, kitchen_positive])
positive_tokenized_collection = tokenize_reviews(positive_reviews['review_text'])
positive_filtered_tokens = remove_stopwords(positive_tokenized_collection)

#@title Setting up Negative Reviews

negative_reviews = pd.concat([books_negative, dvd_negative, electronics_negative, kitchen_negative])
negative_tokenized_collection = tokenize_reviews(negative_reviews['review_text'])
negative_filtered_tokens = remove_stopwords(negative_tokenized_collection)

"""## **Step 2 (Optional): Preparing Data**"""

#@title POS Tagging function
def pos_tag_collection(token_collection):
  return [nltk.pos_tag(tokens) for tokens in token_collection]

#@title Chunking function
def chunk_collection(tagged_reviews):
  # Define chunk grammar rules
  chunk_grammar = r"""
      NP: {<DT|JJ|NN.*>+}  # Noun Phrase
      VP: {<VB.*><NP|PP>*}  # Verb Phrase
  """
  chunk_parser = nltk.RegexpParser(chunk_grammar)
  return [chunk_parser.parse(tagged_review) for tagged_review in tagged_reviews]
  # return [nltk.chunk.ne_chunk(tagged_review) for tagged_review in tagged_reviews]

#@title Lemmatization function
def lemmatize_collection(tagged_collection):
  lemmatizer = WordNetLemmatizer()
  lemmatized_collection = []

  for tagged_phrase in tagged_collection:
    phrase = []

    for token, tag in tagged_phrase:
      pos = 'n'

      if tag.startswith("NN"):
        pos = 'n'
      elif tag.startswith('VB'):
        pos = 'v'
      elif tag.startswith('RB'):
        pos = 'r'
      elif tag.startswith('JJ'):
        pos = 'a'

      phrase.append(lemmatizer.lemmatize(token, pos))

    lemmatized_collection.append(phrase)

  return lemmatized_collection

#@title POS Tagging for positive reviews
tagged_positives = pos_tag_collection(positive_filtered_tokens)

for tagged_positive in tagged_positives[:10]:
  for token, tag in tagged_positive:
    print(f"{token}: {tag}")

#@title Chunking for positive reviews
chunked_positives = chunk_collection(tagged_positives)

!pip install svgling

import IPython.display

for tagged_positive in chunked_positives[:10]:
  IPython.display.display(tagged_positive)

#@title Lemmatization for positive reviews
lemmatized_positives = lemmatize_collection(tagged_positives)

lemmatized_positives[0]

#@title POS Tagging and Chunking for negative reviews
tagged_positives = pos_tag_collection(negative_filtered_tokens)
chunked_positives = chunk_collection(tagged_positives)

for tagged_positive in tagged_positives[:10]:
  for token, tag in tagged_positive:
    print(f"{token}: {tag}")

"""## **Step 3: Embedding Data**"""

#@title Combine positive and negative reviews with labels
reviews = positive_filtered_tokens + negative_filtered_tokens
labels = np.array([1] * len(positive_filtered_tokens) + [0] * len(negative_filtered_tokens))

#@title Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=42)

#@title Embedding layer expects the words to be in numeric form
# Using Tokenizer function from keras.preprocessing.text library
# Method fit_on_text trains the tokenizer
# Method texts_to_sequences converts sentences to their numeric form

word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X_train)

X_train = word_tokenizer.texts_to_sequences(X_train)
X_test = word_tokenizer.texts_to_sequences(X_test)

#@title Adding 1 to store dimensions for words for which no pretrained word embeddings exist

vocab_length = len(word_tokenizer.word_index) + 1

vocab_length

#@title Padding all reviews to fixed length 100

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

#@title Load GloVe word embeddings and create an Embeddings Dictionary

from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open('/content/drive/MyDrive/Colab Notebooks/dataset/glove_word_embeddings.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

#@title Create Embedding Matrix having 100 columns
# Containing 100-dimensional GloVe word embeddings for all words in our corpus.

embedding_matrix = zeros((vocab_length, 100))
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
      embedding_matrix[index] = embedding_vector

embedding_matrix.shape

"""## **Step 4: Model Training**"""

#@title Import LSTM from keras
from keras.layers import LSTM

#@title Use Sequential for Neural Network architecture

lstm_model = Sequential()
embedding_layer = Embedding(vocab_length, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)

lstm_model.add(embedding_layer)
lstm_model.add(LSTM(128))

lstm_model.add(Dense(1, activation='sigmoid'))

#@title Model compiling

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(lstm_model.summary())

#@title Model Training with 128 batch size and 12 epochs

lstm_model_history = lstm_model.fit(X_train, y_train, batch_size=128, epochs=12, verbose=1, validation_split=0.2)

#@title Predictions on the Test Set

score = lstm_model.evaluate(X_test, y_test, verbose=1)

#@title Model Performance

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

#@title Model Performance Charts

import matplotlib.pyplot as plt

plt.plot(lstm_model_history.history['acc'])
plt.plot(lstm_model_history.history['val_acc'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

plt.plot(lstm_model_history.history['loss'])
plt.plot(lstm_model_history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

round(score[1], 3)

#@title Saving the model as a keras file for possible use later

lstm_model.save("lstm_model.keras")

"""## **Step 5: Test Model**"""

#@title Load Model
loaded_model = models.load_model("lstm_model.keras")

# List of sample reviews
sample_reviews = [
    "This movie was amazing! I loved every moment of it.",
    "The product arrived damaged and the quality is terrible.",
    "The book was okay, but not as good as I expected.",
    "This restaurant has the best food ever. Highly recommended!",
    "I'm extremely disappointed with this purchase. Waste of money."
]

#@title Tokenize and pad sequences for the sample reviews
untested_tokenized_collection = tokenize_reviews(sample_reviews)
untested_filtered_tokens = remove_stopwords(untested_tokenized_collection)

#@title Tokenising instance with earlier trained tokeniser
unseen_tokenized = word_tokenizer.texts_to_sequences(untested_filtered_tokens)

#@title Pooling instance to have maxlength of 100 tokens
unseen_padded = pad_sequences(unseen_tokenized, padding='post', maxlen=maxlen)

#@title Passing tokenised instance to the LSTM model for predictions
unseen_sentiments = lstm_model.predict(unseen_padded)

unseen_sentiments