-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtraining.py
476 lines (328 loc) · 17.4 KB
/
training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
# -*- coding: utf-8 -*-
"""Copy of ISY503 Assessment 3.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1svFN9to-LvGEfCiadi995derG_uDl_Mb
"""
import string
import pandas as pd
import nltk
import re
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer # ImportRegexpTokenizer to remove punctuation and tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, GlobalMaxPooling1D
plt.style.use('ggplot')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')
from google.colab import drive
drive.mount('/content/drive')
"""## **Load Data**"""
books_positive = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/books/positive.xml")
books_negative = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/books/negative.xml")
dvd_positive = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/dvd/positive.xml")
dvd_negative = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/dvd/negative.xml")
electronics_positive = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/electronics/positive.xml")
electronics_negative = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/electronics/negative.xml")
kitchen_positive = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/kitchen_%26_housewares/positive.xml")
kitchen_negative = pd.read_xml("https://raw.githubusercontent.com/jaspervanbrian/sentiment-analysis/main/dataset/kitchen_%26_housewares/negative.xml")
print(kitchen_negative['review_text'].head(20))
"""## **Quick Data Eval**"""
def show_rating_graph(collection, title):
# Must have a rating column
ax = collection['rating'].value_counts().sort_index().plot(kind='bar', title=title)
ax.set_xlabel('Rating')
plt.show()
#@title Books: Positive
show_rating_graph(books_positive, "Ratings of Books by Positive Reviews")
#@title Books: Negative
show_rating_graph(books_negative, "Ratings of Books by Negative Reviews")
#@title DVDs: Positive
show_rating_graph(dvd_positive, "Ratings of DVDs by Positive Reviews")
#@title DVDs: Negative
show_rating_graph(dvd_negative, "Ratings of DVDs by Negative Reviews")
#@title Electronics: Positive
show_rating_graph(electronics_positive, "Ratings of Electronics by Positive Reviews")
#@title Electronics: Negative
show_rating_graph(electronics_negative, "Ratings of Electronics by Negative Reviews")
#@title Kitchen and Housewares: Positive
show_rating_graph(kitchen_positive, "Ratings of kitchen and housewares by Positive Reviews")
#@title Kitchen and Housewares: Negative
show_rating_graph(kitchen_negative, "Ratings of kitchen and housewares by Negative Reviews")
"""# **Training**
## **Step 1: Cleaning Data**
"""
#@title Cleaning Data Functions
# Remove punctuations and Tokenize
def remove_punctuations_and_tokenize(input_string):
translator = str.maketrans("", "", string.punctuation)
formatted = input_string.translate(translator)
# Remove undesired underscore and slashes
formatted = formatted.replace("_"," ").replace("/","")
# Remove Numbers
formatted = re.sub(r"\b[0-9]+\b\s*", "", formatted)
# Remove Hyperlinks
formatted = re.sub(r"https?://\S+", "", formatted)
# Remove the HTML tags but keep their contents
formatted = re.sub(r"<.*?>", " ", formatted)
# Remove the alphanumerics like a389794njfhj because they dont add any value
formatted = re.sub(r'\w*\d\w*', '', formatted)
# Remove undesire punctuation
return RegexpTokenizer(r'\w+').tokenize(formatted)
def tokenize_reviews(reviews):
return [remove_punctuations_and_tokenize(review.lower()) for review in reviews]
# Remove Stopwords
def remove_stopwords(token_collection):
# Get the list of English stopwords
english_stopwords = stopwords.words('english')
# Remove stopwords
return [[word for word in tokens if word.lower() not in english_stopwords] for tokens in token_collection]
#@title Setting up Positive Reviews
positive_reviews = pd.concat([books_positive, dvd_positive, electronics_positive, kitchen_positive])
positive_tokenized_collection = tokenize_reviews(positive_reviews['review_text'])
positive_filtered_tokens = remove_stopwords(positive_tokenized_collection)
#@title Setting up Negative Reviews
negative_reviews = pd.concat([books_negative, dvd_negative, electronics_negative, kitchen_negative])
negative_tokenized_collection = tokenize_reviews(negative_reviews['review_text'])
negative_filtered_tokens = remove_stopwords(negative_tokenized_collection)
"""## **Step 2: Preparing Data**"""
#@title POS Tagging function
def pos_tag_collection(token_collection):
return [nltk.pos_tag(tokens) for tokens in token_collection]
#@title Chunking function
def chunk_collection(tagged_reviews):
# Define chunk grammar rules
chunk_grammar = r"""
NP: {<DT|JJ|NN.*>+} # Noun Phrase
VP: {<VB.*><NP|PP>*} # Verb Phrase
"""
chunk_parser = nltk.RegexpParser(chunk_grammar)
return [chunk_parser.parse(tagged_review) for tagged_review in tagged_reviews]
# return [nltk.chunk.ne_chunk(tagged_review) for tagged_review in tagged_reviews]
#@title Lemmatization function
def lemmatize_collection(tagged_collection):
lemmatizer = WordNetLemmatizer()
lemmatized_collection = []
for tagged_phrase in tagged_collection:
phrase = []
for token, tag in tagged_phrase:
pos = 'n'
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
elif tag.startswith('RB'):
pos = 'r'
elif tag.startswith('JJ'):
pos = 'a'
phrase.append(lemmatizer.lemmatize(token, pos))
lemmatized_collection.append(phrase)
return lemmatized_collection
#@title POS Tagging for positive reviews
tagged_positives = pos_tag_collection(positive_filtered_tokens)
for tagged_positive in tagged_positives[:10]:
for token, tag in tagged_positive:
print(f"{token}: {tag}")
#@title Chunking for positive reviews
chunked_positives = chunk_collection(tagged_positives)
!pip install svgling
import IPython.display
for tagged_positive in chunked_positives[:10]:
IPython.display.display(tagged_positive)
#@title Lemmatization for positive reviews
lemmatized_positives = lemmatize_collection(tagged_positives)
lemmatized_positives[0]
#@title POS Tag and Chunking for negative reviews
# POS Tagging for negative reviews
tagged_negatives = pos_tag_collection(negative_filtered_tokens)
# Display POS tags for the first few negative reviews
for tagged_negative in tagged_negatives[:10]:
print(tagged_negative)
# Chunking for negative reviews
chunked_negatives = chunk_collection(tagged_negatives)
# Display chunked phrases for the first few negative reviews
for chunked_negative in chunked_negatives[:10]:
print(chunked_negative)
# Get all lemmatized negatives
lemmatized_negatives = lemmatize_collection(tagged_negatives)
# Display lemmatized_negatives
for lemmatized_negative in lemmatized_negatives[:10]:
print(lemmatized_negative)
([1] * 3 + [0] * 9)
#@title Feature Extraction
"""Convert the preprocessed text data into numerical features that can be used as input to a machine learning model. Common techniques include TF-IDF (Term Frequency-Inverse Document Frequency) and word embeddings (like Word2Vec or GloVe)."""
# Combine positive and negative reviews with labels
reviews = positive_filtered_tokens + negative_filtered_tokens
labels = np.array([1] * len(positive_filtered_tokens) + [0] * len(negative_filtered_tokens))
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=42)
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000) # You can adjust max_features
# Transform text data into TF-IDF features
X_train_tfidf = tfidf_vectorizer.fit_transform([' '.join(tokens) for tokens in X_train])
X_test_tfidf = tfidf_vectorizer.transform([' '.join(tokens) for tokens in X_test])
"""## Traditional Machine Learning Approach: Support Vector Machines (SVM)
Support Vector Machines (SVM) are often a good choice for text classification tasks like sentiment analysis. They work well when the data is not too large and the features (TF-IDF in this case) are well-defined.
"""
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
# Initialize SVM classifier
svm_classifier = SVC(kernel='linear', random_state=42)
# Train SVM on training data
svm_classifier.fit(X_train_tfidf, y_train)
# Make predictions on testing data
y_pred = svm_classifier.predict(X_test_tfidf)
# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_rep)
"""
- Precision: The percentage of correctly predicted positive or negative instances among the predicted instances of that class.
- Recall: The percentage of correctly predicted positive or negative instances among the actual instances of that class.
- F1-score: The harmonic mean of precision and recall, providing a balanced measure of a model's performance.
- Support: The number of actual instances in each class.
## Analysis of SVM model
Precision for class 0 (negative sentiment) is 0.83, which means 83% of predicted negative reviews were actually negative.
Recall for class 0 is 0.81, which means 81% of actual negative reviews were correctly predicted as negative.
F1-score for class 0 is 0.82, which is the balanced measure of precision and recall for negative sentiment.
Precision for class 1 (positive sentiment) is 0.81, which means 81% of predicted positive reviews were actually positive.
Recall for class 1 is 0.84, which means 84% of actual positive reviews were correctly predicted as positive.
F1-score for class 1 is 0.83, which is the balanced measure of precision and recall for positive sentiment."""
#@title Modern, Deep Learning Approach: LSTM (Long Short-Term Memory)
#@title Data splitting and batching
from sklearn.model_selection import train_test_split
# Combine positive and negative reviews with labels
all_reviews = positive_filtered_tokens + negative_filtered_tokens
all_labels = [1] * len(positive_filtered_tokens) + [0] * len(negative_filtered_tokens)
# Split data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(all_reviews, all_labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
#@title Neural Network Architecture and Model Class
"""Here, we'll use a simple LSTM-based neural network."""
# Define the vocabulary and maximum sequence length
vocabulary = set(word for tokens in positive_filtered_tokens + negative_filtered_tokens for word in tokens)
max_sequence_length = max(len(tokens) for tokens in positive_filtered_tokens + negative_filtered_tokens)
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, GlobalMaxPooling1D
# Define the model architecture
model = Sequential()
model.add(Embedding(input_dim=len(vocabulary), output_dim=128, input_length=max_sequence_length))
model.add(LSTM(128, return_sequences=True))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Print the model summary
model.summary()
#@title Model Training and Evaluation
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
y_train = np.array(y_train)
y_val = np.array(y_val)
def tokenization_and_padding(texts, tokenizer, max_sequence_length):
# Tokenize the texts using the provided tokenizer
sequences = tokenizer.texts_to_sequences(texts)
# Pad sequences to a fixed length
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')
return padded_sequences
# Create a tokenizer and fit on the combined positive and negative reviews
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(positive_filtered_tokens + negative_filtered_tokens)
# Tokenize and pad sequences for training, validation, and test sets
X_train_sequences = tokenization_and_padding(X_train, tokenizer, max_sequence_length)
X_val_sequences = tokenization_and_padding(X_val, tokenizer, max_sequence_length)
# Tokenize test sequences with the same tokenizer, considering out-of-vocabulary tokens
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_sequences = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
# Tokenize and pad sequences for training, validation, and test sets
# X_train_sequences = tokenization_and_padding(X_train)
# X_val_sequences = tokenization_and_padding(X_val)
# X_test_sequences = tokenization_and_padding(X_test)
# Train the model
history = model.fit(X_train_sequences, y_train, validation_data=(X_val_sequences, y_val), epochs=10, batch_size=32)
# Tokenize test sequences with the same tokenizer, considering out-of-vocabulary tokens
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_sequences = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
print("Vocabulary size:", len(tokenizer.word_index))
print("Max token index in training data:", np.max(X_train_sequences))
print("Max token index in testing data:", np.max(X_test_sequences))
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Combine all text data for fitting the tokenizer
all_text = X_train + X_val + X_test
max_words = 5000
# Convert y_test to a NumPy array
y_test = np.array(y_test)
# Create a new tokenizer and fit it on all text data
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(all_text)
# Tokenize and pad sequences for testing data using the same tokenizer
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_sequences = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
# Evaluate the model
loss, accuracy = model.evaluate(X_test_sequences, y_test)
print(f"Test Accuracy: {accuracy:.2f}")
"""LSTM model achieved a test accuracy of approximately 66%. This indicates that the model is able to correctly classify sentiment in the test dataset about 66% of the time."""
from sklearn.metrics import classification_report
# Make predictions on the test set
y_pred = model.predict(X_test_sequences)
y_pred_binary = (y_pred > 0.5).astype(int) # Convert probabilities to binary predictions
# Generate the classification report
class_report = classification_report(y_test, y_pred_binary)
print("Classification Report:\n", class_report)
"""For Class 0 (Negative Sentiment):
Precision: 0.70
Recall: 0.56
F1-score: 0.62
For Class 1 (Positive Sentiment):
Precision: 0.64
Recall: 0.76
F1-score: 0.69
These metrics provide insights into the model's performance for both positive and negative sentiment classifications. It looks like the model has a higher recall for positive sentiment (Class 1), meaning it correctly identifies more actual positive reviews. However, its precision is slightly higher for negative sentiment (Class 0), indicating that when it predicts negative sentiment, it is more likely to be correct.
The macro average F1-score is 0.66, which is the average of the F1-scores for both classes. The weighted average F1-score takes into account the class distribution and is also 0.66.
"""
#@title Save model
# After training your model, save it to a file
model.save('sentiment_model.h5')
import numpy as np
loaded_model = tf.keras.models.load_model('sentiment_model.h5')
# List of sample reviews
sample_reviews = [
"This movie was amazing! I loved every moment of it.",
"The product arrived damaged and the quality is terrible.",
"The book was okay, but not as good as I expected.",
"This restaurant has the best food ever. Highly recommended!",
"I'm extremely disappointed with this purchase. Waste of money."
]
# Tokenize and pad sequences for the sample reviews
sample_sequences = tokenization_and_padding(sample_reviews, tokenizer, max_sequence_length)
# Convert the sequences to a NumPy array
sample_sequences = np.array(sample_sequences)
# Use the loaded model to make predictions
predictions = loaded_model.predict(sample_sequences)
# Interpret the predictions
sentiment_labels = ["Negative", "Positive"]
for i, prediction in enumerate(predictions):
sentiment = sentiment_labels[int(round(prediction[0]))]
print(f"Review: {sample_reviews[i]}")
print(f"Predicted Sentiment: {sentiment}")
print("=" * 50)