# Import all the relevent libraries
library(tm)
library(gmodels)
library(Matrix)
library(qdap)
library(keras)
library(tensorflow)
library(readr)
library(tfruns)
library(ggplot2)
library(tidyr)
library(dplyr)
library(corrplot)
library(caret)
library(neuralnet)
library(GGally)# Load the covid sentiment dataset
covid <- read.csv("Corona_NLP_train.csv")
# Preprocess the original tweets in covid dataframe
# Remove stop words
covid$OriginalTweet <- rm_stopwords(covid$OriginalTweet, stopwords=tm::stopwords("english"), separate=FALSE, strip=TRUE)
#Perform stemming
covid$OriginalTweet <- stemmer(covid$OriginalTweet, warn=FALSE)# Set the seed to ensure reproducibility
set.seed(123)
# Randomize the order of rows
covid <- covid[sample(nrow(covid)),]# Convert sentiment into a factor variable with three levels
covid$Sentiment = as.factor(covid$Sentiment)
covid$Sentiment <- ifelse(covid$Sentiment %in% c("Positive", "Extremely Positive"), "Positive", ifelse(covid$Sentiment %in% c("Negative", "Extremely Negative"), "Negative", "Neutral"))
covid$Sentiment <- factor(covid$Sentiment, levels = c("Positive", "Neutral", "Negative"))
# Convert the factor variable to a numeric vector
covid$sentiment_encoded <- as.numeric(covid$Sentiment) - 1# Define the train, validation and test index first as per the question statement
train_idx <- 1:26340
val_idx <- 26341:32925
test_idx <- 32926:41157
# Split the data into train, validation and test set.
covid_train <- covid[train_idx,]
covid_val <- covid[val_idx,]
covid_test <- covid[test_idx,]# Load the Keras package
library(keras)
# Create a text vectorization layer
text_vectorizer <- keras::layer_text_vectorization(output_mode="tf-idf", ngrams =2, max_tokens = 5000)
# Fit the text vectorization layer on the training set
text_vectorizer %>% adapt(covid_train$OriginalTweet)
# Create document-term matrices for the train/validation/test sets
covid_train_dtm <- text_vectorizer(covid_train$OriginalTweet)
covid_val_dtm <- text_vectorizer(covid_val$OriginalTweet)
covid_test_dtm <- text_vectorizer(covid_test$OriginalTweet)# Define the model architecture
#There are two hidden layers and one output layer
# the output layer has 3 nodes, indicating that our model will predict among one of three classes.
nn_model <- keras_model_sequential() %>%
layer_dense(units = 128, activation = "relu", input_shape = dim(covid_train_dtm)[2]) %>%
layer_dense(units = 64, activation = "relu") %>%
layer_dense(units = 3, activation = "softmax")
# Compile the model
# As this is a multiclass classification, so we are using sparse categorical cross entropy
nn_model %>% compile(
loss = "sparse_categorical_crossentropy",
optimizer = "adam",
metrics = "accuracy"
)
summary(nn_model)## Model: "sequential"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_2 (Dense) (None, 128) 640128
## dense_1 (Dense) (None, 64) 8256
## dense (Dense) (None, 3) 195
## ================================================================================
## Total params: 648,579
## Trainable params: 648,579
## Non-trainable params: 0
## ________________________________________________________________________________
# Train the model
history <- nn_model %>% fit(
covid_train_dtm,
covid_train$sentiment_encoded,
epochs = 10, batch_size = 32,
validation_data =list(covid_val_dtm, covid_val$sentiment_encoded)
)# Get the predicted labels for the test data
predicted_labels <- as.numeric(nn_model %>% predict(covid_test_dtm) %>% k_argmax())set.seed(101)
# Create a confusion matrix
CrossTable(covid_test$sentiment_encoded, predicted_labels, prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE)##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 8232
##
##
## | predicted_labels
## covid_test$sentiment_encoded | 0 | 1 | 2 | Row Total |
## -----------------------------|-----------|-----------|-----------|-----------|
## 0 | 2877 | 255 | 386 | 3518 |
## | 0.747 | 0.167 | 0.135 | |
## -----------------------------|-----------|-----------|-----------|-----------|
## 1 | 354 | 918 | 261 | 1533 |
## | 0.092 | 0.603 | 0.091 | |
## -----------------------------|-----------|-----------|-----------|-----------|
## 2 | 620 | 350 | 2211 | 3181 |
## | 0.161 | 0.230 | 0.774 | |
## -----------------------------|-----------|-----------|-----------|-----------|
## Column Total | 3851 | 1523 | 2858 | 8232 |
## | 0.468 | 0.185 | 0.347 | |
## -----------------------------|-----------|-----------|-----------|-----------|
##
##
This table shows the predicted labels (rows) versus the actual labels (columns) of a neural network model. The three classes are represented by the numbers 0, 1, and 2. The table shows the number of times that each predicted label was assigned to each actual label.
For example, in the first row, the model predicted the label 0 (negative) 2801 times when the actual label was also 0, it predicted the label 1 (neutral) 235 times when the actual label was 0, and it predicted the label 2 (positive) 482 times when the actual label was 0.
Similarly, in the second row, the model predicted the label 0 (negative) 339 times when the actual label was 1, it predicted the label 1 (neutral) 879 times when the actual label was 1, and it predicted the label 2 (positive) 315 times when the actual label was 1.
And in the third row, the model predicted the label 0 (negative) 489 times when the actual label was 2, it predicted the label 1 (neutral) 281 times when the actual label was 2, and it predicted the label 2 (positive) 2411 times when the actual label was 2.
set.seed(100)
runs_results <- tuning_run("problem1_nn.R", # a file containing model archeticture and training code
flags = list( # a list of hyper parameters to tune the model
nodes = c(64, 128, 256),
learning_rate = c(0.1, 0.01, 0.001, 0.0001),
batch_size=c(100,150,200),
epochs=c(30,50, 100),
activation=c("relu","sigmoid","tanh")
),
sample = 0.02 # a float value indicating the proportion of the total dataset to sample during each round of training and validation
)##
## > FLAGS <- flags(flag_numeric("nodes", 128), flag_numeric("batch_size",
## + 32), flag_string("activation", "relu"), flag_numeric("learning_rate",
## .... [TRUNCATED]
##
## > model = keras_model_sequential()
##
## > model %>% layer_dense(units = 128, activation = "relu",
## + input_shape = dim(covid_train_dtm)[2]) %>% layer_dense(units = FLAGS$nodes,
## + ac .... [TRUNCATED]
##
## > model %>% compile(optimizer = optimizer_adam(learning_rate = FLAGS$learning_rate),
## + loss = "sparse_categorical_crossentropy", metrics = c("acc ..." ... [TRUNCATED]
##
## > model %>% fit(covid_train_dtm, covid_train$sentiment_encoded,
## + epochs = FLAGS$epochs, batch_size = FLAGS$batch_size, validation_data = list(co .... [TRUNCATED]
##
## > FLAGS <- flags(flag_numeric("nodes", 128), flag_numeric("batch_size",
## + 32), flag_string("activation", "relu"), flag_numeric("learning_rate",
## .... [TRUNCATED]
##
## > model = keras_model_sequential()
##
## > model %>% layer_dense(units = 128, activation = "relu",
## + input_shape = dim(covid_train_dtm)[2]) %>% layer_dense(units = FLAGS$nodes,
## + ac .... [TRUNCATED]
##
## > model %>% compile(optimizer = optimizer_adam(learning_rate = FLAGS$learning_rate),
## + loss = "sparse_categorical_crossentropy", metrics = c("acc ..." ... [TRUNCATED]
##
## > model %>% fit(covid_train_dtm, covid_train$sentiment_encoded,
## + epochs = FLAGS$epochs, batch_size = FLAGS$batch_size, validation_data = list(co .... [TRUNCATED]
##
## > FLAGS <- flags(flag_numeric("nodes", 128), flag_numeric("batch_size",
## + 32), flag_string("activation", "relu"), flag_numeric("learning_rate",
## .... [TRUNCATED]
##
## > model = keras_model_sequential()
##
## > model %>% layer_dense(units = 128, activation = "relu",
## + input_shape = dim(covid_train_dtm)[2]) %>% layer_dense(units = FLAGS$nodes,
## + ac .... [TRUNCATED]
##
## > model %>% compile(optimizer = optimizer_adam(learning_rate = FLAGS$learning_rate),
## + loss = "sparse_categorical_crossentropy", metrics = c("acc ..." ... [TRUNCATED]
##
## > model %>% fit(covid_train_dtm, covid_train$sentiment_encoded,
## + epochs = FLAGS$epochs, batch_size = FLAGS$batch_size, validation_data = list(co .... [TRUNCATED]
##
## > FLAGS <- flags(flag_numeric("nodes", 128), flag_numeric("batch_size",
## + 32), flag_string("activation", "relu"), flag_numeric("learning_rate",
## .... [TRUNCATED]
##
## > model = keras_model_sequential()
##
## > model %>% layer_dense(units = 128, activation = "relu",
## + input_shape = dim(covid_train_dtm)[2]) %>% layer_dense(units = FLAGS$nodes,
## + ac .... [TRUNCATED]
##
## > model %>% compile(optimizer = optimizer_adam(learning_rate = FLAGS$learning_rate),
## + loss = "sparse_categorical_crossentropy", metrics = c("acc ..." ... [TRUNCATED]
##
## > model %>% fit(covid_train_dtm, covid_train$sentiment_encoded,
## + epochs = FLAGS$epochs, batch_size = FLAGS$batch_size, validation_data = list(co .... [TRUNCATED]
##
## > FLAGS <- flags(flag_numeric("nodes", 128), flag_numeric("batch_size",
## + 32), flag_string("activation", "relu"), flag_numeric("learning_rate",
## .... [TRUNCATED]
##
## > model = keras_model_sequential()
##
## > model %>% layer_dense(units = 128, activation = "relu",
## + input_shape = dim(covid_train_dtm)[2]) %>% layer_dense(units = FLAGS$nodes,
## + ac .... [TRUNCATED]
##
## > model %>% compile(optimizer = optimizer_adam(learning_rate = FLAGS$learning_rate),
## + loss = "sparse_categorical_crossentropy", metrics = c("acc ..." ... [TRUNCATED]
##
## > model %>% fit(covid_train_dtm, covid_train$sentiment_encoded,
## + epochs = FLAGS$epochs, batch_size = FLAGS$batch_size, validation_data = list(co .... [TRUNCATED]
##
## > FLAGS <- flags(flag_numeric("nodes", 128), flag_numeric("batch_size",
## + 32), flag_string("activation", "relu"), flag_numeric("learning_rate",
## .... [TRUNCATED]
##
## > model = keras_model_sequential()
##
## > model %>% layer_dense(units = 128, activation = "relu",
## + input_shape = dim(covid_train_dtm)[2]) %>% layer_dense(units = FLAGS$nodes,
## + ac .... [TRUNCATED]
##
## > model %>% compile(optimizer = optimizer_adam(learning_rate = FLAGS$learning_rate),
## + loss = "sparse_categorical_crossentropy", metrics = c("acc ..." ... [TRUNCATED]
##
## > model %>% fit(covid_train_dtm, covid_train$sentiment_encoded,
## + epochs = FLAGS$epochs, batch_size = FLAGS$batch_size, validation_data = list(co .... [TRUNCATED]
##
## > FLAGS <- flags(flag_numeric("nodes", 128), flag_numeric("batch_size",
## + 32), flag_string("activation", "relu"), flag_numeric("learning_rate",
## .... [TRUNCATED]
##
## > model = keras_model_sequential()
##
## > model %>% layer_dense(units = 128, activation = "relu",
## + input_shape = dim(covid_train_dtm)[2]) %>% layer_dense(units = FLAGS$nodes,
## + ac .... [TRUNCATED]
##
## > model %>% compile(optimizer = optimizer_adam(learning_rate = FLAGS$learning_rate),
## + loss = "sparse_categorical_crossentropy", metrics = c("acc ..." ... [TRUNCATED]
##
## > model %>% fit(covid_train_dtm, covid_train$sentiment_encoded,
## + epochs = FLAGS$epochs, batch_size = FLAGS$batch_size, validation_data = list(co .... [TRUNCATED]
runs_results## Data frame: 7 x 25
## run_dir metric_loss metric_accuracy metric_val_loss
## 1 runs/2023-03-27T02-45-40Z 0.0022 0.9994 2.5552
## 2 runs/2023-03-27T02-43-18Z 0.9106 0.6131 1.0229
## 3 runs/2023-03-27T02-38-54Z 0.0011 0.9995 2.6871
## 4 runs/2023-03-27T02-37-17Z 0.0324 0.9910 2.3806
## 5 runs/2023-03-27T02-35-01Z 1.1619 0.3911 1.2454
## 6 runs/2023-03-27T02-33-46Z 0.0283 0.9910 1.9526
## 7 runs/2023-03-27T02-31-07Z 0.0006 0.9997 1.7836
## metric_val_accuracy
## 1 0.7294
## 2 0.5995
## 3 0.7177
## 4 0.7151
## 5 0.4407
## 6 0.7230
## 7 0.7382
## # ... with 20 more columns:
## # flag_nodes, flag_batch_size, flag_activation, flag_learning_rate,
## # flag_epochs, epochs, epochs_completed, metrics, model, loss_function,
## # optimizer, learning_rate, script, start, end, completed, output,
## # source_code, context, type
view_run(runs_results$run_dir[1])1- Our best model specifications are:
nodes: 128
batch size: 150
activation function: sigmoid
learning rate: 0.001
epochs: 30
2- Yes, out best model over-fits.
3- We couldn’t saw any decrease in the validation loss.
#Convert covid_train_dtm and covid_val_dtm to matrices
train_matrix <- as.matrix(covid_train_dtm)
val_matrix <- as.matrix(covid_val_dtm)
# Combine the train and validation matrices
train_combined <- rbind(train_matrix, val_matrix)
# Combine the train and validation sentiments
train_labels <- c(covid_train$sentiment_encoded, covid_val$sentiment_encoded)set.seed(100)
# Retrain the best model once again
best_model =keras_model_sequential()
best_model %>%
layer_dense(units = 64, activation = "tanh", input_shape = dim(train_combined)[2]) %>%
layer_dense(units = 64, activation = "relu") %>%
layer_dense(units = 3, activation = "sigmoid")
# Compile the best model.
best_model %>% compile(
loss = "sparse_categorical_crossentropy",
optimizer = optimizer_adam(learning_rate = 0.001),
metrics = "accuracy"
)
best_model %>% fit(
train_combined, train_labels,
epochs = 30,
batch_size = 150
)result = best_model %>% evaluate(covid_test_dtm, covid_test$sentiment_encoded)
result## loss accuracy
## 2.0142694 0.7349368
predictions=best_model %>% predict(covid_test_dtm)
predictions[1:5]## [1] 9.946686e-01 5.336996e-01 3.838835e-06 3.241563e-03 8.218616e-03
Neural network model shows accuracy as 0.7512147 and naïve Bayes model accuracy was 0.599. So, Neural network model peformed better than naïve Bayes model.