WGU - D213 - Task 2

Install / Load Libraries

# Install / Load Libraries
#install.packages("remotes")
#remotes::install_github("rstudio/tensorflow")
#install.packages("tensorflow")
#install.packages("keras")
#install.packages("stringi")
#install.packages("tm")
#install.packages("tokenizers")
#install.packages("word2vec")
#install.packages("knitr")
library(word2vec)

## Warning: package 'word2vec' was built under R version 4.2.3

library(tokenizers)

## Warning: package 'tokenizers' was built under R version 4.2.3

library(tm)

## Warning: package 'tm' was built under R version 4.2.3

## Loading required package: NLP

library(stringi)

## Warning: package 'stringi' was built under R version 4.2.2

library(keras)

## Warning: package 'keras' was built under R version 4.2.3

library(tensorflow)

## Warning: package 'tensorflow' was built under R version 4.2.3

install_tensorflow(envname = "r-tensorflow")

## Virtual environment "r-tensorflow" removed.
## Using Python: C:/Users/panic/AppData/Local/Programs/Python/Python39/python.exe
## Creating virtual environment "r-tensorflow" ...

## + "C:/Users/panic/AppData/Local/Programs/Python/Python39/python.exe" -m venv "C:/Users/panic/Documents/.virtualenvs/r-tensorflow"

## Done!
## Installing packages: pip, wheel, setuptools

## + "C:/Users/panic/Documents/.virtualenvs/r-tensorflow/Scripts/python.exe" -m pip install --upgrade --no-user pip wheel setuptools

## Virtual environment 'r-tensorflow' successfully created.
## Using virtual environment "r-tensorflow" ...

## + "C:/Users/panic/Documents/.virtualenvs/r-tensorflow/Scripts/python.exe" -m pip install --upgrade --no-user "tensorflow==2.13.*"

## 
## Installation complete.

library(knitr)

Create Dataframe Objects and Read in Data

# Create empty dataframes for "yelp", "imdb", and "amazon"; then read in all of the lines from their respective text files
yelp_df <- data.frame()
imdb_df <- data.frame()
amazon_df <- data.frame()
yelp_data <- readLines("yelp_labelled.txt", encoding = "UTF-8")
imdb_data <- readLines("imdb_labelled.txt", encoding = "UTF-8")
amazon_data <- readLines("amazon_cells_labelled.txt", encoding = "UTF-8")

For Loops to Extract Data and Place in the Dataframes

# For loops for splitting each line between "sentence" and "integer" and putting the values into their respective dataframes
for (line in yelp_data) {
  split_data <- unlist(strsplit(line, "\t"))
  yelp_df <- rbind(yelp_df, data.frame(split_data[1], as.integer(split_data[2])))
}
# Clean up for next run
line=""
split_data=""

for (line in imdb_data) {
  split_data <- unlist(strsplit(line, "\t"))
  imdb_df <- rbind(imdb_df, data.frame(split_data[1], as.integer(split_data[2])))
}
# Clean up for next run
line=""
split_data=""

for (line in amazon_data) {
  split_data <- unlist(strsplit(line, "\t"))
  amazon_df <- rbind(amazon_df, data.frame(split_data[1], as.integer(split_data[2])))
}
# Clean up as no longer needed
rm(line)
rm(split_data)

Combine the Three Dataframes and Preliminary EDA

# Combine all dataframes into a single dataframe and clean up the column names
merged_df <- rbind(yelp_df, imdb_df, amazon_df)
colnames(merged_df) <- c("Sentence", "Score")

# Count duplicate entries; Will retain if found
sum(duplicated(merged_df))

## [1] 17

# Extract the duplicate entries to visualize
duplicates <- duplicated(merged_df)
duplicate_rows <- merged_df[duplicates, ]
print(duplicate_rows)

##                                                                                        Sentence
## 815                                                                          I love this place.
## 817                                                                      The food was terrible.
## 844                                                                            I won't be back.
## 847                                                           I would not recommend this place.
## 1364                                                           Definitely worth checking out.  
## 1586                                                                         Not recommended.  
## 1789                                                                                    10/10  
## 2286                                                                              Great phone!.
## 2408                                                                               Works great.
## 2525                                                                              Works great!.
## 2544                                                                    Don't buy this product.
## 2745 If you like a loud buzzing to override all your conversations, then this phone is for you!
## 2749                                                                              Does not fit.
## 2779                                                                      This is a great deal.
## 2793                                                                               Great Phone.
## 2893                                                           Excellent product for the price.
## 2897                                                                               Great phone.
##      Score
## 815      1
## 817      0
## 844      0
## 847      0
## 1364     1
## 1586     0
## 1789     1
## 2286     1
## 2408     1
## 2525     1
## 2544     0
## 2745     0
## 2749     0
## 2779     1
## 2793     1
## 2893     1
## 2897     1

# Identification of NULL values
colSums(is.na(merged_df))

## Sentence    Score 
##        0        0

Loop to Clean the Sentences and Add Them to the Dataframe

# Create a vector for storing the cleaned sentences
cleaned_text <- character(length = nrow(merged_df))

# Loop for cleaning each row
for (i in 1:nrow(merged_df)) {
  text <- merged_df$Sentence[i] # Get current sentence
  text <- tolower(text) # Make it lowercase
  text <- removeWords(text, stopwords("en")) # Remove stopwords now, as they add little to no value
  text <- gsub("[^a-z ]", "", text) # Regex to only keep lowercase characters and spacing, replacing with a "" if found
  text <- stri_trans_general(text, "Latin-ASCII") # Removes any non-Latin/ASCII characters (such as emojis)
  text <- iconv(text, "latin1", "ASCII", sub = "") # Double-check of above: removes any non-Latin/ASCII characters (such as emojis)
  text <- stripWhitespace(text) # Strips excess whitespace, excluding spaces (i.e. leading/trailing spaces)
  cleaned_text[i] <- text # Store in the vector from above
}

# Add the cleaned text back to the dataframe as a new column
merged_df$Cleaned_Text <- cleaned_text

Preliminary Tokenization, Vocabulary Size, and Top 20 Tokens/Words

# Tokenize the text data
tokens <- unlist(strsplit(tolower(merged_df$Cleaned_Text), "\\s+"))
tokens <- tokens[tokens != ""] # Remove empty token(s)

# Calculate the vocabulary size
vocab_size <- length(unique(tokens))
cat("Vocabulary Size:", vocab_size, "\n")

## Vocabulary Size: 5074

# Create a frequency table and dataframe of the tokens
token_freq <- table(tokens)
token_freq_df <- data.frame(Token = names(token_freq), Frequency = as.numeric(token_freq))

# Sort the dataframe by frequency in descending order
token_freq_df <- token_freq_df[order(-token_freq_df$Frequency), ]

# Plot the top 20 tokens
top_tokens <- token_freq_df[1:20, ]
barplot(top_tokens$Frequency, names.arg = top_tokens$Token, horiz = TRUE, 
        main = "Top Token Frequencies", xlab = "Frequency", ylab = "", las = 1)

Calculate Sentence Length Distributions, Plot, and Find Longest Sentence for Word Embedding Length

# Calculate sentence lengths and plot results
sentence_lengths <- sapply(strsplit(merged_df$Cleaned_Text, "\\s+"), length)
summary(sentence_lengths)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   4.000   6.000   6.701   9.000  42.000

hist(sentence_lengths, main = "Distribution of Sentence Lengths", xlab = "Sentence Length")

# Find the longest sentence
index_of_longest_sentence <- which.max(sentence_lengths)
longest_sentence <- merged_df$Cleaned_Text[index_of_longest_sentence]

# Print the longest sentence and its length
cat("Longest Sentence:", longest_sentence, "\nLength of Longest Sentence:", sentence_lengths[index_of_longest_sentence], "tokens\n")

## Longest Sentence:  masterful piece filmmaking many themes simmering occasionally boiling warts study poets bohemian selfindulgent wartime years span aerial bombardments london outward tranquillity welsh coastal retreat borderlines friendship lust love dedication art experience versus practical concerns jealousy rivalry cowardice egotism versus heroism selfsacrifice  
## Length of Longest Sentence: 42 tokens

# Proposed word embedding length
embedding_dim <- as.integer(round(sqrt(sqrt(vocab_size)), 0)) # Take the fourth root of the vocab size, which is the squared root of the square root 
max_sequence_length <- max(sentence_lengths) # Get the actual max sequence length within the vocab, which will determine the max dimension we can use
cat("Estimated Word Embedding Length:", embedding_dim, "\nProposed Word Embedding Length:", max_sequence_length, "\n")

## Estimated Word Embedding Length: 8 
## Proposed Word Embedding Length: 42

Tokenize and Pad Sequences

# Tokenize the cleaned sentences
tokenizer <- text_tokenizer(num_words = vocab_size)
tokenizer$fit_on_texts(merged_df$Cleaned_Text)
sequences <- texts_to_sequences(tokenizer, merged_df$Cleaned_Text)
word_index <- tokenizer$word_index

# Pad sequences to the maximum sequence length
padded_sequences <- pad_sequences(sequences, maxlen = sentence_lengths[index_of_longest_sentence], padding = "post", truncating = "post")

Displaying a Randomly-Selected Sequence and Rebuilding It From the Word Index

# Randomly selecting a sequence to display
k <- 8
sequences[k]

## [[1]]
## [1]  933    8 2057  258   26 2058   11  318 2059

padded_sequences[k, ]

##  [1]  933    8 2057  258   26 2058   11  318 2059    0    0    0    0    0    0
## [16]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [31]    0    0    0    0    0    0    0    0    0    0    0    0

# Loop through the word_index from the tokenizer to find the matching word to the integer in the sequence above
for(i in sequences[k]){
  print(word_index[i])
}

## $potatoes
## [1] 933
## 
## $like
## [1] 8
## 
## $rubber
## [1] 2057
## 
## $tell
## [1] 258
## 
## $made
## [1] 26
## 
## $ahead
## [1] 2058
## 
## $time
## [1] 11
## 
## $kept
## [1] 318
## 
## $warmer
## [1] 2059

# Print the corresponding sentence in the cleaned dataframe column to double-check/confirm
merged_df$Cleaned_Text[k]

## [1] " potatoes like rubber tell made ahead time kept warmer"

Build the Model and Its Layers, then Display Summary

# Create a Keras sequential model
model <- keras_model_sequential()

# Add layers (input and hidden layers)
model %>%
  layer_dense(units = 50, activation = "relu", input_shape = c(max_sequence_length)) %>%
  layer_dense(units = 25, activation = "relu")

# Add final dense layer with selected activation function
model %>% layer_dense(units = 1, activation = "sigmoid")

# Compile the model
model %>% compile(
  loss = "binary_crossentropy",
  optimizer = optimizer_adam(),
  metrics = c("accuracy")
)

# Display model summary
summary(model)

## Model: "sequential"
## ________________________________________________________________________________
##  Layer (type)                       Output Shape                    Param #     
## ================================================================================
##  dense_1 (Dense)                    (None, 50)                      2150        
##  dense (Dense)                      (None, 25)                      1275        
##  dense_2 (Dense)                    (None, 1)                       26          
## ================================================================================
## Total params: 3451 (13.48 KB)
## Trainable params: 3451 (13.48 KB)
## Non-trainable params: 0 (0.00 Byte)
## ________________________________________________________________________________

Split Data (50% Training, 25% Test, and 25% Validation)

# Set seed for reproducibility
set.seed(123)

# Vector of row indices
n <- nrow(merged_df)
indices <- sample(1:n)

# Obtain size of each: Training = 50%, Testing = 25%, Validation = 25%
train_size <- floor(0.5 * n)
test_size <- floor(0.25 * n)

# Split the data into training, test, and validation sets
training_indices <- indices[1:train_size]
test_indices <- indices[(train_size + 1):(train_size + test_size)]
validation_indices <- indices[(train_size + test_size + 1):n]
training_data <- merged_df[training_indices, ]
test_data <- merged_df[test_indices, ]
validation_data <- merged_df[validation_indices, ]

# Output to csv
write.csv(training_data, "task2_train.csv")
write.csv(test_data, "task2_test.csv")
write.csv(validation_data, "task2_validation.csv")

Add Stopping Criteria to Model, Train the Model, and Confirm Results in Test and Validation

# Stopping criteria (validation Loss)
callback <- callback_early_stopping(
  monitor = "val_loss",
  patience = 10,
  restore_best_weights = TRUE
)

# Train the model on the training data, with the validation data as well
history <- model %>% fit(
  x = padded_sequences[training_indices, ],
  y = training_data$Score,
  epochs = 100,
  validation_data = list(padded_sequences[validation_indices, ], validation_data$Score),
  callbacks = list(callback)
)

## Epoch 1/100
## 47/47 - 1s - loss: 52.3194 - accuracy: 0.5060 - val_loss: 34.8181 - val_accuracy: 0.5040 - 767ms/epoch - 16ms/step
## Epoch 2/100
## 47/47 - 0s - loss: 24.5642 - accuracy: 0.4987 - val_loss: 25.3565 - val_accuracy: 0.5040 - 115ms/epoch - 2ms/step
## Epoch 3/100
## 47/47 - 0s - loss: 15.5026 - accuracy: 0.5080 - val_loss: 19.8455 - val_accuracy: 0.5160 - 81ms/epoch - 2ms/step
## Epoch 4/100
## 47/47 - 0s - loss: 11.4664 - accuracy: 0.5447 - val_loss: 19.1791 - val_accuracy: 0.4693 - 82ms/epoch - 2ms/step
## Epoch 5/100
## 47/47 - 0s - loss: 9.5178 - accuracy: 0.5527 - val_loss: 16.3097 - val_accuracy: 0.4893 - 83ms/epoch - 2ms/step
## Epoch 6/100
## 47/47 - 0s - loss: 7.6397 - accuracy: 0.5560 - val_loss: 16.4038 - val_accuracy: 0.4800 - 95ms/epoch - 2ms/step
## Epoch 7/100
## 47/47 - 0s - loss: 6.9239 - accuracy: 0.5667 - val_loss: 14.9376 - val_accuracy: 0.4880 - 95ms/epoch - 2ms/step
## Epoch 8/100
## 47/47 - 0s - loss: 5.9349 - accuracy: 0.5780 - val_loss: 14.2694 - val_accuracy: 0.4707 - 88ms/epoch - 2ms/step
## Epoch 9/100
## 47/47 - 0s - loss: 5.6040 - accuracy: 0.5840 - val_loss: 12.9651 - val_accuracy: 0.5053 - 84ms/epoch - 2ms/step
## Epoch 10/100
## 47/47 - 0s - loss: 5.8966 - accuracy: 0.5920 - val_loss: 14.7346 - val_accuracy: 0.5187 - 89ms/epoch - 2ms/step
## Epoch 11/100
## 47/47 - 0s - loss: 4.4429 - accuracy: 0.6153 - val_loss: 12.3508 - val_accuracy: 0.4893 - 86ms/epoch - 2ms/step
## Epoch 12/100
## 47/47 - 0s - loss: 4.5397 - accuracy: 0.6193 - val_loss: 15.3783 - val_accuracy: 0.5187 - 83ms/epoch - 2ms/step
## Epoch 13/100
## 47/47 - 0s - loss: 4.7440 - accuracy: 0.5973 - val_loss: 12.3078 - val_accuracy: 0.5107 - 83ms/epoch - 2ms/step
## Epoch 14/100
## 47/47 - 0s - loss: 4.0009 - accuracy: 0.6000 - val_loss: 11.7986 - val_accuracy: 0.4947 - 92ms/epoch - 2ms/step
## Epoch 15/100
## 47/47 - 0s - loss: 3.7832 - accuracy: 0.6127 - val_loss: 13.0101 - val_accuracy: 0.5013 - 80ms/epoch - 2ms/step
## Epoch 16/100
## 47/47 - 0s - loss: 3.5182 - accuracy: 0.6073 - val_loss: 11.1019 - val_accuracy: 0.5013 - 81ms/epoch - 2ms/step
## Epoch 17/100
## 47/47 - 0s - loss: 3.1967 - accuracy: 0.6000 - val_loss: 12.6674 - val_accuracy: 0.5027 - 88ms/epoch - 2ms/step
## Epoch 18/100
## 47/47 - 0s - loss: 3.5379 - accuracy: 0.6080 - val_loss: 11.4331 - val_accuracy: 0.5053 - 90ms/epoch - 2ms/step
## Epoch 19/100
## 47/47 - 0s - loss: 3.0387 - accuracy: 0.6233 - val_loss: 11.5984 - val_accuracy: 0.5053 - 90ms/epoch - 2ms/step
## Epoch 20/100
## 47/47 - 0s - loss: 2.8419 - accuracy: 0.6207 - val_loss: 10.6635 - val_accuracy: 0.4867 - 85ms/epoch - 2ms/step
## Epoch 21/100
## 47/47 - 0s - loss: 3.9931 - accuracy: 0.6093 - val_loss: 10.4876 - val_accuracy: 0.5067 - 89ms/epoch - 2ms/step
## Epoch 22/100
## 47/47 - 0s - loss: 3.4120 - accuracy: 0.6000 - val_loss: 10.4521 - val_accuracy: 0.5347 - 94ms/epoch - 2ms/step
## Epoch 23/100
## 47/47 - 0s - loss: 2.7001 - accuracy: 0.6367 - val_loss: 13.4666 - val_accuracy: 0.5120 - 88ms/epoch - 2ms/step
## Epoch 24/100
## 47/47 - 0s - loss: 2.9629 - accuracy: 0.6060 - val_loss: 9.9912 - val_accuracy: 0.5227 - 85ms/epoch - 2ms/step
## Epoch 25/100
## 47/47 - 0s - loss: 2.2884 - accuracy: 0.6453 - val_loss: 9.9458 - val_accuracy: 0.5240 - 83ms/epoch - 2ms/step
## Epoch 26/100
## 47/47 - 0s - loss: 2.3572 - accuracy: 0.6407 - val_loss: 10.9240 - val_accuracy: 0.5013 - 80ms/epoch - 2ms/step
## Epoch 27/100
## 47/47 - 0s - loss: 2.3190 - accuracy: 0.6420 - val_loss: 10.3131 - val_accuracy: 0.5067 - 80ms/epoch - 2ms/step
## Epoch 28/100
## 47/47 - 0s - loss: 2.4596 - accuracy: 0.6540 - val_loss: 10.2341 - val_accuracy: 0.5040 - 90ms/epoch - 2ms/step
## Epoch 29/100
## 47/47 - 0s - loss: 2.4783 - accuracy: 0.6320 - val_loss: 9.7086 - val_accuracy: 0.5307 - 92ms/epoch - 2ms/step
## Epoch 30/100
## 47/47 - 0s - loss: 2.5239 - accuracy: 0.6460 - val_loss: 10.3411 - val_accuracy: 0.5213 - 102ms/epoch - 2ms/step
## Epoch 31/100
## 47/47 - 0s - loss: 2.7517 - accuracy: 0.6260 - val_loss: 11.0580 - val_accuracy: 0.5320 - 94ms/epoch - 2ms/step
## Epoch 32/100
## 47/47 - 0s - loss: 2.2263 - accuracy: 0.6300 - val_loss: 10.6270 - val_accuracy: 0.5120 - 91ms/epoch - 2ms/step
## Epoch 33/100
## 47/47 - 0s - loss: 1.9377 - accuracy: 0.6607 - val_loss: 10.8650 - val_accuracy: 0.5120 - 89ms/epoch - 2ms/step
## Epoch 34/100
## 47/47 - 0s - loss: 2.2074 - accuracy: 0.6440 - val_loss: 9.8891 - val_accuracy: 0.4987 - 82ms/epoch - 2ms/step
## Epoch 35/100
## 47/47 - 0s - loss: 1.8255 - accuracy: 0.6513 - val_loss: 9.3729 - val_accuracy: 0.5213 - 89ms/epoch - 2ms/step
## Epoch 36/100
## 47/47 - 0s - loss: 1.7513 - accuracy: 0.6540 - val_loss: 9.7910 - val_accuracy: 0.5027 - 82ms/epoch - 2ms/step
## Epoch 37/100
## 47/47 - 0s - loss: 1.7425 - accuracy: 0.6733 - val_loss: 10.6169 - val_accuracy: 0.5200 - 82ms/epoch - 2ms/step
## Epoch 38/100
## 47/47 - 0s - loss: 2.0313 - accuracy: 0.6407 - val_loss: 10.1732 - val_accuracy: 0.4960 - 81ms/epoch - 2ms/step
## Epoch 39/100
## 47/47 - 0s - loss: 1.8696 - accuracy: 0.6573 - val_loss: 9.9096 - val_accuracy: 0.5120 - 85ms/epoch - 2ms/step
## Epoch 40/100
## 47/47 - 0s - loss: 2.1738 - accuracy: 0.6500 - val_loss: 10.0697 - val_accuracy: 0.5267 - 86ms/epoch - 2ms/step
## Epoch 41/100
## 47/47 - 0s - loss: 2.0580 - accuracy: 0.6573 - val_loss: 10.3047 - val_accuracy: 0.5000 - 91ms/epoch - 2ms/step
## Epoch 42/100
## 47/47 - 0s - loss: 2.0123 - accuracy: 0.6473 - val_loss: 9.8234 - val_accuracy: 0.5000 - 93ms/epoch - 2ms/step
## Epoch 43/100
## 47/47 - 0s - loss: 1.4698 - accuracy: 0.6620 - val_loss: 10.1582 - val_accuracy: 0.5200 - 86ms/epoch - 2ms/step
## Epoch 44/100
## 47/47 - 0s - loss: 1.9205 - accuracy: 0.6593 - val_loss: 9.1838 - val_accuracy: 0.5307 - 87ms/epoch - 2ms/step
## Epoch 45/100
## 47/47 - 0s - loss: 1.8189 - accuracy: 0.6753 - val_loss: 9.6248 - val_accuracy: 0.5120 - 90ms/epoch - 2ms/step
## Epoch 46/100
## 47/47 - 0s - loss: 1.7115 - accuracy: 0.6647 - val_loss: 9.7746 - val_accuracy: 0.5160 - 83ms/epoch - 2ms/step
## Epoch 47/100
## 47/47 - 0s - loss: 1.5914 - accuracy: 0.6593 - val_loss: 10.0012 - val_accuracy: 0.4893 - 85ms/epoch - 2ms/step
## Epoch 48/100
## 47/47 - 0s - loss: 2.0462 - accuracy: 0.6533 - val_loss: 10.1705 - val_accuracy: 0.4907 - 84ms/epoch - 2ms/step
## Epoch 49/100
## 47/47 - 0s - loss: 1.7375 - accuracy: 0.6487 - val_loss: 9.8876 - val_accuracy: 0.5187 - 80ms/epoch - 2ms/step
## Epoch 50/100
## 47/47 - 0s - loss: 1.8975 - accuracy: 0.6620 - val_loss: 9.7474 - val_accuracy: 0.5373 - 82ms/epoch - 2ms/step
## Epoch 51/100
## 47/47 - 0s - loss: 1.5605 - accuracy: 0.6847 - val_loss: 9.8519 - val_accuracy: 0.5027 - 91ms/epoch - 2ms/step
## Epoch 52/100
## 47/47 - 0s - loss: 1.4901 - accuracy: 0.6680 - val_loss: 10.2555 - val_accuracy: 0.5173 - 89ms/epoch - 2ms/step
## Epoch 53/100
## 47/47 - 0s - loss: 1.4502 - accuracy: 0.6700 - val_loss: 9.7645 - val_accuracy: 0.4947 - 97ms/epoch - 2ms/step
## Epoch 54/100
## 47/47 - 0s - loss: 1.5067 - accuracy: 0.6560 - val_loss: 9.4868 - val_accuracy: 0.5213 - 95ms/epoch - 2ms/step

# Evaluate the model on the test data
test_results <- model %>% evaluate(
  x = padded_sequences[test_indices, ],
  y = test_data$Score
)

## 24/24 - 0s - loss: 13.0331 - accuracy: 0.4853 - 34ms/epoch - 1ms/step

# Print test results
test_results

##       loss   accuracy 
## 13.0330811  0.4853333

# Evaluate the model on the validation data
validation_results <- model %>% evaluate(
  x = padded_sequences[validation_indices, ],
  y = validation_data$Score
)

## 24/24 - 0s - loss: 9.1838 - accuracy: 0.5307 - 34ms/epoch - 1ms/step

# Print validation results
validation_results

##      loss  accuracy 
## 9.1837606 0.5306666

Predict a Randomly-Selected Sentiment

# Evaluate a single sequence to predict users' sentiment
j <- 8
selected_sequence <- padded_sequences[j, , drop = FALSE]

# Make predictions on the selected sequence
predictions <- model %>% predict(selected_sequence)

## 1/1 - 0s - 67ms/epoch - 67ms/step

# Interpret the predictions (assuming 0.5 as the threshold for positive/negative sentiment)
if (predictions >= 0.5) {
  sentiment <- "Positive"
} else {
  sentiment <- "Negative"
}

# Print the selected sequence and its sentiment prediction
cat("Selected Sequence:", test_data$Cleaned_Text[j], "\nPredicted Sentiment:", sentiment, "\nActual Sentiment:", ifelse(test_data$Score[j] == 1, "Positive", "Negative"), "\n")

## Selected Sequence:  get absolutely horrible reception apartment phones problem 
## Predicted Sentiment: Negative 
## Actual Sentiment: Negative

Save Model

# Save the trained model
model %>% save_model_hdf5("task2.keras")

WGU - D213 - Task 2

Derek Ley

2023-09-14

Install / Load Libraries

Create Dataframe Objects and Read in Data

For Loops to Extract Data and Place in the Dataframes

Combine the Three Dataframes and Preliminary EDA

Loop to Clean the Sentences and Add Them to the Dataframe

Preliminary Tokenization, Vocabulary Size, and Top 20 Tokens/Words

Calculate Sentence Length Distributions, Plot, and Find Longest Sentence for Word Embedding Length

Tokenize and Pad Sequences

Displaying a Randomly-Selected Sequence and Rebuilding It From the Word Index

Build the Model and Its Layers, then Display Summary

Split Data (50% Training, 25% Test, and 25% Validation)

Add Stopping Criteria to Model, Train the Model, and Confirm Results in Test and Validation

Predict a Randomly-Selected Sentiment

Save Model