Load Libraries

library(tidyverse)
library(dplyr)
library(caret)
library(text)
library(tibble)
library(reticulate)
library(randomForest)

Import and Clean Data

# Import the data
tweet_raw <- read_csv("https://github.com/evanskaylie/DATA622/raw/refs/heads/main/tweets_dataset_new.csv")
## Rows: 14 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, tweet
## lgl (1): is_real
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Preview the data
head(tweet_raw)
## # A tibble: 6 × 3
##   name          tweet                                                    is_real
##   <chr>         <chr>                                                    <lgl>  
## 1 Billie Eilish "To celebrate Earth Day, Billie has teamed up with 3 ri… NA     
## 2 Billie Eilish "BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorro… NA     
## 3 Ryan Reynolds "Back to back to back is a metaphor for an unprecedente… NA     
## 4 Ryan Reynolds "Love these boys. @Wrexham_AFC"                          NA     
## 5 Elon Musk     "Starlink continuity test with Diablo Hardcore. \n\nAny… NA     
## 6 Elon Musk     "Using Tesla Autopilot self-driving massively improves … NA
summary(tweet_raw)
##      name              tweet           is_real       
##  Length:14          Length:14          Mode:logical  
##  Class :character   Class :character   NA's:14       
##  Mode  :character   Mode  :character
# Clean the data
clean_text <- function(text) {
  text <- gsub("[\r\n]", " ", text)                    # remove newlines
  text <- gsub("\\\\x[0-9A-Fa-f]{2}", "", text)        # remove hex escape codes like \xd5
  text <- gsub("[^[:alnum:][:space:][:punct:]]", "", text)  # remove non-printable characters
  text <- gsub("[\\p{So}\\p{Cn}]", "", text, perl = TRUE)   # remove emojis and special symbols
  text <- trimws(text)                                 # trim whitespace
  return(text)
}

# Data types are correct and the Tyler, The Creator comma did not break the csv delimitation like expected
tweet_df <- tweet_raw
tweet_df$tweet <- iconv(tweet_df$tweet, from = "", to = "UTF-8", sub = "")

# Update the raw data to the clean tweets
tweet_df$tweet <- sapply(tweet_df$tweet, clean_text)

Feature Analysis + Splitting

Python emulator to do the neural network analyses

A neural network will be used to get new columns from the tweets.

Specifically, sentiment score (positive/negative) and emotion probabilities (anger, joy, etc.)

Use Neural Network to get Sentiment Predictors

# Create a Python environment and install transformers
reticulate::py_install(c("transformers", "torch"), pip = TRUE)
## Using virtual environment '/Users/kaylieevans/.virtualenvs/r-reticulate' ...
## + /Users/kaylieevans/.virtualenvs/r-reticulate/bin/python -m pip install --upgrade --no-user transformers torch
# Load Python libraries
transformers <- import("transformers")
torch <- import("torch")

# Load sentiment pipeline classifier
classifier <- transformers$pipeline(
  "sentiment-analysis",
  model = "distilbert/distilbert-base-uncased-finetuned-sst-2-english",
  revision = "714eb0f"
)

# Run classifier on the tweet column
tweet_df$sentiment_raw <- sapply(tweet_df$tweet, function(x) {
  result <- classifier(x)[[1]]
  paste(result$label, result$score, sep = ":")
})

# Split out label and score into separate columns
tweet_df$sentiment_val <- sapply(strsplit(tweet_df$sentiment_raw, ":"), `[`, 1)
tweet_df$sentiment_score <- as.numeric(sapply(strsplit(tweet_df$sentiment_raw, ":"), `[`, 2))

# Binary encoding for the sentiment val column
tweet_df$sentiment_val_binary <- ifelse(tweet_df$sentiment_val == "POSITIVE", 1, 0)

# Preview Data
summary(tweet_df)
##      name              tweet           is_real        sentiment_raw     
##  Length:14          Length:14          Mode:logical   Length:14         
##  Class :character   Class :character   NA's:14        Class :character  
##  Mode  :character   Mode  :character                  Mode  :character  
##                                                                         
##                                                                         
##                                                                         
##  sentiment_val      sentiment_score  sentiment_val_binary
##  Length:14          Min.   :0.6992   Min.   :0.0000      
##  Class :character   1st Qu.:0.9798   1st Qu.:1.0000      
##  Mode  :character   Median :0.9978   Median :1.0000      
##                     Mean   :0.9495   Mean   :0.7857      
##                     3rd Qu.:0.9991   3rd Qu.:1.0000      
##                     Max.   :0.9998   Max.   :1.0000
head(tweet_df, 1)
## # A tibble: 1 × 7
##   name          tweet        is_real sentiment_raw sentiment_val sentiment_score
##   <chr>         <chr>        <lgl>   <chr>         <chr>                   <dbl>
## 1 Billie Eilish To celebrat… NA      POSITIVE:0.9… POSITIVE                0.999
## # ℹ 1 more variable: sentiment_val_binary <dbl>

Use Neural Network to get Emotional Predictors

# Load emotion classifier pipeline
emotion_classifier <- transformers$pipeline(
  "text-classification",
  model = "j-hartmann/emotion-english-distilroberta-base",
  return_all_scores = TRUE
)

# Run the classifier on tweets
emotion_scores <- lapply(tweet_df$tweet, function(x) {
  result <- emotion_classifier(x)[[1]]
  # Convert to named vector of probabilities
  scores <- setNames(sapply(result, function(r) r$score), sapply(result, function(r) r$label))
  return(scores)
})

# Combine results into a data frame
emotion_df <- do.call(rbind, lapply(emotion_scores, function(x) {
  # Ensure all possible emotions are included
  emotions <- c("anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise")
  x[setdiff(emotions, names(x))] <- 0 
  return(x[emotions])  
}))

# Add the emotion probabilities to tweet_df
tweet_df <- cbind(tweet_df, emotion_df)

# Preview Data
head(tweet_df)
##                                                                                                                                                                                                                                                                                                                                          name
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store Billie Eilish
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR                                                                                                                                                                                       Billie Eilish
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that.                                                                                                                                            Ryan Reynolds
## Love these boys. @Wrexham_AFC                                                                                                                                                                                                                                                                                                   Ryan Reynolds
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test.                                                                                                                                                                                            Elon Musk
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road                                                                                                                                                                                                                                         Elon Musk
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           tweet
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR                                                                                                                                                                                                                                                                                                                                                                             BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that.                                                                                                                                                                                                                                                                                       Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that.
## Love these boys. @Wrexham_AFC                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     Love these boys. @Wrexham_AFC
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test.                                                                                                                                                                                                                                                                                                                                                                                 Starlink continuity test with Diablo Hardcore.   Any major lag or a brief loss of connection means permadeath, so it is the best test.
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road                                                                                                                                                                                                                                                                                                                                                                                                                                                                         Using Tesla Autopilot self-driving massively improves safety for you and others on the road
##                                                                                                                                                                                                                                                                                                                                 is_real
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store      NA
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR                                                                                                                                                                                            NA
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that.                                                                                                                                                 NA
## Love these boys. @Wrexham_AFC                                                                                                                                                                                                                                                                                                        NA
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test.                                                                                                                                                                                             NA
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road                                                                                                                                                                                                                                          NA
##                                                                                                                                                                                                                                                                                                                                              sentiment_raw
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store POSITIVE:0.998986780643463
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR                                                                                                                                                                                        POSITIVE:0.99777102470398
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that.                                                                                                                                            POSITIVE:0.974160969257355
## Love these boys. @Wrexham_AFC                                                                                                                                                                                                                                                                                                   POSITIVE:0.999803960323334
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test.                                                                                                                                                                                        POSITIVE:0.771799087524414
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road                                                                                                                                                                                                                                     POSITIVE:0.996823191642761
##                                                                                                                                                                                                                                                                                                                                 sentiment_val
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store      POSITIVE
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR                                                                                                                                                                                            POSITIVE
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that.                                                                                                                                                 POSITIVE
## Love these boys. @Wrexham_AFC                                                                                                                                                                                                                                                                                                        POSITIVE
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test.                                                                                                                                                                                             POSITIVE
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road                                                                                                                                                                                                                                          POSITIVE
##                                                                                                                                                                                                                                                                                                                                 sentiment_score
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store       0.9989868
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR                                                                                                                                                                                             0.9977710
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that.                                                                                                                                                  0.9741610
## Love these boys. @Wrexham_AFC                                                                                                                                                                                                                                                                                                         0.9998040
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test.                                                                                                                                                                                              0.7717991
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road                                                                                                                                                                                                                                           0.9968232
##                                                                                                                                                                                                                                                                                                                                 sentiment_val_binary
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store                    1
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR                                                                                                                                                                                                          1
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that.                                                                                                                                                               1
## Love these boys. @Wrexham_AFC                                                                                                                                                                                                                                                                                                                      1
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test.                                                                                                                                                                                                           1
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road                                                                                                                                                                                                                                                        1
##                                                                                                                                                                                                                                                                                                                                       anger
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store 0.009951324
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR                                                                                                                                                                                       0.025626555
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that.                                                                                                                                            0.014392977
## Love these boys. @Wrexham_AFC                                                                                                                                                                                                                                                                                                   0.010342427
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test.                                                                                                                                                                                        0.003674293
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road                                                                                                                                                                                                                                     0.011271660
##                                                                                                                                                                                                                                                                                                                                     disgust
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store 0.003928915
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR                                                                                                                                                                                       0.001621294
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that.                                                                                                                                            0.055173125
## Love these boys. @Wrexham_AFC                                                                                                                                                                                                                                                                                                   0.001098096
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test.                                                                                                                                                                                        0.001680017
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road                                                                                                                                                                                                                                     0.004726662
##                                                                                                                                                                                                                                                                                                                                        fear
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store 0.002165676
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR                                                                                                                                                                                       0.005382390
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that.                                                                                                                                            0.001207569
## Love these boys. @Wrexham_AFC                                                                                                                                                                                                                                                                                                   0.001976798
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test.                                                                                                                                                                                        0.007144672
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road                                                                                                                                                                                                                                     0.139299005
##                                                                                                                                                                                                                                                                                                                                        joy
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store 0.10378531
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR                                                                                                                                                                                       0.07971518
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that.                                                                                                                                            0.01492078
## Love these boys. @Wrexham_AFC                                                                                                                                                                                                                                                                                                   0.95849103
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test.                                                                                                                                                                                        0.11069606
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road                                                                                                                                                                                                                                     0.03138582
##                                                                                                                                                                                                                                                                                                                                     neutral
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store 0.839183569
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR                                                                                                                                                                                       0.640678763
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that.                                                                                                                                            0.898435831
## Love these boys. @Wrexham_AFC                                                                                                                                                                                                                                                                                                   0.006809928
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test.                                                                                                                                                                                        0.794360280
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road                                                                                                                                                                                                                                     0.776941359
##                                                                                                                                                                                                                                                                                                                                     sadness
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store 0.003338196
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR                                                                                                                                                                                       0.012384440
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that.                                                                                                                                            0.005729300
## Love these boys. @Wrexham_AFC                                                                                                                                                                                                                                                                                                   0.018889830
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test.                                                                                                                                                                                        0.027738208
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road                                                                                                                                                                                                                                     0.010570211
##                                                                                                                                                                                                                                                                                                                                    surprise
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store 0.037647042
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR                                                                                                                                                                                       0.234591424
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that.                                                                                                                                            0.010140403
## Love these boys. @Wrexham_AFC                                                                                                                                                                                                                                                                                                   0.002391844
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test.                                                                                                                                                                                        0.054706544
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road                                                                                                                                                                                                                                     0.025805326

Export the data

write.csv(tweet_df, "new_tweet_df.csv", row.names = FALSE)