library(tidyverse)
library(dplyr)
library(caret)
library(text)
library(tibble)
library(reticulate)
library(randomForest)
# Import the data
tweet_raw <- read_csv("https://github.com/evanskaylie/DATA622/raw/refs/heads/main/tweets_dataset_new.csv")
## Rows: 14 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, tweet
## lgl (1): is_real
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Preview the data
head(tweet_raw)
## # A tibble: 6 × 3
## name tweet is_real
## <chr> <chr> <lgl>
## 1 Billie Eilish "To celebrate Earth Day, Billie has teamed up with 3 ri… NA
## 2 Billie Eilish "BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorro… NA
## 3 Ryan Reynolds "Back to back to back is a metaphor for an unprecedente… NA
## 4 Ryan Reynolds "Love these boys. @Wrexham_AFC" NA
## 5 Elon Musk "Starlink continuity test with Diablo Hardcore. \n\nAny… NA
## 6 Elon Musk "Using Tesla Autopilot self-driving massively improves … NA
summary(tweet_raw)
## name tweet is_real
## Length:14 Length:14 Mode:logical
## Class :character Class :character NA's:14
## Mode :character Mode :character
# Clean the data
clean_text <- function(text) {
text <- gsub("[\r\n]", " ", text) # remove newlines
text <- gsub("\\\\x[0-9A-Fa-f]{2}", "", text) # remove hex escape codes like \xd5
text <- gsub("[^[:alnum:][:space:][:punct:]]", "", text) # remove non-printable characters
text <- gsub("[\\p{So}\\p{Cn}]", "", text, perl = TRUE) # remove emojis and special symbols
text <- trimws(text) # trim whitespace
return(text)
}
# Data types are correct and the Tyler, The Creator comma did not break the csv delimitation like expected
tweet_df <- tweet_raw
tweet_df$tweet <- iconv(tweet_df$tweet, from = "", to = "UTF-8", sub = "")
# Update the raw data to the clean tweets
tweet_df$tweet <- sapply(tweet_df$tweet, clean_text)
Python emulator to do the neural network analyses
A neural network will be used to get new columns from the tweets.
Specifically, sentiment score (positive/negative) and emotion probabilities (anger, joy, etc.)
# Create a Python environment and install transformers
reticulate::py_install(c("transformers", "torch"), pip = TRUE)
## Using virtual environment '/Users/kaylieevans/.virtualenvs/r-reticulate' ...
## + /Users/kaylieevans/.virtualenvs/r-reticulate/bin/python -m pip install --upgrade --no-user transformers torch
# Load Python libraries
transformers <- import("transformers")
torch <- import("torch")
# Load sentiment pipeline classifier
classifier <- transformers$pipeline(
"sentiment-analysis",
model = "distilbert/distilbert-base-uncased-finetuned-sst-2-english",
revision = "714eb0f"
)
# Run classifier on the tweet column
tweet_df$sentiment_raw <- sapply(tweet_df$tweet, function(x) {
result <- classifier(x)[[1]]
paste(result$label, result$score, sep = ":")
})
# Split out label and score into separate columns
tweet_df$sentiment_val <- sapply(strsplit(tweet_df$sentiment_raw, ":"), `[`, 1)
tweet_df$sentiment_score <- as.numeric(sapply(strsplit(tweet_df$sentiment_raw, ":"), `[`, 2))
# Binary encoding for the sentiment val column
tweet_df$sentiment_val_binary <- ifelse(tweet_df$sentiment_val == "POSITIVE", 1, 0)
# Preview Data
summary(tweet_df)
## name tweet is_real sentiment_raw
## Length:14 Length:14 Mode:logical Length:14
## Class :character Class :character NA's:14 Class :character
## Mode :character Mode :character Mode :character
##
##
##
## sentiment_val sentiment_score sentiment_val_binary
## Length:14 Min. :0.6992 Min. :0.0000
## Class :character 1st Qu.:0.9798 1st Qu.:1.0000
## Mode :character Median :0.9978 Median :1.0000
## Mean :0.9495 Mean :0.7857
## 3rd Qu.:0.9991 3rd Qu.:1.0000
## Max. :0.9998 Max. :1.0000
head(tweet_df, 1)
## # A tibble: 1 × 7
## name tweet is_real sentiment_raw sentiment_val sentiment_score
## <chr> <chr> <lgl> <chr> <chr> <dbl>
## 1 Billie Eilish To celebrat… NA POSITIVE:0.9… POSITIVE 0.999
## # ℹ 1 more variable: sentiment_val_binary <dbl>
# Load emotion classifier pipeline
emotion_classifier <- transformers$pipeline(
"text-classification",
model = "j-hartmann/emotion-english-distilroberta-base",
return_all_scores = TRUE
)
# Run the classifier on tweets
emotion_scores <- lapply(tweet_df$tweet, function(x) {
result <- emotion_classifier(x)[[1]]
# Convert to named vector of probabilities
scores <- setNames(sapply(result, function(r) r$score), sapply(result, function(r) r$label))
return(scores)
})
# Combine results into a data frame
emotion_df <- do.call(rbind, lapply(emotion_scores, function(x) {
# Ensure all possible emotions are included
emotions <- c("anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise")
x[setdiff(emotions, names(x))] <- 0
return(x[emotions])
}))
# Add the emotion probabilities to tweet_df
tweet_df <- cbind(tweet_df, emotion_df)
# Preview Data
head(tweet_df)
## name
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store Billie Eilish
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR Billie Eilish
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that. Ryan Reynolds
## Love these boys. @Wrexham_AFC Ryan Reynolds
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test. Elon Musk
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road Elon Musk
## tweet
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that. Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that.
## Love these boys. @Wrexham_AFC Love these boys. @Wrexham_AFC
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test. Starlink continuity test with Diablo Hardcore. Any major lag or a brief loss of connection means permadeath, so it is the best test.
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road Using Tesla Autopilot self-driving massively improves safety for you and others on the road
## is_real
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store NA
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR NA
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that. NA
## Love these boys. @Wrexham_AFC NA
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test. NA
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road NA
## sentiment_raw
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store POSITIVE:0.998986780643463
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR POSITIVE:0.99777102470398
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that. POSITIVE:0.974160969257355
## Love these boys. @Wrexham_AFC POSITIVE:0.999803960323334
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test. POSITIVE:0.771799087524414
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road POSITIVE:0.996823191642761
## sentiment_val
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store POSITIVE
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR POSITIVE
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that. POSITIVE
## Love these boys. @Wrexham_AFC POSITIVE
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test. POSITIVE
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road POSITIVE
## sentiment_score
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store 0.9989868
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR 0.9977710
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that. 0.9741610
## Love these boys. @Wrexham_AFC 0.9998040
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test. 0.7717991
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road 0.9968232
## sentiment_val_binary
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store 1
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR 1
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that. 1
## Love these boys. @Wrexham_AFC 1
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test. 1
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road 1
## anger
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store 0.009951324
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR 0.025626555
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that. 0.014392977
## Love these boys. @Wrexham_AFC 0.010342427
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test. 0.003674293
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road 0.011271660
## disgust
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store 0.003928915
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR 0.001621294
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that. 0.055173125
## Love these boys. @Wrexham_AFC 0.001098096
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test. 0.001680017
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road 0.004726662
## fear
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store 0.002165676
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR 0.005382390
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that. 0.001207569
## Love these boys. @Wrexham_AFC 0.001976798
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test. 0.007144672
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road 0.139299005
## joy
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store 0.10378531
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR 0.07971518
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that. 0.01492078
## Love these boys. @Wrexham_AFC 0.95849103
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test. 0.11069606
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road 0.03138582
## neutral
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store 0.839183569
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR 0.640678763
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that. 0.898435831
## Love these boys. @Wrexham_AFC 0.006809928
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test. 0.794360280
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road 0.776941359
## sadness
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store 0.003338196
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR 0.012384440
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that. 0.005729300
## Love these boys. @Wrexham_AFC 0.018889830
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test. 0.027738208
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road 0.010570211
## surprise
## To celebrate Earth Day, Billie has teamed up with 3 rising businesses that are leading the charge in the sustainable production space. These capsules showcase how style, innovation, and creativity can coexist in the things we make, while being mindful of the impact on our planet. Shop the capsules now on Billies store 0.037647042
## BERLIN: The HIT ME HARD AND SOFT Pop-Up begins tomorrow! Open May 9 + 10 from 12:00 UHR to 17:00 UHR & May 11 from 14:00 UHR to 20:00 UHR 0.234591424
## Back to back to back is a metaphor for an unprecedented run of promotions and not for drinking a bunch of delicious beverages in a row but one could be forgiven for inferring that. 0.010140403
## Love these boys. @Wrexham_AFC 0.002391844
## Starlink continuity test with Diablo Hardcore. \n\nAny major lag or a brief loss of connection means permadeath, so it is the best test. 0.054706544
## Using Tesla Autopilot self-driving massively improves safety for you and others on the road 0.025805326
write.csv(tweet_df, "new_tweet_df.csv", row.names = FALSE)