The following RMD contains logic to support the final assignment for CUNY SPS DATA 622 Spring 2025. The dataset used below comes from a hackathon project called TweetLike, and was later cleaned and restructured to support machine learning projects. This cleaned version is used in the project. The data is sourced from Kaggle https://www.kaggle.com/datasets/abaghyangor/celebrity-tweets/data and contains real and AI-generated tweets from celebrities on X/Twitter which are to be used in a predictor algorithm for social media management.
library(tidyverse)
library(dplyr)
library(caret)
library(text)
library(tibble)
library(reticulate)
library(randomForest)
# Import the data
tweet_raw <- read_csv("https://raw.githubusercontent.com/evanskaylie/DATA622/refs/heads/main/tweets_dataset.csv")
# Preview the data
head(tweet_raw)
## # A tibble: 6 × 3
## name tweet is_real
## <chr> <chr> <lgl>
## 1 Billie Eilish i’m fine. just floating in a hoodie-shaped cloud of exi… FALSE
## 2 Ryan Reynolds I watched Frozen without my two-year-old this morning. … TRUE
## 3 Billie Eilish people really be like “you’ve changed” like that’s not … FALSE
## 4 Billie Eilish people really be like “you’ve changed” like that’s not … FALSE
## 5 Elon Musk Nuke Mars! TRUE
## 6 Kanye West Have you ever thought you were in love with someone but… TRUE
summary(tweet_raw)
## name tweet is_real
## Length:120 Length:120 Mode :logical
## Class :character Class :character FALSE:56
## Mode :character Mode :character TRUE :64
# Data types are correct and the Tyler, The Creator comma did not break the csv delimitation like expected
tweet_df <- tweet_raw
# Check for any missing values
sum(is.na(tweet_df))
## [1] 0
Python emulator to do the neural network analyses
A neural network will be used to get new columns from the tweets.
Specifically, sentiment score (positive/negative) and emotion probabilities (anger, joy, etc.)
# Create a Python environment and install transformers
reticulate::py_install(c("transformers", "torch"), pip = TRUE)
## Using virtual environment '/Users/kaylieevans/.virtualenvs/r-reticulate' ...
## + /Users/kaylieevans/.virtualenvs/r-reticulate/bin/python -m pip install --upgrade --no-user transformers torch
# Load Python libraries
transformers <- import("transformers")
torch <- import("torch")
# Load sentiment pipeline classifier
classifier <- transformers$pipeline(
"sentiment-analysis",
model = "distilbert/distilbert-base-uncased-finetuned-sst-2-english",
revision = "714eb0f"
)
# Run classifier on the tweet column
tweet_df$sentiment_raw <- sapply(tweet_df$tweet, function(x) {
result <- classifier(x)[[1]]
paste(result$label, result$score, sep = ":")
})
# Split out label and score into separate columns
tweet_df$sentiment_val <- sapply(strsplit(tweet_df$sentiment_raw, ":"), `[`, 1)
tweet_df$sentiment_score <- as.numeric(sapply(strsplit(tweet_df$sentiment_raw, ":"), `[`, 2))
# Binary encoding for the sentiment val column
tweet_df$sentiment_val_binary <- ifelse(tweet_df$sentiment_val == "POSITIVE", 1, 0)
# Load emotion classifier pipeline
emotion_classifier <- transformers$pipeline(
"text-classification",
model = "j-hartmann/emotion-english-distilroberta-base",
return_all_scores = TRUE
)
# Run the classifier on tweets
emotion_scores <- lapply(tweet_df$tweet, function(x) {
result <- emotion_classifier(x)[[1]]
# Convert to named vector of probabilities
scores <- setNames(sapply(result, function(r) r$score), sapply(result, function(r) r$label))
return(scores)
})
# Combine results into a data frame
emotion_df <- do.call(rbind, lapply(emotion_scores, function(x) {
# Ensure all possible emotions are included
emotions <- c("anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise")
x[setdiff(emotions, names(x))] <- 0
return(x[emotions])
}))
# Add the emotion probabilities to tweet_df
tweet_df <- cbind(tweet_df, emotion_df)
# Preview Data
summary(tweet_df)
## name tweet is_real sentiment_raw
## Length:120 Length:120 Mode :logical Length:120
## Class :character Class :character FALSE:56 Class :character
## Mode :character Mode :character TRUE :64 Mode :character
##
##
##
## sentiment_val sentiment_score sentiment_val_binary anger
## Length:120 Min. :0.6006 Min. :0.0000 Min. :0.0007579
## Class :character 1st Qu.:0.9768 1st Qu.:0.0000 1st Qu.:0.0042723
## Mode :character Median :0.9945 Median :0.0000 Median :0.0187627
## Mean :0.9511 Mean :0.3667 Mean :0.0922059
## 3rd Qu.:0.9988 3rd Qu.:1.0000 3rd Qu.:0.0587539
## Max. :0.9999 Max. :1.0000 Max. :0.6127625
## disgust fear joy neutral
## Min. :0.0003042 Min. :0.0005585 Min. :0.001415 Min. :0.001312
## 1st Qu.:0.0050158 1st Qu.:0.0026508 1st Qu.:0.003538 1st Qu.:0.035472
## Median :0.0165695 Median :0.0100859 Median :0.010174 Median :0.244099
## Mean :0.0417464 Mean :0.0912046 Mean :0.108731 Mean :0.355420
## 3rd Qu.:0.0639502 3rd Qu.:0.0300521 3rd Qu.:0.021943 3rd Qu.:0.604242
## Max. :0.3163918 Max. :0.9852785 Max. :0.982051 Max. :0.966248
## sadness surprise
## Min. :0.001523 Min. :0.0005985
## 1st Qu.:0.006061 1st Qu.:0.0078853
## Median :0.021070 Median :0.0414546
## Mean :0.144213 Mean :0.1664800
## 3rd Qu.:0.129918 3rd Qu.:0.1511820
## Max. :0.981419 Max. :0.9558494
head(tweet_df, 1)
## name
## 1 Billie Eilish
## tweet
## 1 i’m fine. just floating in a hoodie-shaped cloud of existential dread.
## is_real sentiment_raw sentiment_val sentiment_score
## 1 FALSE POSITIVE:0.600636720657349 POSITIVE 0.6006367
## sentiment_val_binary anger disgust fear joy
## 1 1 0.001180525 0.000304157 0.9852785 0.005264891
## neutral sadness surprise
## 1 0.001312157 0.006061158 0.0005985099
# Set seed for reproducibility
set.seed(64)
# Create partition of 80-20
tweet_index <- createDataPartition(tweet_df$is_real, p = .8, list = FALSE)
# Split X into training and testing (exclude 'is_real' column)
X_train <- tweet_df[tweet_index, !(names(tweet_df) %in% "is_real")]
X_test <- tweet_df[-tweet_index, !(names(tweet_df) %in% "is_real")]
# Split y into training and testing (only the 'is_real' column)
y_train <- tweet_df[tweet_index, "is_real", drop = TRUE]
y_test <- tweet_df[-tweet_index, "is_real", drop = TRUE]
# Prepare the training and testing data
exclude_cols <- c("name", "tweet", "sentiment_raw", "sentiment_val")
X_train_rf <- X_train[, !(names(X_train) %in% exclude_cols)]
X_test_rf <- X_test[, !(names(X_test) %in% exclude_cols)]
# Set seed for reproducibility
set.seed(64)
# Train a random forest classifier
rf_model <- randomForest(
x = X_train_rf,
y = as.factor(y_train),
ntree = 500,
importance = TRUE
)
# Predict on the test set
rf_preds <- predict(rf_model, newdata = X_test_rf)
# Evaluate performance
conf_mat <- confusionMatrix(rf_preds, as.factor(y_test))
print(conf_mat)
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 11 0
## TRUE 0 12
##
## Accuracy : 1
## 95% CI : (0.8518, 1)
## No Information Rate : 0.5217
## P-Value [Acc > NIR] : 3.173e-07
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.4783
## Detection Rate : 0.4783
## Detection Prevalence : 0.4783
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : FALSE
##
# Visualize variable importance
varImpPlot(rf_model, main = "Random Forest Feature Importance")
This is data I manually collected from each of the named celebrity’s Twitters.
The first set of data comes from 2020, before ChatGPT popularized generative AI for common tasks. The second set of data comes from each celebrity’s most recent tweet.
# Import the data
new_tweet_raw <- read_csv("https://raw.githubusercontent.com/evanskaylie/DATA622/refs/heads/main/new_tweet_df.csv")
# Preview the data
head(new_tweet_raw)
## # A tibble: 6 × 14
## name tweet is_real sentiment_raw sentiment_val sentiment_score
## <chr> <chr> <lgl> <chr> <chr> <dbl>
## 1 Billie Eilish To celebrat… NA POSITIVE:0.9… POSITIVE 0.999
## 2 Billie Eilish BERLIN: The… NA POSITIVE:0.9… POSITIVE 0.998
## 3 Ryan Reynolds Back to bac… NA POSITIVE:0.9… POSITIVE 0.974
## 4 Ryan Reynolds Love these … NA POSITIVE:0.9… POSITIVE 1.00
## 5 Elon Musk Starlink co… NA POSITIVE:0.7… POSITIVE 0.772
## 6 Elon Musk Using Tesla… NA POSITIVE:0.9… POSITIVE 0.997
## # ℹ 8 more variables: sentiment_val_binary <dbl>, anger <dbl>, disgust <dbl>,
## # fear <dbl>, joy <dbl>, neutral <dbl>, sadness <dbl>, surprise <dbl>
# Data types are correct and no commas messed up the rows
new_tweet_df <- new_tweet_raw
# Remove unused columns to match model input
exclude_cols <- c("name", "tweet", "sentiment_raw", "sentiment_val", "is_real")
feature_cols <- setdiff(names(new_tweet_df), exclude_cols)
X_new_rf <- new_tweet_df[, feature_cols]
# Predict using trained model
rf_preds <- predict(rf_model, newdata = X_new_rf)
# Store predictions in the new data frame
new_tweet_df$predicted_is_real <- rf_preds
# View results
new_tweet_df[, c("name", "predicted_is_real")]
## # A tibble: 14 × 2
## name predicted_is_real
## <chr> <fct>
## 1 Billie Eilish FALSE
## 2 Billie Eilish FALSE
## 3 Ryan Reynolds TRUE
## 4 Ryan Reynolds FALSE
## 5 Elon Musk FALSE
## 6 Elon Musk TRUE
## 7 Kanye West FALSE
## 8 Kanye West FALSE
## 9 Taylor Swift FALSE
## 10 Taylor Swift FALSE
## 11 Conan O'Brien FALSE
## 12 Conan O'Brien FALSE
## 13 Tyler, the Creator FALSE
## 14 Tyler, the Creator TRUE