Introduction

The following RMD contains logic to support the final assignment for CUNY SPS DATA 622 Spring 2025. The dataset used below comes from a hackathon project called TweetLike, and was later cleaned and restructured to support machine learning projects. This cleaned version is used in the project. The data is sourced from Kaggle https://www.kaggle.com/datasets/abaghyangor/celebrity-tweets/data and contains real and AI-generated tweets from celebrities on X/Twitter which are to be used in a predictor algorithm for social media management.

Load Libraries

library(tidyverse)
library(dplyr)
library(caret)
library(text)
library(tibble)
library(reticulate)
library(randomForest)

Import and Clean Data

# Import the data
tweet_raw <- read_csv("https://raw.githubusercontent.com/evanskaylie/DATA622/refs/heads/main/tweets_dataset.csv")

# Preview the data
head(tweet_raw)
## # A tibble: 6 × 3
##   name          tweet                                                    is_real
##   <chr>         <chr>                                                    <lgl>  
## 1 Billie Eilish i’m fine. just floating in a hoodie-shaped cloud of exi… FALSE  
## 2 Ryan Reynolds I watched Frozen without my two-year-old this morning. … TRUE   
## 3 Billie Eilish people really be like “you’ve changed” like that’s not … FALSE  
## 4 Billie Eilish people really be like “you’ve changed” like that’s not … FALSE  
## 5 Elon Musk     Nuke Mars!                                               TRUE   
## 6 Kanye West    Have you ever thought you were in love with someone but… TRUE
summary(tweet_raw)
##      name              tweet            is_real       
##  Length:120         Length:120         Mode :logical  
##  Class :character   Class :character   FALSE:56       
##  Mode  :character   Mode  :character   TRUE :64
# Data types are correct and the Tyler, The Creator comma did not break the csv delimitation like expected
tweet_df <- tweet_raw

# Check for any missing values
sum(is.na(tweet_df))
## [1] 0

Feature Analysis + Splitting

Python emulator to do the neural network analyses

A neural network will be used to get new columns from the tweets.

Specifically, sentiment score (positive/negative) and emotion probabilities (anger, joy, etc.)

Use Neural Network to get Sentiment Predictors

# Create a Python environment and install transformers
reticulate::py_install(c("transformers", "torch"), pip = TRUE)
## Using virtual environment '/Users/kaylieevans/.virtualenvs/r-reticulate' ...
## + /Users/kaylieevans/.virtualenvs/r-reticulate/bin/python -m pip install --upgrade --no-user transformers torch
# Load Python libraries
transformers <- import("transformers")
torch <- import("torch")

# Load sentiment pipeline classifier
classifier <- transformers$pipeline(
  "sentiment-analysis",
  model = "distilbert/distilbert-base-uncased-finetuned-sst-2-english",
  revision = "714eb0f"
)

# Run classifier on the tweet column
tweet_df$sentiment_raw <- sapply(tweet_df$tweet, function(x) {
  result <- classifier(x)[[1]]
  paste(result$label, result$score, sep = ":")
})

# Split out label and score into separate columns
tweet_df$sentiment_val <- sapply(strsplit(tweet_df$sentiment_raw, ":"), `[`, 1)
tweet_df$sentiment_score <- as.numeric(sapply(strsplit(tweet_df$sentiment_raw, ":"), `[`, 2))

# Binary encoding for the sentiment val column
tweet_df$sentiment_val_binary <- ifelse(tweet_df$sentiment_val == "POSITIVE", 1, 0)

Use Neural Network to get Emotional Predictors

# Load emotion classifier pipeline
emotion_classifier <- transformers$pipeline(
  "text-classification",
  model = "j-hartmann/emotion-english-distilroberta-base",
  return_all_scores = TRUE
)

# Run the classifier on tweets
emotion_scores <- lapply(tweet_df$tweet, function(x) {
  result <- emotion_classifier(x)[[1]]
  # Convert to named vector of probabilities
  scores <- setNames(sapply(result, function(r) r$score), sapply(result, function(r) r$label))
  return(scores)
})

# Combine results into a data frame
emotion_df <- do.call(rbind, lapply(emotion_scores, function(x) {
  # Ensure all possible emotions are included
  emotions <- c("anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise")
  x[setdiff(emotions, names(x))] <- 0 
  return(x[emotions])  
}))

# Add the emotion probabilities to tweet_df
tweet_df <- cbind(tweet_df, emotion_df)

# Preview Data
summary(tweet_df)
##      name              tweet            is_real        sentiment_raw     
##  Length:120         Length:120         Mode :logical   Length:120        
##  Class :character   Class :character   FALSE:56        Class :character  
##  Mode  :character   Mode  :character   TRUE :64        Mode  :character  
##                                                                          
##                                                                          
##                                                                          
##  sentiment_val      sentiment_score  sentiment_val_binary     anger          
##  Length:120         Min.   :0.6006   Min.   :0.0000       Min.   :0.0007579  
##  Class :character   1st Qu.:0.9768   1st Qu.:0.0000       1st Qu.:0.0042723  
##  Mode  :character   Median :0.9945   Median :0.0000       Median :0.0187627  
##                     Mean   :0.9511   Mean   :0.3667       Mean   :0.0922059  
##                     3rd Qu.:0.9988   3rd Qu.:1.0000       3rd Qu.:0.0587539  
##                     Max.   :0.9999   Max.   :1.0000       Max.   :0.6127625  
##     disgust               fear                joy              neutral        
##  Min.   :0.0003042   Min.   :0.0005585   Min.   :0.001415   Min.   :0.001312  
##  1st Qu.:0.0050158   1st Qu.:0.0026508   1st Qu.:0.003538   1st Qu.:0.035472  
##  Median :0.0165695   Median :0.0100859   Median :0.010174   Median :0.244099  
##  Mean   :0.0417464   Mean   :0.0912046   Mean   :0.108731   Mean   :0.355420  
##  3rd Qu.:0.0639502   3rd Qu.:0.0300521   3rd Qu.:0.021943   3rd Qu.:0.604242  
##  Max.   :0.3163918   Max.   :0.9852785   Max.   :0.982051   Max.   :0.966248  
##     sadness            surprise        
##  Min.   :0.001523   Min.   :0.0005985  
##  1st Qu.:0.006061   1st Qu.:0.0078853  
##  Median :0.021070   Median :0.0414546  
##  Mean   :0.144213   Mean   :0.1664800  
##  3rd Qu.:0.129918   3rd Qu.:0.1511820  
##  Max.   :0.981419   Max.   :0.9558494
head(tweet_df, 1)
##            name
## 1 Billie Eilish
##                                                                    tweet
## 1 i’m fine. just floating in a hoodie-shaped cloud of existential dread.
##   is_real              sentiment_raw sentiment_val sentiment_score
## 1   FALSE POSITIVE:0.600636720657349      POSITIVE       0.6006367
##   sentiment_val_binary       anger     disgust      fear         joy
## 1                    1 0.001180525 0.000304157 0.9852785 0.005264891
##       neutral     sadness     surprise
## 1 0.001312157 0.006061158 0.0005985099

Split Out Training and Testing Data 80-20

# Set seed for reproducibility
set.seed(64)

# Create partition of 80-20
tweet_index <- createDataPartition(tweet_df$is_real, p = .8, list = FALSE)

# Split X into training and testing (exclude 'is_real' column)
X_train <- tweet_df[tweet_index, !(names(tweet_df) %in% "is_real")]
X_test  <- tweet_df[-tweet_index, !(names(tweet_df) %in% "is_real")]

# Split y into training and testing (only the 'is_real' column)
y_train <- tweet_df[tweet_index, "is_real", drop = TRUE]
y_test  <- tweet_df[-tweet_index, "is_real", drop = TRUE]

Random Forest Predictions

# Prepare the training and testing data
exclude_cols <- c("name", "tweet", "sentiment_raw", "sentiment_val")
X_train_rf <- X_train[, !(names(X_train) %in% exclude_cols)]
X_test_rf  <- X_test[, !(names(X_test) %in% exclude_cols)]

# Set seed for reproducibility 
set.seed(64)

# Train a random forest classifier
rf_model <- randomForest(
  x = X_train_rf,
  y = as.factor(y_train),
  ntree = 500,
  importance = TRUE
)

# Predict on the test set
rf_preds <- predict(rf_model, newdata = X_test_rf)

# Evaluate performance
conf_mat <- confusionMatrix(rf_preds, as.factor(y_test))
print(conf_mat)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE    11    0
##      TRUE      0   12
##                                      
##                Accuracy : 1          
##                  95% CI : (0.8518, 1)
##     No Information Rate : 0.5217     
##     P-Value [Acc > NIR] : 3.173e-07  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.4783     
##          Detection Rate : 0.4783     
##    Detection Prevalence : 0.4783     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : FALSE      
## 
# Visualize variable importance
varImpPlot(rf_model, main = "Random Forest Feature Importance")

Bonus Predictions

This is data I manually collected from each of the named celebrity’s Twitters.

The first set of data comes from 2020, before ChatGPT popularized generative AI for common tasks. The second set of data comes from each celebrity’s most recent tweet.

Predict on the model with new data

# Import the data
new_tweet_raw <- read_csv("https://raw.githubusercontent.com/evanskaylie/DATA622/refs/heads/main/new_tweet_df.csv")

# Preview the data
head(new_tweet_raw)
## # A tibble: 6 × 14
##   name          tweet        is_real sentiment_raw sentiment_val sentiment_score
##   <chr>         <chr>        <lgl>   <chr>         <chr>                   <dbl>
## 1 Billie Eilish To celebrat… NA      POSITIVE:0.9… POSITIVE                0.999
## 2 Billie Eilish BERLIN: The… NA      POSITIVE:0.9… POSITIVE                0.998
## 3 Ryan Reynolds Back to bac… NA      POSITIVE:0.9… POSITIVE                0.974
## 4 Ryan Reynolds Love these … NA      POSITIVE:0.9… POSITIVE                1.00 
## 5 Elon Musk     Starlink co… NA      POSITIVE:0.7… POSITIVE                0.772
## 6 Elon Musk     Using Tesla… NA      POSITIVE:0.9… POSITIVE                0.997
## # ℹ 8 more variables: sentiment_val_binary <dbl>, anger <dbl>, disgust <dbl>,
## #   fear <dbl>, joy <dbl>, neutral <dbl>, sadness <dbl>, surprise <dbl>
# Data types are correct and no commas messed up the rows
new_tweet_df <- new_tweet_raw

Bonus prediction:

# Remove unused columns to match model input
exclude_cols <- c("name", "tweet", "sentiment_raw", "sentiment_val", "is_real")
feature_cols <- setdiff(names(new_tweet_df), exclude_cols)
X_new_rf <- new_tweet_df[, feature_cols]

# Predict using trained model
rf_preds <- predict(rf_model, newdata = X_new_rf)

# Store predictions in the new data frame
new_tweet_df$predicted_is_real <- rf_preds

# View results
new_tweet_df[, c("name", "predicted_is_real")]
## # A tibble: 14 × 2
##    name               predicted_is_real
##    <chr>              <fct>            
##  1 Billie Eilish      FALSE            
##  2 Billie Eilish      FALSE            
##  3 Ryan Reynolds      TRUE             
##  4 Ryan Reynolds      FALSE            
##  5 Elon Musk          FALSE            
##  6 Elon Musk          TRUE             
##  7 Kanye West         FALSE            
##  8 Kanye West         FALSE            
##  9 Taylor Swift       FALSE            
## 10 Taylor Swift       FALSE            
## 11 Conan O'Brien      FALSE            
## 12 Conan O'Brien      FALSE            
## 13 Tyler, the Creator FALSE            
## 14 Tyler, the Creator TRUE