Viral Social Media Trends: Data Cleaning & EDA

Load Data

raw     <- read_csv("/Users/christinamac/Downloads/archive/Viral_Social_Media_Trends.csv")
cleaned <- read_csv("/Users/christinamac/Downloads/archive/Cleaned_Viral_Social_Media_Trends.csv")

Filter Both Datasets

filter_data <- function(df) {
  df %>%
    filter(Region %in% c("USA", "Canada")) %>%
    filter(str_to_lower(Hashtag) == "#education")
}

raw_filtered     <- filter_data(raw)
cleaned_filtered <- filter_data(cleaned)

# Binary engagement variable: 1 = High, 0 = Medium or Low
# Binary content type variable: 1 = Reel, 0 = all other types
cleaned_filtered <- cleaned_filtered %>%
  mutate(
    Engagement_Binary    = if_else(Engagement_Level == "High", 1, 0),
    Content_Type_Binary  = if_else(Content_Type == "Reel", 1, 0)
  )

cat("Engagement_Binary distribution:\n")

## Engagement_Binary distribution:

print(table(cleaned_filtered$Engagement_Binary))

## 
##  0  1 
## 95 47

cat("\nContent_Type_Binary distribution (1 = Reel):\n")

## 
## Content_Type_Binary distribution (1 = Reel):

print(table(cleaned_filtered$Content_Type_Binary))

## 
##   0   1 
## 120  22

EDA: Cleaned Dataset (USA/Canada, #Education)

eda_data <- cleaned_filtered
glimpse(eda_data)

## Rows: 142
## Columns: 13
## $ Post_ID             <chr> "Post_100", "Post_115", "Post_135", "Post_146", "P…
## $ Post_Date           <date> 2023-05-20, 2022-07-17, 2023-07-25, 2022-04-08, 2…
## $ Platform            <chr> "YouTube", "Instagram", "TikTok", "Instagram", "In…
## $ Hashtag             <chr> "#Education", "#Education", "#Education", "#Educat…
## $ Content_Type        <chr> "Post", "Tweet", "Live Stream", "Reel", "Live Stre…
## $ Region              <chr> "Canada", "Canada", "Canada", "Canada", "USA", "US…
## $ Views               <dbl> 3096420, 1599554, 707841, 964498, 4709770, 4221739…
## $ Likes               <dbl> 364521, 364748, 133933, 194879, 389553, 291223, 25…
## $ Shares              <dbl> 73308, 16705, 46058, 76029, 55435, 61415, 23167, 8…
## $ Comments            <dbl> 29082, 8949, 45912, 34998, 16799, 44293, 5670, 335…
## $ Engagement_Level    <chr> "Low", "High", "High", "High", "Medium", "Low", "L…
## $ Engagement_Binary   <dbl> 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,…
## $ Content_Type_Binary <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…

Missing Values

cat("Total NAs:", sum(is.na(eda_data)))

## Total NAs: 0

print(colSums(is.na(eda_data)))

##             Post_ID           Post_Date            Platform             Hashtag 
##                   0                   0                   0                   0 
##        Content_Type              Region               Views               Likes 
##                   0                   0                   0                   0 
##              Shares            Comments    Engagement_Level   Engagement_Binary 
##                   0                   0                   0                   0 
## Content_Type_Binary 
##                   0

Categorical Distributions

cat("=== PLATFORM ===\n");       print(table(eda_data$Platform))

## === PLATFORM ===

## 
## Instagram    TikTok   Twitter   YouTube 
##        36        30        36        40

cat("\n=== REGION ===\n");       print(table(eda_data$Region))

## 
## === REGION ===

## 
## Canada    USA 
##     78     64

cat("\n=== CONTENT TYPE ===\n"); print(table(eda_data$Content_Type))

## 
## === CONTENT TYPE ===

## 
## Live Stream        Post        Reel      Shorts       Tweet       Video 
##          24          34          22          15          27          20

cat("\n=== ENGAGEMENT LEVEL ===\n"); print(table(eda_data$Engagement_Level))

## 
## === ENGAGEMENT LEVEL ===

## 
##   High    Low Medium 
##     47     49     46

Numeric Summary

print(summary(eda_data[, c("Views", "Likes", "Shares", "Comments")]))

##      Views             Likes            Shares         Comments    
##  Min.   : 112559   Min.   :  7372   Min.   :  435   Min.   :   32  
##  1st Qu.:1033473   1st Qu.:136294   1st Qu.:25961   1st Qu.:13785  
##  Median :2572378   Median :264404   Median :55326   Median :25024  
##  Mean   :2515903   Mean   :261113   Mean   :51941   Mean   :24411  
##  3rd Qu.:3922257   3rd Qu.:377607   3rd Qu.:76555   3rd Qu.:34514  
##  Max.   :4956515   Max.   :499312   Max.   :99857   Max.   :49993

Visualization 1: Engagement Level by Platform

ggplot(eda_data, aes(x = Platform, fill = Engagement_Level)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Engagement Level Distribution by Platform",
       subtitle = "USA/Canada — #Education Posts",
       x = "Platform", y = "Proportion", fill = "Engagement Level") +
  theme_minimal()

Visualization 2: Average Views by Content Type

eda_data %>%
  group_by(Content_Type) %>%
  summarise(Avg_Views = mean(Views)) %>%
  arrange(desc(Avg_Views)) %>%
  ggplot(aes(x = reorder(Content_Type, Avg_Views), y = Avg_Views, fill = Content_Type)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(title = "Average Views by Content Type",
       subtitle = "USA/Canada — #Education Posts",
       x = "Content Type", y = "Average Views") +
  theme_minimal()

Visualization 3: Views Distribution by Platform

ggplot(eda_data, aes(x = Platform, y = Views, fill = Platform)) +
  geom_boxplot(show.legend = FALSE) +
  labs(title = "Views Distribution by Platform",
       subtitle = "USA/Canada — #Education Posts",
       x = "Platform", y = "Views") +
  theme_minimal()

Correlation Matrix

eda_data %>%
  mutate(Content_Type_Num = as.numeric(factor(Content_Type))) %>%
  select(Views, Likes, Shares, Comments, Content_Type_Num) %>%
  cor() %>%
  round(3) %>%
  print()

##                   Views  Likes Shares Comments Content_Type_Num
## Views             1.000  0.142  0.141    0.004           -0.065
## Likes             0.142  1.000  0.010    0.094           -0.117
## Shares            0.141  0.010  1.000    0.115            0.014
## Comments          0.004  0.094  0.115    1.000           -0.052
## Content_Type_Num -0.065 -0.117  0.014   -0.052            1.000

Average Metrics by Region

eda_data %>%
  group_by(Region) %>%
  summarise(
    Avg_Views    = round(mean(Views), 0),
    Avg_Likes    = round(mean(Likes), 0),
    Avg_Shares   = round(mean(Shares), 0),
    Avg_Comments = round(mean(Comments), 0),
    Posts        = n()
  ) %>%
  arrange(desc(Avg_Views))

## # A tibble: 2 × 6
##   Region Avg_Views Avg_Likes Avg_Shares Avg_Comments Posts
##   <chr>      <dbl>     <dbl>      <dbl>        <dbl> <int>
## 1 USA      2739953    263244      49387        24455    64
## 2 Canada   2332068    259365      54036        24376    78

Visualization 4: Posts Over Time

eda_data %>%
  mutate(Month = floor_date(as.Date(Post_Date), "month")) %>%
  count(Month) %>%
  ggplot(aes(x = Month, y = n)) +
  geom_line(color = "steelblue", linewidth = 1) +
  geom_point(color = "steelblue") +
  labs(title = "Number of Education Posts Over Time",
       subtitle = "USA/Canada — #Education Posts",
       x = "Month", y = "Post Count") +
  theme_minimal()

YouTube Posts: Slice & EDA

youtube_data <- cleaned_filtered %>%
  filter(Platform == "YouTube")

cat("YouTube posts (USA/Canada, #Education):", nrow(youtube_data), "\n")

## YouTube posts (USA/Canada, #Education): 40

glimpse(youtube_data)

## Rows: 40
## Columns: 13
## $ Post_ID             <chr> "Post_100", "Post_191", "Post_234", "Post_332", "P…
## $ Post_Date           <date> 2023-05-20, 2022-01-27, 2022-07-11, 2023-08-20, 2…
## $ Platform            <chr> "YouTube", "YouTube", "YouTube", "YouTube", "YouTu…
## $ Hashtag             <chr> "#Education", "#Education", "#Education", "#Educat…
## $ Content_Type        <chr> "Post", "Reel", "Shorts", "Post", "Reel", "Video",…
## $ Region              <chr> "Canada", "USA", "USA", "USA", "USA", "USA", "Cana…
## $ Views               <dbl> 3096420, 4221739, 1465227, 4530257, 4378018, 13746…
## $ Likes               <dbl> 364521, 291223, 158579, 474798, 396292, 479271, 26…
## $ Shares              <dbl> 73308, 61415, 8809, 91950, 22112, 435, 11451, 9434…
## $ Comments            <dbl> 29082, 44293, 33511, 40024, 2274, 37576, 18145, 40…
## $ Engagement_Level    <chr> "Low", "Low", "High", "High", "Medium", "Medium", …
## $ Engagement_Binary   <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…
## $ Content_Type_Binary <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,…

Missing Values

cat("Total NAs:", sum(is.na(youtube_data)))

## Total NAs: 0

print(colSums(is.na(youtube_data)))

##             Post_ID           Post_Date            Platform             Hashtag 
##                   0                   0                   0                   0 
##        Content_Type              Region               Views               Likes 
##                   0                   0                   0                   0 
##              Shares            Comments    Engagement_Level   Engagement_Binary 
##                   0                   0                   0                   0 
## Content_Type_Binary 
##                   0

Categorical Distributions

cat("=== REGION ===\n");          print(table(youtube_data$Region))

## === REGION ===

## 
## Canada    USA 
##     19     21

cat("\n=== CONTENT TYPE ===\n");  print(table(youtube_data$Content_Type))

## 
## === CONTENT TYPE ===

## 
## Live Stream        Post        Reel      Shorts       Tweet       Video 
##           6          10           6           6           8           4

cat("\n=== ENGAGEMENT LEVEL ===\n"); print(table(youtube_data$Engagement_Level))

## 
## === ENGAGEMENT LEVEL ===

## 
##   High    Low Medium 
##     10     14     16

Numeric Summary

print(summary(youtube_data[, c("Views", "Likes", "Shares", "Comments")]))

##      Views             Likes            Shares         Comments    
##  Min.   : 218047   Min.   :  7372   Min.   :  435   Min.   :  349  
##  1st Qu.:1307542   1st Qu.:119807   1st Qu.:21562   1st Qu.:10942  
##  Median :2557428   Median :222460   Median :55592   Median :23334  
##  Mean   :2506645   Mean   :233537   Mean   :50211   Mean   :22489  
##  3rd Qu.:3784346   3rd Qu.:314760   3rd Qu.:74842   3rd Qu.:33477  
##  Max.   :4948346   Max.   :488199   Max.   :99857   Max.   :46321

Visualization 1: Engagement Level Distribution (YouTube)

ggplot(youtube_data, aes(x = Engagement_Level, fill = Engagement_Level)) +
  geom_bar(show.legend = FALSE) +
  labs(title = "Engagement Level Distribution — YouTube",
       subtitle = "USA/Canada — #Education Posts",
       x = "Engagement Level", y = "Count") +
  theme_minimal()

Visualization 2: Views by Content Type (YouTube)

ggplot(youtube_data, aes(x = Content_Type, y = Views, fill = Content_Type)) +
  geom_boxplot(show.legend = FALSE) +
  labs(title = "Views by Content Type — YouTube",
       subtitle = "USA/Canada — #Education Posts",
       x = "Content Type", y = "Views") +
  theme_minimal()

Visualization 3: Views by Region (YouTube)

ggplot(youtube_data, aes(x = Region, y = Views, fill = Region)) +
  geom_boxplot(show.legend = FALSE) +
  labs(title = "Views by Region — YouTube",
       subtitle = "USA/Canada — #Education Posts",
       x = "Region", y = "Views") +
  theme_minimal()

Visualization 4: Posts Over Time (YouTube)

youtube_data %>%
  mutate(Month = floor_date(as.Date(Post_Date), "month")) %>%
  count(Month) %>%
  ggplot(aes(x = Month, y = n)) +
  geom_line(color = "tomato", linewidth = 1) +
  geom_point(color = "tomato") +
  labs(title = "YouTube Education Posts Over Time",
       subtitle = "USA/Canada — #Education Posts",
       x = "Month", y = "Post Count") +
  theme_minimal()

Correlation Matrix (YouTube)

youtube_data %>%
  mutate(
    Content_Type_Num    = as.numeric(factor(Content_Type)),
    Content_Type_Binary = if_else(Content_Type == "Reel", 1, 0)
  ) %>%
  select(Views, Likes, Shares, Comments, Content_Type_Num, Content_Type_Binary) %>%
  cor() %>%
  round(3) %>%
  print()

##                      Views  Likes Shares Comments Content_Type_Num
## Views                1.000  0.268  0.289    0.144           -0.141
## Likes                0.268  1.000  0.003    0.190           -0.067
## Shares               0.289  0.003  1.000    0.051           -0.035
## Comments             0.144  0.190  0.051    1.000            0.208
## Content_Type_Num    -0.141 -0.067 -0.035    0.208            1.000
## Content_Type_Binary  0.108  0.058  0.040    0.025           -0.078
##                     Content_Type_Binary
## Views                             0.108
## Likes                             0.058
## Shares                            0.040
## Comments                          0.025
## Content_Type_Num                 -0.078
## Content_Type_Binary               1.000

Logistic Regression: Engagement_Binary ~ Month + Country + Content Type + Likes + Shares

Using the binary engagement variable (1 = High, 0 = Medium/Low) with logistic regression. Post_Date is decomposed into Month. Region is dummy-coded. Content_Type is binary (1 = Reel, 0 = Other). Likes and Shares are included as continuous predictors.

library(broom)

log_data <- youtube_data %>%
  mutate(
    Engagement_Binary   = if_else(Engagement_Level == "High", 1, 0),
    Content_Type_Binary = if_else(Content_Type == "Reel", 1, 0),
    Month               = as.numeric(format(as.Date(Post_Date), "%m")),
    Region              = factor(Region)
  )

cat("Engagement_Binary distribution (YouTube):\n")

## Engagement_Binary distribution (YouTube):

print(table(log_data$Engagement_Binary))

## 
##  0  1 
## 30 10

cat("\nContent_Type_Binary distribution (1 = Reel):\n")

## 
## Content_Type_Binary distribution (1 = Reel):

print(table(log_data$Content_Type_Binary))

## 
##  0  1 
## 34  6

Run the Logistic Regression

log_model <- glm(Engagement_Binary ~ Month + Region + Content_Type_Binary + Likes + Shares,
                 data   = log_data,
                 family = binomial(link = "logit"))
summary(log_model)

## 
## Call:
## glm(formula = Engagement_Binary ~ Month + Region + Content_Type_Binary + 
##     Likes + Shares, family = binomial(link = "logit"), data = log_data)
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)  
## (Intercept)         -2.610e+00  1.368e+00  -1.907   0.0565 .
## Month               -2.092e-02  1.182e-01  -0.177   0.8595  
## RegionUSA           -2.251e-02  7.758e-01  -0.029   0.9768  
## Content_Type_Binary -7.394e-01  1.202e+00  -0.615   0.5383  
## Likes                4.083e-06  2.974e-06   1.373   0.1697  
## Shares               1.380e-05  1.261e-05   1.094   0.2738  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 44.987  on 39  degrees of freedom
## Residual deviance: 41.466  on 34  degrees of freedom
## AIC: 53.466
## 
## Number of Fisher Scoring iterations: 4

Tidy Coefficient Table (with Odds Ratios)

tidy(log_model) %>%
  mutate(
    odds_ratio = round(exp(estimate), 4),
    across(where(is.numeric), ~ round(., 4))
  ) %>%
  select(term, estimate, odds_ratio, std.error, statistic, p.value) %>%
  print()

## # A tibble: 6 × 6
##   term                estimate odds_ratio std.error statistic p.value
##   <chr>                  <dbl>      <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)          -2.61       0.0736     1.37     -1.91   0.0565
## 2 Month                -0.0209     0.979      0.118    -0.177  0.860 
## 3 RegionUSA            -0.0225     0.978      0.776    -0.029  0.977 
## 4 Content_Type_Binary  -0.739      0.477      1.20     -0.615  0.538 
## 5 Likes                 0          1          0         1.37   0.170 
## 6 Shares                0          1          0         1.09   0.274

Model Fit

glance(log_model) %>%
  select(null.deviance, deviance, AIC, BIC, df.null, df.residual) %>%
  mutate(across(where(is.numeric), ~ round(., 4))) %>%
  print()

## # A tibble: 1 × 6
##   null.deviance deviance   AIC   BIC df.null df.residual
##           <dbl>    <dbl> <dbl> <dbl>   <dbl>       <dbl>
## 1          45.0     41.5  53.5  63.6      39          34

Predicted Probabilities vs. Actual

log_data %>%
  mutate(Predicted_Prob = predict(log_model, type = "response")) %>%
  ggplot(aes(x = Predicted_Prob, fill = factor(Engagement_Binary))) +
  geom_histogram(bins = 20, alpha = 0.7, position = "identity") +
  scale_fill_manual(values = c("0" = "steelblue", "1" = "tomato"),
                    labels = c("0" = "Not High", "1" = "High")) +
  labs(title = "Predicted Probability of High Engagement",
       subtitle = "YouTube — USA/Canada — #Education Posts",
       x = "Predicted Probability", y = "Count", fill = "Actual Engagement") +
  theme_minimal()

Classification Accuracy

log_data <- log_data %>%
  mutate(
    Predicted_Prob  = predict(log_model, type = "response"),
    Predicted_Class = if_else(Predicted_Prob >= 0.5, 1, 0)
  )

conf_matrix <- table(Actual = log_data$Engagement_Binary,
                     Predicted = log_data$Predicted_Class)
print(conf_matrix)

##       Predicted
## Actual  0  1
##      0 29  1
##      1  8  2

# Safely extract confusion matrix cells (handles missing rows/columns)
get_cell <- function(mat, r, c) {
  if (r %in% rownames(mat) && c %in% colnames(mat)) mat[r, c] else 0
}

TP <- get_cell(conf_matrix, "1", "1")
TN <- get_cell(conf_matrix, "0", "0")
FP <- get_cell(conf_matrix, "0", "1")
FN <- get_cell(conf_matrix, "1", "0")

accuracy    <- (TP + TN) / sum(conf_matrix)
sensitivity <- if ((TP + FN) > 0) TP / (TP + FN) else NA
specificity <- if ((TN + FP) > 0) TN / (TN + FP) else NA
precision   <- if ((TP + FP) > 0) TP / (TP + FP) else NA

cat("\nAccuracy   :", round(accuracy * 100, 2), "%\n")

## 
## Accuracy   : 77.5 %

cat("Sensitivity:", ifelse(is.na(sensitivity), "N/A (no High predicted)",
                           paste0(round(sensitivity * 100, 2), "% (caught this % of actual High posts)")), "\n")

## Sensitivity: 20% (caught this % of actual High posts)

cat("Specificity:", ifelse(is.na(specificity), "N/A",
                           paste0(round(specificity * 100, 2), "% (caught this % of actual Not High posts)")), "\n")

## Specificity: 96.67% (caught this % of actual Not High posts)

cat("Precision  :", ifelse(is.na(precision), "N/A (no High predicted)",
                           paste0(round(precision * 100, 2), "% (of predicted High, this % were correct)")), "\n")

## Precision  : 66.67% (of predicted High, this % were correct)

if (is.na(sensitivity))
  cat("\nNote: model predicted only one class — consider class imbalance or adding more predictors.\n")

Machine Learning Models

All models use the YouTube subset (USA/Canada, #Education) with Engagement_Binary (1 = High, 0 = Not High) as the outcome. Predictors: Month, Day_of_Week, Region, Content_Type_Binary (1 = Reel, 0 = Other).

Note on Content Type: An initial model run using the full multi-category Content_Type factor variable did not produce statistically significant results. Content type was therefore recoded as a binary variable (Content_Type_Binary) distinguishing Reel posts (1) from all other content types (0), which provided a more interpretable and parsimonious predictor.

library(caret)
library(randomForest)
library(gbm)

# Shared feature matrix
ml_data <- youtube_data %>%
  mutate(
    Engagement_Binary   = factor(if_else(Engagement_Level == "High", "High", "NotHigh"),
                                 levels = c("NotHigh", "High")),
    Content_Type_Binary = if_else(Content_Type == "Reel", 1, 0),
    Month               = as.numeric(format(as.Date(Post_Date), "%m")),
    Region              = factor(Region)
  ) %>%
  select(Engagement_Binary, Month, Region, Content_Type_Binary, Likes, Shares)

# 70/30 train-test split
set.seed(123)
train_idx  <- createDataPartition(ml_data$Engagement_Binary, p = 0.7, list = FALSE)
train_data <- ml_data[train_idx, ]
test_data  <- ml_data[-train_idx, ]

cat("Training rows:", nrow(train_data), "| Test rows:", nrow(test_data), "\n")

## Training rows: 28 | Test rows: 12

cat("Outcome distribution (train):\n")

## Outcome distribution (train):

print(table(train_data$Engagement_Binary))

## 
## NotHigh    High 
##      21       7

K-Nearest Neighbors (KNN)

set.seed(123)
knn_model <- train(
  Engagement_Binary ~ .,
  data      = train_data,
  method    = "knn",
  trControl = trainControl(method = "cv", number = 3),
  tuneLength = 5
)

cat("Best K:", knn_model$bestTune$k, "\n")

## Best K: 13

knn_pred <- predict(knn_model, test_data)
knn_cm   <- confusionMatrix(knn_pred, test_data$Engagement_Binary, positive = "High")
print(knn_cm)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction NotHigh High
##    NotHigh       9    3
##    High          0    0
##                                           
##                Accuracy : 0.75            
##                  95% CI : (0.4281, 0.9451)
##     No Information Rate : 0.75            
##     P-Value [Acc > NIR] : 0.6488          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 0.2482          
##                                           
##             Sensitivity : 0.00            
##             Specificity : 1.00            
##          Pos Pred Value :  NaN            
##          Neg Pred Value : 0.75            
##              Prevalence : 0.25            
##          Detection Rate : 0.00            
##    Detection Prevalence : 0.00            
##       Balanced Accuracy : 0.50            
##                                           
##        'Positive' Class : High            
##

plot(knn_model, main = "KNN — Accuracy by K")

K-Means Clustering

K-Means is unsupervised (no outcome label). We cluster on numeric features then examine how well clusters align with Engagement_Binary.

set.seed(123)

# K-Means requires all-numeric input
kmeans_data <- ml_data %>%
  mutate(Region_Num = as.numeric(Region)) %>%
  select(Month, Region_Num, Content_Type_Binary, Likes, Shares) %>%
  scale()

km_model <- kmeans(kmeans_data, centers = 2, nstart = 25)

cat("Cluster sizes:\n")

## Cluster sizes:

print(table(km_model$cluster))

## 
##  1  2 
## 19 21

cat("\nCluster vs. Engagement_Binary cross-tab:\n")

## 
## Cluster vs. Engagement_Binary cross-tab:

print(table(Cluster = km_model$cluster, Engagement = ml_data$Engagement_Binary))

##        Engagement
## Cluster NotHigh High
##       1      14    5
##       2      16    5

ml_data %>%
  mutate(Cluster = factor(km_model$cluster)) %>%
  ggplot(aes(x = Month, y = Likes, color = Cluster, shape = Engagement_Binary)) +
  geom_jitter(size = 3, alpha = 0.7, width = 0.3, height = 0.3) +
  labs(title = "K-Means Clusters vs. Engagement",
       subtitle = "YouTube — USA/Canada — #Education Posts",
       x = "Month", y = "Likes") +
  theme_minimal()

Random Forest

set.seed(123)
rf_model <- train(
  Engagement_Binary ~ .,
  data      = train_data,
  method    = "rf",
  trControl = trainControl(method = "cv", number = 3),
  importance = TRUE
)

rf_pred <- predict(rf_model, test_data)
rf_cm   <- confusionMatrix(rf_pred, test_data$Engagement_Binary, positive = "High")
print(rf_cm)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction NotHigh High
##    NotHigh       6    1
##    High          3    2
##                                           
##                Accuracy : 0.6667          
##                  95% CI : (0.3489, 0.9008)
##     No Information Rate : 0.75            
##     P-Value [Acc > NIR] : 0.8424          
##                                           
##                   Kappa : 0.2727          
##                                           
##  Mcnemar's Test P-Value : 0.6171          
##                                           
##             Sensitivity : 0.6667          
##             Specificity : 0.6667          
##          Pos Pred Value : 0.4000          
##          Neg Pred Value : 0.8571          
##              Prevalence : 0.2500          
##          Detection Rate : 0.1667          
##    Detection Prevalence : 0.4167          
##       Balanced Accuracy : 0.6667          
##                                           
##        'Positive' Class : High            
##

varImpPlot(rf_model$finalModel,
           main = "Random Forest — Variable Importance")

Gradient Boosting

library(gbm)
set.seed(123)

# Use gbm directly (avoids caret CV failures on small samples)
train_gbm <- train_data %>%
  mutate(
    Outcome    = if_else(Engagement_Binary == "High", 1, 0),
    Region_Num = as.numeric(Region)
  )

test_gbm <- test_data %>%
  mutate(
    Outcome    = if_else(Engagement_Binary == "High", 1, 0),
    Region_Num = as.numeric(Region)
  )

boost_fit <- gbm(
  Outcome ~ Month + Region_Num + Content_Type_Binary + Likes + Shares,
  data         = train_gbm,
  distribution = "bernoulli",
  n.trees      = 100,
  interaction.depth = 2,
  shrinkage    = 0.1,
  n.minobsinnode = 3,
  verbose      = FALSE
)

boost_probs <- predict(boost_fit, test_gbm, n.trees = 100, type = "response")
boost_pred  <- factor(if_else(boost_probs >= 0.5, "High", "NotHigh"),
                      levels = c("NotHigh", "High"))

boost_cm <- confusionMatrix(boost_pred, test_data$Engagement_Binary, positive = "High")
print(boost_cm)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction NotHigh High
##    NotHigh       6    1
##    High          3    2
##                                           
##                Accuracy : 0.6667          
##                  95% CI : (0.3489, 0.9008)
##     No Information Rate : 0.75            
##     P-Value [Acc > NIR] : 0.8424          
##                                           
##                   Kappa : 0.2727          
##                                           
##  Mcnemar's Test P-Value : 0.6171          
##                                           
##             Sensitivity : 0.6667          
##             Specificity : 0.6667          
##          Pos Pred Value : 0.4000          
##          Neg Pred Value : 0.8571          
##              Prevalence : 0.2500          
##          Detection Rate : 0.1667          
##    Detection Prevalence : 0.4167          
##       Balanced Accuracy : 0.6667          
##                                           
##        'Positive' Class : High            
##

summary(boost_fit, cBars = 10, plotit = TRUE, main = "Gradient Boosting — Variable Importance")

##                                     var    rel.inf
## Likes                             Likes 42.0130175
## Shares                           Shares 39.7068961
## Month                             Month 12.6159951
## Region_Num                   Region_Num  5.5130733
## Content_Type_Binary Content_Type_Binary  0.1510179

Model Comparison

results <- resamples(list(
  KNN           = knn_model,
  Random_Forest = rf_model
))

summary(results)

## 
## Call:
## summary.resamples(object = results)
## 
## Models: KNN, Random_Forest 
## Number of resamples: 3 
## 
## Accuracy 
##               Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## KNN            0.7 0.7388889 0.7777778 0.7518519 0.7777778 0.7777778    0
## Random_Forest  0.6 0.6333333 0.6666667 0.6814815 0.7222222 0.7777778    0
## 
## Kappa 
##                     Min.     1st Qu. Median        Mean    3rd Qu.      Max.
## KNN            0.0000000  0.00000000      0 0.000000000 0.00000000 0.0000000
## Random_Forest -0.1764706 -0.08823529      0 0.001782531 0.09090909 0.1818182
##               NA's
## KNN              0
## Random_Forest    0

# Boosting test-set accuracy reported separately
boost_acc <- mean(boost_pred == test_data$Engagement_Binary)
cat("\nBoosting test-set accuracy:", round(boost_acc * 100, 2), "%\n")

## 
## Boosting test-set accuracy: 66.67 %

bwplot(results, main = "Model Accuracy Comparison — KNN vs. Random Forest (3-Fold CV)")

Viral Social Media Trends: Data Cleaning & EDA

Filtered to USA/Canada — #Education Posts

Christina

2026-06-25

Load Data

Filter Both Datasets

EDA: Cleaned Dataset (USA/Canada, #Education)

Missing Values

Categorical Distributions

Numeric Summary

Visualization 1: Engagement Level by Platform

Visualization 2: Average Views by Content Type

Visualization 3: Views Distribution by Platform

Correlation Matrix

Average Metrics by Region

Visualization 4: Posts Over Time

YouTube Posts: Slice & EDA

Missing Values

Categorical Distributions

Numeric Summary

Visualization 1: Engagement Level Distribution (YouTube)

Visualization 2: Views by Content Type (YouTube)

Visualization 3: Views by Region (YouTube)

Visualization 4: Posts Over Time (YouTube)

Correlation Matrix (YouTube)

Logistic Regression: Engagement_Binary ~ Month + Country + Content Type + Likes + Shares

Run the Logistic Regression

Tidy Coefficient Table (with Odds Ratios)

Model Fit

Predicted Probabilities vs. Actual

Classification Accuracy

Machine Learning Models

K-Nearest Neighbors (KNN)

K-Means Clustering

Random Forest

Gradient Boosting

Model Comparison