raw <- read_csv("/Users/christinamac/Downloads/archive/Viral_Social_Media_Trends.csv")
cleaned <- read_csv("/Users/christinamac/Downloads/archive/Cleaned_Viral_Social_Media_Trends.csv")
filter_data <- function(df) {
df %>%
filter(Region %in% c("USA", "Canada")) %>%
filter(str_to_lower(Hashtag) == "#education")
}
raw_filtered <- filter_data(raw)
cleaned_filtered <- filter_data(cleaned)
# Binary engagement variable: 1 = High, 0 = Medium or Low
# Binary content type variable: 1 = Reel, 0 = all other types
cleaned_filtered <- cleaned_filtered %>%
mutate(
Engagement_Binary = if_else(Engagement_Level == "High", 1, 0),
Content_Type_Binary = if_else(Content_Type == "Reel", 1, 0)
)
cat("Engagement_Binary distribution:\n")
## Engagement_Binary distribution:
print(table(cleaned_filtered$Engagement_Binary))
##
## 0 1
## 95 47
cat("\nContent_Type_Binary distribution (1 = Reel):\n")
##
## Content_Type_Binary distribution (1 = Reel):
print(table(cleaned_filtered$Content_Type_Binary))
##
## 0 1
## 120 22
eda_data <- cleaned_filtered
glimpse(eda_data)
## Rows: 142
## Columns: 13
## $ Post_ID <chr> "Post_100", "Post_115", "Post_135", "Post_146", "P…
## $ Post_Date <date> 2023-05-20, 2022-07-17, 2023-07-25, 2022-04-08, 2…
## $ Platform <chr> "YouTube", "Instagram", "TikTok", "Instagram", "In…
## $ Hashtag <chr> "#Education", "#Education", "#Education", "#Educat…
## $ Content_Type <chr> "Post", "Tweet", "Live Stream", "Reel", "Live Stre…
## $ Region <chr> "Canada", "Canada", "Canada", "Canada", "USA", "US…
## $ Views <dbl> 3096420, 1599554, 707841, 964498, 4709770, 4221739…
## $ Likes <dbl> 364521, 364748, 133933, 194879, 389553, 291223, 25…
## $ Shares <dbl> 73308, 16705, 46058, 76029, 55435, 61415, 23167, 8…
## $ Comments <dbl> 29082, 8949, 45912, 34998, 16799, 44293, 5670, 335…
## $ Engagement_Level <chr> "Low", "High", "High", "High", "Medium", "Low", "L…
## $ Engagement_Binary <dbl> 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,…
## $ Content_Type_Binary <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…
cat("Total NAs:", sum(is.na(eda_data)))
## Total NAs: 0
print(colSums(is.na(eda_data)))
## Post_ID Post_Date Platform Hashtag
## 0 0 0 0
## Content_Type Region Views Likes
## 0 0 0 0
## Shares Comments Engagement_Level Engagement_Binary
## 0 0 0 0
## Content_Type_Binary
## 0
cat("=== PLATFORM ===\n"); print(table(eda_data$Platform))
## === PLATFORM ===
##
## Instagram TikTok Twitter YouTube
## 36 30 36 40
cat("\n=== REGION ===\n"); print(table(eda_data$Region))
##
## === REGION ===
##
## Canada USA
## 78 64
cat("\n=== CONTENT TYPE ===\n"); print(table(eda_data$Content_Type))
##
## === CONTENT TYPE ===
##
## Live Stream Post Reel Shorts Tweet Video
## 24 34 22 15 27 20
cat("\n=== ENGAGEMENT LEVEL ===\n"); print(table(eda_data$Engagement_Level))
##
## === ENGAGEMENT LEVEL ===
##
## High Low Medium
## 47 49 46
print(summary(eda_data[, c("Views", "Likes", "Shares", "Comments")]))
## Views Likes Shares Comments
## Min. : 112559 Min. : 7372 Min. : 435 Min. : 32
## 1st Qu.:1033473 1st Qu.:136294 1st Qu.:25961 1st Qu.:13785
## Median :2572378 Median :264404 Median :55326 Median :25024
## Mean :2515903 Mean :261113 Mean :51941 Mean :24411
## 3rd Qu.:3922257 3rd Qu.:377607 3rd Qu.:76555 3rd Qu.:34514
## Max. :4956515 Max. :499312 Max. :99857 Max. :49993
ggplot(eda_data, aes(x = Platform, fill = Engagement_Level)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(title = "Engagement Level Distribution by Platform",
subtitle = "USA/Canada — #Education Posts",
x = "Platform", y = "Proportion", fill = "Engagement Level") +
theme_minimal()
eda_data %>%
group_by(Content_Type) %>%
summarise(Avg_Views = mean(Views)) %>%
arrange(desc(Avg_Views)) %>%
ggplot(aes(x = reorder(Content_Type, Avg_Views), y = Avg_Views, fill = Content_Type)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(title = "Average Views by Content Type",
subtitle = "USA/Canada — #Education Posts",
x = "Content Type", y = "Average Views") +
theme_minimal()
ggplot(eda_data, aes(x = Platform, y = Views, fill = Platform)) +
geom_boxplot(show.legend = FALSE) +
labs(title = "Views Distribution by Platform",
subtitle = "USA/Canada — #Education Posts",
x = "Platform", y = "Views") +
theme_minimal()
eda_data %>%
mutate(Content_Type_Num = as.numeric(factor(Content_Type))) %>%
select(Views, Likes, Shares, Comments, Content_Type_Num) %>%
cor() %>%
round(3) %>%
print()
## Views Likes Shares Comments Content_Type_Num
## Views 1.000 0.142 0.141 0.004 -0.065
## Likes 0.142 1.000 0.010 0.094 -0.117
## Shares 0.141 0.010 1.000 0.115 0.014
## Comments 0.004 0.094 0.115 1.000 -0.052
## Content_Type_Num -0.065 -0.117 0.014 -0.052 1.000
eda_data %>%
group_by(Region) %>%
summarise(
Avg_Views = round(mean(Views), 0),
Avg_Likes = round(mean(Likes), 0),
Avg_Shares = round(mean(Shares), 0),
Avg_Comments = round(mean(Comments), 0),
Posts = n()
) %>%
arrange(desc(Avg_Views))
## # A tibble: 2 × 6
## Region Avg_Views Avg_Likes Avg_Shares Avg_Comments Posts
## <chr> <dbl> <dbl> <dbl> <dbl> <int>
## 1 USA 2739953 263244 49387 24455 64
## 2 Canada 2332068 259365 54036 24376 78
eda_data %>%
mutate(Month = floor_date(as.Date(Post_Date), "month")) %>%
count(Month) %>%
ggplot(aes(x = Month, y = n)) +
geom_line(color = "steelblue", linewidth = 1) +
geom_point(color = "steelblue") +
labs(title = "Number of Education Posts Over Time",
subtitle = "USA/Canada — #Education Posts",
x = "Month", y = "Post Count") +
theme_minimal()
youtube_data <- cleaned_filtered %>%
filter(Platform == "YouTube")
cat("YouTube posts (USA/Canada, #Education):", nrow(youtube_data), "\n")
## YouTube posts (USA/Canada, #Education): 40
glimpse(youtube_data)
## Rows: 40
## Columns: 13
## $ Post_ID <chr> "Post_100", "Post_191", "Post_234", "Post_332", "P…
## $ Post_Date <date> 2023-05-20, 2022-01-27, 2022-07-11, 2023-08-20, 2…
## $ Platform <chr> "YouTube", "YouTube", "YouTube", "YouTube", "YouTu…
## $ Hashtag <chr> "#Education", "#Education", "#Education", "#Educat…
## $ Content_Type <chr> "Post", "Reel", "Shorts", "Post", "Reel", "Video",…
## $ Region <chr> "Canada", "USA", "USA", "USA", "USA", "USA", "Cana…
## $ Views <dbl> 3096420, 4221739, 1465227, 4530257, 4378018, 13746…
## $ Likes <dbl> 364521, 291223, 158579, 474798, 396292, 479271, 26…
## $ Shares <dbl> 73308, 61415, 8809, 91950, 22112, 435, 11451, 9434…
## $ Comments <dbl> 29082, 44293, 33511, 40024, 2274, 37576, 18145, 40…
## $ Engagement_Level <chr> "Low", "Low", "High", "High", "Medium", "Medium", …
## $ Engagement_Binary <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…
## $ Content_Type_Binary <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,…
cat("Total NAs:", sum(is.na(youtube_data)))
## Total NAs: 0
print(colSums(is.na(youtube_data)))
## Post_ID Post_Date Platform Hashtag
## 0 0 0 0
## Content_Type Region Views Likes
## 0 0 0 0
## Shares Comments Engagement_Level Engagement_Binary
## 0 0 0 0
## Content_Type_Binary
## 0
cat("=== REGION ===\n"); print(table(youtube_data$Region))
## === REGION ===
##
## Canada USA
## 19 21
cat("\n=== CONTENT TYPE ===\n"); print(table(youtube_data$Content_Type))
##
## === CONTENT TYPE ===
##
## Live Stream Post Reel Shorts Tweet Video
## 6 10 6 6 8 4
cat("\n=== ENGAGEMENT LEVEL ===\n"); print(table(youtube_data$Engagement_Level))
##
## === ENGAGEMENT LEVEL ===
##
## High Low Medium
## 10 14 16
print(summary(youtube_data[, c("Views", "Likes", "Shares", "Comments")]))
## Views Likes Shares Comments
## Min. : 218047 Min. : 7372 Min. : 435 Min. : 349
## 1st Qu.:1307542 1st Qu.:119807 1st Qu.:21562 1st Qu.:10942
## Median :2557428 Median :222460 Median :55592 Median :23334
## Mean :2506645 Mean :233537 Mean :50211 Mean :22489
## 3rd Qu.:3784346 3rd Qu.:314760 3rd Qu.:74842 3rd Qu.:33477
## Max. :4948346 Max. :488199 Max. :99857 Max. :46321
ggplot(youtube_data, aes(x = Engagement_Level, fill = Engagement_Level)) +
geom_bar(show.legend = FALSE) +
labs(title = "Engagement Level Distribution — YouTube",
subtitle = "USA/Canada — #Education Posts",
x = "Engagement Level", y = "Count") +
theme_minimal()
ggplot(youtube_data, aes(x = Content_Type, y = Views, fill = Content_Type)) +
geom_boxplot(show.legend = FALSE) +
labs(title = "Views by Content Type — YouTube",
subtitle = "USA/Canada — #Education Posts",
x = "Content Type", y = "Views") +
theme_minimal()
ggplot(youtube_data, aes(x = Region, y = Views, fill = Region)) +
geom_boxplot(show.legend = FALSE) +
labs(title = "Views by Region — YouTube",
subtitle = "USA/Canada — #Education Posts",
x = "Region", y = "Views") +
theme_minimal()
youtube_data %>%
mutate(Month = floor_date(as.Date(Post_Date), "month")) %>%
count(Month) %>%
ggplot(aes(x = Month, y = n)) +
geom_line(color = "tomato", linewidth = 1) +
geom_point(color = "tomato") +
labs(title = "YouTube Education Posts Over Time",
subtitle = "USA/Canada — #Education Posts",
x = "Month", y = "Post Count") +
theme_minimal()
youtube_data %>%
mutate(
Content_Type_Num = as.numeric(factor(Content_Type)),
Content_Type_Binary = if_else(Content_Type == "Reel", 1, 0)
) %>%
select(Views, Likes, Shares, Comments, Content_Type_Num, Content_Type_Binary) %>%
cor() %>%
round(3) %>%
print()
## Views Likes Shares Comments Content_Type_Num
## Views 1.000 0.268 0.289 0.144 -0.141
## Likes 0.268 1.000 0.003 0.190 -0.067
## Shares 0.289 0.003 1.000 0.051 -0.035
## Comments 0.144 0.190 0.051 1.000 0.208
## Content_Type_Num -0.141 -0.067 -0.035 0.208 1.000
## Content_Type_Binary 0.108 0.058 0.040 0.025 -0.078
## Content_Type_Binary
## Views 0.108
## Likes 0.058
## Shares 0.040
## Comments 0.025
## Content_Type_Num -0.078
## Content_Type_Binary 1.000
All models use the YouTube subset (USA/Canada, #Education) with
Engagement_Binary (1 = High, 0 = Not High) as the outcome.
Predictors: Month, Day_of_Week, Region, Content_Type_Binary (1 = Reel, 0
= Other).
Note on Content Type: An initial model run using the full multi-category
Content_Typefactor variable did not produce statistically significant results. Content type was therefore recoded as a binary variable (Content_Type_Binary) distinguishing Reel posts (1) from all other content types (0), which provided a more interpretable and parsimonious predictor.
library(caret)
library(randomForest)
library(gbm)
# Shared feature matrix
ml_data <- youtube_data %>%
mutate(
Engagement_Binary = factor(if_else(Engagement_Level == "High", "High", "NotHigh"),
levels = c("NotHigh", "High")),
Content_Type_Binary = if_else(Content_Type == "Reel", 1, 0),
Month = as.numeric(format(as.Date(Post_Date), "%m")),
Region = factor(Region)
) %>%
select(Engagement_Binary, Month, Region, Content_Type_Binary, Likes, Shares)
# 70/30 train-test split
set.seed(123)
train_idx <- createDataPartition(ml_data$Engagement_Binary, p = 0.7, list = FALSE)
train_data <- ml_data[train_idx, ]
test_data <- ml_data[-train_idx, ]
cat("Training rows:", nrow(train_data), "| Test rows:", nrow(test_data), "\n")
## Training rows: 28 | Test rows: 12
cat("Outcome distribution (train):\n")
## Outcome distribution (train):
print(table(train_data$Engagement_Binary))
##
## NotHigh High
## 21 7
set.seed(123)
knn_model <- train(
Engagement_Binary ~ .,
data = train_data,
method = "knn",
trControl = trainControl(method = "cv", number = 3),
tuneLength = 5
)
cat("Best K:", knn_model$bestTune$k, "\n")
## Best K: 13
knn_pred <- predict(knn_model, test_data)
knn_cm <- confusionMatrix(knn_pred, test_data$Engagement_Binary, positive = "High")
print(knn_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction NotHigh High
## NotHigh 9 3
## High 0 0
##
## Accuracy : 0.75
## 95% CI : (0.4281, 0.9451)
## No Information Rate : 0.75
## P-Value [Acc > NIR] : 0.6488
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.2482
##
## Sensitivity : 0.00
## Specificity : 1.00
## Pos Pred Value : NaN
## Neg Pred Value : 0.75
## Prevalence : 0.25
## Detection Rate : 0.00
## Detection Prevalence : 0.00
## Balanced Accuracy : 0.50
##
## 'Positive' Class : High
##
plot(knn_model, main = "KNN — Accuracy by K")
K-Means is unsupervised (no outcome label). We cluster on numeric features then examine how well clusters align with Engagement_Binary.
set.seed(123)
# K-Means requires all-numeric input
kmeans_data <- ml_data %>%
mutate(Region_Num = as.numeric(Region)) %>%
select(Month, Region_Num, Content_Type_Binary, Likes, Shares) %>%
scale()
km_model <- kmeans(kmeans_data, centers = 2, nstart = 25)
cat("Cluster sizes:\n")
## Cluster sizes:
print(table(km_model$cluster))
##
## 1 2
## 19 21
cat("\nCluster vs. Engagement_Binary cross-tab:\n")
##
## Cluster vs. Engagement_Binary cross-tab:
print(table(Cluster = km_model$cluster, Engagement = ml_data$Engagement_Binary))
## Engagement
## Cluster NotHigh High
## 1 14 5
## 2 16 5
ml_data %>%
mutate(Cluster = factor(km_model$cluster)) %>%
ggplot(aes(x = Month, y = Likes, color = Cluster, shape = Engagement_Binary)) +
geom_jitter(size = 3, alpha = 0.7, width = 0.3, height = 0.3) +
labs(title = "K-Means Clusters vs. Engagement",
subtitle = "YouTube — USA/Canada — #Education Posts",
x = "Month", y = "Likes") +
theme_minimal()
set.seed(123)
rf_model <- train(
Engagement_Binary ~ .,
data = train_data,
method = "rf",
trControl = trainControl(method = "cv", number = 3),
importance = TRUE
)
rf_pred <- predict(rf_model, test_data)
rf_cm <- confusionMatrix(rf_pred, test_data$Engagement_Binary, positive = "High")
print(rf_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction NotHigh High
## NotHigh 6 1
## High 3 2
##
## Accuracy : 0.6667
## 95% CI : (0.3489, 0.9008)
## No Information Rate : 0.75
## P-Value [Acc > NIR] : 0.8424
##
## Kappa : 0.2727
##
## Mcnemar's Test P-Value : 0.6171
##
## Sensitivity : 0.6667
## Specificity : 0.6667
## Pos Pred Value : 0.4000
## Neg Pred Value : 0.8571
## Prevalence : 0.2500
## Detection Rate : 0.1667
## Detection Prevalence : 0.4167
## Balanced Accuracy : 0.6667
##
## 'Positive' Class : High
##
varImpPlot(rf_model$finalModel,
main = "Random Forest — Variable Importance")
library(gbm)
set.seed(123)
# Use gbm directly (avoids caret CV failures on small samples)
train_gbm <- train_data %>%
mutate(
Outcome = if_else(Engagement_Binary == "High", 1, 0),
Region_Num = as.numeric(Region)
)
test_gbm <- test_data %>%
mutate(
Outcome = if_else(Engagement_Binary == "High", 1, 0),
Region_Num = as.numeric(Region)
)
boost_fit <- gbm(
Outcome ~ Month + Region_Num + Content_Type_Binary + Likes + Shares,
data = train_gbm,
distribution = "bernoulli",
n.trees = 100,
interaction.depth = 2,
shrinkage = 0.1,
n.minobsinnode = 3,
verbose = FALSE
)
boost_probs <- predict(boost_fit, test_gbm, n.trees = 100, type = "response")
boost_pred <- factor(if_else(boost_probs >= 0.5, "High", "NotHigh"),
levels = c("NotHigh", "High"))
boost_cm <- confusionMatrix(boost_pred, test_data$Engagement_Binary, positive = "High")
print(boost_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction NotHigh High
## NotHigh 6 1
## High 3 2
##
## Accuracy : 0.6667
## 95% CI : (0.3489, 0.9008)
## No Information Rate : 0.75
## P-Value [Acc > NIR] : 0.8424
##
## Kappa : 0.2727
##
## Mcnemar's Test P-Value : 0.6171
##
## Sensitivity : 0.6667
## Specificity : 0.6667
## Pos Pred Value : 0.4000
## Neg Pred Value : 0.8571
## Prevalence : 0.2500
## Detection Rate : 0.1667
## Detection Prevalence : 0.4167
## Balanced Accuracy : 0.6667
##
## 'Positive' Class : High
##
summary(boost_fit, cBars = 10, plotit = TRUE, main = "Gradient Boosting — Variable Importance")
## var rel.inf
## Likes Likes 42.0130175
## Shares Shares 39.7068961
## Month Month 12.6159951
## Region_Num Region_Num 5.5130733
## Content_Type_Binary Content_Type_Binary 0.1510179
results <- resamples(list(
KNN = knn_model,
Random_Forest = rf_model
))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: KNN, Random_Forest
## Number of resamples: 3
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## KNN 0.7 0.7388889 0.7777778 0.7518519 0.7777778 0.7777778 0
## Random_Forest 0.6 0.6333333 0.6666667 0.6814815 0.7222222 0.7777778 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## KNN 0.0000000 0.00000000 0 0.000000000 0.00000000 0.0000000
## Random_Forest -0.1764706 -0.08823529 0 0.001782531 0.09090909 0.1818182
## NA's
## KNN 0
## Random_Forest 0
# Boosting test-set accuracy reported separately
boost_acc <- mean(boost_pred == test_data$Engagement_Binary)
cat("\nBoosting test-set accuracy:", round(boost_acc * 100, 2), "%\n")
##
## Boosting test-set accuracy: 66.67 %
bwplot(results, main = "Model Accuracy Comparison — KNN vs. Random Forest (3-Fold CV)")