# COMP2501 Project – K-pop Comebacks (40 MVs)
library(tidyverse) # dplyr, ggplot2, tidyr, readr
## Warning: package 'ggplot2' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate) # for parsing release_date
# 1. Import & Basic Cleaning
# Read CSV (update path if needed)
df <- read_csv("/Users/hyunwookkim/Desktop/kpop_comeback_full.csv")
## Rows: 40 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): group, song, agency, release_date, youtube_url
## dbl (1): comeback_id
## num (10): views_24h, views_7d, likes_7d, comments_7d, subs_at_release, tweet...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Quick structure check
glimpse(df)
## Rows: 40
## Columns: 16
## $ comeback_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
## $ group <chr> "NewJeans", "NewJeans", "LE SSERAFIM", "LE SSERAFIM", …
## $ song <chr> "Super Shy", "Ditto", "UNFORGIVEN", "ANTIFRAGILE", "I …
## $ agency <chr> "ADOR", "ADOR", "Source Music", "Source Music", "Stars…
## $ release_date <chr> "07 July 2023", "19 December 2022", "01 May 2023", "17…
## $ youtube_url <chr> "https://www.youtube.com/watch?v=ArmDp-zijuc", "https:…
## $ views_24h <dbl> 12384221, 8944112, 16221991, 7122882, 9522311, 6744992…
## $ views_7d <dbl> 43912884, 38552911, 51740332, 27442199, 31882554, 2411…
## $ likes_7d <dbl> 1253882, 1108442, 1451099, 854221, 901122, 751221, 612…
## $ comments_7d <dbl> 84923, 72119, 109331, 60442, 64882, 47223, 41882, 3855…
## $ subs_at_release <dbl> 5912331, 5322884, 3912221, 3442110, 2822554, 2511882, …
## $ tweets_window <dbl> 182442, 159311, 132554, 98331, 143882, 121442, 92554, …
## $ retweets_total <dbl> 6493334, 5822442, 4923110, 3220882, 4221331, 3011554, …
## $ likes_total <dbl> 2981552, 2551119, 1981554, 1441110, 1882442, 1220991, …
## $ sent_pos <dbl> 41223, 36882, 31122, 22554, 28331, 21334, 16442, 15882…
## $ sent_neg <dbl> 8912, 7442, 7331, 5120, 6442, 4991, 3554, 3331, 19882,…
# Drop any completely blank rows (e.g., Excel junk at bottom)
df <- df %>% drop_na(comeback_id)
# Sanity check: now should be 40 rows
nrow(df) # you should see 40
## [1] 40
# Columns that should be numeric
numeric_cols <- c(
"views_24h","views_7d","likes_7d","comments_7d",
"subs_at_release","tweets_window","likes_total",
"retweets_total","sent_pos","sent_neg"
)
# Convert data types
df <- df %>%
mutate(
comeback_id = as.integer(comeback_id),
group = as.factor(group),
song = as.factor(song),
agency = as.factor(agency),
release_date = dmy(release_date), # e.g. "07 July 2023"
across(all_of(numeric_cols), as.numeric)
)
glimpse(df) # confirm types
## Rows: 40
## Columns: 16
## $ comeback_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
## $ group <fct> NewJeans, NewJeans, LE SSERAFIM, LE SSERAFIM, IVE, IVE…
## $ song <fct> Super Shy, Ditto, UNFORGIVEN, ANTIFRAGILE, I AM, After…
## $ agency <fct> ADOR, ADOR, Source Music, Source Music, Starship, Star…
## $ release_date <date> 2023-07-07, 2022-12-19, 2023-05-01, 2022-10-17, 2023-…
## $ youtube_url <chr> "https://www.youtube.com/watch?v=ArmDp-zijuc", "https:…
## $ views_24h <dbl> 12384221, 8944112, 16221991, 7122882, 9522311, 6744992…
## $ views_7d <dbl> 43912884, 38552911, 51740332, 27442199, 31882554, 2411…
## $ likes_7d <dbl> 1253882, 1108442, 1451099, 854221, 901122, 751221, 612…
## $ comments_7d <dbl> 84923, 72119, 109331, 60442, 64882, 47223, 41882, 3855…
## $ subs_at_release <dbl> 5912331, 5322884, 3912221, 3442110, 2822554, 2511882, …
## $ tweets_window <dbl> 182442, 159311, 132554, 98331, 143882, 121442, 92554, …
## $ retweets_total <dbl> 6493334, 5822442, 4923110, 3220882, 4221331, 3011554, …
## $ likes_total <dbl> 2981552, 2551119, 1981554, 1441110, 1882442, 1220991, …
## $ sent_pos <dbl> 41223, 36882, 31122, 22554, 28331, 21334, 16442, 15882…
## $ sent_neg <dbl> 8912, 7442, 7331, 5120, 6442, 4991, 3554, 3331, 19882,…
# 2. Feature Engineering
df <- df %>%
mutate(
# Log transforms (stabilize skewed metrics)
log_views_24h = log10(views_24h + 1),
log_views_7d = log10(views_7d + 1),
log_subs = log10(subs_at_release + 1),
log_tweets = log10(tweets_window + 1),
log_likes_twt = log10(likes_total + 1),
# Engagement ratios
like_view_ratio = likes_7d / pmax(views_7d, 1),
comment_view_ratio = comments_7d / pmax(views_7d, 1),
# Sentiment ratios: positive vs negative word counts
pos_ratio = sent_pos / pmax(sent_pos + sent_neg, 1),
neg_ratio = sent_neg / pmax(sent_pos + sent_neg, 1)
)
# 3. Exploratory Data Analysis (EDA)
# 3.1 Histogram of first-week views (log scale)
df %>%
ggplot(aes(log_views_7d)) +
geom_histogram(binwidth = 0.1, fill = "steelblue", alpha = 0.7) +
labs(
title = "Distribution of First-Week YouTube Views (log10)",
x = "log10(views_7d + 1)",
y = "Count of comebacks"
) +
theme_minimal()

# 3.2 Agency-level performance (boxplot)
df %>%
ggplot(aes(agency, log_views_7d, fill = agency)) +
geom_boxplot(show.legend = FALSE, alpha = 0.7) +
labs(
title = "First-Week Performance by Agency (40 comebacks)",
x = "Agency",
y = "log10(views_7d + 1)"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1))

# 3.3 Agency counts (bar plot – looks better with 40 rows)
df %>%
count(agency) %>%
ggplot(aes(x = fct_reorder(agency, n), y = n, fill = agency)) +
geom_col(show.legend = FALSE, alpha = 0.8) +
coord_flip() +
labs(
title = "Number of Comebacks per Agency",
x = "Agency",
y = "Count of MVs"
) +
theme_minimal()

# 3.4 Twitter vs YouTube performance
df %>%
ggplot(aes(log_tweets, log_views_7d, color = agency)) +
geom_point(size = 2, alpha = 0.8) +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "Twitter Engagement vs First-Week YouTube Views",
x = "log10(Tweets Window + 1)",
y = "log10(Views in First Week + 1)"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# 4. Correlation Analysis
corr_vars <- df %>%
select(
log_views_7d,
log_views_24h,
log_subs,
log_tweets,
log_likes_twt,
like_view_ratio,
comment_view_ratio,
pos_ratio,
neg_ratio
)
corr_matrix <- cor(corr_vars, use = "complete.obs")
corr_df <- corr_matrix %>%
as.data.frame() %>%
rownames_to_column("var1") %>%
pivot_longer(-var1, names_to = "var2", values_to = "corr")
ggplot(corr_df, aes(var1, var2, fill = corr)) +
geom_tile() +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
labs(
title = "Correlation Heatmap of Key Features",
x = "",
y = ""
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# 5. Regression Models
# Model A: Twitter-only model
model_twitter <- lm(log_views_7d ~ log_tweets, data = df)
summary(model_twitter)
##
## Call:
## lm(formula = log_views_7d ~ log_tweets, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.33949 -0.13106 -0.02068 0.10801 0.32872
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.8432 0.2887 20.244 < 2e-16 ***
## log_tweets 0.3509 0.0609 5.761 1.21e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1791 on 38 degrees of freedom
## Multiple R-squared: 0.4662, Adjusted R-squared: 0.4522
## F-statistic: 33.19 on 1 and 38 DF, p-value: 1.21e-06
# Model B: Multi-feature model
model_multi <- lm(
log_views_7d ~ log_views_24h + log_subs + log_tweets +
like_view_ratio + comment_view_ratio + pos_ratio + neg_ratio,
data = df
)
summary(model_multi)
##
## Call:
## lm(formula = log_views_7d ~ log_views_24h + log_subs + log_tweets +
## like_view_ratio + comment_view_ratio + pos_ratio + neg_ratio,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.056692 -0.022632 -0.004785 0.019502 0.082442
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.51166 0.25855 5.847 1.52e-06 ***
## log_views_24h 0.84701 0.04786 17.698 < 2e-16 ***
## log_subs 0.01065 0.02729 0.390 0.699
## log_tweets -0.02415 0.03579 -0.675 0.504
## like_view_ratio 1.03375 1.97683 0.523 0.605
## comment_view_ratio -26.57485 8.58693 -3.095 0.004 **
## pos_ratio 0.22334 0.37178 0.601 0.552
## neg_ratio NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03485 on 33 degrees of freedom
## Multiple R-squared: 0.9825, Adjusted R-squared: 0.9793
## F-statistic: 308 on 6 and 33 DF, p-value: < 2.2e-16
# Model C: Include agency as factor
model_agency <- lm(
log_views_7d ~ log_views_24h + log_subs + log_tweets + agency,
data = df
)
summary(model_agency)
##
## Call:
## lm(formula = log_views_7d ~ log_views_24h + log_subs + log_tweets +
## agency, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.05142 -0.01617 -0.00232 0.01682 0.07029
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.886555 0.171811 10.980 1.84e-11 ***
## log_views_24h 0.777003 0.045198 17.191 4.55e-16 ***
## log_subs -0.007792 0.032289 -0.241 0.811136
## log_tweets 0.060901 0.014479 4.206 0.000256 ***
## agencyCube 0.008222 0.029680 0.277 0.783877
## agencyHYBE -0.078195 0.024540 -3.186 0.003620 **
## agencyJYP -0.096745 0.024139 -4.008 0.000434 ***
## agencyPledis -0.033748 0.027302 -1.236 0.227072
## agencySM -0.072331 0.025328 -2.856 0.008156 **
## agencySource Music -0.062111 0.025007 -2.484 0.019503 *
## agencyStarship -0.067211 0.026740 -2.514 0.018222 *
## agencyWAKEONE -0.102433 0.028997 -3.533 0.001502 **
## agencyYG -0.081248 0.032261 -2.518 0.018017 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03244 on 27 degrees of freedom
## Multiple R-squared: 0.9876, Adjusted R-squared: 0.982
## F-statistic: 178.7 on 12 and 27 DF, p-value: < 2.2e-16
# 6. Residuals & Outlier Analysis
df <- df %>%
mutate(
pred_multi = predict(model_multi, newdata = df),
resid_multi = log_views_7d - pred_multi
)
# Residuals vs predicted
df %>%
ggplot(aes(pred_multi, resid_multi)) +
geom_point(alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(
title = "Residuals vs Predicted log10(views_7d)",
x = "Predicted log10(views_7d + 1)",
y = "Residual"
) +
theme_minimal()

# Top 5 overperformers
df %>%
arrange(desc(resid_multi)) %>%
select(comeback_id, group, song, agency,
log_views_7d, pred_multi, resid_multi) %>%
head(5)
## # A tibble: 5 × 7
## comeback_id group song agency log_views_7d pred_multi resid_multi
## <int> <fct> <fct> <fct> <dbl> <dbl> <dbl>
## 1 14 (G)I-DLE Queencard Cube 7.54 7.46 0.0824
## 2 2 NewJeans Ditto ADOR 7.59 7.51 0.0745
## 3 15 Seventeen Super Pledis 7.68 7.63 0.0562
## 4 38 fromis_9 DM Pledis 7.24 7.18 0.0515
## 5 27 ENHYPEN Bite Me HYBE 7.69 7.66 0.0289
# Top 5 underperformers
df %>%
arrange(resid_multi) %>%
select(comeback_id, group, song, agency,
log_views_7d, pred_multi, resid_multi) %>%
head(5)
## # A tibble: 5 × 7
## comeback_id group song agency log_views_7d pred_multi resid_multi
## <int> <fct> <fct> <fct> <dbl> <dbl> <dbl>
## 1 11 Stray Kids S-Class JYP 7.61 7.66 -0.0567
## 2 8 Aespa Drama SM 7.24 7.28 -0.0458
## 3 32 TXT Sugar Rush… HYBE 7.69 7.74 -0.0456
## 4 31 Kep1er Giddy WAKEO… 7.11 7.14 -0.0367
## 5 24 ZEROBASEONE In Bloom WAKEO… 7.19 7.22 -0.0326
# 7. Train/Test Split (now meaningful with 40 rows)
set.seed(42)
n <- nrow(df) # should be 40
test_idx <- sample(seq_len(n), size = floor(0.3 * n)) # ≈ 12 test, 28 train
train <- df[-test_idx, ]
test <- df[test_idx, ]
# Train on training set
model_train <- lm(
log_views_7d ~ log_views_24h + log_subs + log_tweets +
like_view_ratio + comment_view_ratio + pos_ratio + neg_ratio,
data = train
)
# Predict on test set
test <- test %>%
mutate(pred = predict(model_train, newdata = test))
# RMSE
rmse <- sqrt(mean((test$log_views_7d - test$pred)^2))
rmse
## [1] 0.04143438
# Test-set R-squared
sst <- sum((test$log_views_7d - mean(test$log_views_7d))^2)
sse <- sum((test$log_views_7d - test$pred)^2)
rsq <- 1 - sse / sst
rsq
## [1] 0.9608263