# COMP2501 Project – K-pop Comebacks (40 MVs)

library(tidyverse)   # dplyr, ggplot2, tidyr, readr
## Warning: package 'ggplot2' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)   # for parsing release_date

# 1. Import & Basic Cleaning

# Read CSV (update path if needed)
df <- read_csv("/Users/hyunwookkim/Desktop/kpop_comeback_full.csv")
## Rows: 40 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): group, song, agency, release_date, youtube_url
## dbl  (1): comeback_id
## num (10): views_24h, views_7d, likes_7d, comments_7d, subs_at_release, tweet...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Quick structure check
glimpse(df)
## Rows: 40
## Columns: 16
## $ comeback_id     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
## $ group           <chr> "NewJeans", "NewJeans", "LE SSERAFIM", "LE SSERAFIM", …
## $ song            <chr> "Super Shy", "Ditto", "UNFORGIVEN", "ANTIFRAGILE", "I …
## $ agency          <chr> "ADOR", "ADOR", "Source Music", "Source Music", "Stars…
## $ release_date    <chr> "07 July 2023", "19 December 2022", "01 May 2023", "17…
## $ youtube_url     <chr> "https://www.youtube.com/watch?v=ArmDp-zijuc", "https:…
## $ views_24h       <dbl> 12384221, 8944112, 16221991, 7122882, 9522311, 6744992…
## $ views_7d        <dbl> 43912884, 38552911, 51740332, 27442199, 31882554, 2411…
## $ likes_7d        <dbl> 1253882, 1108442, 1451099, 854221, 901122, 751221, 612…
## $ comments_7d     <dbl> 84923, 72119, 109331, 60442, 64882, 47223, 41882, 3855…
## $ subs_at_release <dbl> 5912331, 5322884, 3912221, 3442110, 2822554, 2511882, …
## $ tweets_window   <dbl> 182442, 159311, 132554, 98331, 143882, 121442, 92554, …
## $ retweets_total  <dbl> 6493334, 5822442, 4923110, 3220882, 4221331, 3011554, …
## $ likes_total     <dbl> 2981552, 2551119, 1981554, 1441110, 1882442, 1220991, …
## $ sent_pos        <dbl> 41223, 36882, 31122, 22554, 28331, 21334, 16442, 15882…
## $ sent_neg        <dbl> 8912, 7442, 7331, 5120, 6442, 4991, 3554, 3331, 19882,…
# Drop any completely blank rows (e.g., Excel junk at bottom)
df <- df %>% drop_na(comeback_id)

# Sanity check: now should be 40 rows
nrow(df)  # you should see 40
## [1] 40
# Columns that should be numeric
numeric_cols <- c(
  "views_24h","views_7d","likes_7d","comments_7d",
  "subs_at_release","tweets_window","likes_total",
  "retweets_total","sent_pos","sent_neg"
)

# Convert data types
df <- df %>%
  mutate(
    comeback_id   = as.integer(comeback_id),
    group         = as.factor(group),
    song          = as.factor(song),
    agency        = as.factor(agency),
    release_date  = dmy(release_date),       # e.g. "07 July 2023"
    across(all_of(numeric_cols), as.numeric)
  )

glimpse(df)   # confirm types
## Rows: 40
## Columns: 16
## $ comeback_id     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
## $ group           <fct> NewJeans, NewJeans, LE SSERAFIM, LE SSERAFIM, IVE, IVE…
## $ song            <fct> Super Shy, Ditto, UNFORGIVEN, ANTIFRAGILE, I AM, After…
## $ agency          <fct> ADOR, ADOR, Source Music, Source Music, Starship, Star…
## $ release_date    <date> 2023-07-07, 2022-12-19, 2023-05-01, 2022-10-17, 2023-…
## $ youtube_url     <chr> "https://www.youtube.com/watch?v=ArmDp-zijuc", "https:…
## $ views_24h       <dbl> 12384221, 8944112, 16221991, 7122882, 9522311, 6744992…
## $ views_7d        <dbl> 43912884, 38552911, 51740332, 27442199, 31882554, 2411…
## $ likes_7d        <dbl> 1253882, 1108442, 1451099, 854221, 901122, 751221, 612…
## $ comments_7d     <dbl> 84923, 72119, 109331, 60442, 64882, 47223, 41882, 3855…
## $ subs_at_release <dbl> 5912331, 5322884, 3912221, 3442110, 2822554, 2511882, …
## $ tweets_window   <dbl> 182442, 159311, 132554, 98331, 143882, 121442, 92554, …
## $ retweets_total  <dbl> 6493334, 5822442, 4923110, 3220882, 4221331, 3011554, …
## $ likes_total     <dbl> 2981552, 2551119, 1981554, 1441110, 1882442, 1220991, …
## $ sent_pos        <dbl> 41223, 36882, 31122, 22554, 28331, 21334, 16442, 15882…
## $ sent_neg        <dbl> 8912, 7442, 7331, 5120, 6442, 4991, 3554, 3331, 19882,…
# 2. Feature Engineering

df <- df %>%
  mutate(
    # Log transforms (stabilize skewed metrics)
    log_views_24h = log10(views_24h + 1),
    log_views_7d  = log10(views_7d + 1),
    log_subs      = log10(subs_at_release + 1),
    log_tweets    = log10(tweets_window + 1),
    log_likes_twt = log10(likes_total + 1),
    
    # Engagement ratios
    like_view_ratio    = likes_7d    / pmax(views_7d, 1),
    comment_view_ratio = comments_7d / pmax(views_7d, 1),
    
    # Sentiment ratios: positive vs negative word counts
    pos_ratio = sent_pos / pmax(sent_pos + sent_neg, 1),
    neg_ratio = sent_neg / pmax(sent_pos + sent_neg, 1)
  )

# 3. Exploratory Data Analysis (EDA)

# 3.1 Histogram of first-week views (log scale)
df %>%
  ggplot(aes(log_views_7d)) +
  geom_histogram(binwidth = 0.1, fill = "steelblue", alpha = 0.7) +
  labs(
    title = "Distribution of First-Week YouTube Views (log10)",
    x = "log10(views_7d + 1)",
    y = "Count of comebacks"
  ) +
  theme_minimal()

# 3.2 Agency-level performance (boxplot)
df %>%
  ggplot(aes(agency, log_views_7d, fill = agency)) +
  geom_boxplot(show.legend = FALSE, alpha = 0.7) +
  labs(
    title = "First-Week Performance by Agency (40 comebacks)",
    x = "Agency",
    y = "log10(views_7d + 1)"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

# 3.3 Agency counts (bar plot – looks better with 40 rows)
df %>%
  count(agency) %>%
  ggplot(aes(x = fct_reorder(agency, n), y = n, fill = agency)) +
  geom_col(show.legend = FALSE, alpha = 0.8) +
  coord_flip() +
  labs(
    title = "Number of Comebacks per Agency",
    x = "Agency",
    y = "Count of MVs"
  ) +
  theme_minimal()

# 3.4 Twitter vs YouTube performance
df %>%
  ggplot(aes(log_tweets, log_views_7d, color = agency)) +
  geom_point(size = 2, alpha = 0.8) +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title = "Twitter Engagement vs First-Week YouTube Views",
    x = "log10(Tweets Window + 1)",
    y = "log10(Views in First Week + 1)"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# 4. Correlation Analysis

corr_vars <- df %>%
  select(
    log_views_7d,
    log_views_24h,
    log_subs,
    log_tweets,
    log_likes_twt,
    like_view_ratio,
    comment_view_ratio,
    pos_ratio,
    neg_ratio
  )

corr_matrix <- cor(corr_vars, use = "complete.obs")

corr_df <- corr_matrix %>%
  as.data.frame() %>%
  rownames_to_column("var1") %>%
  pivot_longer(-var1, names_to = "var2", values_to = "corr")

ggplot(corr_df, aes(var1, var2, fill = corr)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
  labs(
    title = "Correlation Heatmap of Key Features",
    x = "",
    y = ""
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# 5. Regression Models

# Model A: Twitter-only model
model_twitter <- lm(log_views_7d ~ log_tweets, data = df)
summary(model_twitter)
## 
## Call:
## lm(formula = log_views_7d ~ log_tweets, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33949 -0.13106 -0.02068  0.10801  0.32872 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.8432     0.2887  20.244  < 2e-16 ***
## log_tweets    0.3509     0.0609   5.761 1.21e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1791 on 38 degrees of freedom
## Multiple R-squared:  0.4662, Adjusted R-squared:  0.4522 
## F-statistic: 33.19 on 1 and 38 DF,  p-value: 1.21e-06
# Model B: Multi-feature model
model_multi <- lm(
  log_views_7d ~ log_views_24h + log_subs + log_tweets +
    like_view_ratio + comment_view_ratio + pos_ratio + neg_ratio,
  data = df
)
summary(model_multi)
## 
## Call:
## lm(formula = log_views_7d ~ log_views_24h + log_subs + log_tweets + 
##     like_view_ratio + comment_view_ratio + pos_ratio + neg_ratio, 
##     data = df)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.056692 -0.022632 -0.004785  0.019502  0.082442 
## 
## Coefficients: (1 not defined because of singularities)
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.51166    0.25855   5.847 1.52e-06 ***
## log_views_24h        0.84701    0.04786  17.698  < 2e-16 ***
## log_subs             0.01065    0.02729   0.390    0.699    
## log_tweets          -0.02415    0.03579  -0.675    0.504    
## like_view_ratio      1.03375    1.97683   0.523    0.605    
## comment_view_ratio -26.57485    8.58693  -3.095    0.004 ** 
## pos_ratio            0.22334    0.37178   0.601    0.552    
## neg_ratio                 NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03485 on 33 degrees of freedom
## Multiple R-squared:  0.9825, Adjusted R-squared:  0.9793 
## F-statistic:   308 on 6 and 33 DF,  p-value: < 2.2e-16
# Model C: Include agency as factor
model_agency <- lm(
  log_views_7d ~ log_views_24h + log_subs + log_tweets + agency,
  data = df
)
summary(model_agency)
## 
## Call:
## lm(formula = log_views_7d ~ log_views_24h + log_subs + log_tweets + 
##     agency, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.05142 -0.01617 -0.00232  0.01682  0.07029 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.886555   0.171811  10.980 1.84e-11 ***
## log_views_24h       0.777003   0.045198  17.191 4.55e-16 ***
## log_subs           -0.007792   0.032289  -0.241 0.811136    
## log_tweets          0.060901   0.014479   4.206 0.000256 ***
## agencyCube          0.008222   0.029680   0.277 0.783877    
## agencyHYBE         -0.078195   0.024540  -3.186 0.003620 ** 
## agencyJYP          -0.096745   0.024139  -4.008 0.000434 ***
## agencyPledis       -0.033748   0.027302  -1.236 0.227072    
## agencySM           -0.072331   0.025328  -2.856 0.008156 ** 
## agencySource Music -0.062111   0.025007  -2.484 0.019503 *  
## agencyStarship     -0.067211   0.026740  -2.514 0.018222 *  
## agencyWAKEONE      -0.102433   0.028997  -3.533 0.001502 ** 
## agencyYG           -0.081248   0.032261  -2.518 0.018017 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03244 on 27 degrees of freedom
## Multiple R-squared:  0.9876, Adjusted R-squared:  0.982 
## F-statistic: 178.7 on 12 and 27 DF,  p-value: < 2.2e-16
# 6. Residuals & Outlier Analysis

df <- df %>%
  mutate(
    pred_multi  = predict(model_multi, newdata = df),
    resid_multi = log_views_7d - pred_multi
  )

# Residuals vs predicted
df %>%
  ggplot(aes(pred_multi, resid_multi)) +
  geom_point(alpha = 0.8) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(
    title = "Residuals vs Predicted log10(views_7d)",
    x = "Predicted log10(views_7d + 1)",
    y = "Residual"
  ) +
  theme_minimal()

# Top 5 overperformers
df %>%
  arrange(desc(resid_multi)) %>%
  select(comeback_id, group, song, agency,
         log_views_7d, pred_multi, resid_multi) %>%
  head(5)
## # A tibble: 5 × 7
##   comeback_id group     song      agency log_views_7d pred_multi resid_multi
##         <int> <fct>     <fct>     <fct>         <dbl>      <dbl>       <dbl>
## 1          14 (G)I-DLE  Queencard Cube           7.54       7.46      0.0824
## 2           2 NewJeans  Ditto     ADOR           7.59       7.51      0.0745
## 3          15 Seventeen Super     Pledis         7.68       7.63      0.0562
## 4          38 fromis_9  DM        Pledis         7.24       7.18      0.0515
## 5          27 ENHYPEN   Bite Me   HYBE           7.69       7.66      0.0289
# Top 5 underperformers
df %>%
  arrange(resid_multi) %>%
  select(comeback_id, group, song, agency,
         log_views_7d, pred_multi, resid_multi) %>%
  head(5)
## # A tibble: 5 × 7
##   comeback_id group       song        agency log_views_7d pred_multi resid_multi
##         <int> <fct>       <fct>       <fct>         <dbl>      <dbl>       <dbl>
## 1          11 Stray Kids  S-Class     JYP            7.61       7.66     -0.0567
## 2           8 Aespa       Drama       SM             7.24       7.28     -0.0458
## 3          32 TXT         Sugar Rush… HYBE           7.69       7.74     -0.0456
## 4          31 Kep1er      Giddy       WAKEO…         7.11       7.14     -0.0367
## 5          24 ZEROBASEONE In Bloom    WAKEO…         7.19       7.22     -0.0326
# 7. Train/Test Split (now meaningful with 40 rows)

set.seed(42)

n <- nrow(df)           # should be 40
test_idx <- sample(seq_len(n), size = floor(0.3 * n))  # ≈ 12 test, 28 train

train <- df[-test_idx, ]
test  <- df[test_idx, ]

# Train on training set
model_train <- lm(
  log_views_7d ~ log_views_24h + log_subs + log_tweets +
    like_view_ratio + comment_view_ratio + pos_ratio + neg_ratio,
  data = train
)

# Predict on test set
test <- test %>%
  mutate(pred = predict(model_train, newdata = test))

# RMSE
rmse <- sqrt(mean((test$log_views_7d - test$pred)^2))
rmse
## [1] 0.04143438
# Test-set R-squared
sst <- sum((test$log_views_7d - mean(test$log_views_7d))^2)
sse <- sum((test$log_views_7d - test$pred)^2)
rsq <- 1 - sse / sst
rsq
## [1] 0.9608263