# 1. Load all packages needed
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(cluster)
library(corrplot)
## corrplot 0.95 loaded
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# 2. Load the data
youtube <- read_excel("FDS_A3_Data.xlsx", sheet = "Youtube engagement data")
spotify <- read_excel("FDS_A3_Data.xlsx", sheet = "Spotify streaming data")
geo <- read_excel("FDS_A3_Data.xlsx", sheet = "Geographic distribution")
demo <- read_excel("FDS_A3_Data.xlsx", sheet = "Fan demographics and engagement")
concert <- read_excel("FDS_A3_Data.xlsx", sheet = "Concert market analysis")

# 3. Compute the Fan Engagement Index
engagement <- spotify %>%
  inner_join(youtube, by = "song_title") %>%
  mutate(
    engagement_point = (completion_rate + like_rate + save_rate)/3
  )
print(engagement)
## # A tibble: 6 × 10
##   song_title   daily_streams_indone…¹ like_rate save_rate playlist_adds_indone…²
##   <chr>                         <dbl>     <dbl>     <dbl>                  <dbl>
## 1 FIGHT ME                      28500     0.128     0.094                  12400
## 2 BBB (Bitter…                  23400     0.142     0.108                   9800
## 3 George the …                  31200     0.119     0.087                  14500
## 4 Supernatural                  26700     0.131     0.098                  11200
## 5 Diamond                       29800     0.135     0.102                  13200
## 6 More than I…                  21800     0.148     0.113                   8700
## # ℹ abbreviated names: ¹​daily_streams_indonesia, ²​playlist_adds_indonesia
## # ℹ 5 more variables: views_indonesia <dbl>, completion_rate <dbl>,
## #   likes_indonesia <dbl>, comments_indonesia <dbl>, engagement_point <dbl>
# 4. Find the correlation between Spotify and YouTube metrics
corr_matrix <- engagement %>%
  select(completion_rate, like_rate, save_rate, daily_streams_indonesia, views_indonesia)
corrplot(cor(corr_matrix), method = "number")

# 5. Find the market segmentation with cluster analysis 
geo_clust <- geo %>%
  select(fan_engagement_score, youtube_views, spotify_streams)

geo_scaled <- scale(geo_clust)
hc <- hclust(dist(geo_scaled), method = "ward.D2")

# Create the hierarchial clustering dendrogram
plot(hc, labels = geo$city, main = "Hierarchical Clustering Dendrogram",
     xlab = "", sub = "", cex = 0.8)

# Cut tree into clusters 
cluster_cut <- cutree(hc, k = 3)
geo$cluster_hier <- as.factor(cluster_cut)

# Visualize clusters on a scatterplot
ggplot(geo, aes(x = youtube_views, y = spotify_streams, color = cluster_hier, label = city)) +
  geom_point(size = 3) +
  geom_text(nudge_y = 0.05, check_overlap = TRUE) +
  labs(title = "Cities Grouped by Hierarchical Cluster", x = "YouTube Views", y = "Spotify Streams")

# 6. Concert Viability Regression (updated for latest dataset)

# Load the data
geo <- read_excel("FDS_A3_Data.xlsx", sheet = "Geographic distribution")
concert <- read_excel("FDS_A3_Data.xlsx", sheet = "Concert market analysis")

# Check column names to confirm exact spellings
colnames(geo)
## [1] "city"                 "region"               "youtube_views"       
## [4] "spotify_streams"      "fan_engagement_score" "concert_viability"
colnames(concert)
## [1] "city"                 "estimated_fan_base"   "avg_ticket_price_idr"
## [4] "venue_capacity"       "projected_attendance"
# Create success_rate (attendance ÷ capacity)
concert <- concert %>%
  mutate(success_rate = projected_attendance / venue_capacity)

# Merge fan engagement data to concert dataset
concert <- left_join(concert, geo %>% select(city, fan_engagement_score), by = "city")

# Verify the new columns
colnames(concert)
## [1] "city"                 "estimated_fan_base"   "avg_ticket_price_idr"
## [4] "venue_capacity"       "projected_attendance" "success_rate"        
## [7] "fan_engagement_score"
# Run regression model using success rate as the dependent variable
model <- lm(success_rate ~ fan_engagement_score + estimated_fan_base, data = concert)

# Display summary
summary(model)
## 
## Call:
## lm(formula = success_rate ~ fan_engagement_score + estimated_fan_base, 
##     data = concert)
## 
## Residuals:
##         1         2         3         4 
## -0.015683 -0.006027  0.045276 -0.023566 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)
## (Intercept)          5.712e-01  7.275e-01   0.785    0.576
## fan_engagement_score 1.473e-03  1.204e-02   0.122    0.922
## estimated_fan_base   1.409e-06  1.991e-06   0.708    0.608
## 
## Residual standard error: 0.05374 on 1 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.8751, Adjusted R-squared:  0.6254 
## F-statistic: 3.505 on 2 and 1 DF,  p-value: 0.3533
# Visualize the relationship
ggplot(concert, aes(x = fan_engagement_score, y = success_rate,
                    size = estimated_fan_base, color = city)) +
  geom_point(alpha = 0.8) +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(title = "Effect of Fan Engagement on Concert Success Rate",
       x = "Fan Engagement Score",
       y = "Concert Success Rate (Attendance ÷ Capacity)",
       size = "Estimated Fan Base")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: The following aesthetics were dropped during statistical transformation: size.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

# 7. Visualization of Fan Engagement vs City Market Strength
ggplot(geo, aes(x = youtube_views, y = spotify_streams, color = city, size = fan_engagement_score)) +
  geom_point() +
  labs(title = "Fan Engagement vs City Market Strength")