# 1. Load all packages needed
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(cluster)
library(corrplot)
## corrplot 0.95 loaded
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# 2. Load the data
youtube <- read_excel("FDS_A3_Data.xlsx", sheet = "Youtube engagement data")
spotify <- read_excel("FDS_A3_Data.xlsx", sheet = "Spotify streaming data")
geo <- read_excel("FDS_A3_Data.xlsx", sheet = "Geographic distribution")
demo <- read_excel("FDS_A3_Data.xlsx", sheet = "Fan demographics and engagement")
concert <- read_excel("FDS_A3_Data.xlsx", sheet = "Concert market analysis")
# 3. Compute the Fan Engagement Index
engagement <- spotify %>%
inner_join(youtube, by = "song_title") %>%
mutate(
engagement_point = (completion_rate + like_rate + save_rate)/3
)
print(engagement)
## # A tibble: 6 × 10
## song_title daily_streams_indone…¹ like_rate save_rate playlist_adds_indone…²
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 FIGHT ME 28500 0.128 0.094 12400
## 2 BBB (Bitter… 23400 0.142 0.108 9800
## 3 George the … 31200 0.119 0.087 14500
## 4 Supernatural 26700 0.131 0.098 11200
## 5 Diamond 29800 0.135 0.102 13200
## 6 More than I… 21800 0.148 0.113 8700
## # ℹ abbreviated names: ¹​daily_streams_indonesia, ²​playlist_adds_indonesia
## # ℹ 5 more variables: views_indonesia <dbl>, completion_rate <dbl>,
## # likes_indonesia <dbl>, comments_indonesia <dbl>, engagement_point <dbl>
# 4. Find the correlation between Spotify and YouTube metrics
corr_matrix <- engagement %>%
select(completion_rate, like_rate, save_rate, daily_streams_indonesia, views_indonesia)
corrplot(cor(corr_matrix), method = "number")

# 5. Find the market segmentation with cluster analysis
geo_clust <- geo %>%
select(fan_engagement_score, youtube_views, spotify_streams)
geo_scaled <- scale(geo_clust)
hc <- hclust(dist(geo_scaled), method = "ward.D2")
# Create the hierarchial clustering dendrogram
plot(hc, labels = geo$city, main = "Hierarchical Clustering Dendrogram",
xlab = "", sub = "", cex = 0.8)

# Cut tree into clusters
cluster_cut <- cutree(hc, k = 3)
geo$cluster_hier <- as.factor(cluster_cut)
# Visualize clusters on a scatterplot
ggplot(geo, aes(x = youtube_views, y = spotify_streams, color = cluster_hier, label = city)) +
geom_point(size = 3) +
geom_text(nudge_y = 0.05, check_overlap = TRUE) +
labs(title = "Cities Grouped by Hierarchical Cluster", x = "YouTube Views", y = "Spotify Streams")

# 6. Concert Viability Regression (updated for latest dataset)
# Load the data
geo <- read_excel("FDS_A3_Data.xlsx", sheet = "Geographic distribution")
concert <- read_excel("FDS_A3_Data.xlsx", sheet = "Concert market analysis")
# Check column names to confirm exact spellings
colnames(geo)
## [1] "city" "region" "youtube_views"
## [4] "spotify_streams" "fan_engagement_score" "concert_viability"
colnames(concert)
## [1] "city" "estimated_fan_base" "avg_ticket_price_idr"
## [4] "venue_capacity" "projected_attendance"
# Create success_rate (attendance ÷ capacity)
concert <- concert %>%
mutate(success_rate = projected_attendance / venue_capacity)
# Merge fan engagement data to concert dataset
concert <- left_join(concert, geo %>% select(city, fan_engagement_score), by = "city")
# Verify the new columns
colnames(concert)
## [1] "city" "estimated_fan_base" "avg_ticket_price_idr"
## [4] "venue_capacity" "projected_attendance" "success_rate"
## [7] "fan_engagement_score"
# Run regression model using success rate as the dependent variable
model <- lm(success_rate ~ fan_engagement_score + estimated_fan_base, data = concert)
# Display summary
summary(model)
##
## Call:
## lm(formula = success_rate ~ fan_engagement_score + estimated_fan_base,
## data = concert)
##
## Residuals:
## 1 2 3 4
## -0.015683 -0.006027 0.045276 -0.023566
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.712e-01 7.275e-01 0.785 0.576
## fan_engagement_score 1.473e-03 1.204e-02 0.122 0.922
## estimated_fan_base 1.409e-06 1.991e-06 0.708 0.608
##
## Residual standard error: 0.05374 on 1 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.8751, Adjusted R-squared: 0.6254
## F-statistic: 3.505 on 2 and 1 DF, p-value: 0.3533
# Visualize the relationship
ggplot(concert, aes(x = fan_engagement_score, y = success_rate,
size = estimated_fan_base, color = city)) +
geom_point(alpha = 0.8) +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(title = "Effect of Fan Engagement on Concert Success Rate",
x = "Fan Engagement Score",
y = "Concert Success Rate (Attendance ÷ Capacity)",
size = "Estimated Fan Base")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: The following aesthetics were dropped during statistical transformation: size.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

# 7. Visualization of Fan Engagement vs City Market Strength
ggplot(geo, aes(x = youtube_views, y = spotify_streams, color = city, size = fan_engagement_score)) +
geom_point() +
labs(title = "Fan Engagement vs City Market Strength")
