This dataset includes data of 50.000 users from Spotify. The data includes user’s general information, their activity and bevaviour towards ads. The source of the data is the following link: Spotify User Behavior And Pattern
This dataset is helpful for Spotify to deduce a strategy for the company as they can notice trends and habits among users. The result, for example, could be:
signup_date: The date the user first registered (format: YYYY-MM-DD).
subscription_type: The tier of the user’s account. Categories include:
subscription_status: Whether the account is currently Active or Inactive.
ad_interaction: Indicates if the user interacts with advertisements (Yes/No).
ad_conversion_to_subscription: Indicates if an ad interaction specifically led the user to upgrade to a premium plan (Yes/No).
# Function for the Mode (as R doesn't have a built-in one)
get_mode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
# Calculations for Age
mean_age <- mean(df$age, na.rm = TRUE)
median_age <- median(df$age, na.rm = TRUE)
mode_age <- get_mode(df$age)
sd_age <- sd(df$age, na.rm = TRUE)
var_age <- var(df$age, na.rm = TRUE)
range_age <- max(df$age, na.rm = TRUE) - min(df$age, na.rm = TRUE)
# Presenting the results
print(paste("Mean Age:", round(mean_age, 2)))## [1] "Mean Age: 38.01"
## [1] "Median Age: 38"
## [1] "Mode Age: 47"
## [1] "Standard Deviation (Age): 12.98"
## [1] "Variance (Age): 168.61"
## [1] "Range (Age): 44"
# Calculating the Pearson Correlation Coefficient
cor_value <- cor(df$age, df$avg_listening_hours_per_week, use = "complete.obs")
print(paste("Correlation Coefficient (Age vs Hours):", round(cor_value, 4)))## [1] "Correlation Coefficient (Age vs Hours): -0.0023"
# Histogram
ggplot(df, aes(x = age)) +
geom_histogram(binwidth = 2, fill = "#1DB954", color = "white") +
labs(title = "Distribution of User Age", x = "Age", y = "Frequency") +
theme_minimal()# Boxplot
ggplot(df, aes(x = subscription_type, y = avg_listening_hours_per_week, fill = subscription_type)) +
geom_boxplot() +
labs(title = "Listening Hours by Subscription Type", x = "Subscription", y = "Hours/Week") +
theme_minimal()# Scatterplot
ggplot(df, aes(x = age, y = avg_listening_hours_per_week)) +
geom_point(alpha = 0.1, color = "blue") +
geom_smooth(method = "lm", color = "red") +
labs(title = "Relationship: Age vs. Weekly Hours", x = "Age", y = "Weekly Hours") +
theme_light()## `geom_smooth()` using formula = 'y ~ x'
# Bar Chart
ggplot(df, aes(x = favorite_genre, fill = favorite_genre)) +
geom_bar() +
coord_flip() +
theme_minimal() +
theme(legend.position = "none") +
labs(title = "Most Popular Music Genres", x = "Genre", y = "Number of Users")## R version 4.5.3 (2026-03-11 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: Europe/Athens
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggplot2_4.0.2 dplyr_1.2.0
##
## loaded via a namespace (and not attached):
## [1] Matrix_1.7-4 gtable_0.3.6 jsonlite_2.0.0 compiler_4.5.3
## [5] tidyselect_1.2.1 jquerylib_0.1.4 splines_4.5.3 scales_1.4.0
## [9] yaml_2.3.12 fastmap_1.2.0 lattice_0.22-9 R6_2.6.1
## [13] labeling_0.4.3 generics_0.1.4 knitr_1.51 tibble_3.3.1
## [17] bslib_0.10.0 pillar_1.11.1 RColorBrewer_1.1-3 rlang_1.1.7
## [21] cachem_1.1.0 xfun_0.57 sass_0.4.10 S7_0.2.1
## [25] cli_3.6.5 withr_3.0.2 magrittr_2.0.4 mgcv_1.9-4
## [29] digest_0.6.39 grid_4.5.3 lifecycle_1.0.5 nlme_3.1-168
## [33] vctrs_0.7.2 evaluate_1.0.5 glue_1.8.0 farver_2.1.2
## [37] rmarkdown_2.30 tools_4.5.3 pkgconfig_2.0.3 htmltools_0.5.9