#Task 1
#1.1
This Spotify churn dataset contains information about user behavior and engagement patterns on the music streaming platform. The variables include:
User Demographics: Information about user characteristics Usage Metrics: Various measures of how users interact with the platform Engagement Features: Metrics related to user activity and preferences Churn Indicator: Whether the user churned or remained active (if present)
Rows: 8,000 users
Columns (12):
user_id – unique identifier
gender – categorical (Male, Female, Other)
age – numeric
country – categorical (US, DE, UK, etc.)
subscription_type – categorical (Free, Premium, Family, Student)
listening_time – numeric (minutes per week/month?)
songs_played_per_day – numeric
skip_rate – numeric (0–1 proportion of skipped songs)
device_type – categorical (Desktop, Mobile, Web)
ads_listened_per_week – numeric
offline_listening – binary (0/1)
is_churned – binary target (0 = active, 1 = churned)
#General Setup
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(tidyverse) library(knitr) library(kableExtra) library(corrplot) library(ggplot2) library(dplyr) library(readr)
spotify_data <- read_csv(“C:/Users/frach/OneDrive/Desktop/IMB/R Bootcamp/Homework/spotify_churn_dataset.csv”)
cat(“Dataset Dimensions:”) cat(“Number of rows:”, nrow(spotify_data), “”) cat(“Number of columns:”, ncol(spotify_data), “”)
cat(“Variables in the dataset:”) str(spotify_data)
head(spotify_data) %>% kable(caption = “First 6 rows of Spotify Churn Dataset”) %>% kable_styling(bootstrap_options = c(“striped”, “hover”))
#1.2
missing_summary <- spotify_data %>% summarise_all(~sum(is.na(.))) %>% pivot_longer(everything(), names_to = “Variable”, values_to = “Missing_Count”) %>% mutate(Missing_Percentage = round(Missing_Count / nrow(spotify_data) * 100, 2))
missing_summary %>% filter(Missing_Count > 0) %>% kable(caption = “Variables with Missing Values”) %>% kable_styling(bootstrap_options = c(“striped”, “hover”))
spotify_clean <- spotify_data %>% drop_na()
if(nrow(spotify_clean) == 0) { spotify_clean <- spotify_data cat(“No missing values found - using complete dataset”) }
cat(“Original dataset rows:”, nrow(spotify_data), “”) cat(“Cleaned dataset rows:”, nrow(spotify_clean), “”) cat(“Rows removed:”, nrow(spotify_data) - nrow(spotify_clean), “”)
spotify_clean <- spotify_clean %>% mutate( # Categorize listening time listening_category = case_when( listening_time < 100 ~ “Low”, listening_time < 300 ~ “Medium”, TRUE ~ “High” ), # Categorize skip rate skip_behavior = case_when( skip_rate < 0.3 ~ “Low Skipper”, skip_rate < 0.6 ~ “Moderate Skipper”, TRUE ~ “High Skipper” ), # Engagement score (combination of listening time and songs played) engagement_score = (listening_time / max(listening_time)) * 0.5 + (songs_played_per_day / max(songs_played_per_day)) * 0.5 )
numeric_vars <- spotify_clean %>% select(age, listening_time, songs_played_per_day, skip_rate, ads_listened_per_week, offline_listening, is_churned)
cat(“variables selected for analysis:”, ncol(numeric_vars), “”) cat(“Column names:”, paste(names(numeric_vars), collapse = “,”), “”)
#2.3
desc_stats <- spotify_clean %>% select(age, listening_time, songs_played_per_day, skip_rate, ads_listened_per_week, offline_listening) %>% summarise_all(list( Mean = ~mean(., na.rm = TRUE), Median = ~median(., na.rm = TRUE), SD = ~sd(., na.rm = TRUE), Min = ~min(., na.rm = TRUE), Max = ~max(., na.rm = TRUE) ))
desc_stats_long <- desc_stats %>% pivot_longer(everything(), names_to = “stat”, values_to = “value”) %>% separate(stat, into = c(“variable”, “measure”), sep = “_“, extra =”merge”) %>% pivot_wider(names_from = measure, values_from = value)
print(desc_stats_long)
mean_listening <- mean(spotify_clean\(listening_time) sd_skip <- sd(spotify_clean\)skip_rate) median_songs <- median(spotify_clean$songs_played_per_day)
print(paste(“Mean Listening Time:”, round(mean_listening, 2))) print(paste(“SD of Skip Rate:”, round(sd_skip, 3))) print(paste(“Median Songs Per Day:”, median_songs))
quartiles <- quantile(spotify_clean$listening_time, probs = c(0.25, 0.5, 0.75)) print(“Listening Time Quartiles:”) print(quartiles)
Interpretation of Results:
Age (Mean: 37.7 years)
Users average around 38 years old This indicates a mature user base, not primarily young adults
Listening Time (Mean: 154.07 minutes)
Users listen for about 2.5 hours on average Quartiles (81, 154, 227) show:
25% listen less than 1.3 hours 50% listen less than 2.5 hours 25% listen more than 3.8 hours
Wide spread indicates diverse usage patterns
Songs Played Per Day (Median: 50)
Half of users play more than 50 songs daily, half play fewer This is quite high engagement (about 2-3 songs per hour if listening 2.5 hours)
Skip Rate (Mean: 0.30, SD: 0.174)
Users skip 30% of songs on average SD of 0.174 shows moderate variability This means most users skip between 13% and 47% of songs (mean ± 1 SD)
Ads Listened Per Week (Mean: 6.94)
Users hear about 7 ads weekly on average Likely indicates many free-tier users in the dataset
Three Sample Statistics Explained in Detail: 1. Mean (e.g., Listening Time = 154.07):
The mean is the arithmetic average - add all values and divide by count Good for understanding typical behavior Can be affected by extreme values (outliers)
Measures how spread out the data is from the mean Smaller SD = data clustered near mean Larger SD = data more spread out Skip rate SD of 0.174 means about 68% of users have skip rates between 12.6% and 47.4%
The middle value when data is sorted Not affected by outliers unlike mean Better than mean for skewed data Exactly half of users are above/below this value
#1.4
library(ggplot2)
ggplot(spotify_clean, aes(x = songs_played_per_day)) + geom_histogram(bins = 20, fill = “lawngreen”, color = “magenta2”, alpha = 0.8) + labs(title = “Distribution of Songs Played Per Day”, x = “Songs Played Per Day”, y = “Number of Users”) + theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 14, face = “bold”))
ggplot(spotify_clean, aes(x = factor(is_churned), y = listening_time, fill = factor(is_churned))) + geom_boxplot(alpha = 0.7) + scale_fill_manual(values = c(“0” = “lightgreen”, “1” = “salmon”)) + scale_x_discrete(labels = c(“0” = “Active”, “1” = “Churned”)) + labs(title = “Listening Time by User Status”, x = “User Status”, y = “Listening Time (minutes)”) + theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 14, face = “bold”), legend.position = “none”)
ggplot(spotify_clean, aes(x = listening_time, y = songs_played_per_day)) + geom_point(alpha = 0.5, color = “darkblue”, size = 1.5) + geom_smooth(method = “lm”, color = “slateblue4”, se = TRUE, alpha = 0.2) + labs(title = “Listening Time vs Songs Played Per Day”, x = “Listening Time (minutes)”, y = “Songs Played Per Day”) + theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 14, face = “bold”))
#Task 2
#2.1
library(readxl) library(ggplot2)
mba_data <- read_excel(“C:/Users/frach/OneDrive/Desktop/IMB/R Bootcamp/Homework/Business School.xlsx”)
ggplot(mba_data, aes(x = Undergrad Degree)) +
geom_bar(fill = “darkgoldenrod3”, color = “gray0”) + labs(title =
“Distribution of Undergraduate Degrees”, x = “Undergraduate Degree”, y =
“Number of Students”) + theme_minimal() + theme(plot.title =
element_text(hjust = 0.5, size = 14, face = “bold”))
#2.2
library(dplyr)
summary_stats <- mba_data %>% summarise( count = n(), mean =
mean(Annual Salary, na.rm = TRUE), median =
median(Annual Salary, na.rm = TRUE), sd =
sd(Annual Salary, na.rm = TRUE), min =
min(Annual Salary, na.rm = TRUE), max =
max(Annual Salary, na.rm = TRUE) ) print(summary_stats)
ggplot(mba_data, aes(x = Annual Salary)) +
geom_histogram(bins = 20, fill = “steelblue”, color = “black”, alpha =
0.7) + labs(title = “Distribution of Annual Salary”, x = “Annual
Salary”, y = “Number of Students”) + theme_minimal() + theme(plot.title
= element_text(hjust = 0.5, size = 14, face = “bold”))
The mean (109,058) is higher than the median (103,500). This suggests the distribution is right-skewed (a few very high salaries pull the mean up).
The range is very wide (20,000 → 340,000), showing a big gap between lowest and highest salaries.
The relatively large standard deviation (41,501) confirms high variability in salaries.
Likely, most students cluster around ~100,000, with a long right tail due to some very high earners.
#2.3
t_test_result <- t.test(mba_data$MBA Grade, mu =
74)
print(t_test_result)
mean_mba <- mean(mba_data\(`MBA Grade`,
na.rm = TRUE)
sd_mba <- sd(mba_data\)MBA Grade, na.rm =
TRUE)
cohens_d <- (mean_mba - 74) / sd_mba cohens_d
One-sample t-test results
Sample mean: 76.04
Null hypothesis mean (µ₀): 74
t(99) = 2.66, p = 0.009
95% CI: [74.52, 77.56]
Since p < 0.05, we reject H₀. The mean MBA grade in our sample (76.04) is significantly higher than the previous year’s average (74).
Effect size (Cohen’s d)
Cohen’s d = 0.27
Interpretation (Cohen’s d benchmarks):
~0.2 = small effect
~0.5 = medium effect
~0.8 = large effect
The effect is statistically significant, but the practical effect is small.
This means that while students this year did score higher on average, the improvement over the historical average is modest in real-world terms. The current cohort of MBA students scored significantly above the historical mean of 74. However, the effect size is small, indicating the improvement is noticeable but not a large jump in performance.
#Task 3
apartments <- read_excel(“C:/Users/frach/OneDrive/Desktop/IMB/R Bootcamp/Homework/Apartments.xlsx”)
apartments\(Parking <- factor(apartments\)Parking, labels = c(“No”, “Yes”)) apartments\(Balcony <- factor(apartments\)Balcony, labels = c(“No”, “Yes”))
str(apartments)
library(readxl)
t_test_price <- t.test(apartments$Price, mu = 1900)
print(t_test_price)
mean_price <- mean(apartments\(Price, na.rm = TRUE) sd_price <- sd(apartments\)Price, na.rm = TRUE)
cohens_d_price <- (mean_price - 1900) / sd_price cohens_d_price
Since p < 0.05, we reject H₀. The average apartment price is significantly higher than €1900.
Cohen’s d = 0.31
So this is a small-to-moderate effect size. Prices are statistically higher than €1900, but the practical difference is modest (around €119 on average).
Conclusiom The data provides strong evidence that apartment prices are above €1900, but the effect size shows this difference is not very large in practical terms.
fit1 <- lm(Price ~ Age, data = apartments)
summary(fit1)
cor_coef <- cor(apartments\(Age, apartments\)Price, use = “complete.obs”)
cor_coef
Regression coefficient (β₁ = –8.975)
The slope is –8.98.
for every 1 year increase in apartment age, the expected price decreases by about €8.98 (holding all else constant).
Since the p-value = 0.034 (< 0.05), this effect is statistically significant.
Correlation coefficient (r = –0.230)
This measures the strength and direction of the linear relationship between Age and Price.
r = –0.23 → weak negative correlation. Older apartments tend to have lower prices, but the relationship is not very strong.
Coefficient of determination (R² = 0.053)
R² = 0.053 → 5.3% of the variation in apartment prices is explained by Age.
That means 94.7% of the price variation is due to other factors not captured in this simple regression (e.g., location, size, balcony, parking, etc.).
So while Age has a statistically significant effect, it is not a strong predictor of Price by itself.
Conclusion:
Apartment age has a small but significant negative effect on price.
The correlation is weak (–0.23), and only ~5% of price differences can be explained by age alone.
install.packages(“GGally”)
library(GGally)
df_sub <- apartments[, c(“Price”, “Age”, “Distance”)]
ggpairs(df_sub)
Correlation results from the matrix
Price – Age: r = –0.230 → weak negative correlation.
Price – Distance: r = –0.631 → strong negative correlation (apartments farther away tend to be cheaper).
Age – Distance: r = 0.043 → essentially no correlation.
Multicollinearity check
Multicollinearity is only a problem if the independent (predictor) variables are strongly correlated with each other.
Here the predictors are Age and Distance.
Their correlation is r = 0.043 → almost zero → no linear relationship. Conclusion: There is no evidence of multicollinearity between Age and Distance. We can safely include both as predictors in a multiple regression model without worrying about inflated standard errors or unstable estimates.
fit2 <- lm(Price ~ Age + Distance, data = apartments)
summary(fit2)
install.packages(“car”)
library(car)
vif(fit2)
A VIF = 1 means that the predictor is completely uncorrelated with the other predictors in the model.
The values are almost exactly 1, which indicates:
No multicollinearity at all between Age and Distance.
Each variable provides unique, independent information about apartment prices.
Conclusion
The regression model Price = f(Age, Distance) is free from multicollinearity problems.
We can interpret the coefficients of Age and Distance confidently, without worrying that they overlap or distort each other’s effects.
std_resid <- rstandard(fit2)
cooks_d <- cooks.distance(fit2)
diagnostics <- data.frame(std_resid, cooks_d)
n <- nrow(apartments) threshold_cooks <- 4/n
which(abs(std_resid) > 2 | cooks_d > threshold_cooks)
problematic <- which(abs(std_resid) > 2 | cooks_d > threshold_cooks)
apartments_clean <- apartments[-problematic, ]
fit2_clean <- lm(Price ~ Age + Distance, data = apartments_clean)
summary(fit2_clean)
library(ggplot2)
std_resid <- rstandard(fit2) std_fitted <- scale(fitted(fit2))
ggplot(data.frame(std_fitted, std_resid), aes(x = std_fitted, y = std_resid)) + geom_point(color = “steelblue”) + geom_hline(yintercept = 0, linetype = “dashed”, color = “red”) + labs(title = “Standardized Residuals vs. Standardized Fitted Values”, x = “Standardized Fitted Values”, y = “Standardized Residuals”) + theme_minimal()
Interpretation
Ideally, residuals should look like a random cloud with equal variance across all fitted values.
Here:
There’s no strong funnel shape, but there is a mild increase in variance as fitted values grow.
This suggests mild heteroskedasticity.
Implications
With heteroskedasticity, our coefficient estimates (β’s) remain unbiased, but the standard errors may be biased.
This can affect p-values and confidence intervals, making tests less reliable.
There is some indication of heteroskedasticity, but not severe.
std_resid <- rstandard(fit2)
hist(std_resid, breaks = 15, probability = TRUE, col = “lightblue”, border = “white”, main = “Histogram of Standardized Residuals”, xlab = “Standardized Residuals”) lines(density(std_resid), col = “red”, lwd = 2)
qqnorm(std_resid, main = “Q-Q Plot of Standardized Residuals”) qqline(std_resid, col = “red”, lwd = 2)
shapiro.test(std_resid)
Q-Q Plot
The points follow the straight line fairly well in the middle.
At the tails (both left and right ends), the points deviate away from the line. This indicates mild non-normality — the residuals have heavier tails than a perfect normal distribution.
W = 0.953, p = 0.0037.
Since p < 0.05, we reject the null hypothesis of normality. Statistically, the residuals are not normally distributed.
The assumption of normal residuals is not fully satisfied.
However:
With a sample size of ~85, regression estimates (coefficients) remain unbiased due to the Central Limit Theorem.
The main consequence is that p-values and confidence intervals may be less reliable, especially for small effects.
Conclusison The residuals deviate from normality (as confirmed by Shapiro–Wilk test, p < 0.01). This suggests that while the regression coefficients are unbiased, inference (tests and confidence intervals) should be interpreted with caution. Using robust standard errors or transforming the dependent variable could mitigate the issue.
std_resid <- rstandard(fit2) cooks_d <- cooks.distance(fit2)
n <- nrow(apartments) threshold_cooks <- 4/n
problematic <- which(abs(std_resid) > 2 | cooks_d > threshold_cooks)
apartments_clean <- apartments[-problematic, ]
fit2_clean <- lm(Price ~ Age + Distance, data = apartments_clean)
summary(fit2_clean)
. Intercept (2502.47)
Interpretation: Predicted apartment price is €2502 when both Age = 0 and Distance = 0.
This is a baseline, not very realistic (since an apartment with 0 years age and distance doesn’t exist), but it anchors the regression line.
For each additional year of Age, the apartment price decreases by about €8.67, holding Distance constant.
The effect is statistically significant (p < 0.01). Older apartments tend to be slightly cheaper.
For each additional unit of Distance (likely km from city center), the apartment price decreases by about €24.06, holding Age constant.
The effect is highly significant. Distance is a stronger predictor of price than Age.
About 53.6% of the variation in apartment prices is explained by Age and Distance together.
This is a substantial improvement compared to the earlier simple regression (~5% explained).
Removing outliers also reduced residual spread (standard error ~256.8, down from ~370).
Conclusion
The refined model shows that both Age and Distance significantly lower apartment prices.
Distance has a much stronger effect than Age.
The model explains over half of the price variation, which is quite good for only two predictors.
fit3 <- lm(Price ~ Age + Distance + Parking + Balcony, data = apartments_clean)
summary(fit3)
anova(fit2_clean, fit3)
Interpretation
Adding Parking and Balcony reduced residual variance (RSS dropped).
But the p-value = 0.1135 (> 0.05), so the improvement is not statistically significant.
In other words:
fit3 does explain more variance, but not enough to justify the extra complexity.
fit2_clean is more parsimonious (simpler, equally good in explanatory power).
Conclusion
fit3 does not fit the data significantly better than fit2_clean.
Distance and Age remain the strongest, reliable predictors.
Parking adds some improvement, but overall the joint effect of Parking + Balcony is too small to justify keeping both.
summary(fit3)
Call: lm(formula = Price ~ Age + Distance + Parking + Balcony, data = apartments_clean)
Residuals: Min 1Q Median 3Q Max -390.93 -198.19 -53.64 186.73 518.34
Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 2393.316 93.930 25.480 < 2e-16 Age -7.970
3.191 -2.498 0.0147
Distance -21.961 2.830 -7.762 3.39e-11 ParkingYes 128.700
60.801 2.117 0.0376
BalconyYes 6.032 57.307 0.105 0.9165
— Signif. codes: 0 ‘’ 0.001 ‘’ 0.01 ‘’ 0.05
‘.’ 0.1 ‘ ’ 1
Residual standard error: 252.7 on 75 degrees of freedom Multiple R-squared: 0.5623, Adjusted R-squared: 0.5389 F-statistic: 24.08 on 4 and 75 DF, p-value: 7.764e-13
ParkingYes: Apartments with parking cost about €128.70 more than those without, holding other variables constant. This effect is significant (p = 0.038). BalconyYes: Apartments with a balcony cost about €6 more than those without, holding other variables constant. This effect is not significant (p = 0.917). F-statistic tests the joint null hypothesis that all slope coefficients are zero (Age, Distance, Parking, Balcony). Since p < 0.001, we reject H0 and conclude that at least one predictor significantly explains apartment price.
apartments$ID <- seq_len(nrow(apartments))
apartments_clean\(ID <- as.integer(rownames(apartments_clean)) # keep original row IDs apartments_clean\)Fitted <- fitted(fit3) apartments_clean$Residual <- resid(fit3)
if (2 %in% apartments_clean\(ID) { # If yes, extract fitted value and residual result_id2 <- apartments_clean[apartments_clean\)ID == 2, c(“ID”, “Price”, “Fitted”, “Residual”)] } else { # If ID = 2 was removed as outlier, compute prediction and residual manually row2 <- apartments[apartments\(ID == 2, ] fitted_id2 <- predict(fit3, newdata = row2) residual_id2 <- row2\)Price - fitted_id2 result_id2 <- data.frame(ID = 2, Price = row2$Price, Fitted = fitted_id2, Residual = residual_id2) }
print(result_id2)
print(result_id2) # A tibble: 1 × 4 ID Price Fitted Residual
1 2 2800 2357. 443.