#Task 1

#1.1

This Spotify churn dataset contains information about user behavior and engagement patterns on the music streaming platform. The variables include:

User Demographics: Information about user characteristics Usage Metrics: Various measures of how users interact with the platform Engagement Features: Metrics related to user activity and preferences Churn Indicator: Whether the user churned or remained active (if present)

Rows: 8,000 users

Columns (12):

user_id – unique identifier

gender – categorical (Male, Female, Other)

age – numeric

country – categorical (US, DE, UK, etc.)

subscription_type – categorical (Free, Premium, Family, Student)

listening_time – numeric (minutes per week/month?)

songs_played_per_day – numeric

skip_rate – numeric (0–1 proportion of skipped songs)

device_type – categorical (Desktop, Mobile, Web)

ads_listened_per_week – numeric

offline_listening – binary (0/1)

is_churned – binary target (0 = active, 1 = churned)

#General Setup

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

Load necessary libraries

library(tidyverse) library(knitr) library(kableExtra) library(corrplot) library(ggplot2) library(dplyr) library(readr)

Load the dataset

spotify_data <- read_csv(“C:/Users/frach/OneDrive/Desktop/IMB/R Bootcamp/Homework/spotify_churn_dataset.csv”)

basic information about the dataset

cat(“Dataset Dimensions:”) cat(“Number of rows:”, nrow(spotify_data), “”) cat(“Number of columns:”, ncol(spotify_data), “”)

Display column names and types

cat(“Variables in the dataset:”) str(spotify_data)

Display first few rows

head(spotify_data) %>% kable(caption = “First 6 rows of Spotify Churn Dataset”) %>% kable_styling(bootstrap_options = c(“striped”, “hover”))

#1.2

Check for missing values

missing_summary <- spotify_data %>% summarise_all(~sum(is.na(.))) %>% pivot_longer(everything(), names_to = “Variable”, values_to = “Missing_Count”) %>% mutate(Missing_Percentage = round(Missing_Count / nrow(spotify_data) * 100, 2))

missing_summary %>% filter(Missing_Count > 0) %>% kable(caption = “Variables with Missing Values”) %>% kable_styling(bootstrap_options = c(“striped”, “hover”))

Create spotify_clean by removing rows with missing values

spotify_clean <- spotify_data %>% drop_na()

if(nrow(spotify_clean) == 0) { spotify_clean <- spotify_data cat(“No missing values found - using complete dataset”) }

cat(“Original dataset rows:”, nrow(spotify_data), “”) cat(“Cleaned dataset rows:”, nrow(spotify_clean), “”) cat(“Rows removed:”, nrow(spotify_data) - nrow(spotify_clean), “”)

Create new variable

spotify_clean <- spotify_clean %>% mutate( # Categorize listening time listening_category = case_when( listening_time < 100 ~ “Low”, listening_time < 300 ~ “Medium”, TRUE ~ “High” ), # Categorize skip rate skip_behavior = case_when( skip_rate < 0.3 ~ “Low Skipper”, skip_rate < 0.6 ~ “Moderate Skipper”, TRUE ~ “High Skipper” ), # Engagement score (combination of listening time and songs played) engagement_score = (listening_time / max(listening_time)) * 0.5 + (songs_played_per_day / max(songs_played_per_day)) * 0.5 )

Subset for analyisis with only numeric values

numeric_vars <- spotify_clean %>% select(age, listening_time, songs_played_per_day, skip_rate, ads_listened_per_week, offline_listening, is_churned)

cat(“variables selected for analysis:”, ncol(numeric_vars), “”) cat(“Column names:”, paste(names(numeric_vars), collapse = “,”), “”)

#2.3

Calculate descriptive statistics for numeric variables

desc_stats <- spotify_clean %>% select(age, listening_time, songs_played_per_day, skip_rate, ads_listened_per_week, offline_listening) %>% summarise_all(list( Mean = ~mean(., na.rm = TRUE), Median = ~median(., na.rm = TRUE), SD = ~sd(., na.rm = TRUE), Min = ~min(., na.rm = TRUE), Max = ~max(., na.rm = TRUE) ))

Reshape for better display

desc_stats_long <- desc_stats %>% pivot_longer(everything(), names_to = “stat”, values_to = “value”) %>% separate(stat, into = c(“variable”, “measure”), sep = “_“, extra =”merge”) %>% pivot_wider(names_from = measure, values_from = value)

print(desc_stats_long)

Examples

mean_listening <- mean(spotify_clean\(listening_time) sd_skip <- sd(spotify_clean\)skip_rate) median_songs <- median(spotify_clean$songs_played_per_day)

print(paste(“Mean Listening Time:”, round(mean_listening, 2))) print(paste(“SD of Skip Rate:”, round(sd_skip, 3))) print(paste(“Median Songs Per Day:”, median_songs))

Quartiles for listening_time

quartiles <- quantile(spotify_clean$listening_time, probs = c(0.25, 0.5, 0.75)) print(“Listening Time Quartiles:”) print(quartiles)

Interpretation of Results:

Age (Mean: 37.7 years)

Users average around 38 years old This indicates a mature user base, not primarily young adults

Listening Time (Mean: 154.07 minutes)

Users listen for about 2.5 hours on average Quartiles (81, 154, 227) show:

25% listen less than 1.3 hours 50% listen less than 2.5 hours 25% listen more than 3.8 hours

Wide spread indicates diverse usage patterns

Songs Played Per Day (Median: 50)

Half of users play more than 50 songs daily, half play fewer This is quite high engagement (about 2-3 songs per hour if listening 2.5 hours)

Skip Rate (Mean: 0.30, SD: 0.174)

Users skip 30% of songs on average SD of 0.174 shows moderate variability This means most users skip between 13% and 47% of songs (mean ± 1 SD)

Ads Listened Per Week (Mean: 6.94)

Users hear about 7 ads weekly on average Likely indicates many free-tier users in the dataset

Three Sample Statistics Explained in Detail: 1. Mean (e.g., Listening Time = 154.07):

The mean is the arithmetic average - add all values and divide by count Good for understanding typical behavior Can be affected by extreme values (outliers)

  1. Standard Deviation (e.g., Skip Rate SD = 0.174):

Measures how spread out the data is from the mean Smaller SD = data clustered near mean Larger SD = data more spread out Skip rate SD of 0.174 means about 68% of users have skip rates between 12.6% and 47.4%

  1. Median (e.g., Songs Per Day = 50):

The middle value when data is sorted Not affected by outliers unlike mean Better than mean for skewed data Exactly half of users are above/below this value

#1.4

library(ggplot2)

These are the 6 variables from Task 1.3: age, listening_time, songs_played_per_day, skip_rate, ads_listened_per_week, offline_listening

1. HISTOGRAM - Songs played per day

ggplot(spotify_clean, aes(x = songs_played_per_day)) + geom_histogram(bins = 20, fill = “lawngreen”, color = “magenta2”, alpha = 0.8) + labs(title = “Distribution of Songs Played Per Day”, x = “Songs Played Per Day”, y = “Number of Users”) + theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 14, face = “bold”))

2. BOXPLOT - Listening time by churned status

ggplot(spotify_clean, aes(x = factor(is_churned), y = listening_time, fill = factor(is_churned))) + geom_boxplot(alpha = 0.7) + scale_fill_manual(values = c(“0” = “lightgreen”, “1” = “salmon”)) + scale_x_discrete(labels = c(“0” = “Active”, “1” = “Churned”)) + labs(title = “Listening Time by User Status”, x = “User Status”, y = “Listening Time (minutes)”) + theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 14, face = “bold”), legend.position = “none”)

3. SCATTERPLOT - Listening time vs Songs played

ggplot(spotify_clean, aes(x = listening_time, y = songs_played_per_day)) + geom_point(alpha = 0.5, color = “darkblue”, size = 1.5) + geom_smooth(method = “lm”, color = “slateblue4”, se = TRUE, alpha = 0.2) + labs(title = “Listening Time vs Songs Played Per Day”, x = “Listening Time (minutes)”, y = “Songs Played Per Day”) + theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 14, face = “bold”))

#Task 2

#2.1

Load libraries

library(readxl) library(ggplot2)

Read Excel file

mba_data <- read_excel(“C:/Users/frach/OneDrive/Desktop/IMB/R Bootcamp/Homework/Business School.xlsx”)

Plot the distribution of undergrad degrees

ggplot(mba_data, aes(x = Undergrad Degree)) + geom_bar(fill = “darkgoldenrod3”, color = “gray0”) + labs(title = “Distribution of Undergraduate Degrees”, x = “Undergraduate Degree”, y = “Number of Students”) + theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 14, face = “bold”))

#2.2

library(dplyr)

Descriptive statistics of Annual Salary

summary_stats <- mba_data %>% summarise( count = n(), mean = mean(Annual Salary, na.rm = TRUE), median = median(Annual Salary, na.rm = TRUE), sd = sd(Annual Salary, na.rm = TRUE), min = min(Annual Salary, na.rm = TRUE), max = max(Annual Salary, na.rm = TRUE) ) print(summary_stats)

Histogram of Annual Salary

ggplot(mba_data, aes(x = Annual Salary)) + geom_histogram(bins = 20, fill = “steelblue”, color = “black”, alpha = 0.7) + labs(title = “Distribution of Annual Salary”, x = “Annual Salary”, y = “Number of Students”) + theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 14, face = “bold”))

The mean (109,058) is higher than the median (103,500). This suggests the distribution is right-skewed (a few very high salaries pull the mean up).

The range is very wide (20,000 → 340,000), showing a big gap between lowest and highest salaries.

The relatively large standard deviation (41,501) confirms high variability in salaries.

Likely, most students cluster around ~100,000, with a long right tail due to some very high earners.

#2.3

One-sample t-test against mu = 74

t_test_result <- t.test(mba_data$MBA Grade, mu = 74)

print(t_test_result)

Compute Cohen’s d

mean_mba <- mean(mba_data\(`MBA Grade`, na.rm = TRUE) sd_mba <- sd(mba_data\)MBA Grade, na.rm = TRUE)

cohens_d <- (mean_mba - 74) / sd_mba cohens_d

One-sample t-test results

Sample mean: 76.04

Null hypothesis mean (µ₀): 74

t(99) = 2.66, p = 0.009

95% CI: [74.52, 77.56]

Since p < 0.05, we reject H₀. The mean MBA grade in our sample (76.04) is significantly higher than the previous year’s average (74).

Effect size (Cohen’s d)

Cohen’s d = 0.27

Interpretation (Cohen’s d benchmarks):

~0.2 = small effect

~0.5 = medium effect

~0.8 = large effect

The effect is statistically significant, but the practical effect is small.

This means that while students this year did score higher on average, the improvement over the historical average is modest in real-world terms. The current cohort of MBA students scored significantly above the historical mean of 74. However, the effect size is small, indicating the improvement is noticeable but not a large jump in performance.

#Task 3

apartments <- read_excel(“C:/Users/frach/OneDrive/Desktop/IMB/R Bootcamp/Homework/Apartments.xlsx”)

Convert categorical variables (0/1) into factors

apartments\(Parking <- factor(apartments\)Parking, labels = c(“No”, “Yes”)) apartments\(Balcony <- factor(apartments\)Balcony, labels = c(“No”, “Yes”))

Check structure

str(apartments)

Change categorical variables into factors.

Load libraries

library(readxl)

Read the Excel file

Test the hypothesis H0: Mu_Price = 1900 eur. What can you conclude?

One-sample t-test for Price

t_test_price <- t.test(apartments$Price, mu = 1900)

print(t_test_price)

Effect size (Cohen’s d)

mean_price <- mean(apartments\(Price, na.rm = TRUE) sd_price <- sd(apartments\)Price, na.rm = TRUE)

cohens_d_price <- (mean_price - 1900) / sd_price cohens_d_price

Since p < 0.05, we reject H₀. The average apartment price is significantly higher than €1900.

Cohen’s d = 0.31

So this is a small-to-moderate effect size. Prices are statistically higher than €1900, but the practical difference is modest (around €119 on average).

Conclusiom The data provides strong evidence that apartment prices are above €1900, but the effect size shows this difference is not very large in practical terms.

Estimate the simple regression function: Price = f(Age). Save results in object fit1 and explain the estimate of regression coefficient, coefficient of correlation and coefficient of determination.

Simple linear regression: Price ~ Age

fit1 <- lm(Price ~ Age, data = apartments)

summary(fit1)

Correlation coefficient (r)

cor_coef <- cor(apartments\(Age, apartments\)Price, use = “complete.obs”)

cor_coef

Regression coefficient (β₁ = –8.975)

The slope is –8.98.

for every 1 year increase in apartment age, the expected price decreases by about €8.98 (holding all else constant).

Since the p-value = 0.034 (< 0.05), this effect is statistically significant.

Correlation coefficient (r = –0.230)

This measures the strength and direction of the linear relationship between Age and Price.

r = –0.23 → weak negative correlation. Older apartments tend to have lower prices, but the relationship is not very strong.

Coefficient of determination (R² = 0.053)

R² = 0.053 → 5.3% of the variation in apartment prices is explained by Age.

That means 94.7% of the price variation is due to other factors not captured in this simple regression (e.g., location, size, balcony, parking, etc.).

So while Age has a statistically significant effect, it is not a strong predictor of Price by itself.

Conclusion:

Apartment age has a small but significant negative effect on price.

The correlation is weak (–0.23), and only ~5% of price differences can be explained by age alone.

Show the scateerplot matrix between Price, Age and Distance. Based on the matrix determine if there is potential problem with multicolinearity.

Load libraries

install.packages(“GGally”)

library(GGally)

df_sub <- apartments[, c(“Price”, “Age”, “Distance”)]

Scatterplot matrix

ggpairs(df_sub)

Correlation results from the matrix

Price – Age: r = –0.230 → weak negative correlation.

Price – Distance: r = –0.631 → strong negative correlation (apartments farther away tend to be cheaper).

Age – Distance: r = 0.043 → essentially no correlation.

Multicollinearity check

Multicollinearity is only a problem if the independent (predictor) variables are strongly correlated with each other.

Here the predictors are Age and Distance.

Their correlation is r = 0.043 → almost zero → no linear relationship. Conclusion: There is no evidence of multicollinearity between Age and Distance. We can safely include both as predictors in a multiple regression model without worrying about inflated standard errors or unstable estimates.

Estimate the multiple regression function: Price = f(Age, Distance). Save it in object named fit2.

Multiple regression: Price ~ Age + Distance

fit2 <- lm(Price ~ Age + Distance, data = apartments)

Show summary of regression

summary(fit2)

Chech the multicolinearity with VIF statistics. Explain the findings.

Install if needed

install.packages(“car”)

Load library

library(car)

Compute VIF for the multiple regression model

vif(fit2)

A VIF = 1 means that the predictor is completely uncorrelated with the other predictors in the model.

The values are almost exactly 1, which indicates:

No multicollinearity at all between Age and Distance.

Each variable provides unique, independent information about apartment prices.

Conclusion

The regression model Price = f(Age, Distance) is free from multicollinearity problems.

We can interpret the coefficients of Age and Distance confidently, without worrying that they overlap or distort each other’s effects.

Calculate standardized residuals and Cooks Distances for model fit2. Remove any potentially problematic units (outliers or units with high influence).

Standardized residuals

std_resid <- rstandard(fit2)

Cook’s distance

cooks_d <- cooks.distance(fit2)

Combine into data frame for inspection

diagnostics <- data.frame(std_resid, cooks_d)

n <- nrow(apartments) threshold_cooks <- 4/n

which(abs(std_resid) > 2 | cooks_d > threshold_cooks)

Identify problematic units

problematic <- which(abs(std_resid) > 2 | cooks_d > threshold_cooks)

Remove them from dataset

apartments_clean <- apartments[-problematic, ]

Refit the model

fit2_clean <- lm(Price ~ Age + Distance, data = apartments_clean)

Check new model summary

summary(fit2_clean)

Check for potential heteroskedasticity with scatterplot between standarized residuals and standrdized fitted values. Explain the findings.

library(ggplot2)

Standardized residuals and fitted values

std_resid <- rstandard(fit2) std_fitted <- scale(fitted(fit2))

Scatterplot

ggplot(data.frame(std_fitted, std_resid), aes(x = std_fitted, y = std_resid)) + geom_point(color = “steelblue”) + geom_hline(yintercept = 0, linetype = “dashed”, color = “red”) + labs(title = “Standardized Residuals vs. Standardized Fitted Values”, x = “Standardized Fitted Values”, y = “Standardized Residuals”) + theme_minimal()

Interpretation

Ideally, residuals should look like a random cloud with equal variance across all fitted values.

Here:

There’s no strong funnel shape, but there is a mild increase in variance as fitted values grow.

This suggests mild heteroskedasticity.

Implications

With heteroskedasticity, our coefficient estimates (β’s) remain unbiased, but the standard errors may be biased.

This can affect p-values and confidence intervals, making tests less reliable.

There is some indication of heteroskedasticity, but not severe.

Are standardized residuals ditributed normally? Show the graph and formally test it. Explain the findings.

Standardized residuals

std_resid <- rstandard(fit2)

Histogram + density

hist(std_resid, breaks = 15, probability = TRUE, col = “lightblue”, border = “white”, main = “Histogram of Standardized Residuals”, xlab = “Standardized Residuals”) lines(density(std_resid), col = “red”, lwd = 2)

Q-Q plot

qqnorm(std_resid, main = “Q-Q Plot of Standardized Residuals”) qqline(std_resid, col = “red”, lwd = 2)

shapiro.test(std_resid)

Q-Q Plot

The points follow the straight line fairly well in the middle.

At the tails (both left and right ends), the points deviate away from the line. This indicates mild non-normality — the residuals have heavier tails than a perfect normal distribution.

  1. Shapiro–Wilk test

W = 0.953, p = 0.0037.

Since p < 0.05, we reject the null hypothesis of normality. Statistically, the residuals are not normally distributed.

  1. What this means

The assumption of normal residuals is not fully satisfied.

However:

With a sample size of ~85, regression estimates (coefficients) remain unbiased due to the Central Limit Theorem.

The main consequence is that p-values and confidence intervals may be less reliable, especially for small effects.

Conclusison The residuals deviate from normality (as confirmed by Shapiro–Wilk test, p < 0.01). This suggests that while the regression coefficients are unbiased, inference (tests and confidence intervals) should be interpreted with caution. Using robust standard errors or transforming the dependent variable could mitigate the issue.

Estimate the fit2 again without potentially excluded units and show the summary of the model. Explain all coefficients.

Standardized residuals and Cook’s Distance

std_resid <- rstandard(fit2) cooks_d <- cooks.distance(fit2)

n <- nrow(apartments) threshold_cooks <- 4/n

Identify problematic cases

problematic <- which(abs(std_resid) > 2 | cooks_d > threshold_cooks)

Remove them

apartments_clean <- apartments[-problematic, ]

Refit the model

fit2_clean <- lm(Price ~ Age + Distance, data = apartments_clean)

Show summary

summary(fit2_clean)

. Intercept (2502.47)

Interpretation: Predicted apartment price is €2502 when both Age = 0 and Distance = 0.

This is a baseline, not very realistic (since an apartment with 0 years age and distance doesn’t exist), but it anchors the regression line.

  1. Age coefficient (–8.67, p = 0.009)

For each additional year of Age, the apartment price decreases by about €8.67, holding Distance constant.

The effect is statistically significant (p < 0.01). Older apartments tend to be slightly cheaper.

  1. Distance coefficient (–24.06, p < 0.001)

For each additional unit of Distance (likely km from city center), the apartment price decreases by about €24.06, holding Age constant.

The effect is highly significant. Distance is a stronger predictor of price than Age.

  1. Model fit (R² = 0.536, Adjusted R² = 0.524)

About 53.6% of the variation in apartment prices is explained by Age and Distance together.

This is a substantial improvement compared to the earlier simple regression (~5% explained).

Removing outliers also reduced residual spread (standard error ~256.8, down from ~370).

Conclusion

The refined model shows that both Age and Distance significantly lower apartment prices.

Distance has a much stronger effect than Age.

The model explains over half of the price variation, which is quite good for only two predictors.

Estimate the linear regression function Price = f(Age, Distance, Parking and Balcony). Be careful to correctly include categorical variables. Save the object named fit3.

Multiple regression with categorical variables included

fit3 <- lm(Price ~ Age + Distance + Parking + Balcony, data = apartments_clean)

Show summary

summary(fit3)

With function anova check if model fit3 fits data better than model fit2.

Compare fit2 (without Parking and Balcony) vs fit3 (with Parking and Balcony)

anova(fit2_clean, fit3)

Interpretation

Adding Parking and Balcony reduced residual variance (RSS dropped).

But the p-value = 0.1135 (> 0.05), so the improvement is not statistically significant.

In other words:

fit3 does explain more variance, but not enough to justify the extra complexity.

fit2_clean is more parsimonious (simpler, equally good in explanatory power).

Conclusion

fit3 does not fit the data significantly better than fit2_clean.

Distance and Age remain the strongest, reliable predictors.

Parking adds some improvement, but overall the joint effect of Parking + Balcony is too small to justify keeping both.

Show the results of fit3 and explain regression coefficient for both categorical variables. Can you write down the hypothesis which is being tested with F-statistics, shown at the bottom of the output?

summary(fit3)

Call: lm(formula = Price ~ Age + Distance + Parking + Balcony, data = apartments_clean)

Residuals: Min 1Q Median 3Q Max -390.93 -198.19 -53.64 186.73 518.34

Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 2393.316 93.930 25.480 < 2e-16 Age -7.970 3.191 -2.498 0.0147
Distance -21.961 2.830 -7.762 3.39e-11
ParkingYes 128.700 60.801 2.117 0.0376
BalconyYes 6.032 57.307 0.105 0.9165
— Signif. codes: 0 ‘’ 0.001 ‘’ 0.01 ‘’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 252.7 on 75 degrees of freedom Multiple R-squared: 0.5623, Adjusted R-squared: 0.5389 F-statistic: 24.08 on 4 and 75 DF, p-value: 7.764e-13

ParkingYes: Apartments with parking cost about €128.70 more than those without, holding other variables constant. This effect is significant (p = 0.038). BalconyYes: Apartments with a balcony cost about €6 more than those without, holding other variables constant. This effect is not significant (p = 0.917). F-statistic tests the joint null hypothesis that all slope coefficients are zero (Age, Distance, Parking, Balcony). Since p < 0.001, we reject H0 and conclude that at least one predictor significantly explains apartment price.

Save fitted values and claculate the residual for apartment ID2.

Add row numbers as IDs

apartments$ID <- seq_len(nrow(apartments))

Add fitted values and residuals to the cleaned dataset

apartments_clean\(ID <- as.integer(rownames(apartments_clean)) # keep original row IDs apartments_clean\)Fitted <- fitted(fit3) apartments_clean$Residual <- resid(fit3)

Check if ID = 2 exists in cleaned dataset

if (2 %in% apartments_clean\(ID) { # If yes, extract fitted value and residual result_id2 <- apartments_clean[apartments_clean\)ID == 2, c(“ID”, “Price”, “Fitted”, “Residual”)] } else { # If ID = 2 was removed as outlier, compute prediction and residual manually row2 <- apartments[apartments\(ID == 2, ] fitted_id2 <- predict(fit3, newdata = row2) residual_id2 <- row2\)Price - fitted_id2 result_id2 <- data.frame(ID = 2, Price = row2$Price, Fitted = fitted_id2, Residual = residual_id2) }

print(result_id2)

print(result_id2) # A tibble: 1 × 4 ID Price Fitted Residual 1 2 2800 2357. 443.