# Load library for Excel
library(readxl)
library(dplyr) # for data manipulation
library(tidyr)
library(ggplot2)
library(forcats)
# Import the dataset
survey <- read_excel(
"ScreenTime vs MentalWellness.xlsx",
sheet = "Sheet 1 - ScreenTime vs MentalW",
skip = 1
)
# Keep only the 5 variables
survey_small <- survey %>%
transmute(
Age = as.numeric(age),
Gender = factor(gender),
Screen_time_hours = as.numeric(screen_time_hours),
Sleep_hours = as.numeric(sleep_hours),
Mental_wellness_index_0_100 = as.numeric(mental_wellness_index_0_100)
)
# Take a random sample of 20 units
set.seed(123)
survey20 <- dplyr::sample_n(survey_small, 20)
dim(survey)
## [1] 400 15
names(survey)
## [1] "user_id" "age"
## [3] "gender" "occupation"
## [5] "work_mode" "screen_time_hours"
## [7] "work_screen_hours" "leisure_screen_hours"
## [9] "sleep_hours" "sleep_quality_1_5"
## [11] "stress_level_0_10" "productivity_0_100"
## [13] "exercise_minutes_per_week" "social_hours_per_week"
## [15] "mental_wellness_index_0_100"
head(survey)
## # A tibble: 6 × 15
## user_id age gender occupation work_mode screen_time_hours work_screen_hours
## <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 U0001 33 Female Employed Remote 10.79 5.44
## 2 U0002 28 Female Employed In-person 7.4 0.37
## 3 U0003 35 Female Employed Hybrid 9.78 1.09
## 4 U0004 42 Male Employed Hybrid 11.13 0.56
## 5 U0005 28 Male Student Remote 13.22 4.09
## 6 U0006 28 Non-bi… Self-empl… Hybrid 9.83 0.53
## # ℹ 8 more variables: leisure_screen_hours <chr>, sleep_hours <chr>,
## # sleep_quality_1_5 <dbl>, stress_level_0_10 <chr>, productivity_0_100 <chr>,
## # exercise_minutes_per_week <dbl>, social_hours_per_week <chr>,
## # mental_wellness_index_0_100 <chr>
For Task 1, I use a subset of the Screen Time vs Mental Wellness Survey – 2025 dataset from Kaggle. The subset contains 20 respondents and focuses on five variables: Age (numeric, in years): the age of the respondent. Gender (categorical, factor): the gender of the respondent (Male, Female, Other). Screen_time_hours (numeric, hours/day): the average number of hours per day spent using screens (work and leisure combined). Sleep_hours (numeric, hours/day): the average number of hours of sleep per night. Mental_wellness_index_0_100 (numeric, index): a self-reported mental wellness score ranging from 0 (lowest well-being) to 100 (highest well-being).
# Create new variables
# - screen_sleep_ratio: screen time relative to sleep time
# - Age_group: categorical age bands
# - Wellness_cat: categorized wellness index
survey20 <- survey20 %>%
mutate(
screen_sleep_ratio = Screen_time_hours / Sleep_hours,
Age_group = cut(
Age,
breaks = c(-Inf, 25, 40, Inf),
labels = c("≤25", "26–40", "41+")
),
Wellness_cat = cut(
Mental_wellness_index_0_100,
breaks = c(-Inf, 40, 70, Inf),
labels = c("Low", "Medium", "High")
)
)
# Delete units with missing data in any of the key variables
survey20_clean <- survey20 %>%
drop_na(
Age, Gender, Screen_time_hours, Sleep_hours,
Mental_wellness_index_0_100, screen_sleep_ratio
)
# Rename variables (shorter names for later use)
survey20_clean <- survey20_clean %>%
rename(
ScreenH = Screen_time_hours,
SleepH = Sleep_hours,
Wellness = Mental_wellness_index_0_100
) %>%
mutate(
# Tidy up Gender factor levels and order
Gender = fct_relevel(Gender, "Male", "Female", "Other")
)
# Create new data.frames based on conditions
df_high_screen <- filter(survey20_clean, ScreenH >= 8) # high screen time (≥8 h/day)
df_good_sleep <- filter(survey20_clean, SleepH >= 7) # good sleep (≥7 h/day)
df_high_well <- filter(survey20_clean, Wellness >= 70) # high wellness (≥70/100)
# Quick checks (prints counts and sample rows)
list(
n_before = nrow(survey20),
n_after_clean = nrow(survey20_clean),
head_clean = head(survey20_clean)
)
## $n_before
## [1] 20
##
## $n_after_clean
## [1] 20
##
## $head_clean
## # A tibble: 6 × 8
## Age Gender ScreenH SleepH Wellness screen_sleep_ratio Age_group Wellness_cat
## <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <fct> <fct>
## 1 27 Female 8.16 5.7 30 1.43 26–40 Low
## 2 16 Female 7.86 7.59 40.9 1.04 ≤25 Medium
## 3 31 Male 10.8 7.87 19.3 1.37 26–40 Low
## 4 30 Female 12.6 5.69 10.7 2.22 26–40 Low
## 5 20 Female 8.7 6.16 2.7 1.41 ≤25 Low
## 6 36 Male 10.1 8.17 9 1.24 26–40 Low
summary(survey20_clean)
## Age Gender ScreenH SleepH
## Min. :16.00 Male : 8 Min. : 5.040 Min. :5.690
## 1st Qu.:23.75 Female :12 1st Qu.: 8.115 1st Qu.:6.200
## Median :29.00 Non-binary/Other: 0 Median : 9.325 Median :7.095
## Mean :28.65 Mean : 8.960 Mean :7.063
## 3rd Qu.:34.50 3rd Qu.: 9.935 3rd Qu.:7.758
## Max. :45.00 Max. :12.630 Max. :8.660
## Wellness screen_sleep_ratio Age_group Wellness_cat
## Min. : 0.000 Min. :0.6374 ≤25 : 7 Low :16
## 1st Qu.: 7.475 1st Qu.:1.1451 26–40:12 Medium: 4
## Median :10.650 Median :1.3515 41+ : 1 High : 0
## Mean :20.220 Mean :1.3018
## 3rd Qu.:32.025 3rd Qu.:1.4348
## Max. :68.400 Max. :2.2197
# Descriptive statistics for all variables
survey20_clean %>%
summarise(
Mean_Age = mean(Age, na.rm = TRUE),
Median_Age = median(Age, na.rm = TRUE),
SD_Age = sd(Age, na.rm = TRUE),
Mean_ScreenH = mean(ScreenH, na.rm = TRUE),
Median_Sleep = median(SleepH, na.rm = TRUE),
Mean_Well = mean(Wellness, na.rm = TRUE),
SD_Well = sd(Wellness, na.rm = TRUE)
)
## # A tibble: 1 × 7
## Mean_Age Median_Age SD_Age Mean_ScreenH Median_Sleep Mean_Well SD_Well
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 28.6 29 7.31 8.96 7.10 20.2 18.2
The mean is the arithmetic average of a variable. In this dataset, the mean age of respondents is 28.65 years, the mean screen time is 8.96 hours/day, the mean sleep duration is 7.06 hours/day, and the mean mental wellness index is 20.22 (out of 100). These values represent the central tendency of each variable.
The median is the middle value when all observations are ordered from lowest to highest. For example, the median age is 29 years, meaning half of the respondents are younger and half are older. The median screen time is 9.33 hours/day, the median sleep duration is 7.10 hours/day, and the median wellness index is 10.65, which shows that most respondents report quite low wellness compared to the maximum of 100.
The standard deviation (SD) measures how much the values vary around the mean. A larger SD means greater spread. In this sample, the SD for age is about 7.3 years, showing moderate variation in respondent ages. For wellness, the SD is 18.2 points, showing very high variability in well-being scores compared to the mean.
library(ggplot2)
# Histogram of Age
ggplot(survey20_clean, aes(x = Age)) +
geom_histogram(binwidth = 5, fill = "lightgreen", color = "black") +
labs(title = "Distribution of Age", x = "Age (years)", y = "Count")
# Histogram of Screen Time Hours
ggplot(survey20_clean, aes(x = ScreenH)) +
geom_histogram(binwidth = 1, fill = "yellow", color = "black") +
labs(title = "Distribution of Screen Time", x = "Screen time (hours/day)", y = "Count")
# Boxplot of Sleep Hours by Gender
ggplot(survey20_clean, aes(x = Gender, y = SleepH, fill = Gender)) +
geom_boxplot() +
labs(title = "Sleep Hours by Gender", x = "Gender", y = "Sleep (hours/day)") +
theme(legend.position = "none")
# Histogram of Wellness Index
ggplot(survey20_clean, aes(x = Wellness)) +
geom_histogram(binwidth = 10, fill = "orange", color = "black") +
labs(title = "Distribution of Wellness Index",
x = "Mental Wellness (0–100)", y = "Count")
# Scatterplot: Screen Time vs Mental Wellness
ggplot(survey20_clean, aes(x = ScreenH, y = Wellness)) +
geom_point(color = "purple", size = 3) +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Screen Time vs Mental Wellness",
x = "Screen time (hours/day)",
y = "Mental wellness index (0–100)")
The histograms show that the age distribution is concentrated around the
30s, with few respondents over 40. Screen time is centered around 9
hours/day, with most participants spending between 8 and 10 hours. The
boxplot of sleep hours by gender reveals a clear difference: on average,
males sleep about 8 hours per night, while females average closer to 6.5
hours, suggesting a potential gender gap in sleep duration. The wellness
index is heavily skewed, with most values clustered at the low end
(below 40). The scatterplot of screen time vs wellness indicates a
negative trend, suggesting that higher screen time may be associated
with lower mental wellness.
# Libraries
library(readxl)
library(dplyr)
library(ggplot2)
# Load data
df <- read_excel("Business School.xlsx")
# Inspect raw unique values
sort(unique(df$`Undergrad Degree`))
## [1] "Art" "Business" "Computer Science" "Engineering"
## [5] "Finance"
# Clean the raw text and categorize
df <- df %>%
mutate(
# Clean: trim, lowercase, collapse multiple spaces
deg_clean = gsub("\\s+", " ", tolower(trimws(`Undergrad Degree`))),
# Categorize using robust regex-style matches
UndergradCat = case_when(
# Business family
grepl("^business$", deg_clean) |
grepl("business", deg_clean) ~ "Business",
# Finance family
grepl("^finance$", deg_clean) |
grepl("finance", deg_clean) ~ "Finance",
# Computer Science family
grepl("^computer science$", deg_clean) |
grepl("computer science", deg_clean) ~ "Computer Science",
# Engineering family
grepl("engineering", deg_clean) ~ "Engineering",
# Art family
grepl("^art$", deg_clean) |
grepl("art", deg_clean) ~ "Art",
TRUE ~ "Other"
)
)
# Count JUST the five requested categories
target_levels <- c("Business","Finance","Computer Science","Engineering","Art")
deg_counts <- df %>%
filter(UndergradCat %in% target_levels) %>%
count(UndergradCat, name = "Count", sort = TRUE)
# Safety check: stop with a message if nothing matched
if (nrow(deg_counts) == 0) {
stop("No entries matched the five categories after cleaning. Check the 'deg_clean' values printed above and adjust patterns.")
}
# Print ONLY the name of the most common category (as required)
most_common_degree <- deg_counts$UndergradCat[1]
cat(most_common_degree, "\n")
## Business
# Sentence with the count:
cat("Most common undergraduate degree is:", most_common_degree,
"with", deg_counts$Count[1], "students.\n")
## Most common undergraduate degree is: Business with 35 students.
# Plot: Distribution of categories
ggplot(deg_counts, aes(x = reorder(UndergradCat, Count), y = Count)) +
geom_col() +
geom_text(aes(label = Count), hjust = -0.2, size = 3) +
coord_flip() +
expand_limits(y = max(deg_counts$Count) * 1.15) +
labs(
title = "Distribution of Undergraduate Degrees",
x = "Undergraduate Degree",
y = "Count"
) +
theme_minimal(base_size = 12)
library(dplyr)
library(ggplot2)
## Descriptive statistics for Annual Salary ----
salary_stats <- df %>%
summarise(
n = sum(!is.na(`Annual Salary`)),
mean = mean(`Annual Salary`, na.rm = TRUE),
median = median(`Annual Salary`, na.rm = TRUE),
sd = sd(`Annual Salary`, na.rm = TRUE),
min = min(`Annual Salary`, na.rm = TRUE),
max = max(`Annual Salary`, na.rm = TRUE),
IQR = IQR(`Annual Salary`, na.rm = TRUE)
)
salary_stats
## # A tibble: 1 × 7
## n mean median sd min max IQR
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 100 109058 103500 41501. 20000 340000 36875
## Histogram of Annual Salary
ggplot(df, aes(x = `Annual Salary`)) +
geom_histogram(bins = 20, fill = "pink", color = "black") +
labs(
title = "Distribution of Annual Salary",
x = "Annual Salary",
y = "Count"
) +
theme_minimal()
Description of distribution: The mean salary is higher than the median,
which suggests the distribution is right-skewed (a few students earn
much more than most). The standard deviation is relatively large
compared to the mean, indicating substantial variation in salaries. The
histogram shows that most salaries cluster in the lower range, while a
smaller number of students earn very high salaries (long right tail).
This type of distribution is common in income data because high outliers
(top earners) pull the average up.
library(dplyr)
## One-sample t-test: H0: mu = 74
t_res <- t.test(df$`MBA Grade`, mu = 74)
t_res # show t-test results
##
## One Sample t-test
##
## data: df$`MBA Grade`
## t = 2.6587, df = 99, p-value = 0.00915
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
## 74.51764 77.56346
## sample estimates:
## mean of x
## 76.04055
## Effect size: Cohen's d
mean_grade <- mean(df$`MBA Grade`, na.rm = TRUE)
sd_grade <- sd(df$`MBA Grade`, na.rm = TRUE)
n_grade <- sum(!is.na(df$`MBA Grade`))
cohens_d <- (mean_grade - 74) / sd_grade
cohens_d
## [1] 0.2658658
Explanation: The one-sample t-test showed that the average MBA grade was significantly different than 74, t(99) = 2.66, p = 0.009 (p < 0.05).
apartments <- read_excel("Apartments.xlsx") # reads the first sheet by default
Description:
library(readxl)
# Read
apartments <- read_excel("Apartments.xlsx")
# Helper: map many encodings to a "No"/"Yes" factor
to_yes_no_factor <- function(x) {
# Convert to trimmed lower-case character
xc <- trimws(as.character(x))
xlow <- tolower(xc)
# Try to pull out any numeric signal (handles "1-Yes", "0 - No", etc.)
num <- suppressWarnings(as.numeric(gsub("[^0-9.-]", "", xlow)))
out <- rep(NA_character_, length(xlow))
# Numeric logic: 0 -> No; >0 -> Yes
out[!is.na(num) & num == 0] <- "No"
out[!is.na(num) & num > 0] <- "Yes"
# Textual logic (English + a few common variants)
out[is.na(out) & grepl("^(yes|y|true|t|da|ja|with|has|present)$", xlow)] <- "Yes"
out[is.na(out) & grepl("^(no|n|false|f|ne|without|none|absent)$", xlow)] <- "No"
factor(out, levels = c("No","Yes"))
}
# Apply to the two categorical columns
apartments$Parking <- to_yes_no_factor(apartments$Parking)
apartments$Balcony <- to_yes_no_factor(apartments$Balcony)
# Quick verification
str(apartments[, c("Parking","Balcony")])
## tibble [85 × 2] (S3: tbl_df/tbl/data.frame)
## $ Parking: Factor w/ 2 levels "No","Yes": 1 2 1 1 2 1 1 2 2 2 ...
## $ Balcony: Factor w/ 2 levels "No","Yes": 2 1 1 2 2 2 2 2 1 1 ...
table(apartments$Parking, useNA = "ifany")
##
## No Yes
## 42 43
table(apartments$Balcony, useNA = "ifany")
##
## No Yes
## 48 37
# One-sample t-test for mean Price
t_test_result <- t.test(apartments$Price, mu = 1900)
t_test_result
##
## One Sample t-test
##
## data: apartments$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
Conclusion: The average price per m² in our sample is ≈ 2019 €. The p-value of the t-test is 0.0047, which is smaller than 0.05. Therefore, we reject the null hypothesis H₀ (μ = 1900). We can conclude that the true mean apartment price is significantly different from 1900 € at the 5% significance level.
# Estimate simple regression Price = f(Age)
fit1 <- lm(Price ~ Age, data = apartments)
# Regression summary
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
# Correlation coefficient
cor_coef <- cor(apartments$Price, apartments$Age)
# Coefficient of determination (R-squared)
r_squared <- summary(fit1)$r.squared
list(correlation = cor_coef,
r_squared = r_squared)
## $correlation
## [1] -0.230255
##
## $r_squared
## [1] 0.05301737
# Scatterplot matrix of Price, Age, and Distance
pairs(~ Price + Age + Distance, data = apartments,
main = "Scatterplot Matrix: Price, Age, Distance",
pch = 19, col = "purple")
Interpretation: There is a weak negative relationship between Price and
both Age and Distance, while Age and Distance are not strongly
correlated with each other, so multicollinearity does not appear to be a
problem.
fit2 <- lm(Price ~ Age + Distance, data = apartments)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
library(performance)
check_collinearity(fit2)
## # Check for Multicollinearity
##
## Low Correlation
##
## Term VIF VIF 95% CI adj. VIF Tolerance Tolerance 95% CI
## Age 1.00 [1.00, Inf] 1.00 1.00 [0.00, 1.00]
## Distance 1.00 [1.00, Inf] 1.00 1.00 [0.00, 1.00]
Explanation: The VIF values for both Age and Distance are equal to 1, which means there is no multicollinearity between the explanatory variables. This confirms that Age and Distance are independent enough to be included in the regression model without inflating the variances of their coefficients.
# Standardized residuals
std_resid <- rstandard(fit2)
# Cook's distances
cooks_d <- cooks.distance(fit2)
# Combine into a data frame for easier inspection
diagnostics <- data.frame(std_resid, cooks_d)
head(diagnostics)
## std_resid cooks_d
## 1 -0.6653487 0.007386569
## 2 1.7832876 0.030365432
## 3 -0.5937629 0.005882612
## 4 0.7543794 0.008299153
## 5 -1.0733987 0.005112584
## 6 -0.7775190 0.004900891
# Identify potentially problematic observations
# Rule of thumb:
# |standardized residual| > 2 → possible outlier
# Cook's distance > 4/n → influential (n = sample size)
n <- nrow(apartments)
problematic <- which(abs(std_resid) > 2 | cooks_d > 4/n)
problematic
## 22 33 38 53 55
## 22 33 38 53 55
# Remove problematic observations
apartments_clean <- apartments[-problematic, ]
# Re-estimate regression without them
fit2_clean <- lm(Price ~ Age + Distance, data = apartments_clean)
summary(fit2_clean)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = apartments_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -411.50 -203.69 -45.24 191.11 492.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2502.467 75.024 33.356 < 2e-16 ***
## Age -8.674 3.221 -2.693 0.00869 **
## Distance -24.063 2.692 -8.939 1.57e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared: 0.5361, Adjusted R-squared: 0.524
## F-statistic: 44.49 on 2 and 77 DF, p-value: 1.437e-13
# Standardized residuals
std_resid <- rstandard(fit2_clean)
# Standardized fitted values
std_fitted <- scale(fitted(fit2_clean))
# Scatterplot
plot(std_fitted, std_resid,
xlab = "Standardized Fitted Values",
ylab = "Standardized Residuals",
main = "Residuals vs. Fitted (Check for Heteroskedasticity)",
pch = 19, col = "lightblue")
abline(h = 0, lty = 2, col = "red")
Explanation: The scatterplot of standardized residuals against
standardized fitted values shows a cone-shaped pattern, where the spread
of residuals increases as fitted values rise. This indicates the
presence of heteroskedasticity, meaning the variance of errors is not
constant across all levels of predicted prices.
# Standardized residuals from cleaned model
std_resid <- rstandard(fit2_clean)
# Histogram
hist(std_resid, breaks = 10, col = "purple",
main = "Histogram of Standardized Residuals",
xlab = "Standardized Residuals")
# Q-Q plot
qqnorm(std_resid, main = "Q-Q Plot of Standardized Residuals", col = "purple")
qqline(std_resid, col = "red", lwd = 2)
# Formal Shapiro-Wilk test
shapiro.test(std_resid)
##
## Shapiro-Wilk normality test
##
## data: std_resid
## W = 0.94156, p-value = 0.001168
Explanation: The histogram and Q-Q plot of standardized residuals show noticeable deviations from a normal distribution, especially in the tails. The Shapiro–Wilk test yields W = 0.94, p = 0.0012, which is below the 0.05 threshold. Therefore, we reject the null hypothesis of normality and conclude that the standardized residuals are not normally distributed.
# Estimate model again without problematic units
fit2_clean <- lm(Price ~ Age + Distance, data = apartments_clean)
# Show summary
summary(fit2_clean)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = apartments_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -411.50 -203.69 -45.24 191.11 492.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2502.467 75.024 33.356 < 2e-16 ***
## Age -8.674 3.221 -2.693 0.00869 **
## Distance -24.063 2.692 -8.939 1.57e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared: 0.5361, Adjusted R-squared: 0.524
## F-statistic: 44.49 on 2 and 77 DF, p-value: 1.437e-13
The regression model Price = 2502.47 – 8.67 × Age – 24.06 × Distance shows that each additional year lowers the price per m² by about 9 €, and each kilometer farther from the city center reduces it by about 24 €. Both effects are statistically significant, with distance having the stronger impact. The model explains about 54% of the variation in prices, indicating a reasonably good fit.
# Estimate multiple regression including Age, Distance, Parking, and Balcony
fit3 <- lm(Price ~ Age + Distance + Parking + Balcony, data = apartments_clean)
# Show summary
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = apartments_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -390.93 -198.19 -53.64 186.73 518.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2393.316 93.930 25.480 < 2e-16 ***
## Age -7.970 3.191 -2.498 0.0147 *
## Distance -21.961 2.830 -7.762 3.39e-11 ***
## ParkingYes 128.700 60.801 2.117 0.0376 *
## BalconyYes 6.032 57.307 0.105 0.9165
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 252.7 on 75 degrees of freedom
## Multiple R-squared: 0.5623, Adjusted R-squared: 0.5389
## F-statistic: 24.08 on 4 and 75 DF, p-value: 7.764e-13
# Compare fit2 (Age + Distance) with fit3 (Age + Distance + Parking + Balcony)
anova(fit2_clean, fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + Parking + Balcony
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 77 5077362
## 2 75 4791128 2 286234 2.2403 0.1135
The ANOVA test compares the simpler model (fit2) with the extended model (fit3) that includes Parking and Balcony. The p-value is 0.1135, which is greater than 0.05, so we fail to reject the null hypothesis that the additional variables improve the model. This means that, although Parking was significant on its own, jointly Parking and Balcony do not significantly increase the overall explanatory power compared to using only Age and Distance.
# Show regression results for fit3
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = apartments_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -390.93 -198.19 -53.64 186.73 518.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2393.316 93.930 25.480 < 2e-16 ***
## Age -7.970 3.191 -2.498 0.0147 *
## Distance -21.961 2.830 -7.762 3.39e-11 ***
## ParkingYes 128.700 60.801 2.117 0.0376 *
## BalconyYes 6.032 57.307 0.105 0.9165
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 252.7 on 75 degrees of freedom
## Multiple R-squared: 0.5623, Adjusted R-squared: 0.5389
## F-statistic: 24.08 on 4 and 75 DF, p-value: 7.764e-13
Explanation: In fit3, apartments with parking cost on average about 129 € per m² more than those without, a significant effect, while the balcony adds only ≈6 €, which is not significant. At the bottom of the regression output, the F-statistic tests the null hypothesis that all slope coefficients are equal to zero (Age, Distance, Parking, and Balcony together have no effect on apartment prices). The alternative hypothesis is that at least one coefficient differs from zero. Since the F-test p-value is far below 0.001, we reject the null and conclude that the model as a whole is statistically significant.
# Save fitted values and residuals
fitted_values <- fitted(fit3)
residuals <- resid(fit3)
# Apartment ID2
id <- 2
cat("Apartment ID", id, "\n")
## Apartment ID 2
cat(" Fitted value (predicted price per m²):", round(fitted_values[id], 2), "€\n")
## Fitted value (predicted price per m²): 2356.6 €
cat(" Residual (observed - fitted):", round(residuals[id], 2), "€\n")
## Residual (observed - fitted): 443.4 €