Task 1 - Dataset analysis

1.1. Dataset - ScreenTime vs MentalWellness

# Load library for Excel
library(readxl)
library(dplyr) # for data manipulation
library(tidyr)
library(ggplot2)
library(forcats)

# Import the dataset
survey <- read_excel(
  "ScreenTime vs MentalWellness.xlsx",
  sheet = "Sheet 1 - ScreenTime vs MentalW",
  skip = 1
)

# Keep only the 5 variables 
survey_small <- survey %>%
  transmute(
    Age  = as.numeric(age),
    Gender = factor(gender),
    Screen_time_hours = as.numeric(screen_time_hours),
    Sleep_hours = as.numeric(sleep_hours),
    Mental_wellness_index_0_100 = as.numeric(mental_wellness_index_0_100)
  )

# Take a random sample of 20 units
set.seed(123)               
survey20 <- dplyr::sample_n(survey_small, 20)


dim(survey)

## [1] 400  15

names(survey)

##  [1] "user_id"                     "age"                        
##  [3] "gender"                      "occupation"                 
##  [5] "work_mode"                   "screen_time_hours"          
##  [7] "work_screen_hours"           "leisure_screen_hours"       
##  [9] "sleep_hours"                 "sleep_quality_1_5"          
## [11] "stress_level_0_10"           "productivity_0_100"         
## [13] "exercise_minutes_per_week"   "social_hours_per_week"      
## [15] "mental_wellness_index_0_100"

head(survey)

## # A tibble: 6 × 15
##   user_id   age gender  occupation work_mode screen_time_hours work_screen_hours
##   <chr>   <dbl> <chr>   <chr>      <chr>     <chr>             <chr>            
## 1 U0001      33 Female  Employed   Remote    10.79             5.44             
## 2 U0002      28 Female  Employed   In-person 7.4               0.37             
## 3 U0003      35 Female  Employed   Hybrid    9.78              1.09             
## 4 U0004      42 Male    Employed   Hybrid    11.13             0.56             
## 5 U0005      28 Male    Student    Remote    13.22             4.09             
## 6 U0006      28 Non-bi… Self-empl… Hybrid    9.83              0.53             
## # ℹ 8 more variables: leisure_screen_hours <chr>, sleep_hours <chr>,
## #   sleep_quality_1_5 <dbl>, stress_level_0_10 <chr>, productivity_0_100 <chr>,
## #   exercise_minutes_per_week <dbl>, social_hours_per_week <chr>,
## #   mental_wellness_index_0_100 <chr>

For Task 1, I use a subset of the Screen Time vs Mental Wellness Survey – 2025 dataset from Kaggle. The subset contains 20 respondents and focuses on five variables: Age (numeric, in years): the age of the respondent. Gender (categorical, factor): the gender of the respondent (Male, Female, Other). Screen_time_hours (numeric, hours/day): the average number of hours per day spent using screens (work and leisure combined). Sleep_hours (numeric, hours/day): the average number of hours of sleep per night. Mental_wellness_index_0_100 (numeric, index): a self-reported mental wellness score ranging from 0 (lowest well-being) to 100 (highest well-being).

1.2. Data Manipulations

# Create new variables
#    - screen_sleep_ratio: screen time relative to sleep time
#    - Age_group: categorical age bands
#    - Wellness_cat: categorized wellness index
survey20 <- survey20 %>%
  mutate(
    screen_sleep_ratio = Screen_time_hours / Sleep_hours,
    Age_group = cut(
      Age,
      breaks = c(-Inf, 25, 40, Inf),
      labels = c("≤25", "26–40", "41+")
    ),
    Wellness_cat = cut(
      Mental_wellness_index_0_100,
      breaks = c(-Inf, 40, 70, Inf),
      labels = c("Low", "Medium", "High")
    )
  )

# Delete units with missing data in any of the key variables
survey20_clean <- survey20 %>%
  drop_na(
    Age, Gender, Screen_time_hours, Sleep_hours,
    Mental_wellness_index_0_100, screen_sleep_ratio
  )

# Rename variables (shorter names for later use)
survey20_clean <- survey20_clean %>%
  rename(
    ScreenH  = Screen_time_hours,
    SleepH   = Sleep_hours,
    Wellness = Mental_wellness_index_0_100
  ) %>%
  mutate(
    # Tidy up Gender factor levels and order 
    Gender = fct_relevel(Gender, "Male", "Female", "Other")
  )

# Create new data.frames based on conditions 
df_high_screen <- filter(survey20_clean, ScreenH >= 8)   # high screen time (≥8 h/day)
df_good_sleep  <- filter(survey20_clean, SleepH  >= 7)   # good sleep (≥7 h/day)
df_high_well   <- filter(survey20_clean, Wellness >= 70) # high wellness (≥70/100)

# Quick checks (prints counts and sample rows)
list(
  n_before = nrow(survey20),
  n_after_clean = nrow(survey20_clean),
  head_clean = head(survey20_clean)
)

## $n_before
## [1] 20
## 
## $n_after_clean
## [1] 20
## 
## $head_clean
## # A tibble: 6 × 8
##     Age Gender ScreenH SleepH Wellness screen_sleep_ratio Age_group Wellness_cat
##   <dbl> <fct>    <dbl>  <dbl>    <dbl>              <dbl> <fct>     <fct>       
## 1    27 Female    8.16   5.7      30                 1.43 26–40     Low         
## 2    16 Female    7.86   7.59     40.9               1.04 ≤25       Medium      
## 3    31 Male     10.8    7.87     19.3               1.37 26–40     Low         
## 4    30 Female   12.6    5.69     10.7               2.22 26–40     Low         
## 5    20 Female    8.7    6.16      2.7               1.41 ≤25       Low         
## 6    36 Male     10.1    8.17      9                 1.24 26–40     Low

1.3. Descriptive statistics + explanation

summary(survey20_clean)

##       Age                     Gender      ScreenH           SleepH     
##  Min.   :16.00   Male            : 8   Min.   : 5.040   Min.   :5.690  
##  1st Qu.:23.75   Female          :12   1st Qu.: 8.115   1st Qu.:6.200  
##  Median :29.00   Non-binary/Other: 0   Median : 9.325   Median :7.095  
##  Mean   :28.65                         Mean   : 8.960   Mean   :7.063  
##  3rd Qu.:34.50                         3rd Qu.: 9.935   3rd Qu.:7.758  
##  Max.   :45.00                         Max.   :12.630   Max.   :8.660  
##     Wellness      screen_sleep_ratio Age_group  Wellness_cat
##  Min.   : 0.000   Min.   :0.6374     ≤25  : 7   Low   :16   
##  1st Qu.: 7.475   1st Qu.:1.1451     26–40:12   Medium: 4   
##  Median :10.650   Median :1.3515     41+  : 1   High  : 0   
##  Mean   :20.220   Mean   :1.3018                            
##  3rd Qu.:32.025   3rd Qu.:1.4348                            
##  Max.   :68.400   Max.   :2.2197

# Descriptive statistics for all variables
survey20_clean %>%
  summarise(
    Mean_Age     = mean(Age, na.rm = TRUE),
    Median_Age   = median(Age, na.rm = TRUE),
    SD_Age       = sd(Age, na.rm = TRUE),
    Mean_ScreenH = mean(ScreenH, na.rm = TRUE),
    Median_Sleep = median(SleepH, na.rm = TRUE),
    Mean_Well    = mean(Wellness, na.rm = TRUE),
    SD_Well      = sd(Wellness, na.rm = TRUE)
  )

## # A tibble: 1 × 7
##   Mean_Age Median_Age SD_Age Mean_ScreenH Median_Sleep Mean_Well SD_Well
##      <dbl>      <dbl>  <dbl>        <dbl>        <dbl>     <dbl>   <dbl>
## 1     28.6         29   7.31         8.96         7.10      20.2    18.2

The mean is the arithmetic average of a variable. In this dataset, the mean age of respondents is 28.65 years, the mean screen time is 8.96 hours/day, the mean sleep duration is 7.06 hours/day, and the mean mental wellness index is 20.22 (out of 100). These values represent the central tendency of each variable.

The median is the middle value when all observations are ordered from lowest to highest. For example, the median age is 29 years, meaning half of the respondents are younger and half are older. The median screen time is 9.33 hours/day, the median sleep duration is 7.10 hours/day, and the median wellness index is 10.65, which shows that most respondents report quite low wellness compared to the maximum of 100.

The standard deviation (SD) measures how much the values vary around the mean. A larger SD means greater spread. In this sample, the SD for age is about 7.3 years, showing moderate variation in respondent ages. For wellness, the SD is 18.2 points, showing very high variability in well-being scores compared to the mean.

1.4. Graph distributions

library(ggplot2)

# Histogram of Age
ggplot(survey20_clean, aes(x = Age)) +
  geom_histogram(binwidth = 5, fill = "lightgreen", color = "black") +
  labs(title = "Distribution of Age", x = "Age (years)", y = "Count")

# Histogram of Screen Time Hours
ggplot(survey20_clean, aes(x = ScreenH)) +
  geom_histogram(binwidth = 1, fill = "yellow", color = "black") +
  labs(title = "Distribution of Screen Time", x = "Screen time (hours/day)", y = "Count")

# Boxplot of Sleep Hours by Gender
ggplot(survey20_clean, aes(x = Gender, y = SleepH, fill = Gender)) +
  geom_boxplot() +
  labs(title = "Sleep Hours by Gender", x = "Gender", y = "Sleep (hours/day)") +
  theme(legend.position = "none")

# Histogram of Wellness Index
ggplot(survey20_clean, aes(x = Wellness)) +
  geom_histogram(binwidth = 10, fill = "orange", color = "black") +
  labs(title = "Distribution of Wellness Index", 
       x = "Mental Wellness (0–100)", y = "Count")

# Scatterplot: Screen Time vs Mental Wellness
ggplot(survey20_clean, aes(x = ScreenH, y = Wellness)) +
  geom_point(color = "purple", size = 3) +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Screen Time vs Mental Wellness",
       x = "Screen time (hours/day)", 
       y = "Mental wellness index (0–100)")

The histograms show that the age distribution is concentrated around the 30s, with few respondents over 40. Screen time is centered around 9 hours/day, with most participants spending between 8 and 10 hours. The boxplot of sleep hours by gender reveals a clear difference: on average, males sleep about 8 hours per night, while females average closer to 6.5 hours, suggesting a potential gender gap in sleep duration. The wellness index is heavily skewed, with most values clustered at the low end (below 40). The scatterplot of screen time vs wellness indicates a negative trend, suggesting that higher screen time may be associated with lower mental wellness.

Task 2 - Business school

2.1. Distribution of Undergraduate Degrees

# Libraries
library(readxl)
library(dplyr)
library(ggplot2)

# Load data 
df <- read_excel("Business School.xlsx")

# Inspect raw unique values 
sort(unique(df$`Undergrad Degree`))

## [1] "Art"              "Business"         "Computer Science" "Engineering"     
## [5] "Finance"

# Clean the raw text and categorize
df <- df %>%
  mutate(
    # Clean: trim, lowercase, collapse multiple spaces
    deg_clean = gsub("\\s+", " ", tolower(trimws(`Undergrad Degree`))),
    # Categorize using robust regex-style matches 
    UndergradCat = case_when(
      # Business family
      grepl("^business$", deg_clean) |
      grepl("business", deg_clean) ~ "Business",

      # Finance family
      grepl("^finance$", deg_clean) |
      grepl("finance", deg_clean) ~ "Finance",

      # Computer Science family
       grepl("^computer science$", deg_clean) |
      grepl("computer science", deg_clean) ~ "Computer Science",

      # Engineering family
      grepl("engineering", deg_clean) ~ "Engineering",

      # Art family
      grepl("^art$", deg_clean) |
      grepl("art", deg_clean) ~ "Art",

      TRUE ~ "Other"
    )
  )

# Count JUST the five requested categories
target_levels <- c("Business","Finance","Computer Science","Engineering","Art")
deg_counts <- df %>%
  filter(UndergradCat %in% target_levels) %>%
  count(UndergradCat, name = "Count", sort = TRUE)

# Safety check: stop with a message if nothing matched
if (nrow(deg_counts) == 0) {
  stop("No entries matched the five categories after cleaning. Check the 'deg_clean' values printed above and adjust patterns.")
}

# Print ONLY the name of the most common category (as required)
most_common_degree <- deg_counts$UndergradCat[1]
cat(most_common_degree, "\n")

## Business

# Sentence with the count:
cat("Most common undergraduate degree is:", most_common_degree,
    "with", deg_counts$Count[1], "students.\n")

## Most common undergraduate degree is: Business with 35 students.

# Plot: Distribution of categories
ggplot(deg_counts, aes(x = reorder(UndergradCat, Count), y = Count)) +
  geom_col() +
  geom_text(aes(label = Count), hjust = -0.2, size = 3) +
  coord_flip() +
  expand_limits(y = max(deg_counts$Count) * 1.15) +
  labs(
    title = "Distribution of Undergraduate Degrees",
    x = "Undergraduate Degree",
    y = "Count"
  ) +
  theme_minimal(base_size = 12)

2.2. Distribution of Annual Salary

library(dplyr)
library(ggplot2)

## Descriptive statistics for Annual Salary ----
salary_stats <- df %>%
  summarise(
    n = sum(!is.na(`Annual Salary`)),
    mean = mean(`Annual Salary`, na.rm = TRUE),
    median = median(`Annual Salary`, na.rm = TRUE),
    sd = sd(`Annual Salary`, na.rm = TRUE),
    min = min(`Annual Salary`, na.rm = TRUE),
    max = max(`Annual Salary`, na.rm = TRUE),
    IQR = IQR(`Annual Salary`, na.rm = TRUE)
  )

salary_stats

## # A tibble: 1 × 7
##       n   mean median     sd   min    max   IQR
##   <int>  <dbl>  <dbl>  <dbl> <dbl>  <dbl> <dbl>
## 1   100 109058 103500 41501. 20000 340000 36875

## Histogram of Annual Salary
ggplot(df, aes(x = `Annual Salary`)) +
  geom_histogram(bins = 20, fill = "pink", color = "black") +
  labs(
    title = "Distribution of Annual Salary",
    x = "Annual Salary",
    y = "Count"
  ) +
  theme_minimal()

Description of distribution: The mean salary is higher than the median, which suggests the distribution is right-skewed (a few students earn much more than most). The standard deviation is relatively large compared to the mean, indicating substantial variation in salaries. The histogram shows that most salaries cluster in the lower range, while a smaller number of students earn very high salaries (long right tail). This type of distribution is common in income data because high outliers (top earners) pull the average up.

2.3. Hypothesis

library(dplyr)

## One-sample t-test: H0: mu = 74
t_res <- t.test(df$`MBA Grade`, mu = 74)

t_res   # show t-test results

## 
##  One Sample t-test
## 
## data:  df$`MBA Grade`
## t = 2.6587, df = 99, p-value = 0.00915
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
##  74.51764 77.56346
## sample estimates:
## mean of x 
##  76.04055

## Effect size: Cohen's d
mean_grade <- mean(df$`MBA Grade`, na.rm = TRUE)
sd_grade   <- sd(df$`MBA Grade`, na.rm = TRUE)
n_grade    <- sum(!is.na(df$`MBA Grade`))

cohens_d <- (mean_grade - 74) / sd_grade

cohens_d

## [1] 0.2658658

Explanation: The one-sample t-test showed that the average MBA grade was significantly different than 74, t(99) = 2.66, p = 0.009 (p < 0.05).

Task 3 - Apartments

Import the dataset Apartments.xlsx

apartments <- read_excel("Apartments.xlsx")  # reads the first sheet by default

Description:

Age: Age of an apartment in years
Distance: The distance from city center in km
Price: Price per m2
Parking: 0-No, 1-Yes
Balcony: 0-No, 1-Yes

Change categorical variables into factors.

library(readxl)

# Read 
apartments <- read_excel("Apartments.xlsx")

# Helper: map many encodings to a "No"/"Yes" factor
to_yes_no_factor <- function(x) {
  # Convert to trimmed lower-case character
  xc   <- trimws(as.character(x))
  xlow <- tolower(xc)

  # Try to pull out any numeric signal (handles "1-Yes", "0 - No", etc.)
  num <- suppressWarnings(as.numeric(gsub("[^0-9.-]", "", xlow)))

  out <- rep(NA_character_, length(xlow))

  # Numeric logic: 0 -> No; >0 -> Yes
  out[!is.na(num) & num == 0] <- "No"
  out[!is.na(num) & num >  0] <- "Yes"

  # Textual logic (English + a few common variants)
  out[is.na(out) & grepl("^(yes|y|true|t|da|ja|with|has|present)$", xlow)]  <- "Yes"
  out[is.na(out) & grepl("^(no|n|false|f|ne|without|none|absent)$", xlow)]  <- "No"

  factor(out, levels = c("No","Yes"))
}

# Apply to the two categorical columns
apartments$Parking <- to_yes_no_factor(apartments$Parking)
apartments$Balcony <- to_yes_no_factor(apartments$Balcony)

# Quick verification
str(apartments[, c("Parking","Balcony")])

## tibble [85 × 2] (S3: tbl_df/tbl/data.frame)
##  $ Parking: Factor w/ 2 levels "No","Yes": 1 2 1 1 2 1 1 2 2 2 ...
##  $ Balcony: Factor w/ 2 levels "No","Yes": 2 1 1 2 2 2 2 2 1 1 ...

table(apartments$Parking,  useNA = "ifany")

## 
##  No Yes 
##  42  43

table(apartments$Balcony,  useNA = "ifany")

## 
##  No Yes 
##  48  37

Test the hypothesis H0: Mu_Price = 1900 eur. What can you conclude?

# One-sample t-test for mean Price
t_test_result <- t.test(apartments$Price, mu = 1900)

t_test_result

## 
##  One Sample t-test
## 
## data:  apartments$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
##  1937.443 2100.440
## sample estimates:
## mean of x 
##  2018.941

Conclusion: The average price per m² in our sample is ≈ 2019 €. The p-value of the t-test is 0.0047, which is smaller than 0.05. Therefore, we reject the null hypothesis H₀ (μ = 1900). We can conclude that the true mean apartment price is significantly different from 1900 € at the 5% significance level.

Estimate the simple regression function: Price = f(Age). Save results in object fit1 and explain the estimate of regression coefficient, coefficient of correlation and coefficient of determination.

# Estimate simple regression Price = f(Age)
fit1 <- lm(Price ~ Age, data = apartments)

# Regression summary
summary(fit1)

## 
## Call:
## lm(formula = Price ~ Age, data = apartments)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -623.9 -278.0  -69.8  243.5  776.1 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2185.455     87.043  25.108   <2e-16 ***
## Age           -8.975      4.164  -2.156    0.034 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared:  0.05302,    Adjusted R-squared:  0.04161 
## F-statistic: 4.647 on 1 and 83 DF,  p-value: 0.03401

# Correlation coefficient
cor_coef <- cor(apartments$Price, apartments$Age)

# Coefficient of determination (R-squared)
r_squared <- summary(fit1)$r.squared

list(correlation = cor_coef,
     r_squared = r_squared)

## $correlation
## [1] -0.230255
## 
## $r_squared
## [1] 0.05301737

Show the scateerplot matrix between Price, Age and Distance. Based on the matrix determine if there is potential problem with multicolinearity.

# Scatterplot matrix of Price, Age, and Distance
pairs(~ Price + Age + Distance, data = apartments,
      main = "Scatterplot Matrix: Price, Age, Distance",
      pch = 19, col = "purple")

Interpretation: There is a weak negative relationship between Price and both Age and Distance, while Age and Distance are not strongly correlated with each other, so multicollinearity does not appear to be a problem.

Estimate the multiple regression function: Price = f(Age, Distance). Save it in object named fit2.

fit2 <- lm(Price ~ Age + Distance, data = apartments)

summary(fit2)

## 
## Call:
## lm(formula = Price ~ Age + Distance, data = apartments)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -603.23 -219.94  -85.68  211.31  689.58 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2460.101     76.632   32.10  < 2e-16 ***
## Age           -7.934      3.225   -2.46    0.016 *  
## Distance     -20.667      2.748   -7.52 6.18e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared:  0.4396, Adjusted R-squared:  0.4259 
## F-statistic: 32.16 on 2 and 82 DF,  p-value: 4.896e-11

Chech the multicolinearity with VIF statistics. Explain the findings.

library(performance)

check_collinearity(fit2)

## # Check for Multicollinearity
## 
## Low Correlation
## 
##      Term  VIF  VIF 95% CI adj. VIF Tolerance Tolerance 95% CI
##       Age 1.00 [1.00, Inf]     1.00      1.00     [0.00, 1.00]
##  Distance 1.00 [1.00, Inf]     1.00      1.00     [0.00, 1.00]

Explanation: The VIF values for both Age and Distance are equal to 1, which means there is no multicollinearity between the explanatory variables. This confirms that Age and Distance are independent enough to be included in the regression model without inflating the variances of their coefficients.

Calculate standardized residuals and Cooks Distances for model fit2. Remove any potentially problematic units (outliers or units with high influence).

# Standardized residuals
std_resid <- rstandard(fit2)

# Cook's distances
cooks_d <- cooks.distance(fit2)

# Combine into a data frame for easier inspection
diagnostics <- data.frame(std_resid, cooks_d)
head(diagnostics)

##    std_resid     cooks_d
## 1 -0.6653487 0.007386569
## 2  1.7832876 0.030365432
## 3 -0.5937629 0.005882612
## 4  0.7543794 0.008299153
## 5 -1.0733987 0.005112584
## 6 -0.7775190 0.004900891

# Identify potentially problematic observations
# Rule of thumb:
#   |standardized residual| > 2 → possible outlier
#   Cook's distance > 4/n → influential (n = sample size)

n <- nrow(apartments)
problematic <- which(abs(std_resid) > 2 | cooks_d > 4/n)

problematic

## 22 33 38 53 55 
## 22 33 38 53 55

# Remove problematic observations
apartments_clean <- apartments[-problematic, ]

# Re-estimate regression without them
fit2_clean <- lm(Price ~ Age + Distance, data = apartments_clean)

summary(fit2_clean)

## 
## Call:
## lm(formula = Price ~ Age + Distance, data = apartments_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -411.50 -203.69  -45.24  191.11  492.56 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2502.467     75.024  33.356  < 2e-16 ***
## Age           -8.674      3.221  -2.693  0.00869 ** 
## Distance     -24.063      2.692  -8.939 1.57e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared:  0.5361, Adjusted R-squared:  0.524 
## F-statistic: 44.49 on 2 and 77 DF,  p-value: 1.437e-13

Check for potential heteroskedasticity with scatterplot between standarized residuals and standrdized fitted values. Explain the findings.

# Standardized residuals
std_resid <- rstandard(fit2_clean)

# Standardized fitted values
std_fitted <- scale(fitted(fit2_clean))

# Scatterplot
plot(std_fitted, std_resid,
     xlab = "Standardized Fitted Values",
     ylab = "Standardized Residuals",
     main = "Residuals vs. Fitted (Check for Heteroskedasticity)",
     pch = 19, col = "lightblue")

abline(h = 0, lty = 2, col = "red")

Explanation: The scatterplot of standardized residuals against standardized fitted values shows a cone-shaped pattern, where the spread of residuals increases as fitted values rise. This indicates the presence of heteroskedasticity, meaning the variance of errors is not constant across all levels of predicted prices.

Are standardized residuals ditributed normally? Show the graph and formally test it. Explain the findings.

# Standardized residuals from cleaned model
std_resid <- rstandard(fit2_clean)

# Histogram
hist(std_resid, breaks = 10, col = "purple",
     main = "Histogram of Standardized Residuals",
     xlab = "Standardized Residuals")

# Q-Q plot
qqnorm(std_resid, main = "Q-Q Plot of Standardized Residuals", col = "purple")
qqline(std_resid, col = "red", lwd = 2)

# Formal Shapiro-Wilk test
shapiro.test(std_resid)

## 
##  Shapiro-Wilk normality test
## 
## data:  std_resid
## W = 0.94156, p-value = 0.001168

Explanation: The histogram and Q-Q plot of standardized residuals show noticeable deviations from a normal distribution, especially in the tails. The Shapiro–Wilk test yields W = 0.94, p = 0.0012, which is below the 0.05 threshold. Therefore, we reject the null hypothesis of normality and conclude that the standardized residuals are not normally distributed.

Estimate the fit2 again without potentially excluded units and show the summary of the model. Explain all coefficients.

# Estimate model again without problematic units
fit2_clean <- lm(Price ~ Age + Distance, data = apartments_clean)

# Show summary
summary(fit2_clean)

## 
## Call:
## lm(formula = Price ~ Age + Distance, data = apartments_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -411.50 -203.69  -45.24  191.11  492.56 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2502.467     75.024  33.356  < 2e-16 ***
## Age           -8.674      3.221  -2.693  0.00869 ** 
## Distance     -24.063      2.692  -8.939 1.57e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared:  0.5361, Adjusted R-squared:  0.524 
## F-statistic: 44.49 on 2 and 77 DF,  p-value: 1.437e-13

The regression model Price = 2502.47 – 8.67 × Age – 24.06 × Distance shows that each additional year lowers the price per m² by about 9 €, and each kilometer farther from the city center reduces it by about 24 €. Both effects are statistically significant, with distance having the stronger impact. The model explains about 54% of the variation in prices, indicating a reasonably good fit.

Estimate the linear regression function Price = f(Age, Distance, Parking and Balcony). Be careful to correctly include categorical variables. Save the object named fit3.

# Estimate multiple regression including Age, Distance, Parking, and Balcony
fit3 <- lm(Price ~ Age + Distance + Parking + Balcony, data = apartments_clean)

# Show summary
summary(fit3)

## 
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = apartments_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -390.93 -198.19  -53.64  186.73  518.34 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2393.316     93.930  25.480  < 2e-16 ***
## Age           -7.970      3.191  -2.498   0.0147 *  
## Distance     -21.961      2.830  -7.762 3.39e-11 ***
## ParkingYes   128.700     60.801   2.117   0.0376 *  
## BalconyYes     6.032     57.307   0.105   0.9165    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 252.7 on 75 degrees of freedom
## Multiple R-squared:  0.5623, Adjusted R-squared:  0.5389 
## F-statistic: 24.08 on 4 and 75 DF,  p-value: 7.764e-13

With function anova check if model fit3 fits data better than model fit2.

# Compare fit2 (Age + Distance) with fit3 (Age + Distance + Parking + Balcony)
anova(fit2_clean, fit3)

## Analysis of Variance Table
## 
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + Parking + Balcony
##   Res.Df     RSS Df Sum of Sq      F Pr(>F)
## 1     77 5077362                           
## 2     75 4791128  2    286234 2.2403 0.1135

The ANOVA test compares the simpler model (fit2) with the extended model (fit3) that includes Parking and Balcony. The p-value is 0.1135, which is greater than 0.05, so we fail to reject the null hypothesis that the additional variables improve the model. This means that, although Parking was significant on its own, jointly Parking and Balcony do not significantly increase the overall explanatory power compared to using only Age and Distance.

Show the results of fit3 and explain regression coefficient for both categorical variables. Can you write down the hypothesis which is being tested with F-statistics, shown at the bottom of the output?

# Show regression results for fit3
summary(fit3)

## 
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = apartments_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -390.93 -198.19  -53.64  186.73  518.34 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2393.316     93.930  25.480  < 2e-16 ***
## Age           -7.970      3.191  -2.498   0.0147 *  
## Distance     -21.961      2.830  -7.762 3.39e-11 ***
## ParkingYes   128.700     60.801   2.117   0.0376 *  
## BalconyYes     6.032     57.307   0.105   0.9165    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 252.7 on 75 degrees of freedom
## Multiple R-squared:  0.5623, Adjusted R-squared:  0.5389 
## F-statistic: 24.08 on 4 and 75 DF,  p-value: 7.764e-13

Explanation: In fit3, apartments with parking cost on average about 129 € per m² more than those without, a significant effect, while the balcony adds only ≈6 €, which is not significant. At the bottom of the regression output, the F-statistic tests the null hypothesis that all slope coefficients are equal to zero (Age, Distance, Parking, and Balcony together have no effect on apartment prices). The alternative hypothesis is that at least one coefficient differs from zero. Since the F-test p-value is far below 0.001, we reject the null and conclude that the model as a whole is statistically significant.

Save fitted values and claculate the residual for apartment ID2.

# Save fitted values and residuals
fitted_values <- fitted(fit3)
residuals <- resid(fit3)

# Apartment ID2
id <- 2
cat("Apartment ID", id, "\n")

## Apartment ID 2

cat("  Fitted value (predicted price per m²):", round(fitted_values[id], 2), "€\n")

##   Fitted value (predicted price per m²): 2356.6 €

cat("  Residual (observed - fitted):", round(residuals[id], 2), "€\n")

##   Residual (observed - fitted): 443.4 €

R Take-Home Exam 2025

Izabela Seršen

2025-09-28