#install.packages("dplyr")
#install.packages("psych")
#install.packages("ggplot2")
library(ggplot2)
library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Task 1: Find your own data set that contains at least 20 units and at least 4 variables (most of which are numeric, but it is good to have at least one categorical variable as well).

1.) Explain the data set (the variables used in the analysis).

task1 <- read.csv("./ST vs MW.csv")
task <- task1[,-c(1,4,5,7,8,10,12,13,14,16)]
task <- task %>%
  rename(
    ScreenTime = screen_time_hours,
    SleepHours = sleep_hours,
    StressLevel = stress_level_0_10,
    MentalWellness = mental_wellness_index_0_100
  )

head(task, 5)

##   age gender ScreenTime SleepHours StressLevel MentalWellness
## 1  33 Female      10.79       6.63         9.3            9.3
## 2  28 Female       7.40       8.05         5.7           56.2
## 3  35 Female       9.78       6.48         9.1            3.6
## 4  42   Male      11.13       6.89        10.0            0.0
## 5  28   Male      13.22       5.79        10.0            0.0

Age: (Numeric) - the age of the respondent in years.
Gender: (Categorical) - the self the self-reported gender of the respondent (i.e., Male, Female, Non-Binary/Other).
Screen time hours: (Numeric) - the total number of hours per day spent on screens.
Sleep hours: (Numeric) - the average number of hours the respondent sleeps per night.
Stress level: (Numeric) - a self-reported measure of stress on a scale from 0 (no stress) to 10 (extremely high stress).
Mental wellness index: (Numeric) - a composite index representing the respondent’s overall mental wellness (From 1, to 100: a higher value indicates better mental wellness).
Source: https://www.kaggle.com/datasets/adharshinikumar/screentime-vs-mentalwellness-survey-2025?resource=download

2.) Perform some data manipulations (create new variable, delete some units due to missing data, rename variables, create new data.frame based on conditions, etc.).

#Renaming gender:

task <- task %>% 
  mutate(gender = case_when(
    gender == "Male" ~ "Moški",
    gender == "Female" ~ "Ženska",
    gender == "Non-binary/Other" ~ "Ne-binaren/Drugo",
    TRUE ~ gender   # keep other values unchanged (just in case)
  ))

head(task, 7)

##   age           gender ScreenTime SleepHours StressLevel MentalWellness
## 1  33           Ženska      10.79       6.63         9.3            9.3
## 2  28           Ženska       7.40       8.05         5.7           56.2
## 3  35           Ženska       9.78       6.48         9.1            3.6
## 4  42            Moški      11.13       6.89        10.0            0.0
## 5  28            Moški      13.22       5.79        10.0            0.0
## 6  28 Ne-binaren/Drugo       9.83       7.19        10.0            5.0
## 7  42 Ne-binaren/Drugo       6.02       7.44         6.1           43.7

#Creating a new data frame based on conditions:

task <- task %>%
  mutate(SleepScreenRatio = SleepHours / ScreenTime)

Sleep Screen Ratio: (Numeric) - hours of sleep per hour of screen time (Newly created variable).

#Creating a new variable:

high_wellness <- task %>%
  filter(MentalWellness > 70)

head(high_wellness)

##   age gender ScreenTime SleepHours StressLevel MentalWellness SleepScreenRatio
## 1  26  Moški       6.28       8.48         1.6           86.8         1.350318
## 2  26 Ženska       4.96       8.01         4.6           74.3         1.614919
## 3  23 Ženska       7.04       8.04         4.6           72.6         1.142045
## 4  26 Ženska       7.83       8.30         3.2           80.9         1.060026
## 5  39 Ženska       1.00       7.26         0.0           87.2         7.260000
## 6  20 Ženska       6.31       9.00         2.7           97.0         1.426307

3.) Present the descriptive statistics for the selected variables and explain at least 3 sample statistics (mean, median, etc.).

describe(task)

##                  vars   n  mean    sd median trimmed   mad   min   max range  skew kurtosis   se
## age                 1 400 29.78  7.47  30.00   29.55  7.41 16.00 60.00 44.00  0.35     0.12 0.37
## gender*             2 400  2.13  0.98   3.00    2.16  0.00  1.00  3.00  2.00 -0.26    -1.92 0.05
## ScreenTime          3 400  9.02  2.49   9.09    9.01  2.39  1.00 19.17 18.17  0.08     0.66 0.12
## SleepHours          4 400  7.01  0.85   7.03    7.01  0.92  4.64  9.74  5.10  0.04    -0.27 0.04
## StressLevel         5 400  8.15  2.09   8.80    8.49  1.78  0.00 10.00 10.00 -1.22     1.11 0.10
## MentalWellness      6 400 20.33 20.38  14.80   17.22 19.57  0.00 97.00 97.00  1.22     1.14 1.02
## SleepScreenRatio    7 400  0.88  0.55   0.76    0.81  0.25  0.35  7.26  6.91  6.96    70.52 0.03

Mean: The mean (average) is the sum of all values divided by the number of observations. For example, the mean screen time is 9,02 hours, meaning that, on average, respondents spend about 9 hours per day on screens.
Median: The median is the middle value when the data are ordered. For instance, the median sleep duration is 7,03 hours, showing that half of the respondents sleep less than 7,03 hours and half sleep more.
Standard deviation (sd): This measures the spread of the data around the mean. A higher standard deviation indicates more variability. For example, the standard deviation of stress level is about 2,09.
The minimum (Min): is the smallest observed value in the dataset. For example, the smallest amount of Screen time is 1 hour.
The maximum (Max): is the largest observed value. For example the largest number of Screen Time is 19,17 hours.

4.) Graph the distribution of the variables using histograms, scatterplots, and/or boxplots. Explain the results.

#Histogram of Screen Time

library(ggplot2)

ggplot(task, aes(x = ScreenTime)) +
  geom_histogram(binwidth = 1, fill = "pink", color = "black") +
  labs(title = "Distribution of Screen Time",
       x = "Screen Time (hours per day)",
       y = "Count")

The histogram shows how daily screen time is distributed among respondents. Most individuals cluster between 6 and 10 hours per day. A few participants report very high screen time (12+ hours), which makes the distribution slightly right-skewed.

#Boxplot of Stress level by Gender

ggplot(task, aes(x = gender, y = StressLevel, fill = gender)) +
  geom_boxplot() +
  labs(title = "Stress Levels by Gender", x = "Gender", y = "Stress Level (0–10)"
       )

The boxplot shows that stress levels are distributed fairly evenly across genders, with median values around 5. Both groups include individuals with very high stress (close to 10), while the spread of scores indicates variability within each group. This suggests that gender does not strongly determine stress level in this sample, though individual differences are substantial. The dataset contains fewer participants who identified as Non-binary compared to Male and Female. This smaller sample size means that the boxplot for Non-binary stress levels is based on fewer observations, so its distribution should be interpreted with caution.

#Scatterplot of Sleep Hours vs. Mental Wellness

ggplot(task, aes(x = SleepHours, y = MentalWellness)) +
  geom_point(alpha = 0.5, color = "darkgreen") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Sleep Hours vs Mental Wellness", 
       x = "Sleep Hours per Night", 
       y = "Mental Wellness Index (0–100)")

## `geom_smooth()` using formula = 'y ~ x'

There appears to be a positive relationship: individuals who sleep more tend to report higher mental wellness. The red trend line shows this upward pattern.

Task 2:You have a data set for 100 MBA students of the current generation (Business School.xlsx in Task 2 folder). In the previous year, the average grade in this program was 74 (MBA grade).

1.) Graph the distribution of undergrad degrees using the ggplot function. Which degree is the most common?

#install.packages("readxl")
#install.packages("effectsize")
library(effectsize)

## 
## Attaching package: 'effectsize'

## The following object is masked from 'package:psych':
## 
##     phi

library(readxl)

task2 <- read_xlsx("./Business School.xlsx")

head(task2)

## # A tibble: 6 × 9
##   `Student ID` `Undergrad Degree` `Undergrad Grade` `MBA Grade` `Work Experience` `Employability (Before)`
##          <dbl> <chr>                          <dbl>       <dbl> <chr>                                <dbl>
## 1            1 Business                        68.4        90.2 No                                     252
## 2            2 Computer Science                70.2        68.7 Yes                                    101
## 3            3 Finance                         76.4        83.3 No                                     401
## 4            4 Business                        82.6        88.7 No                                     287
## 5            5 Finance                         76.9        75.4 No                                     275
## 6            6 Computer Science                83.3        82.1 No                                     254
## # ℹ 3 more variables: `Employability (After)` <dbl>, Status <chr>, `Annual Salary` <dbl>

#Graph distribution of Undergrad Degrees

ggplot(task2, aes(x = `Undergrad Degree`)) +
  geom_bar(fill = "skyblue", color = "black") +
  labs(title = "Distribution of Undergrad Degrees",
       x = "Undergraduate Degree",
       y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

- Interpretation: The tallest bar represents the most common undergraduate degree, which is Business, lowest bar represent the least undergraduate degree, which is Art.

#Descriptive statistics & histogram for Annual Salary

summary(task2$`Annual Salary`)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20000   87125  103500  109058  124000  340000

ggplot(task2, aes(x = `Annual Salary`)) +
  geom_histogram(binwidth = 10000, fill = "orange", color = "black") +
  labs(title = "Distribution of Annual Salaries",
       x = "Annual Salary (USD)",
       y = "Count")

The minimum salary is 20,000 USD, while the maximum is very high at 340,000 USD, showing a large range.
The median salary is 103,500 USD and the mean is 109,058 USD. Since the mean is higher than the median, this indicates a right-skewed distribution.
The middle 50% of students (interquartile range) earn between 87,125 USD (Q1) and 124,000 USD (Q3). Most students cluster between 80,000–130,000 USD.

#Hypothesis test: We test the null hypothesis \(H_0: \mu_{MBA\ Grade} = 74\).

t.test(task2$`MBA Grade`, mu = 74)

## 
##  One Sample t-test
## 
## data:  task2$`MBA Grade`
## t = 2.6587, df = 99, p-value = 0.00915
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
##  74.51764 77.56346
## sample estimates:
## mean of x 
##  76.04055

The sample mean MBA grade is 76.04, which is higher than the previous year’s mean of 74. The t statistic is 2.66 with 99 degrees of freedom. The p-value is 0.009, which is below 0.05. This means we reject the null hypothesis \(H_0: \mu_{MBA\ Grade} = 74\) and conclude that the current class performs significantly better than the previous year.

Task 3: You analyze the price per m2 for a sample of apartments in the Ljubljana region (Apartments.xlsx in Task 3 folder).

Import the dataset Apartments.xlsx

library(readxl)
task3 <- read_excel("Apartments.xlsx")
head(task3)

## # A tibble: 6 × 5
##     Age Distance Price Parking Balcony
##   <dbl>    <dbl> <dbl>   <dbl>   <dbl>
## 1     7       28  1640       0       1
## 2    18        1  2800       1       0
## 3     7       28  1660       0       0
## 4    28       29  1850       0       1
## 5    18       18  1640       1       1
## 6    28       12  1770       0       1

Description:

Age: Age of an apartment in years
Distance: The distance from city center in km
Price: Price per m2
Parking: 0-No, 1-Yes
Balcony: 0-No, 1-Yes

Change categorical variables into factors.

task3$Parking <- factor(task3$Parking,
                        levels = c(0, 1),
                        labels = c("No", "Yes"))

task3$Balcony <- factor(task3$Balcony,
                        levels = c(0, 1),
                        labels = c("No", "Yes"))

str(task3)

## tibble [85 × 5] (S3: tbl_df/tbl/data.frame)
##  $ Age     : num [1:85] 7 18 7 28 18 28 14 18 22 25 ...
##  $ Distance: num [1:85] 28 1 28 29 18 12 20 6 7 2 ...
##  $ Price   : num [1:85] 1640 2800 1660 1850 1640 1770 1850 1970 2270 2570 ...
##  $ Parking : Factor w/ 2 levels "No","Yes": 1 2 1 1 2 1 1 2 2 2 ...
##  $ Balcony : Factor w/ 2 levels "No","Yes": 2 1 1 2 2 2 2 2 1 1 ...

Test the hypothesis H0: Mu_Price = 1900 eur. What can you conclude?

t.test(task3$Price, mu = 1900)

## 
##  One Sample t-test
## 
## data:  task3$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
##  1937.443 2100.440
## sample estimates:
## mean of x 
##  2018.941

A one-sample t-test was performed to test whether the average price per square meter in Ljubljana apartments equals 1900 EUR. The results showed that the mean price (2018,94) was significantly higher than 1900 EUR. The 95% confidence interval [1937, 2100] did not include 1900. Therefore, we reject the null hypothesis and conclude that the average price per m² in this sample is significantly above 1900 EUR.

Estimate the simple regression function: Price = f(Age). Save results in object fit1 and explain the estimate of regression coefficient, coefficient of correlation and coefficient of determination.

fit1 <- lm(Price ~ Age, data = task3)

summary(fit1)

## 
## Call:
## lm(formula = Price ~ Age, data = task3)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -623.9 -278.0  -69.8  243.5  776.1 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2185.455     87.043  25.108   <2e-16 ***
## Age           -8.975      4.164  -2.156    0.034 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared:  0.05302,    Adjusted R-squared:  0.04161 
## F-statistic: 4.647 on 1 and 83 DF,  p-value: 0.03401

The regression shows that apartment Age has a significant negative effect on price. The slope (–8.98) means that, on average, each additional year reduces the price per m² by about 9 EUR. The intercept (2185 EUR) indicates the expected price of a new apartment. The correlation is weak negative, and the coefficient of determination (R² = 0.053) shows that Age explains only about 5% of the variation in prices — most variation is due to other factors.

Show the scateerplot matrix between Price, Age and Distance. Based on the matrix determine if there is potential problem with multicolinearity.

#install.packages("GGally")  
library(GGally)

ggpairs(task3[, c("Price", "Age", "Distance")])

- The scatterplot matrix shows a weak negative relationship between Price and Age and a stronger negative relationship between Price and Distance. Importantly, Age and Distance are not correlated (r = 0.043), which means there is no multicollinearity problem between the explanatory variables.

Estimate the multiple regression function: Price = f(Age, Distance). Save it in object named fit2.

fit2 <- lm(Price ~ Age + Distance, data = task3)
summary(fit2)

## 
## Call:
## lm(formula = Price ~ Age + Distance, data = task3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -603.23 -219.94  -85.68  211.31  689.58 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2460.101     76.632   32.10  < 2e-16 ***
## Age           -7.934      3.225   -2.46    0.016 *  
## Distance     -20.667      2.748   -7.52 6.18e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared:  0.4396, Adjusted R-squared:  0.4259 
## F-statistic: 32.16 on 2 and 82 DF,  p-value: 4.896e-11

Both Age and Distance significantly reduce apartment prices, with distance having the stronger effect. Together, they explain around 44% of the variation in price per m².

Chech the multicolinearity with VIF statistics. Explain the findings.

#install.packages("car")
library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:psych':
## 
##     logit

vif(fit2)

##      Age Distance 
## 1.001845 1.001845

The VIF values for Age and Distance are both around 1.0, confirming that there is no multicollinearity in the model. This means the predictors are independent and do not distort the regression estimates.

Calculate standardized residuals and Cooks Distances for model fit2. Remove any potentially problematic units (outliers or units with high influence).

task3$std_resid <- rstandard(fit2)

task3$cooks_d <- cooks.distance(fit2)

head(task3[, c("Price", "Age", "Distance", "std_resid", "cooks_d")])

## # A tibble: 6 × 5
##   Price   Age Distance std_resid cooks_d
##   <dbl> <dbl>    <dbl>     <dbl>   <dbl>
## 1  1640     7       28    -0.665 0.00739
## 2  2800    18        1     1.78  0.0304 
## 3  1660     7       28    -0.594 0.00588
## 4  1850    28       29     0.754 0.00830
## 5  1640    18       18    -1.07  0.00511
## 6  1770    28       12    -0.778 0.00490

outliers <- which(abs(task3$std_resid) > 2)

n <- nrow(task3)
influential <- which(task3$cooks_d > 4/n)

outliers

## 33 38 53 
## 33 38 53

influential

## 22 33 38 53 55 
## 22 33 38 53 55

problematic <- union(outliers, influential)

task3_clean <- task3[-problematic, ]

fit2_clean <- lm(Price ~ Age + Distance, data = task3_clean)
summary(fit2_clean)

## 
## Call:
## lm(formula = Price ~ Age + Distance, data = task3_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -411.50 -203.69  -45.24  191.11  492.56 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2502.467     75.024  33.356  < 2e-16 ***
## Age           -8.674      3.221  -2.693  0.00869 ** 
## Distance     -24.063      2.692  -8.939 1.57e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared:  0.5361, Adjusted R-squared:  0.524 
## F-statistic: 44.49 on 2 and 77 DF,  p-value: 1.437e-13

Check for potential heteroskedasticity with scatterplot between standarized residuals and standrdized fitted values. Explain the findings.

library(ggplot2)

std_resid <- rstandard(fit2)
std_fitted <- scale(fit2$fitted.values)

ggplot(data.frame(std_fitted, std_resid), aes(x = std_fitted, y = std_resid)) +
  geom_point(color = "blue") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Residuals vs Fitted Values",
       x = "Standardized Fitted Values",
       y = "Standardized Residuals")

- The scatterplot of standardized residuals against standardized fitted values shows that residuals are centered around zero, which is expected. However, the variance of residuals appears to increase for higher fitted values, forming a slight “fan shape.” This suggests some degree of heteroskedasticity in the model.

Are standardized residuals ditributed normally? Show the graph and formally test it. Explain the findings.

hist(std_resid, breaks = 10, col = "skyblue",
     main = "Histogram of Standardized Residuals",
     xlab = "Standardized Residuals")

shapiro.test(std_resid)

## 
##  Shapiro-Wilk normality test
## 
## data:  std_resid
## W = 0.95306, p-value = 0.00366

The graphical checks (histogram) suggest that standardized residuals deviate somewhat from the normal distribution, particularly in the tails. The Shapiro–Wilk test confirms this, with p < 0.05, indicating that the residuals are not normally distributed.

Estimate the fit2 again without potentially excluded units and show the summary of the model. Explain all coefficients.

fit2_clean <- lm(Price ~ Age + Distance, data = task3_clean)

summary(fit2_clean)

## 
## Call:
## lm(formula = Price ~ Age + Distance, data = task3_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -411.50 -203.69  -45.24  191.11  492.56 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2502.467     75.024  33.356  < 2e-16 ***
## Age           -8.674      3.221  -2.693  0.00869 ** 
## Distance     -24.063      2.692  -8.939 1.57e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared:  0.5361, Adjusted R-squared:  0.524 
## F-statistic: 44.49 on 2 and 77 DF,  p-value: 1.437e-13

Intercept (2502.47): A brand-new apartment (Age = 0) located at the city center (Distance = 0) is estimated to cost ≈2502 EUR/m².
Age (–8.67, p = 0.009): Each additional year lowers the price per m² by about 8.7 EUR, holding Distance constant. This effect is statistically significant.
Distance (–24.06, p < 0.001): Each extra kilometer away from the city center decreases the price per m² by about 24 EUR, holding Age constant. This effect is highly significant and stronger than Age.
R² = 0.536: About 54% of the variation in apartment prices is explained by Age and Distance.
Residual standard error = 256.8: The average prediction error is around 257 EUR/m².
F-test (p < 0.001): The model is statistically significant overall.
After removing problematic observations, the regression results improved. Both Age and Distance remain significant predictors of price, with Distance having the stronger effect. The model now explains 54% of the variation in apartment prices, compared to 44% before cleaning

Estimate the linear regression function Price = f(Age, Distance, Parking and Balcony). Be careful to correctly include categorical variables. Save the object named fit3.

fit3 <- lm(Price ~ Age + Distance + Parking + Balcony, data = task3_clean)

summary(fit3)

## 
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = task3_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -390.93 -198.19  -53.64  186.73  518.34 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2393.316     93.930  25.480  < 2e-16 ***
## Age           -7.970      3.191  -2.498   0.0147 *  
## Distance     -21.961      2.830  -7.762 3.39e-11 ***
## ParkingYes   128.700     60.801   2.117   0.0376 *  
## BalconyYes     6.032     57.307   0.105   0.9165    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 252.7 on 75 degrees of freedom
## Multiple R-squared:  0.5623, Adjusted R-squared:  0.5389 
## F-statistic: 24.08 on 4 and 75 DF,  p-value: 7.764e-13

With function anova check if model fit3 fits data better than model fit2.

anova(fit2_clean, fit3)

## Analysis of Variance Table
## 
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + Parking + Balcony
##   Res.Df     RSS Df Sum of Sq      F Pr(>F)
## 1     77 5077362                           
## 2     75 4791128  2    286234 2.2403 0.1135

The ANOVA comparison shows that including Parking and Balcony does not significantly improve the model fit (F = 2.24, p = 0.1135). Therefore, the simpler model with only Age and Distance (fit2_clean) is preferred.

Show the results of fit3 and explain regression coefficient for both categorical variables. Can you write down the hypothesis which is being tested with F-statistics, shown at the bottom of the output?

summary(fit3)

## 
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = task3_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -390.93 -198.19  -53.64  186.73  518.34 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2393.316     93.930  25.480  < 2e-16 ***
## Age           -7.970      3.191  -2.498   0.0147 *  
## Distance     -21.961      2.830  -7.762 3.39e-11 ***
## ParkingYes   128.700     60.801   2.117   0.0376 *  
## BalconyYes     6.032     57.307   0.105   0.9165    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 252.7 on 75 degrees of freedom
## Multiple R-squared:  0.5623, Adjusted R-squared:  0.5389 
## F-statistic: 24.08 on 4 and 75 DF,  p-value: 7.764e-13

In model fit3, Age and Distance both negatively affect prices, while Parking significantly increases price per m². Balcony does not have a statistically significant effect. Since the F-statistic is significant (F = 24.08, p < 0.001), we reject H0. This confirms that the model as a whole explains a substantial share (56%) of the variation in apartment prices, and at least one predictor significantly affects prices.

Save fitted values and claculate the residual for apartment ID2.

task3_clean$fitted <- fitted(fit3)
task3_clean$residuals <- resid(fit3)

task3_clean[2, c("Price", "fitted", "residuals")]

## # A tibble: 1 × 3
##   Price fitted residuals
##   <dbl>  <dbl>     <dbl>
## 1  2800  2357.      443.

task3_clean$ID <- 1:nrow(task3_clean)

task3_clean[task3_clean$ID == 2, c("Price", "fitted", "residuals")]

## # A tibble: 1 × 3
##   Price fitted residuals
##   <dbl>  <dbl>     <dbl>
## 1  2800  2357.      443.

Apartment ID 2 is priced about €443/m² higher than what the model (fit3) predicts → a positive residual.

R Take Home Exam 2025

Žan Blejc Vončina

22.09.2025