#install.packages("dplyr")
#install.packages("psych")
#install.packages("ggplot2")
library(ggplot2)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Task 1: Find your own data set that contains at least 20 units and at least 4 variables (most of which are numeric, but it is good to have at least one categorical variable as well).

task1 <- read.csv("./ST vs MW.csv")
task <- task1[,-c(1,4,5,7,8,10,12,13,14,16)]
task <- task %>%
  rename(
    ScreenTime = screen_time_hours,
    SleepHours = sleep_hours,
    StressLevel = stress_level_0_10,
    MentalWellness = mental_wellness_index_0_100
  )

head(task, 5)
##   age gender ScreenTime SleepHours StressLevel MentalWellness
## 1  33 Female      10.79       6.63         9.3            9.3
## 2  28 Female       7.40       8.05         5.7           56.2
## 3  35 Female       9.78       6.48         9.1            3.6
## 4  42   Male      11.13       6.89        10.0            0.0
## 5  28   Male      13.22       5.79        10.0            0.0

#Renaming gender:

task <- task %>% 
  mutate(gender = case_when(
    gender == "Male" ~ "Moški",
    gender == "Female" ~ "Ženska",
    gender == "Non-binary/Other" ~ "Ne-binaren/Drugo",
    TRUE ~ gender   # keep other values unchanged (just in case)
  ))

head(task, 7)
##   age           gender ScreenTime SleepHours StressLevel MentalWellness
## 1  33           Ženska      10.79       6.63         9.3            9.3
## 2  28           Ženska       7.40       8.05         5.7           56.2
## 3  35           Ženska       9.78       6.48         9.1            3.6
## 4  42            Moški      11.13       6.89        10.0            0.0
## 5  28            Moški      13.22       5.79        10.0            0.0
## 6  28 Ne-binaren/Drugo       9.83       7.19        10.0            5.0
## 7  42 Ne-binaren/Drugo       6.02       7.44         6.1           43.7

#Creating a new data frame based on conditions:

task <- task %>%
  mutate(SleepScreenRatio = SleepHours / ScreenTime)

#Creating a new variable:

high_wellness <- task %>%
  filter(MentalWellness > 70)

head(high_wellness)
##   age gender ScreenTime SleepHours StressLevel MentalWellness SleepScreenRatio
## 1  26  Moški       6.28       8.48         1.6           86.8         1.350318
## 2  26 Ženska       4.96       8.01         4.6           74.3         1.614919
## 3  23 Ženska       7.04       8.04         4.6           72.6         1.142045
## 4  26 Ženska       7.83       8.30         3.2           80.9         1.060026
## 5  39 Ženska       1.00       7.26         0.0           87.2         7.260000
## 6  20 Ženska       6.31       9.00         2.7           97.0         1.426307

describe(task)
##                  vars   n  mean    sd median trimmed   mad   min   max range  skew kurtosis   se
## age                 1 400 29.78  7.47  30.00   29.55  7.41 16.00 60.00 44.00  0.35     0.12 0.37
## gender*             2 400  2.13  0.98   3.00    2.16  0.00  1.00  3.00  2.00 -0.26    -1.92 0.05
## ScreenTime          3 400  9.02  2.49   9.09    9.01  2.39  1.00 19.17 18.17  0.08     0.66 0.12
## SleepHours          4 400  7.01  0.85   7.03    7.01  0.92  4.64  9.74  5.10  0.04    -0.27 0.04
## StressLevel         5 400  8.15  2.09   8.80    8.49  1.78  0.00 10.00 10.00 -1.22     1.11 0.10
## MentalWellness      6 400 20.33 20.38  14.80   17.22 19.57  0.00 97.00 97.00  1.22     1.14 1.02
## SleepScreenRatio    7 400  0.88  0.55   0.76    0.81  0.25  0.35  7.26  6.91  6.96    70.52 0.03

#Histogram of Screen Time

library(ggplot2)

ggplot(task, aes(x = ScreenTime)) +
  geom_histogram(binwidth = 1, fill = "pink", color = "black") +
  labs(title = "Distribution of Screen Time",
       x = "Screen Time (hours per day)",
       y = "Count")

#Boxplot of Stress level by Gender

ggplot(task, aes(x = gender, y = StressLevel, fill = gender)) +
  geom_boxplot() +
  labs(title = "Stress Levels by Gender", x = "Gender", y = "Stress Level (0–10)"
       )

#Scatterplot of Sleep Hours vs. Mental Wellness

ggplot(task, aes(x = SleepHours, y = MentalWellness)) +
  geom_point(alpha = 0.5, color = "darkgreen") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Sleep Hours vs Mental Wellness", 
       x = "Sleep Hours per Night", 
       y = "Mental Wellness Index (0–100)")
## `geom_smooth()` using formula = 'y ~ x'


Task 2:You have a data set for 100 MBA students of the current generation (Business School.xlsx in Task 2 folder). In the previous year, the average grade in this program was 74 (MBA grade).

#install.packages("readxl")
#install.packages("effectsize")
library(effectsize)
## 
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
## 
##     phi
library(readxl)
task2 <- read_xlsx("./Business School.xlsx")

head(task2)
## # A tibble: 6 × 9
##   `Student ID` `Undergrad Degree` `Undergrad Grade` `MBA Grade` `Work Experience` `Employability (Before)`
##          <dbl> <chr>                          <dbl>       <dbl> <chr>                                <dbl>
## 1            1 Business                        68.4        90.2 No                                     252
## 2            2 Computer Science                70.2        68.7 Yes                                    101
## 3            3 Finance                         76.4        83.3 No                                     401
## 4            4 Business                        82.6        88.7 No                                     287
## 5            5 Finance                         76.9        75.4 No                                     275
## 6            6 Computer Science                83.3        82.1 No                                     254
## # ℹ 3 more variables: `Employability (After)` <dbl>, Status <chr>, `Annual Salary` <dbl>

#Graph distribution of Undergrad Degrees

ggplot(task2, aes(x = `Undergrad Degree`)) +
  geom_bar(fill = "skyblue", color = "black") +
  labs(title = "Distribution of Undergrad Degrees",
       x = "Undergraduate Degree",
       y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

- Interpretation: The tallest bar represents the most common undergraduate degree, which is Business, lowest bar represent the least undergraduate degree, which is Art.

#Descriptive statistics & histogram for Annual Salary

summary(task2$`Annual Salary`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20000   87125  103500  109058  124000  340000
ggplot(task2, aes(x = `Annual Salary`)) +
  geom_histogram(binwidth = 10000, fill = "orange", color = "black") +
  labs(title = "Distribution of Annual Salaries",
       x = "Annual Salary (USD)",
       y = "Count")

#Hypothesis test: We test the null hypothesis \(H_0: \mu_{MBA\ Grade} = 74\).

t.test(task2$`MBA Grade`, mu = 74)
## 
##  One Sample t-test
## 
## data:  task2$`MBA Grade`
## t = 2.6587, df = 99, p-value = 0.00915
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
##  74.51764 77.56346
## sample estimates:
## mean of x 
##  76.04055

Task 3: You analyze the price per m2 for a sample of apartments in the Ljubljana region (Apartments.xlsx in Task 3 folder).

Import the dataset Apartments.xlsx

library(readxl)
task3 <- read_excel("Apartments.xlsx")
head(task3)
## # A tibble: 6 × 5
##     Age Distance Price Parking Balcony
##   <dbl>    <dbl> <dbl>   <dbl>   <dbl>
## 1     7       28  1640       0       1
## 2    18        1  2800       1       0
## 3     7       28  1660       0       0
## 4    28       29  1850       0       1
## 5    18       18  1640       1       1
## 6    28       12  1770       0       1

Description:

  • Age: Age of an apartment in years
  • Distance: The distance from city center in km
  • Price: Price per m2
  • Parking: 0-No, 1-Yes
  • Balcony: 0-No, 1-Yes

Change categorical variables into factors.

task3$Parking <- factor(task3$Parking,
                        levels = c(0, 1),
                        labels = c("No", "Yes"))

task3$Balcony <- factor(task3$Balcony,
                        levels = c(0, 1),
                        labels = c("No", "Yes"))

str(task3)
## tibble [85 × 5] (S3: tbl_df/tbl/data.frame)
##  $ Age     : num [1:85] 7 18 7 28 18 28 14 18 22 25 ...
##  $ Distance: num [1:85] 28 1 28 29 18 12 20 6 7 2 ...
##  $ Price   : num [1:85] 1640 2800 1660 1850 1640 1770 1850 1970 2270 2570 ...
##  $ Parking : Factor w/ 2 levels "No","Yes": 1 2 1 1 2 1 1 2 2 2 ...
##  $ Balcony : Factor w/ 2 levels "No","Yes": 2 1 1 2 2 2 2 2 1 1 ...

Test the hypothesis H0: Mu_Price = 1900 eur. What can you conclude?

t.test(task3$Price, mu = 1900)
## 
##  One Sample t-test
## 
## data:  task3$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
##  1937.443 2100.440
## sample estimates:
## mean of x 
##  2018.941
  • A one-sample t-test was performed to test whether the average price per square meter in Ljubljana apartments equals 1900 EUR. The results showed that the mean price (2018,94) was significantly higher than 1900 EUR. The 95% confidence interval [1937, 2100] did not include 1900. Therefore, we reject the null hypothesis and conclude that the average price per m² in this sample is significantly above 1900 EUR.

Estimate the simple regression function: Price = f(Age). Save results in object fit1 and explain the estimate of regression coefficient, coefficient of correlation and coefficient of determination.

fit1 <- lm(Price ~ Age, data = task3)

summary(fit1)
## 
## Call:
## lm(formula = Price ~ Age, data = task3)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -623.9 -278.0  -69.8  243.5  776.1 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2185.455     87.043  25.108   <2e-16 ***
## Age           -8.975      4.164  -2.156    0.034 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared:  0.05302,    Adjusted R-squared:  0.04161 
## F-statistic: 4.647 on 1 and 83 DF,  p-value: 0.03401
  • The regression shows that apartment Age has a significant negative effect on price. The slope (–8.98) means that, on average, each additional year reduces the price per m² by about 9 EUR. The intercept (2185 EUR) indicates the expected price of a new apartment. The correlation is weak negative, and the coefficient of determination (R² = 0.053) shows that Age explains only about 5% of the variation in prices — most variation is due to other factors.

Show the scateerplot matrix between Price, Age and Distance. Based on the matrix determine if there is potential problem with multicolinearity.

#install.packages("GGally")  
library(GGally)

ggpairs(task3[, c("Price", "Age", "Distance")])

- The scatterplot matrix shows a weak negative relationship between Price and Age and a stronger negative relationship between Price and Distance. Importantly, Age and Distance are not correlated (r = 0.043), which means there is no multicollinearity problem between the explanatory variables.

Estimate the multiple regression function: Price = f(Age, Distance). Save it in object named fit2.

fit2 <- lm(Price ~ Age + Distance, data = task3)
summary(fit2)
## 
## Call:
## lm(formula = Price ~ Age + Distance, data = task3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -603.23 -219.94  -85.68  211.31  689.58 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2460.101     76.632   32.10  < 2e-16 ***
## Age           -7.934      3.225   -2.46    0.016 *  
## Distance     -20.667      2.748   -7.52 6.18e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared:  0.4396, Adjusted R-squared:  0.4259 
## F-statistic: 32.16 on 2 and 82 DF,  p-value: 4.896e-11
  • Both Age and Distance significantly reduce apartment prices, with distance having the stronger effect. Together, they explain around 44% of the variation in price per m².

Chech the multicolinearity with VIF statistics. Explain the findings.

#install.packages("car")
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:psych':
## 
##     logit
vif(fit2)
##      Age Distance 
## 1.001845 1.001845
  • The VIF values for Age and Distance are both around 1.0, confirming that there is no multicollinearity in the model. This means the predictors are independent and do not distort the regression estimates.

Calculate standardized residuals and Cooks Distances for model fit2. Remove any potentially problematic units (outliers or units with high influence).

task3$std_resid <- rstandard(fit2)

task3$cooks_d <- cooks.distance(fit2)

head(task3[, c("Price", "Age", "Distance", "std_resid", "cooks_d")])
## # A tibble: 6 × 5
##   Price   Age Distance std_resid cooks_d
##   <dbl> <dbl>    <dbl>     <dbl>   <dbl>
## 1  1640     7       28    -0.665 0.00739
## 2  2800    18        1     1.78  0.0304 
## 3  1660     7       28    -0.594 0.00588
## 4  1850    28       29     0.754 0.00830
## 5  1640    18       18    -1.07  0.00511
## 6  1770    28       12    -0.778 0.00490
outliers <- which(abs(task3$std_resid) > 2)

n <- nrow(task3)
influential <- which(task3$cooks_d > 4/n)

outliers
## 33 38 53 
## 33 38 53
influential
## 22 33 38 53 55 
## 22 33 38 53 55
problematic <- union(outliers, influential)

task3_clean <- task3[-problematic, ]

fit2_clean <- lm(Price ~ Age + Distance, data = task3_clean)
summary(fit2_clean)
## 
## Call:
## lm(formula = Price ~ Age + Distance, data = task3_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -411.50 -203.69  -45.24  191.11  492.56 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2502.467     75.024  33.356  < 2e-16 ***
## Age           -8.674      3.221  -2.693  0.00869 ** 
## Distance     -24.063      2.692  -8.939 1.57e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared:  0.5361, Adjusted R-squared:  0.524 
## F-statistic: 44.49 on 2 and 77 DF,  p-value: 1.437e-13

Check for potential heteroskedasticity with scatterplot between standarized residuals and standrdized fitted values. Explain the findings.

library(ggplot2)

std_resid <- rstandard(fit2)
std_fitted <- scale(fit2$fitted.values)

ggplot(data.frame(std_fitted, std_resid), aes(x = std_fitted, y = std_resid)) +
  geom_point(color = "blue") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Residuals vs Fitted Values",
       x = "Standardized Fitted Values",
       y = "Standardized Residuals")

- The scatterplot of standardized residuals against standardized fitted values shows that residuals are centered around zero, which is expected. However, the variance of residuals appears to increase for higher fitted values, forming a slight “fan shape.” This suggests some degree of heteroskedasticity in the model.

Are standardized residuals ditributed normally? Show the graph and formally test it. Explain the findings.

hist(std_resid, breaks = 10, col = "skyblue",
     main = "Histogram of Standardized Residuals",
     xlab = "Standardized Residuals")

shapiro.test(std_resid)
## 
##  Shapiro-Wilk normality test
## 
## data:  std_resid
## W = 0.95306, p-value = 0.00366
  • The graphical checks (histogram) suggest that standardized residuals deviate somewhat from the normal distribution, particularly in the tails. The Shapiro–Wilk test confirms this, with p < 0.05, indicating that the residuals are not normally distributed.

Estimate the fit2 again without potentially excluded units and show the summary of the model. Explain all coefficients.

fit2_clean <- lm(Price ~ Age + Distance, data = task3_clean)

summary(fit2_clean)
## 
## Call:
## lm(formula = Price ~ Age + Distance, data = task3_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -411.50 -203.69  -45.24  191.11  492.56 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2502.467     75.024  33.356  < 2e-16 ***
## Age           -8.674      3.221  -2.693  0.00869 ** 
## Distance     -24.063      2.692  -8.939 1.57e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared:  0.5361, Adjusted R-squared:  0.524 
## F-statistic: 44.49 on 2 and 77 DF,  p-value: 1.437e-13
  • Intercept (2502.47): A brand-new apartment (Age = 0) located at the city center (Distance = 0) is estimated to cost ≈2502 EUR/m².

  • Age (–8.67, p = 0.009): Each additional year lowers the price per m² by about 8.7 EUR, holding Distance constant. This effect is statistically significant.

  • Distance (–24.06, p < 0.001): Each extra kilometer away from the city center decreases the price per m² by about 24 EUR, holding Age constant. This effect is highly significant and stronger than Age.

  • R² = 0.536: About 54% of the variation in apartment prices is explained by Age and Distance.

  • Residual standard error = 256.8: The average prediction error is around 257 EUR/m².

  • F-test (p < 0.001): The model is statistically significant overall.

  • After removing problematic observations, the regression results improved. Both Age and Distance remain significant predictors of price, with Distance having the stronger effect. The model now explains 54% of the variation in apartment prices, compared to 44% before cleaning

Estimate the linear regression function Price = f(Age, Distance, Parking and Balcony). Be careful to correctly include categorical variables. Save the object named fit3.

fit3 <- lm(Price ~ Age + Distance + Parking + Balcony, data = task3_clean)

summary(fit3)
## 
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = task3_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -390.93 -198.19  -53.64  186.73  518.34 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2393.316     93.930  25.480  < 2e-16 ***
## Age           -7.970      3.191  -2.498   0.0147 *  
## Distance     -21.961      2.830  -7.762 3.39e-11 ***
## ParkingYes   128.700     60.801   2.117   0.0376 *  
## BalconyYes     6.032     57.307   0.105   0.9165    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 252.7 on 75 degrees of freedom
## Multiple R-squared:  0.5623, Adjusted R-squared:  0.5389 
## F-statistic: 24.08 on 4 and 75 DF,  p-value: 7.764e-13

With function anova check if model fit3 fits data better than model fit2.

anova(fit2_clean, fit3)
## Analysis of Variance Table
## 
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + Parking + Balcony
##   Res.Df     RSS Df Sum of Sq      F Pr(>F)
## 1     77 5077362                           
## 2     75 4791128  2    286234 2.2403 0.1135
  • The ANOVA comparison shows that including Parking and Balcony does not significantly improve the model fit (F = 2.24, p = 0.1135). Therefore, the simpler model with only Age and Distance (fit2_clean) is preferred.

Show the results of fit3 and explain regression coefficient for both categorical variables. Can you write down the hypothesis which is being tested with F-statistics, shown at the bottom of the output?

summary(fit3)
## 
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = task3_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -390.93 -198.19  -53.64  186.73  518.34 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2393.316     93.930  25.480  < 2e-16 ***
## Age           -7.970      3.191  -2.498   0.0147 *  
## Distance     -21.961      2.830  -7.762 3.39e-11 ***
## ParkingYes   128.700     60.801   2.117   0.0376 *  
## BalconyYes     6.032     57.307   0.105   0.9165    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 252.7 on 75 degrees of freedom
## Multiple R-squared:  0.5623, Adjusted R-squared:  0.5389 
## F-statistic: 24.08 on 4 and 75 DF,  p-value: 7.764e-13
  • In model fit3, Age and Distance both negatively affect prices, while Parking significantly increases price per m². Balcony does not have a statistically significant effect. Since the F-statistic is significant (F = 24.08, p < 0.001), we reject H0. This confirms that the model as a whole explains a substantial share (56%) of the variation in apartment prices, and at least one predictor significantly affects prices.

Save fitted values and claculate the residual for apartment ID2.

task3_clean$fitted <- fitted(fit3)
task3_clean$residuals <- resid(fit3)

task3_clean[2, c("Price", "fitted", "residuals")]
## # A tibble: 1 × 3
##   Price fitted residuals
##   <dbl>  <dbl>     <dbl>
## 1  2800  2357.      443.
task3_clean$ID <- 1:nrow(task3_clean)

task3_clean[task3_clean$ID == 2, c("Price", "fitted", "residuals")]
## # A tibble: 1 × 3
##   Price fitted residuals
##   <dbl>  <dbl>     <dbl>
## 1  2800  2357.      443.
  • Apartment ID 2 is priced about €443/m² higher than what the model (fit3) predicts → a positive residual.

THE END