#install.packages("dplyr")
#install.packages("psych")
#install.packages("ggplot2")
library(ggplot2)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
task1 <- read.csv("./ST vs MW.csv")
task <- task1[,-c(1,4,5,7,8,10,12,13,14,16)]
task <- task %>%
rename(
ScreenTime = screen_time_hours,
SleepHours = sleep_hours,
StressLevel = stress_level_0_10,
MentalWellness = mental_wellness_index_0_100
)
head(task, 5)
## age gender ScreenTime SleepHours StressLevel MentalWellness
## 1 33 Female 10.79 6.63 9.3 9.3
## 2 28 Female 7.40 8.05 5.7 56.2
## 3 35 Female 9.78 6.48 9.1 3.6
## 4 42 Male 11.13 6.89 10.0 0.0
## 5 28 Male 13.22 5.79 10.0 0.0
#Renaming gender:
task <- task %>%
mutate(gender = case_when(
gender == "Male" ~ "Moški",
gender == "Female" ~ "Ženska",
gender == "Non-binary/Other" ~ "Ne-binaren/Drugo",
TRUE ~ gender # keep other values unchanged (just in case)
))
head(task, 7)
## age gender ScreenTime SleepHours StressLevel MentalWellness
## 1 33 Ženska 10.79 6.63 9.3 9.3
## 2 28 Ženska 7.40 8.05 5.7 56.2
## 3 35 Ženska 9.78 6.48 9.1 3.6
## 4 42 Moški 11.13 6.89 10.0 0.0
## 5 28 Moški 13.22 5.79 10.0 0.0
## 6 28 Ne-binaren/Drugo 9.83 7.19 10.0 5.0
## 7 42 Ne-binaren/Drugo 6.02 7.44 6.1 43.7
#Creating a new data frame based on conditions:
task <- task %>%
mutate(SleepScreenRatio = SleepHours / ScreenTime)
#Creating a new variable:
high_wellness <- task %>%
filter(MentalWellness > 70)
head(high_wellness)
## age gender ScreenTime SleepHours StressLevel MentalWellness SleepScreenRatio
## 1 26 Moški 6.28 8.48 1.6 86.8 1.350318
## 2 26 Ženska 4.96 8.01 4.6 74.3 1.614919
## 3 23 Ženska 7.04 8.04 4.6 72.6 1.142045
## 4 26 Ženska 7.83 8.30 3.2 80.9 1.060026
## 5 39 Ženska 1.00 7.26 0.0 87.2 7.260000
## 6 20 Ženska 6.31 9.00 2.7 97.0 1.426307
describe(task)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## age 1 400 29.78 7.47 30.00 29.55 7.41 16.00 60.00 44.00 0.35 0.12 0.37
## gender* 2 400 2.13 0.98 3.00 2.16 0.00 1.00 3.00 2.00 -0.26 -1.92 0.05
## ScreenTime 3 400 9.02 2.49 9.09 9.01 2.39 1.00 19.17 18.17 0.08 0.66 0.12
## SleepHours 4 400 7.01 0.85 7.03 7.01 0.92 4.64 9.74 5.10 0.04 -0.27 0.04
## StressLevel 5 400 8.15 2.09 8.80 8.49 1.78 0.00 10.00 10.00 -1.22 1.11 0.10
## MentalWellness 6 400 20.33 20.38 14.80 17.22 19.57 0.00 97.00 97.00 1.22 1.14 1.02
## SleepScreenRatio 7 400 0.88 0.55 0.76 0.81 0.25 0.35 7.26 6.91 6.96 70.52 0.03
#Histogram of Screen Time
library(ggplot2)
ggplot(task, aes(x = ScreenTime)) +
geom_histogram(binwidth = 1, fill = "pink", color = "black") +
labs(title = "Distribution of Screen Time",
x = "Screen Time (hours per day)",
y = "Count")
#Boxplot of Stress level by Gender
ggplot(task, aes(x = gender, y = StressLevel, fill = gender)) +
geom_boxplot() +
labs(title = "Stress Levels by Gender", x = "Gender", y = "Stress Level (0–10)"
)
#Scatterplot of Sleep Hours vs. Mental Wellness
ggplot(task, aes(x = SleepHours, y = MentalWellness)) +
geom_point(alpha = 0.5, color = "darkgreen") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Sleep Hours vs Mental Wellness",
x = "Sleep Hours per Night",
y = "Mental Wellness Index (0–100)")
## `geom_smooth()` using formula = 'y ~ x'
#install.packages("readxl")
#install.packages("effectsize")
library(effectsize)
##
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
##
## phi
library(readxl)
task2 <- read_xlsx("./Business School.xlsx")
head(task2)
## # A tibble: 6 × 9
## `Student ID` `Undergrad Degree` `Undergrad Grade` `MBA Grade` `Work Experience` `Employability (Before)`
## <dbl> <chr> <dbl> <dbl> <chr> <dbl>
## 1 1 Business 68.4 90.2 No 252
## 2 2 Computer Science 70.2 68.7 Yes 101
## 3 3 Finance 76.4 83.3 No 401
## 4 4 Business 82.6 88.7 No 287
## 5 5 Finance 76.9 75.4 No 275
## 6 6 Computer Science 83.3 82.1 No 254
## # ℹ 3 more variables: `Employability (After)` <dbl>, Status <chr>, `Annual Salary` <dbl>
#Graph distribution of Undergrad Degrees
ggplot(task2, aes(x = `Undergrad Degree`)) +
geom_bar(fill = "skyblue", color = "black") +
labs(title = "Distribution of Undergrad Degrees",
x = "Undergraduate Degree",
y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
- Interpretation: The tallest bar represents the most
common undergraduate degree, which is Business, lowest bar represent the
least undergraduate degree, which is Art.
#Descriptive statistics & histogram for Annual Salary
summary(task2$`Annual Salary`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20000 87125 103500 109058 124000 340000
ggplot(task2, aes(x = `Annual Salary`)) +
geom_histogram(binwidth = 10000, fill = "orange", color = "black") +
labs(title = "Distribution of Annual Salaries",
x = "Annual Salary (USD)",
y = "Count")
The minimum salary is 20,000 USD, while the maximum is very high at 340,000 USD, showing a large range.
The median salary is 103,500 USD and the mean is 109,058 USD. Since the mean is higher than the median, this indicates a right-skewed distribution.
The middle 50% of students (interquartile range) earn between 87,125 USD (Q1) and 124,000 USD (Q3). Most students cluster between 80,000–130,000 USD.
#Hypothesis test: We test the null hypothesis \(H_0: \mu_{MBA\ Grade} = 74\).
t.test(task2$`MBA Grade`, mu = 74)
##
## One Sample t-test
##
## data: task2$`MBA Grade`
## t = 2.6587, df = 99, p-value = 0.00915
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
## 74.51764 77.56346
## sample estimates:
## mean of x
## 76.04055
library(readxl)
task3 <- read_excel("Apartments.xlsx")
head(task3)
## # A tibble: 6 × 5
## Age Distance Price Parking Balcony
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
Description:
task3$Parking <- factor(task3$Parking,
levels = c(0, 1),
labels = c("No", "Yes"))
task3$Balcony <- factor(task3$Balcony,
levels = c(0, 1),
labels = c("No", "Yes"))
str(task3)
## tibble [85 × 5] (S3: tbl_df/tbl/data.frame)
## $ Age : num [1:85] 7 18 7 28 18 28 14 18 22 25 ...
## $ Distance: num [1:85] 28 1 28 29 18 12 20 6 7 2 ...
## $ Price : num [1:85] 1640 2800 1660 1850 1640 1770 1850 1970 2270 2570 ...
## $ Parking : Factor w/ 2 levels "No","Yes": 1 2 1 1 2 1 1 2 2 2 ...
## $ Balcony : Factor w/ 2 levels "No","Yes": 2 1 1 2 2 2 2 2 1 1 ...
t.test(task3$Price, mu = 1900)
##
## One Sample t-test
##
## data: task3$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
fit1 <- lm(Price ~ Age, data = task3)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = task3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
#install.packages("GGally")
library(GGally)
ggpairs(task3[, c("Price", "Age", "Distance")])
- The scatterplot matrix shows a weak negative relationship between
Price and Age and a stronger negative relationship between Price and
Distance. Importantly, Age and Distance are not correlated (r = 0.043),
which means there is no multicollinearity problem between the
explanatory variables.
fit2 <- lm(Price ~ Age + Distance, data = task3)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = task3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
#install.packages("car")
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:psych':
##
## logit
vif(fit2)
## Age Distance
## 1.001845 1.001845
task3$std_resid <- rstandard(fit2)
task3$cooks_d <- cooks.distance(fit2)
head(task3[, c("Price", "Age", "Distance", "std_resid", "cooks_d")])
## # A tibble: 6 × 5
## Price Age Distance std_resid cooks_d
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1640 7 28 -0.665 0.00739
## 2 2800 18 1 1.78 0.0304
## 3 1660 7 28 -0.594 0.00588
## 4 1850 28 29 0.754 0.00830
## 5 1640 18 18 -1.07 0.00511
## 6 1770 28 12 -0.778 0.00490
outliers <- which(abs(task3$std_resid) > 2)
n <- nrow(task3)
influential <- which(task3$cooks_d > 4/n)
outliers
## 33 38 53
## 33 38 53
influential
## 22 33 38 53 55
## 22 33 38 53 55
problematic <- union(outliers, influential)
task3_clean <- task3[-problematic, ]
fit2_clean <- lm(Price ~ Age + Distance, data = task3_clean)
summary(fit2_clean)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = task3_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -411.50 -203.69 -45.24 191.11 492.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2502.467 75.024 33.356 < 2e-16 ***
## Age -8.674 3.221 -2.693 0.00869 **
## Distance -24.063 2.692 -8.939 1.57e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared: 0.5361, Adjusted R-squared: 0.524
## F-statistic: 44.49 on 2 and 77 DF, p-value: 1.437e-13
library(ggplot2)
std_resid <- rstandard(fit2)
std_fitted <- scale(fit2$fitted.values)
ggplot(data.frame(std_fitted, std_resid), aes(x = std_fitted, y = std_resid)) +
geom_point(color = "blue") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Residuals vs Fitted Values",
x = "Standardized Fitted Values",
y = "Standardized Residuals")
- The scatterplot of standardized residuals against standardized fitted
values shows that residuals are centered around zero, which is expected.
However, the variance of residuals appears to increase for higher fitted
values, forming a slight “fan shape.” This suggests some degree of
heteroskedasticity in the model.
hist(std_resid, breaks = 10, col = "skyblue",
main = "Histogram of Standardized Residuals",
xlab = "Standardized Residuals")
shapiro.test(std_resid)
##
## Shapiro-Wilk normality test
##
## data: std_resid
## W = 0.95306, p-value = 0.00366
fit2_clean <- lm(Price ~ Age + Distance, data = task3_clean)
summary(fit2_clean)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = task3_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -411.50 -203.69 -45.24 191.11 492.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2502.467 75.024 33.356 < 2e-16 ***
## Age -8.674 3.221 -2.693 0.00869 **
## Distance -24.063 2.692 -8.939 1.57e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared: 0.5361, Adjusted R-squared: 0.524
## F-statistic: 44.49 on 2 and 77 DF, p-value: 1.437e-13
Intercept (2502.47): A brand-new apartment (Age = 0) located at the city center (Distance = 0) is estimated to cost ≈2502 EUR/m².
Age (–8.67, p = 0.009): Each additional year lowers the price per m² by about 8.7 EUR, holding Distance constant. This effect is statistically significant.
Distance (–24.06, p < 0.001): Each extra kilometer away from the city center decreases the price per m² by about 24 EUR, holding Age constant. This effect is highly significant and stronger than Age.
R² = 0.536: About 54% of the variation in apartment prices is explained by Age and Distance.
Residual standard error = 256.8: The average prediction error is around 257 EUR/m².
F-test (p < 0.001): The model is statistically significant overall.
After removing problematic observations, the regression results improved. Both Age and Distance remain significant predictors of price, with Distance having the stronger effect. The model now explains 54% of the variation in apartment prices, compared to 44% before cleaning
fit3 <- lm(Price ~ Age + Distance + Parking + Balcony, data = task3_clean)
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = task3_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -390.93 -198.19 -53.64 186.73 518.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2393.316 93.930 25.480 < 2e-16 ***
## Age -7.970 3.191 -2.498 0.0147 *
## Distance -21.961 2.830 -7.762 3.39e-11 ***
## ParkingYes 128.700 60.801 2.117 0.0376 *
## BalconyYes 6.032 57.307 0.105 0.9165
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 252.7 on 75 degrees of freedom
## Multiple R-squared: 0.5623, Adjusted R-squared: 0.5389
## F-statistic: 24.08 on 4 and 75 DF, p-value: 7.764e-13
anova(fit2_clean, fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + Parking + Balcony
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 77 5077362
## 2 75 4791128 2 286234 2.2403 0.1135
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = task3_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -390.93 -198.19 -53.64 186.73 518.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2393.316 93.930 25.480 < 2e-16 ***
## Age -7.970 3.191 -2.498 0.0147 *
## Distance -21.961 2.830 -7.762 3.39e-11 ***
## ParkingYes 128.700 60.801 2.117 0.0376 *
## BalconyYes 6.032 57.307 0.105 0.9165
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 252.7 on 75 degrees of freedom
## Multiple R-squared: 0.5623, Adjusted R-squared: 0.5389
## F-statistic: 24.08 on 4 and 75 DF, p-value: 7.764e-13
task3_clean$fitted <- fitted(fit3)
task3_clean$residuals <- resid(fit3)
task3_clean[2, c("Price", "fitted", "residuals")]
## # A tibble: 1 × 3
## Price fitted residuals
## <dbl> <dbl> <dbl>
## 1 2800 2357. 443.
task3_clean$ID <- 1:nrow(task3_clean)
task3_clean[task3_clean$ID == 2, c("Price", "fitted", "residuals")]
## # A tibble: 1 × 3
## Price fitted residuals
## <dbl> <dbl> <dbl>
## 1 2800 2357. 443.