Show code
library(here)
library(haven)
library(tidyverse)
library(ggplot2)
library(gtsummary)
library(broom)
library(dplyr)
library(corrplot)
library(readxl)Muamar Iskandar bin Mohamed Yusoff.
library(here)
library(haven)
library(tidyverse)
library(ggplot2)
library(gtsummary)
library(broom)
library(dplyr)
library(corrplot)
library(readxl)datais <- read_xlsx(here("Multiple_Linear_Regression_Hypothetical_Data.xlsx"))summary(datais) exam_score study_hours sleep_hours stress_level
Min. : 64.96 Min. : 0.400 Min. :3.770 Min. : 1.00
1st Qu.:105.94 1st Qu.: 4.335 1st Qu.:6.450 1st Qu.: 3.00
Median :126.02 Median : 5.895 Median :7.050 Median : 5.00
Mean :131.20 Mean : 6.054 Mean :6.980 Mean : 5.31
3rd Qu.:153.74 3rd Qu.: 7.968 3rd Qu.:7.478 3rd Qu.: 8.00
Max. :216.89 Max. :10.780 Max. :9.600 Max. :10.00
library(tidyr)
datais %>%
select(study_hours, sleep_hours, stress_level) %>%
tbl_summary(
statistic = list(all_continuous() ~ "{mean} ({sd})")
)| Characteristic | N = 1001 |
|---|---|
| study_hours | 6.05 (2.27) |
| sleep_hours | 6.98 (0.98) |
| stress_level | 5.31 (3.01) |
| 1 Mean (SD) | |
ggplot(datais, aes(x = exam_score)) +
geom_histogram(binwidth = 5, fill = "darkgreen", color = "black") +
labs(
title = "Distribution of Exam Scores",
x = "Exam Score",
y = "Frequency"
) +
theme_minimal()ggplot(datais, aes(x = study_hours, y = exam_score)) +
geom_point(color = "blue") +
labs(
title = "Exam Score vs Study Hours",
x = "Study Hours",
y = "Exam Score"
) +
theme_minimal()ggplot(datais, aes(x = sleep_hours, y = exam_score)) +
geom_point(color = "orange") +
labs(
title = "Exam Score vs Sleep Hours",
x = "Sleep Hours",
y = "Exam Score"
) +
theme_minimal()datais <- datais %>%
mutate(
stress_cat = case_when(
stress_level <= 3 ~ "Low stress",
stress_level <= 6 ~ "Moderate stress",
TRUE ~ "High stress"
)
)ggplot(datais, aes(x = stress_cat, y = study_hours, fill = stress_cat)) +
geom_boxplot() +
labs(
title = "Study Hours by Stress Category",
x = "Stress Category",
y = "Study Hours"
) +
theme_minimal()ggplot(datais, aes(x = stress_cat, y = sleep_hours, fill = stress_cat)) +
geom_boxplot() +
labs(
title = "Sleep Hours by Stress Category",
x = "Stress Category",
y = "Sleep Hours"
) +
theme_minimal()corrdata <- datais %>%
select(exam_score, study_hours, sleep_hours, stress_level) %>%
cor()
corrplot(corrdata, method = "circle", type = "upper", tl.col = "black", tl.srt = 45)model1 <- lm(exam_score ~ study_hours, data = datais)
summary(model1)
Call:
lm(formula = exam_score ~ study_hours, data = datais)
Residuals:
Min 1Q Median 3Q Max
-46.587 -18.287 -2.806 17.548 59.532
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 67.425 7.066 9.542 1.19e-15 ***
study_hours 10.535 1.094 9.632 7.61e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 24.68 on 98 degrees of freedom
Multiple R-squared: 0.4863, Adjusted R-squared: 0.4811
F-statistic: 92.78 on 1 and 98 DF, p-value: 7.606e-16
model2 <- lm(exam_score ~ sleep_hours, data = datais)
summary(model2)
Call:
lm(formula = exam_score ~ sleep_hours, data = datais)
Residuals:
Min 1Q Median 3Q Max
-61.336 -27.764 -3.739 21.322 88.006
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 99.501 24.794 4.013 0.000117 ***
sleep_hours 4.542 3.518 1.291 0.199799
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 34.14 on 98 degrees of freedom
Multiple R-squared: 0.01672, Adjusted R-squared: 0.006685
F-statistic: 1.666 on 1 and 98 DF, p-value: 0.1998
model3 <- lm(exam_score ~ stress_level, data = datais)
summary(model3)
Call:
lm(formula = exam_score ~ stress_level, data = datais)
Residuals:
Min 1Q Median 3Q Max
-71.384 -19.179 0.353 17.867 61.816
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 96.8474 5.7642 16.802 < 2e-16 ***
stress_level 6.4696 0.9455 6.843 6.77e-10 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 28.32 on 98 degrees of freedom
Multiple R-squared: 0.3233, Adjusted R-squared: 0.3164
F-statistic: 46.82 on 1 and 98 DF, p-value: 6.77e-10
model4 <- lm(exam_score ~ study_hours + sleep_hours, data = datais)
summary(model4)
Call:
lm(formula = exam_score ~ study_hours + sleep_hours, data = datais)
Residuals:
Min 1Q Median 3Q Max
-45.129 -18.519 -1.474 16.743 59.990
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 32.294 18.936 1.705 0.0913 .
study_hours 10.575 1.078 9.813 3.4e-16 ***
sleep_hours 4.998 2.506 1.995 0.0489 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 24.31 on 97 degrees of freedom
Multiple R-squared: 0.5066, Adjusted R-squared: 0.4964
F-statistic: 49.79 on 2 and 97 DF, p-value: 1.323e-15
model5 <- lm(exam_score ~ study_hours + stress_level, data = datais)
summary(model5)
Call:
lm(formula = exam_score ~ study_hours + stress_level, data = datais)
Residuals:
Min 1Q Median 3Q Max
-24.0089 -8.3616 0.9474 6.9240 23.3707
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 21.6341 3.7557 5.76 9.89e-08 ***
study_hours 11.5974 0.4736 24.49 < 2e-16 ***
stress_level 7.4124 0.3567 20.78 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 10.62 on 97 degrees of freedom
Multiple R-squared: 0.9058, Adjusted R-squared: 0.9039
F-statistic: 466.3 on 2 and 97 DF, p-value: < 2.2e-16
model6 <- lm(exam_score ~ sleep_hours + stress_level, data = datais)
summary(model6)
Call:
lm(formula = exam_score ~ sleep_hours + stress_level, data = datais)
Residuals:
Min 1Q Median 3Q Max
-72.506 -18.210 0.135 18.392 63.682
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 75.3552 20.8616 3.612 0.000483 ***
sleep_hours 3.1341 2.9239 1.072 0.286428
stress_level 6.3973 0.9471 6.754 1.06e-09 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 28.3 on 97 degrees of freedom
Multiple R-squared: 0.3312, Adjusted R-squared: 0.3175
F-statistic: 24.02 on 2 and 97 DF, p-value: 3.354e-09
model7 <- lm(exam_score ~ study_hours + sleep_hours + stress_level, data =
datais)
summary(model7)
Call:
lm(formula = exam_score ~ study_hours + sleep_hours + stress_level,
data = datais)
Residuals:
Min 1Q Median 3Q Max
-25.170 -8.138 1.587 5.615 23.023
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.9899 8.0465 -0.247 0.80520
study_hours 11.6136 0.4514 25.726 < 2e-16 ***
sleep_hours 3.4296 1.0462 3.278 0.00146 **
stress_level 7.3346 0.3408 21.521 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 10.13 on 96 degrees of freedom
Multiple R-squared: 0.9153, Adjusted R-squared: 0.9126
F-statistic: 345.7 on 3 and 96 DF, p-value: < 2.2e-16
tidy(model1, conf.int = TRUE)# A tibble: 2 × 7
term estimate std.error statistic p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 67.4 7.07 9.54 1.19e-15 53.4 81.4
2 study_hours 10.5 1.09 9.63 7.61e-16 8.36 12.7
tidy(model2, conf.int = TRUE)# A tibble: 2 × 7
term estimate std.error statistic p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 99.5 24.8 4.01 0.000117 50.3 149.
2 sleep_hours 4.54 3.52 1.29 0.200 -2.44 11.5
tidy(model3, conf.int = TRUE)# A tibble: 2 × 7
term estimate std.error statistic p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 96.8 5.76 16.8 1.30e-30 85.4 108.
2 stress_level 6.47 0.945 6.84 6.77e-10 4.59 8.35
tidy(model4, conf.int = TRUE)# A tibble: 3 × 7
term estimate std.error statistic p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 32.3 18.9 1.71 9.13e- 2 -5.29 69.9
2 study_hours 10.6 1.08 9.81 3.40e-16 8.44 12.7
3 sleep_hours 5.00 2.51 1.99 4.89e- 2 0.0255 9.97
tidy(model5, conf.int = TRUE)# A tibble: 3 × 7
term estimate std.error statistic p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 21.6 3.76 5.76 9.89e- 8 14.2 29.1
2 study_hours 11.6 0.474 24.5 2.56e-43 10.7 12.5
3 stress_level 7.41 0.357 20.8 1.68e-37 6.70 8.12
tidy(model6, conf.int = TRUE)# A tibble: 3 × 7
term estimate std.error statistic p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 75.4 20.9 3.61 0.000483 34.0 117.
2 sleep_hours 3.13 2.92 1.07 0.286 -2.67 8.94
3 stress_level 6.40 0.947 6.75 0.00000000106 4.52 8.28
tidy(model7, conf.int = TRUE)# A tibble: 4 × 7
term estimate std.error statistic p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) -1.99 8.05 -0.247 8.05e- 1 -18.0 14.0
2 study_hours 11.6 0.451 25.7 7.39e-45 10.7 12.5
3 sleep_hours 3.43 1.05 3.28 1.46e- 3 1.35 5.51
4 stress_level 7.33 0.341 21.5 1.65e-38 6.66 8.01
multivar.m <- lm(exam_score ~ study_hours + sleep_hours + stress_level, data = datais)
stepwise.bw <- step(multivar.m, direction = "backward")Start: AIC=466.93
exam_score ~ study_hours + sleep_hours + stress_level
Df Sum of Sq RSS AIC
<none> 9842 466.93
- sleep_hours 1 1102 10944 475.54
- stress_level 1 47482 57324 641.13
- study_hours 1 67850 77692 671.53
minimal.model <- lm(exam_score ~ 1, data = datais)
full.model <- lm(exam_score ~ study_hours + sleep_hours + stress_level, data = datais)step.both <- step(minimal.model, direction = "both", scope = list(upper = full.model))Start: AIC=707.77
exam_score ~ 1
Df Sum of Sq RSS AIC
+ study_hours 1 56498 59676 643.15
+ stress_level 1 37561 78612 670.71
<none> 116173 707.77
+ sleep_hours 1 1942 114231 708.08
Step: AIC=643.15
exam_score ~ study_hours
Df Sum of Sq RSS AIC
+ stress_level 1 48732 10944 475.54
+ sleep_hours 1 2352 57324 641.13
<none> 59676 643.15
- study_hours 1 56498 116173 707.77
Step: AIC=475.54
exam_score ~ study_hours + stress_level
Df Sum of Sq RSS AIC
+ sleep_hours 1 1102 9842 466.93
<none> 10944 475.54
- stress_level 1 48732 59676 643.15
- study_hours 1 67668 78612 670.71
Step: AIC=466.93
exam_score ~ study_hours + stress_level + sleep_hours
Df Sum of Sq RSS AIC
<none> 9842 466.93
- sleep_hours 1 1102 10944 475.54
- stress_level 1 47482 57324 641.13
- study_hours 1 67850 77692 671.53
step.fw <- step(minimal.model, direction = "forward", scope = list(upper = full.model))Start: AIC=707.77
exam_score ~ 1
Df Sum of Sq RSS AIC
+ study_hours 1 56498 59676 643.15
+ stress_level 1 37561 78612 670.71
<none> 116173 707.77
+ sleep_hours 1 1942 114231 708.08
Step: AIC=643.15
exam_score ~ study_hours
Df Sum of Sq RSS AIC
+ stress_level 1 48732 10944 475.54
+ sleep_hours 1 2352 57324 641.13
<none> 59676 643.15
Step: AIC=475.54
exam_score ~ study_hours + stress_level
Df Sum of Sq RSS AIC
+ sleep_hours 1 1101.8 9842.1 466.93
<none> 10943.9 475.54
Step: AIC=466.93
exam_score ~ study_hours + stress_level + sleep_hours
step.both$anova Step Df Deviance Resid. Df Resid. Dev AIC
1 NA NA 99 116173.278 707.7668
2 + study_hours -1 56497.536 98 59675.742 643.1511
3 + stress_level -1 48731.825 97 10943.917 475.5369
4 + sleep_hours -1 1101.835 96 9842.083 466.9252
step.fw$anova Step Df Deviance Resid. Df Resid. Dev AIC
1 NA NA 99 116173.278 707.7668
2 + study_hours -1 56497.536 98 59675.742 643.1511
3 + stress_level -1 48731.825 97 10943.917 475.5369
4 + sleep_hours -1 1101.835 96 9842.083 466.9252
stepwise.bw$anova Step Df Deviance Resid. Df Resid. Dev AIC
1 NA NA 96 9842.083 466.9252
final.model <- lm(exam_score ~ study_hours + sleep_hours + stress_level, data = datais)
summary(final.model)
Call:
lm(formula = exam_score ~ study_hours + sleep_hours + stress_level,
data = datais)
Residuals:
Min 1Q Median 3Q Max
-25.170 -8.138 1.587 5.615 23.023
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.9899 8.0465 -0.247 0.80520
study_hours 11.6136 0.4514 25.726 < 2e-16 ***
sleep_hours 3.4296 1.0462 3.278 0.00146 **
stress_level 7.3346 0.3408 21.521 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 10.13 on 96 degrees of freedom
Multiple R-squared: 0.9153, Adjusted R-squared: 0.9126
F-statistic: 345.7 on 3 and 96 DF, p-value: < 2.2e-16
interaction.model1 <- lm(exam_score ~ study_hours + sleep_hours + stress_level + study_hours * stress_level, data = datais)
tidy(interaction.model1)# A tibble: 5 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 55.7 4.60 12.1 5.59e-21
2 study_hours 3.46 0.457 7.56 2.50e-11
3 sleep_hours 2.34 0.467 5.02 2.43e- 6
4 stress_level -2.07 0.497 -4.17 6.78e- 5
5 study_hours:stress_level 1.57 0.0789 19.9 1.43e-35
nointeraction.model1 <- lm(exam_score ~ study_hours + sleep_hours + stress_level, data = datais)
tidy(nointeraction.model1)# A tibble: 4 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) -1.99 8.05 -0.247 8.05e- 1
2 study_hours 11.6 0.451 25.7 7.39e-45
3 sleep_hours 3.43 1.05 3.28 1.46e- 3
4 stress_level 7.33 0.341 21.5 1.65e-38
anova(nointeraction.model1, interaction.model1)Analysis of Variance Table
Model 1: exam_score ~ study_hours + sleep_hours + stress_level
Model 2: exam_score ~ study_hours + sleep_hours + stress_level + study_hours *
stress_level
Res.Df RSS Df Sum of Sq F Pr(>F)
1 96 9842.1
2 95 1911.8 1 7930.3 394.07 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
prelim.final.model <- lm(exam_score ~ study_hours + stress_level + sleep_hours + study_hours*stress_level, data = datais)
plot(prelim.final.model)library(lmtest)
bptest(prelim.final.model)
studentized Breusch-Pagan test
data: prelim.final.model
BP = 3.6463, df = 4, p-value = 0.456
shapiro.test(prelim.final.model$residuals)
Shapiro-Wilk normality test
data: prelim.final.model$residuals
W = 0.98631, p-value = 0.3928
augment(prelim.final.model) %>%
ggplot(aes(x = study_hours, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(
title = "Residuals vs Study Hours",
x = "Study Hours",
y = "Residuals"
) +
theme_minimal()augment(prelim.final.model) %>%
ggplot(aes(x = stress_level, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(
title = "Residuals vs Stress Level",
x = "Stress Level",
y = "Residuals"
) +
theme_minimal()augment(prelim.final.model) %>%
ggplot(aes(x = sleep_hours, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(
title = "Residuals vs Sleep Hours",
x = "Sleep Hours",
y = "Residuals"
) +
theme_minimal()res.mod <- residuals(prelim.final.model)
head(res.mod) 1 2 3 4 5 6
0.9814899 3.6097311 0.7203962 4.7086989 -1.3011740 2.8956146
hist(res.mod)library(DT)
datais.pred.res <- augment(prelim.final.model)
datais.pred.res %>% datatable()new_data <- expand.grid(
study_hours = c(2, 4, 6, 8),
stress_level = c(1, 3, 5),
sleep_hours = c(2,4,6,8)
)
new_data study_hours stress_level sleep_hours
1 2 1 2
2 4 1 2
3 6 1 2
4 8 1 2
5 2 3 2
6 4 3 2
7 6 3 2
8 8 3 2
9 2 5 2
10 4 5 2
11 6 5 2
12 8 5 2
13 2 1 4
14 4 1 4
15 6 1 4
16 8 1 4
17 2 3 4
18 4 3 4
19 6 3 4
20 8 3 4
21 2 5 4
22 4 5 4
23 6 5 4
24 8 5 4
25 2 1 6
26 4 1 6
27 6 1 6
28 8 1 6
29 2 3 6
30 4 3 6
31 6 3 6
32 8 3 6
33 2 5 6
34 4 5 6
35 6 5 6
36 8 5 6
37 2 1 8
38 4 1 8
39 6 1 8
40 8 1 8
41 2 3 8
42 4 3 8
43 6 3 8
44 8 3 8
45 2 5 8
46 4 5 8
47 6 5 8
48 8 5 8
pred_exam_score <- augment(
prelim.final.model,
newdata = new_data
)
pred_exam_score %>% datatable()= 2(7)+2/4225=0.0038???
### Cook’s distance
cutoff <- 4/((nrow(datais)-length(prelim.final.model$coefficients)-2))
plot(prelim.final.model, which = 4, cook.levels = cutoff)datais.pred.res %>%
filter(.std.resid > 2 | .std.resid < -2 | .hat > 0.0038) %>%
datatable()non.influen.obs <-
datais.pred.res %>%
filter(.std.resid < 2 & .std.resid > -2 & .hat < 0.0038)