Show code
library(here)
library(haven)
library(tidyverse)
library(ggplot2)
library(gtsummary)
library(gt)
library(broom)
library(dplyr)
library(corrplot)
library(readxl)
library(MuMIn)library(here)
library(haven)
library(tidyverse)
library(ggplot2)
library(gtsummary)
library(gt)
library(broom)
library(dplyr)
library(corrplot)
library(readxl)
library(MuMIn)datais <- read_xlsx(here("Multiple_Linear_Regression_Hypothetical_Data.xlsx"))summary(datais) exam_score study_hours sleep_hours stress_level
Min. : 64.96 Min. : 0.400 Min. :3.770 Min. : 1.00
1st Qu.:105.94 1st Qu.: 4.335 1st Qu.:6.450 1st Qu.: 3.00
Median :126.02 Median : 5.895 Median :7.050 Median : 5.00
Mean :131.20 Mean : 6.054 Mean :6.980 Mean : 5.31
3rd Qu.:153.74 3rd Qu.: 7.968 3rd Qu.:7.478 3rd Qu.: 8.00
Max. :216.89 Max. :10.780 Max. :9.600 Max. :10.00
library(tidyr)
datais %>%
select(study_hours, sleep_hours, stress_level) %>%
tbl_summary(
statistic = list(all_continuous() ~ "{mean} ({sd})")
)| Characteristic | N = 1001 |
|---|---|
| study_hours | 6.05 (2.27) |
| sleep_hours | 6.98 (0.98) |
| stress_level | 5.31 (3.01) |
| 1 Mean (SD) | |
ggplot(datais, aes(x = exam_score)) +
geom_histogram(binwidth = 5, fill = "darkgreen", color = "black") +
labs(
title = "Distribution of Exam Scores",
x = "Exam Score",
y = "Frequency"
) +
theme_minimal()ggplot(datais, aes(x = study_hours, y = exam_score)) +
geom_point(color = "blue") +
labs(
title = "Exam Score vs Study Hours",
x = "Study Hours",
y = "Exam Score"
) +
theme_minimal()ggplot(datais, aes(x = sleep_hours, y = exam_score)) +
geom_point(color = "orange") +
labs(
title = "Exam Score vs Sleep Hours",
x = "Sleep Hours",
y = "Exam Score"
) +
theme_minimal()datais <- datais %>%
mutate(
stress_cat = case_when(
stress_level <= 3 ~ "Low stress",
stress_level <= 6 ~ "Moderate stress",
TRUE ~ "High stress"
)
)
ggplot(datais, aes(x = study_hours)) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "black") +
facet_grid(stress_cat ~ .) +
labs(
title = "Distribution of Study Hours by Stress Level",
x = "Study Hours",
y = "Frequency"
) +
theme_minimal()ggplot(datais, aes(x = sleep_hours)) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "black") +
facet_grid(stress_cat ~ .) +
labs(
title = "Distribution of Sleep Hours by Stress Level",
x = "Sleep Hours",
y = "Frequency"
) +
theme_minimal()corrdata <- datais %>%
select(exam_score, study_hours, sleep_hours, stress_level) %>%
cor()
corrplot(corrdata, method = "circle", type = "upper", tl.col = "black", tl.srt = 45)model1 <- lm(exam_score ~ study_hours, data = datais)
summary(model1)
Call:
lm(formula = exam_score ~ study_hours, data = datais)
Residuals:
Min 1Q Median 3Q Max
-46.587 -18.287 -2.806 17.548 59.532
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 67.425 7.066 9.542 1.19e-15 ***
study_hours 10.535 1.094 9.632 7.61e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 24.68 on 98 degrees of freedom
Multiple R-squared: 0.4863, Adjusted R-squared: 0.4811
F-statistic: 92.78 on 1 and 98 DF, p-value: 7.606e-16
tbl_regression(model1) %>%
bold_p(t = 0.05) %>%
bold_labels() %>%
as_gt() %>%
tab_header(
title = "Univariate Study Hours")| Univariate Study Hours | |||
|---|---|---|---|
| Characteristic | Beta | 95% CI | p-value |
| study_hours | 11 | 8.4, 13 | <0.001 |
| Abbreviation: CI = Confidence Interval | |||
model2 <- lm(exam_score ~ sleep_hours, data = datais)
summary(model2)
Call:
lm(formula = exam_score ~ sleep_hours, data = datais)
Residuals:
Min 1Q Median 3Q Max
-61.336 -27.764 -3.739 21.322 88.006
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 99.501 24.794 4.013 0.000117 ***
sleep_hours 4.542 3.518 1.291 0.199799
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 34.14 on 98 degrees of freedom
Multiple R-squared: 0.01672, Adjusted R-squared: 0.006685
F-statistic: 1.666 on 1 and 98 DF, p-value: 0.1998
tbl_regression(model2) %>%
bold_p(t = 0.05) %>%
bold_labels() %>%
as_gt() %>%
tab_header(
title = "Univariate Sleep Hours")| Univariate Sleep Hours | |||
|---|---|---|---|
| Characteristic | Beta | 95% CI | p-value |
| sleep_hours | 4.5 | -2.4, 12 | 0.2 |
| Abbreviation: CI = Confidence Interval | |||
model3 <- lm(exam_score ~ stress_level, data = datais)
summary(model3)
Call:
lm(formula = exam_score ~ stress_level, data = datais)
Residuals:
Min 1Q Median 3Q Max
-71.384 -19.179 0.353 17.867 61.816
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 96.8474 5.7642 16.802 < 2e-16 ***
stress_level 6.4696 0.9455 6.843 6.77e-10 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 28.32 on 98 degrees of freedom
Multiple R-squared: 0.3233, Adjusted R-squared: 0.3164
F-statistic: 46.82 on 1 and 98 DF, p-value: 6.77e-10
tbl_regression(model3) %>%
bold_p(t = 0.05) %>%
bold_labels() %>%
as_gt() %>%
tab_header(
title = "Univariate Stress Level")| Univariate Stress Level | |||
|---|---|---|---|
| Characteristic | Beta | 95% CI | p-value |
| stress_level | 6.5 | 4.6, 8.3 | <0.001 |
| Abbreviation: CI = Confidence Interval | |||
model4 <- lm(exam_score ~ study_hours + sleep_hours , data = datais)
summary(model4)
Call:
lm(formula = exam_score ~ study_hours + sleep_hours, data = datais)
Residuals:
Min 1Q Median 3Q Max
-45.129 -18.519 -1.474 16.743 59.990
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 32.294 18.936 1.705 0.0913 .
study_hours 10.575 1.078 9.813 3.4e-16 ***
sleep_hours 4.998 2.506 1.995 0.0489 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 24.31 on 97 degrees of freedom
Multiple R-squared: 0.5066, Adjusted R-squared: 0.4964
F-statistic: 49.79 on 2 and 97 DF, p-value: 1.323e-15
model5 <- lm(exam_score ~ study_hours + stress_level, data = datais)
summary(model5)
Call:
lm(formula = exam_score ~ study_hours + stress_level, data = datais)
Residuals:
Min 1Q Median 3Q Max
-24.0089 -8.3616 0.9474 6.9240 23.3707
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 21.6341 3.7557 5.76 9.89e-08 ***
study_hours 11.5974 0.4736 24.49 < 2e-16 ***
stress_level 7.4124 0.3567 20.78 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 10.62 on 97 degrees of freedom
Multiple R-squared: 0.9058, Adjusted R-squared: 0.9039
F-statistic: 466.3 on 2 and 97 DF, p-value: < 2.2e-16
model6 <- lm(exam_score ~ sleep_hours + stress_level, data = datais)
summary(model6)
Call:
lm(formula = exam_score ~ sleep_hours + stress_level, data = datais)
Residuals:
Min 1Q Median 3Q Max
-72.506 -18.210 0.135 18.392 63.682
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 75.3552 20.8616 3.612 0.000483 ***
sleep_hours 3.1341 2.9239 1.072 0.286428
stress_level 6.3973 0.9471 6.754 1.06e-09 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 28.3 on 97 degrees of freedom
Multiple R-squared: 0.3312, Adjusted R-squared: 0.3175
F-statistic: 24.02 on 2 and 97 DF, p-value: 3.354e-09
tidy(model1, conf.int = TRUE)# A tibble: 2 × 7
term estimate std.error statistic p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 67.4 7.07 9.54 1.19e-15 53.4 81.4
2 study_hours 10.5 1.09 9.63 7.61e-16 8.36 12.7
tidy(model2, conf.int = TRUE)# A tibble: 2 × 7
term estimate std.error statistic p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 99.5 24.8 4.01 0.000117 50.3 149.
2 sleep_hours 4.54 3.52 1.29 0.200 -2.44 11.5
tidy(model3, conf.int = TRUE)# A tibble: 2 × 7
term estimate std.error statistic p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 96.8 5.76 16.8 1.30e-30 85.4 108.
2 stress_level 6.47 0.945 6.84 6.77e-10 4.59 8.35
tidy(model4, conf.int = TRUE)# A tibble: 3 × 7
term estimate std.error statistic p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 32.3 18.9 1.71 9.13e- 2 -5.29 69.9
2 study_hours 10.6 1.08 9.81 3.40e-16 8.44 12.7
3 sleep_hours 5.00 2.51 1.99 4.89e- 2 0.0255 9.97
tidy(model5, conf.int = TRUE)# A tibble: 3 × 7
term estimate std.error statistic p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 21.6 3.76 5.76 9.89e- 8 14.2 29.1
2 study_hours 11.6 0.474 24.5 2.56e-43 10.7 12.5
3 stress_level 7.41 0.357 20.8 1.68e-37 6.70 8.12
tidy(model6, conf.int = TRUE)# A tibble: 3 × 7
term estimate std.error statistic p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 75.4 20.9 3.61 0.000483 34.0 117.
2 sleep_hours 3.13 2.92 1.07 0.286 -2.67 8.94
3 stress_level 6.40 0.947 6.75 0.00000000106 4.52 8.28
model7 <- lm(exam_score ~ study_hours + sleep_hours + stress_level, data = datais)
tidy(model7, conf.int = TRUE)# A tibble: 4 × 7
term estimate std.error statistic p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) -1.99 8.05 -0.247 8.05e- 1 -18.0 14.0
2 study_hours 11.6 0.451 25.7 7.39e-45 10.7 12.5
3 sleep_hours 3.43 1.05 3.28 1.46e- 3 1.35 5.51
4 stress_level 7.33 0.341 21.5 1.65e-38 6.66 8.01
anova(model1, model7)Analysis of Variance Table
Model 1: exam_score ~ study_hours
Model 2: exam_score ~ study_hours + sleep_hours + stress_level
Res.Df RSS Df Sum of Sq F Pr(>F)
1 98 59676
2 96 9842 2 49834 243.04 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model2, model7)Analysis of Variance Table
Model 1: exam_score ~ sleep_hours
Model 2: exam_score ~ study_hours + sleep_hours + stress_level
Res.Df RSS Df Sum of Sq F Pr(>F)
1 98 114231
2 96 9842 2 104389 509.11 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model3, model7)Analysis of Variance Table
Model 1: exam_score ~ stress_level
Model 2: exam_score ~ study_hours + sleep_hours + stress_level
Res.Df RSS Df Sum of Sq F Pr(>F)
1 98 78612
2 96 9842 2 68770 335.39 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model4, model7)Analysis of Variance Table
Model 1: exam_score ~ study_hours + sleep_hours
Model 2: exam_score ~ study_hours + sleep_hours + stress_level
Res.Df RSS Df Sum of Sq F Pr(>F)
1 97 57324
2 96 9842 1 47482 463.14 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model5, model7)Analysis of Variance Table
Model 1: exam_score ~ study_hours + stress_level
Model 2: exam_score ~ study_hours + sleep_hours + stress_level
Res.Df RSS Df Sum of Sq F Pr(>F)
1 97 10943.9
2 96 9842.1 1 1101.8 10.747 0.001455 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model6, model7)Analysis of Variance Table
Model 1: exam_score ~ sleep_hours + stress_level
Model 2: exam_score ~ study_hours + sleep_hours + stress_level
Res.Df RSS Df Sum of Sq F Pr(>F)
1 97 77692
2 96 9842 1 67850 661.81 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model1,model2,model3,model4,model5,model6,model7)Analysis of Variance Table
Model 1: exam_score ~ study_hours
Model 2: exam_score ~ sleep_hours
Model 3: exam_score ~ stress_level
Model 4: exam_score ~ study_hours + sleep_hours
Model 5: exam_score ~ study_hours + stress_level
Model 6: exam_score ~ sleep_hours + stress_level
Model 7: exam_score ~ study_hours + sleep_hours + stress_level
Res.Df RSS Df Sum of Sq F Pr(>F)
1 98 59676
2 98 114231 0 -54555
3 98 78612 0 35619
4 97 57324 1 21288 207.65 < 2.2e-16 ***
5 97 10944 0 46380
6 97 77692 0 -66748
7 96 9842 1 67850 661.81 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Based on the ANOVA results, we can compare the models to see which one provides a better fit for the data. The final model (model7) includes all three predictors: study_hours, sleep_hours, and stress_level. All Predictors in univariable analysis with p-value <0.25 are included in multivariable analysis (here, we include all predictors).
We now build a model using stepwise variable selection methods (forward, backward, both) based on AIC.
multivar.m <- lm(exam_score ~ study_hours + sleep_hours + stress_level, data = datais)
stepwise.bw <- step(multivar.m, direction = "backward")Start: AIC=466.93
exam_score ~ study_hours + sleep_hours + stress_level
Df Sum of Sq RSS AIC
<none> 9842 466.93
- sleep_hours 1 1102 10944 475.54
- stress_level 1 47482 57324 641.13
- study_hours 1 67850 77692 671.53
minimal.model <- lm(exam_score ~ 1, data = datais)final.model <- lm(exam_score ~ study_hours + sleep_hours + stress_level, data = datais)step.both <- step(minimal.model, direction = "both", scope = list(upper = final.model))Start: AIC=707.77
exam_score ~ 1
Df Sum of Sq RSS AIC
+ study_hours 1 56498 59676 643.15
+ stress_level 1 37561 78612 670.71
<none> 116173 707.77
+ sleep_hours 1 1942 114231 708.08
Step: AIC=643.15
exam_score ~ study_hours
Df Sum of Sq RSS AIC
+ stress_level 1 48732 10944 475.54
+ sleep_hours 1 2352 57324 641.13
<none> 59676 643.15
- study_hours 1 56498 116173 707.77
Step: AIC=475.54
exam_score ~ study_hours + stress_level
Df Sum of Sq RSS AIC
+ sleep_hours 1 1102 9842 466.93
<none> 10944 475.54
- stress_level 1 48732 59676 643.15
- study_hours 1 67668 78612 670.71
Step: AIC=466.93
exam_score ~ study_hours + stress_level + sleep_hours
Df Sum of Sq RSS AIC
<none> 9842 466.93
- sleep_hours 1 1102 10944 475.54
- stress_level 1 47482 57324 641.13
- study_hours 1 67850 77692 671.53
step.fw <- step(minimal.model, direction = "forward", scope = list(upper = final.model))Start: AIC=707.77
exam_score ~ 1
Df Sum of Sq RSS AIC
+ study_hours 1 56498 59676 643.15
+ stress_level 1 37561 78612 670.71
<none> 116173 707.77
+ sleep_hours 1 1942 114231 708.08
Step: AIC=643.15
exam_score ~ study_hours
Df Sum of Sq RSS AIC
+ stress_level 1 48732 10944 475.54
+ sleep_hours 1 2352 57324 641.13
<none> 59676 643.15
Step: AIC=475.54
exam_score ~ study_hours + stress_level
Df Sum of Sq RSS AIC
+ sleep_hours 1 1101.8 9842.1 466.93
<none> 10943.9 475.54
Step: AIC=466.93
exam_score ~ study_hours + stress_level + sleep_hours
step.both$anova Step Df Deviance Resid. Df Resid. Dev AIC
1 NA NA 99 116173.278 707.7668
2 + study_hours -1 56497.536 98 59675.742 643.1511
3 + stress_level -1 48731.825 97 10943.917 475.5369
4 + sleep_hours -1 1101.835 96 9842.083 466.9252
step.fw$anova Step Df Deviance Resid. Df Resid. Dev AIC
1 NA NA 99 116173.278 707.7668
2 + study_hours -1 56497.536 98 59675.742 643.1511
3 + stress_level -1 48731.825 97 10943.917 475.5369
4 + sleep_hours -1 1101.835 96 9842.083 466.9252
stepwise.bw$anova Step Df Deviance Resid. Df Resid. Dev AIC
1 NA NA 96 9842.083 466.9252
modelfinal <- lm(exam_score ~ study_hours + sleep_hours + stress_level, data = datais)
summary(modelfinal)
Call:
lm(formula = exam_score ~ study_hours + sleep_hours + stress_level,
data = datais)
Residuals:
Min 1Q Median 3Q Max
-25.170 -8.138 1.587 5.615 23.023
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.9899 8.0465 -0.247 0.80520
study_hours 11.6136 0.4514 25.726 < 2e-16 ***
sleep_hours 3.4296 1.0462 3.278 0.00146 **
stress_level 7.3346 0.3408 21.521 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 10.13 on 96 degrees of freedom
Multiple R-squared: 0.9153, Adjusted R-squared: 0.9126
F-statistic: 345.7 on 3 and 96 DF, p-value: < 2.2e-16
anova_combined <- bind_rows(
as.data.frame(stepwise.bw$anova) %>% mutate(Method = "Backward"),
as.data.frame(step.both$anova) %>% mutate(Method = "Both"),
as.data.frame(step.fw$anova) %>% mutate(Method = "Forward")
) %>%
relocate(Method, .before = 1)
anova_combined Method Step Df Deviance Resid. Df Resid. Dev AIC
1 Backward NA NA 96 9842.083 466.9252
2 Both NA NA 99 116173.278 707.7668
3 Both + study_hours -1 56497.536 98 59675.742 643.1511
4 Both + stress_level -1 48731.825 97 10943.917 475.5369
5 Both + sleep_hours -1 1101.835 96 9842.083 466.9252
6 Forward NA NA 99 116173.278 707.7668
7 Forward + study_hours -1 56497.536 98 59675.742 643.1511
8 Forward + stress_level -1 48731.825 97 10943.917 475.5369
9 Forward + sleep_hours -1 1101.835 96 9842.083 466.9252
Intepretation: All three selection method yield the same final model including all predictors.
Using AICc for multi model comparison
model_list <- list(minimal.model, model1, model4, final.model)
model_comparison <- model.sel(model_list)
model_comparisonModel selection table
(Int) std_hrs slp_hrs str_lvl df logLik AICc delta weight
4 -1.99 11.61 3.430 7.335 5 -371.356 753.4 0.00 1
3 32.29 10.57 4.998 4 -459.459 927.3 173.99 0
2 67.43 10.54 3 -461.469 929.2 175.84 0
1 131.20 2 -494.777 993.7 240.33 0
Models ranked by AICc(x)
Interpretation: The model with the lowest AICc is considered the best-fitting model among the candidates.In this case, the final model which consist of all variables are considered the best-fitting model.
interaction.model1 <- lm(exam_score ~ study_hours + sleep_hours + stress_level + study_hours * stress_level, data = datais)
tidy(interaction.model1)# A tibble: 5 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 55.7 4.60 12.1 5.59e-21
2 study_hours 3.46 0.457 7.56 2.50e-11
3 sleep_hours 2.34 0.467 5.02 2.43e- 6
4 stress_level -2.07 0.497 -4.17 6.78e- 5
5 study_hours:stress_level 1.57 0.0789 19.9 1.43e-35
tbl_regression(interaction.model1) %>%
bold_p(t = 0.05) %>%
bold_labels() %>%
as_gt() %>%
tab_header(
title = "Multiple Linear Regression Model with Interaction of Study Hours and Stress Level")| Multiple Linear Regression Model with Interaction of Study Hours and Stress Level | |||
|---|---|---|---|
| Characteristic | Beta | 95% CI | p-value |
| study_hours | 3.5 | 2.5, 4.4 | <0.001 |
| sleep_hours | 2.3 | 1.4, 3.3 | <0.001 |
| stress_level | -2.1 | -3.1, -1.1 | <0.001 |
| study_hours * stress_level | 1.6 | 1.4, 1.7 | <0.001 |
| Abbreviation: CI = Confidence Interval | |||
nointeraction.model1 <- lm(exam_score ~ study_hours + sleep_hours + stress_level, data = datais)
tidy(nointeraction.model1)# A tibble: 4 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) -1.99 8.05 -0.247 8.05e- 1
2 study_hours 11.6 0.451 25.7 7.39e-45
3 sleep_hours 3.43 1.05 3.28 1.46e- 3
4 stress_level 7.33 0.341 21.5 1.65e-38
anova(nointeraction.model1, interaction.model1)Analysis of Variance Table
Model 1: exam_score ~ study_hours + sleep_hours + stress_level
Model 2: exam_score ~ study_hours + sleep_hours + stress_level + study_hours *
stress_level
Res.Df RSS Df Sum of Sq F Pr(>F)
1 96 9842.1
2 95 1911.8 1 7930.3 394.07 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
prelim.final.model <- lm(exam_score ~ study_hours + stress_level + sleep_hours + study_hours*stress_level, data = datais)
plot(prelim.final.model)augment(prelim.final.model) %>%
ggplot(aes(x = study_hours, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(
title = "Residuals vs Study Hours",
x = "Study Hours",
y = "Residuals"
) +
theme_minimal()augment(prelim.final.model) %>%
ggplot(aes(x = stress_level, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(
title = "Residuals vs Stress Level",
x = "Stress Level",
y = "Residuals"
) +
theme_minimal()augment(prelim.final.model) %>%
ggplot(aes(x = sleep_hours, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(
title = "Residuals vs Sleep Hours",
x = "Sleep Hours",
y = "Residuals"
) +
theme_minimal()augment(prelim.final.model) %>%
ggplot(aes(x = study_hours, y = .resid))+
geom_point()+
geom_smooth()res.mod <- residuals(prelim.final.model)
head(res.mod) 1 2 3 4 5 6
0.9814899 3.6097311 0.7203962 4.7086989 -1.3011740 2.8956146
hist(res.mod)library(DT)
datais.pred.res <- augment(prelim.final.model)
datais.pred.res %>% datatable()new_data <- expand.grid(
study_hours = c(2, 4, 6, 8),
stress_level = c(1, 3, 5),
sleep_hours = c(2,4,6,8)
)
new_data study_hours stress_level sleep_hours
1 2 1 2
2 4 1 2
3 6 1 2
4 8 1 2
5 2 3 2
6 4 3 2
7 6 3 2
8 8 3 2
9 2 5 2
10 4 5 2
11 6 5 2
12 8 5 2
13 2 1 4
14 4 1 4
15 6 1 4
16 8 1 4
17 2 3 4
18 4 3 4
19 6 3 4
20 8 3 4
21 2 5 4
22 4 5 4
23 6 5 4
24 8 5 4
25 2 1 6
26 4 1 6
27 6 1 6
28 8 1 6
29 2 3 6
30 4 3 6
31 6 3 6
32 8 3 6
33 2 5 6
34 4 5 6
35 6 5 6
36 8 5 6
37 2 1 8
38 4 1 8
39 6 1 8
40 8 1 8
41 2 3 8
42 4 3 8
43 6 3 8
44 8 3 8
45 2 5 8
46 4 5 8
47 6 5 8
48 8 5 8
pred_exam_score <- augment(
prelim.final.model,
newdata = new_data
)
pred_exam_score %>% datatable()ggplot(pred_exam_score,
aes(x = study_hours,
y = .fitted,
color = as.factor(stress_level))) +
geom_line(size = 1) +
geom_point(size = 2) +
labs(
title = "Predicted Exam Score by Study Hours and Stress Level",
x = "Study Hours",
y = "Predicted Exam Score",
color = "Stress Level"
) +
theme_minimal()Identifying Influential Data Points in Regression
We used three diagnostic criteria to identify potentially influential observations:
Cook’s Distance Threshold: values above 1, or above 4n Standardized Residuals Threshold: values greater than 2 or less than −2 Leverage Threshold: values above 2k+2n
Example: 2(7)+24225=0.0038 Where: - n = number of observations - k = number of predictors in the model
These thresholds help flag data points that may disproportionately influence model estimates or distort inference.
#add diagnostic columns
exam_score_diag <- augment(prelim.final.model,datais)
#Define thresholds
n_obs <- nrow(datais)
p_preds <- length(coef(prelim.final.model))
cooks_cutoff <- 4 / (n_obs)
leverage_cutoff <- 2 * (p_preds + 2) / n_obs
#filter influentials observations
influential_obs <- exam_score_diag %>%
filter(.cooksd > cooks_cutoff | abs(.std.resid) > 2 | .hat > leverage_cutoff)
# Print count
nrow(influential_obs)[1] 9
cutoff <- 4/((nrow(datais)-length(prelim.final.model$coefficients)-2))
plot(prelim.final.model, which = 4, cook.levels = cutoff)datais.pred.res %>%
filter(.std.resid > 2 | .std.resid < -2 | .hat > 0.0038) %>%
datatable()Interpretation: Cook’s distance diagnostics identified three potentially influential observations (observations 48, 53, and 83). Sensitivity analyses excluding these observations were conducted to assess model robustness. The overall pattern and direction of the regression coefficients remained unchanged, indicating that the model results were not unduly influenced by these observations.
non.influen.obs <-
datais.pred.res %>%
filter(.std.resid < 2 & .std.resid > -2 & .hat < 0.0038)# Remove influentials and Refit
clean_data <- exam_score_diag %>%
filter(.cooksd <= cooks_cutoff & abs(.std.resid) <= 2 & .hat <= leverage_cutoff)
# Re-run the final model
final_model_clean <- lm(
exam_score ~ study_hours + sleep_hours + stress_level + study_hours * stress_level,
data = clean_data
)
tbl_regression(final_model_clean) %>%
bold_p(t = 0.05) %>%
bold_labels() %>%
add_glance_source_note(include = c(r.squared, AIC)) %>%
as_gt() %>%
tab_header(title = "Final Model after Removing Influential Observations")| Final Model after Removing Influential Observations | |||
|---|---|---|---|
| Characteristic | Beta | 95% CI | p-value |
| study_hours | 3.3 | 2.4, 4.1 | <0.001 |
| sleep_hours | 2.9 | 2.1, 3.7 | <0.001 |
| stress_level | -2.0 | -2.9, -1.1 | <0.001 |
| study_hours * stress_level | 1.6 | 1.4, 1.7 | <0.001 |
| Abbreviation: CI = Confidence Interval | |||
| R² = 0.989; AIC = 505 | |||
interaction.model1 <- lm(
exam_score ~ study_hours * stress_level,
data = datais
)
par(mfrow = c(2,2))
plot(interaction.model1)Interpretation:
After removing influential observations, the regression coefficients and significance levels remained largely unchanged. This indicates that the original model was robust and not overly sensitive to a few extreme data points. The diagnostic plots for the cleaned model continue to support the assumptions of linear regression, including linearity, homoscedasticity, normality of residuals, and absence of undue influence.
interaction.model1 <- lm(
exam_score ~ study_hours * stress_level,
data = datais
)new_data <- expand.grid(
study_hours = c(2, 4, 6, 8),
stress_level = c(1, 3, 5)
)predict(interaction.model1, newdata = new_data) 1 2 3 4 5 6 7 8
80.68831 90.31932 99.95033 109.58134 82.53996 98.62227 114.70459 130.78690
9 10 11 12
84.39160 106.92522 129.45885 151.99247
prelim.final.model <- lm(
exam_score ~ study_hours + sleep_hours + stress_level + study_hours * stress_level,
data = datais
)tbl <- tbl_regression(
prelim.final.model,
intercept = TRUE,
label = list(
study_hours ~ "Study hours (hours/day)",
sleep_hours ~ "Sleep hours (hours/day)",
stress_level ~ "Stress level (score)",
`study_hours:stress_level` ~ "Study hours × Stress level"
)
)tbl_final <- tbl %>%
add_glance_source_note(include = c("r.squared", "adj.r.squared", "AIC")) %>%
bold_labels() %>%
bold_p(t = 0.05) %>%
modify_header(estimate ~ "**Adj. beta**") %>%
modify_table_body(
~ .x %>%
mutate(
label = case_when(
label == "study_hours" ~ "Study hours (hours/day)",
label == "sleep_hours" ~ "Sleep hours (hours/day)",
label == "stress_level" ~ "Stress level (score)",
label == "study_hours:stress_level" ~ "Study hours × Stress level",
TRUE ~ label
)
)
)tbl_final %>%
as_gt() %>%
tab_header(
title = "Final Multiple Linear Regression Model",
subtitle = "Predictors of Exam score"
)| Final Multiple Linear Regression Model | |||
|---|---|---|---|
| Predictors of Exam score | |||
| Characteristic | Adj. beta | 95% CI | p-value |
| (Intercept) | 56 | 47, 65 | <0.001 |
| Study hours (hours/day) | 3.5 | 2.5, 4.4 | <0.001 |
| Sleep hours (hours/day) | 2.3 | 1.4, 3.3 | <0.001 |
| Stress level (score) | -2.1 | -3.1, -1.1 | <0.001 |
| Study hours (hours/day) * Stress level (score) | 1.6 | 1.4, 1.7 | <0.001 |
| Abbreviation: CI = Confidence Interval | |||
| R² = 0.984; Adjusted R² = 0.983; AIC = 591 | |||
Interpretation:
Main Effects The model reveals that both study duration and sleep duration are significant positive predictors of academic performance. Holding all other variables constant: Study Hours: Each additional hour of study per day is associated with a 3.5-point increase in exam scores (\(p < 0.001\)). Sleep Hours: Each additional hour of sleep is associated with a 2.3-point increase in scores (\(p < 0.001\)). Stress Level: Conversely, stress level is a significant negative predictor; for every one-unit increase in the stress score, exam performance is expected to decrease by 2.1 points (\(\beta = -2.1, p < 0.001\)).
Interaction Effect Crucially, the analysis identified a significant positive interaction between study hours and stress levels (\(\beta = 1.6, p < 0.001\)). This suggests that the relationship between study hours and exam performance is moderated by the student’s stress level.Specifically, the positive coefficient for the interaction term indicates that the beneficial effect of studying on exam scores becomes more pronounced as stress levels increase, or conversely, that increased study time may help mitigate the negative impact of high stress on performance.
Model Robustness The intercept of 56 represents the predicted exam score when all continuous predictors are at zero. The narrow 95% confidence intervals and the high \(R^2\) value suggest that the model is highly stable and possesses strong predictive power within the context of this dataset.
In conclusion, our multiple linear regression analysis identified study hours, sleep hours, and stress level as significant predictors of exam scores. The interaction between study hours and stress level further highlighted the complex relationship affecting academic performance. The final model demonstrated good fit and robustness, providing valuable insights for students aiming to optimize their study habits and manage stress effectively. Future research could explore additional factors influencing exam performance to further enhance predictive accuracy.