Multiple Linear Regression Analysis

Author

Muamar Iskandar bin Mohamed Yusoff

Published

December 23, 2025

Quarto

Muamar Iskandar bin Mohamed Yusoff.

Load library

library(here)
library(haven)
library(tidyverse)
library(ggplot2)
library(gtsummary)
library(broom)
library(dplyr)
library(corrplot)
library(readxl)

Load data

datais <- read_xlsx(here("Multiple_Linear_Regression_Hypothetical_Data.xlsx"))

Describe data

summary(datais)
   exam_score      study_hours      sleep_hours     stress_level  
 Min.   : 64.96   Min.   : 0.400   Min.   :3.770   Min.   : 1.00  
 1st Qu.:105.94   1st Qu.: 4.335   1st Qu.:6.450   1st Qu.: 3.00  
 Median :126.02   Median : 5.895   Median :7.050   Median : 5.00  
 Mean   :131.20   Mean   : 6.054   Mean   :6.980   Mean   : 5.31  
 3rd Qu.:153.74   3rd Qu.: 7.968   3rd Qu.:7.478   3rd Qu.: 8.00  
 Max.   :216.89   Max.   :10.780   Max.   :9.600   Max.   :10.00  

Tidy into table

library(tidyr)
datais %>%
  select(study_hours, sleep_hours, stress_level) %>%
  tbl_summary(
    statistic = list(all_continuous() ~ "{mean} ({sd})")
  )
Characteristic N = 1001
study_hours 6.05 (2.27)
sleep_hours 6.98 (0.98)
stress_level 5.31 (3.01)
1 Mean (SD)

Explore and wrangle data

Perform plots, to look for outliers, range of data, missing data, error in data entry, wrong groupings, distribution of data

ggplot(datais, aes(x = exam_score)) +
  geom_histogram(binwidth = 5, fill = "darkgreen", color = "black") +
  labs(
    title = "Distribution of Exam Scores",
    x = "Exam Score",
    y = "Frequency"
  ) +
  theme_minimal()

ggplot(datais, aes(x = study_hours, y = exam_score)) +
  geom_point(color = "blue") +
  labs(
    title = "Exam Score vs Study Hours",
    x = "Study Hours",
    y = "Exam Score"
  ) +
  theme_minimal()

ggplot(datais, aes(x = sleep_hours, y = exam_score)) +
  geom_point(color = "orange") +
  labs(
    title = "Exam Score vs Sleep Hours",
    x = "Sleep Hours",
    y = "Exam Score"
  ) +
  theme_minimal()

convert stress level into categorical

datais <- datais %>%
  mutate(
    stress_cat = case_when(
      stress_level <= 3 ~ "Low stress",
      stress_level <= 6 ~ "Moderate stress",
      TRUE ~ "High stress"
    )
  )

assess study hours distribution based on stress category

ggplot(datais, aes(x = stress_cat, y = study_hours, fill = stress_cat)) +
  geom_boxplot() +
  labs(
    title = "Study Hours by Stress Category",
    x = "Stress Category",
    y = "Study Hours"
  ) +
  theme_minimal()

assess sleep hours distribution based on stress category

ggplot(datais, aes(x = stress_cat, y = sleep_hours, fill = stress_cat)) +
  geom_boxplot() +
  labs( 
    title = "Sleep Hours by Stress Category",
    x = "Stress Category",
    y = "Sleep Hours"
  ) +
  theme_minimal()

Check correlations between variables

corrdata <- datais %>%
  select(exam_score, study_hours, sleep_hours, stress_level) %>%
  cor()
corrplot(corrdata, method = "circle", type = "upper", tl.col = "black", tl.srt = 45)

Build Linear Regression Model

Univariable model

Exam score based on study hours

model1 <- lm(exam_score ~ study_hours, data = datais)
summary(model1)

Call:
lm(formula = exam_score ~ study_hours, data = datais)

Residuals:
    Min      1Q  Median      3Q     Max 
-46.587 -18.287  -2.806  17.548  59.532 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   67.425      7.066   9.542 1.19e-15 ***
study_hours   10.535      1.094   9.632 7.61e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 24.68 on 98 degrees of freedom
Multiple R-squared:  0.4863,    Adjusted R-squared:  0.4811 
F-statistic: 92.78 on 1 and 98 DF,  p-value: 7.606e-16

Exam score based on sleep hours

model2 <- lm(exam_score ~ sleep_hours, data = datais)
summary(model2)

Call:
lm(formula = exam_score ~ sleep_hours, data = datais)

Residuals:
    Min      1Q  Median      3Q     Max 
-61.336 -27.764  -3.739  21.322  88.006 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   99.501     24.794   4.013 0.000117 ***
sleep_hours    4.542      3.518   1.291 0.199799    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 34.14 on 98 degrees of freedom
Multiple R-squared:  0.01672,   Adjusted R-squared:  0.006685 
F-statistic: 1.666 on 1 and 98 DF,  p-value: 0.1998

Exam score based on stress level

model3 <- lm(exam_score ~ stress_level, data = datais)
summary(model3)

Call:
lm(formula = exam_score ~ stress_level, data = datais)

Residuals:
    Min      1Q  Median      3Q     Max 
-71.384 -19.179   0.353  17.867  61.816 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)   96.8474     5.7642  16.802  < 2e-16 ***
stress_level   6.4696     0.9455   6.843 6.77e-10 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 28.32 on 98 degrees of freedom
Multiple R-squared:  0.3233,    Adjusted R-squared:  0.3164 
F-statistic: 46.82 on 1 and 98 DF,  p-value: 6.77e-10

Multivariable model

Study hours + Sleep Hours

model4 <- lm(exam_score ~ study_hours + sleep_hours, data = datais)
summary(model4)

Call:
lm(formula = exam_score ~ study_hours + sleep_hours, data = datais)

Residuals:
    Min      1Q  Median      3Q     Max 
-45.129 -18.519  -1.474  16.743  59.990 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   32.294     18.936   1.705   0.0913 .  
study_hours   10.575      1.078   9.813  3.4e-16 ***
sleep_hours    4.998      2.506   1.995   0.0489 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 24.31 on 97 degrees of freedom
Multiple R-squared:  0.5066,    Adjusted R-squared:  0.4964 
F-statistic: 49.79 on 2 and 97 DF,  p-value: 1.323e-15

Study hours + Stress level

model5 <- lm(exam_score ~ study_hours + stress_level, data = datais)
summary(model5)

Call:
lm(formula = exam_score ~ study_hours + stress_level, data = datais)

Residuals:
     Min       1Q   Median       3Q      Max 
-24.0089  -8.3616   0.9474   6.9240  23.3707 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)   21.6341     3.7557    5.76 9.89e-08 ***
study_hours   11.5974     0.4736   24.49  < 2e-16 ***
stress_level   7.4124     0.3567   20.78  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 10.62 on 97 degrees of freedom
Multiple R-squared:  0.9058,    Adjusted R-squared:  0.9039 
F-statistic: 466.3 on 2 and 97 DF,  p-value: < 2.2e-16

Sleep hours + Stress level

model6 <- lm(exam_score ~ sleep_hours + stress_level, data = datais)
summary(model6)

Call:
lm(formula = exam_score ~ sleep_hours + stress_level, data = datais)

Residuals:
    Min      1Q  Median      3Q     Max 
-72.506 -18.210   0.135  18.392  63.682 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)   75.3552    20.8616   3.612 0.000483 ***
sleep_hours    3.1341     2.9239   1.072 0.286428    
stress_level   6.3973     0.9471   6.754 1.06e-09 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 28.3 on 97 degrees of freedom
Multiple R-squared:  0.3312,    Adjusted R-squared:  0.3175 
F-statistic: 24.02 on 2 and 97 DF,  p-value: 3.354e-09

Study hours + Sleep hours + Stress level

model7 <- lm(exam_score ~ study_hours + sleep_hours + stress_level, data =
 datais)
summary(model7)

Call:
lm(formula = exam_score ~ study_hours + sleep_hours + stress_level, 
    data = datais)

Residuals:
    Min      1Q  Median      3Q     Max 
-25.170  -8.138   1.587   5.615  23.023 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)   -1.9899     8.0465  -0.247  0.80520    
study_hours   11.6136     0.4514  25.726  < 2e-16 ***
sleep_hours    3.4296     1.0462   3.278  0.00146 ** 
stress_level   7.3346     0.3408  21.521  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 10.13 on 96 degrees of freedom
Multiple R-squared:  0.9153,    Adjusted R-squared:  0.9126 
F-statistic: 345.7 on 3 and 96 DF,  p-value: < 2.2e-16
tidy(model1, conf.int = TRUE)
# A tibble: 2 × 7
  term        estimate std.error statistic  p.value conf.low conf.high
  <chr>          <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
1 (Intercept)     67.4      7.07      9.54 1.19e-15    53.4       81.4
2 study_hours     10.5      1.09      9.63 7.61e-16     8.36      12.7
tidy(model2, conf.int = TRUE)
# A tibble: 2 × 7
  term        estimate std.error statistic  p.value conf.low conf.high
  <chr>          <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
1 (Intercept)    99.5      24.8       4.01 0.000117    50.3      149. 
2 sleep_hours     4.54      3.52      1.29 0.200       -2.44      11.5
tidy(model3, conf.int = TRUE)
# A tibble: 2 × 7
  term         estimate std.error statistic  p.value conf.low conf.high
  <chr>           <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
1 (Intercept)     96.8      5.76      16.8  1.30e-30    85.4     108.  
2 stress_level     6.47     0.945      6.84 6.77e-10     4.59      8.35
tidy(model4, conf.int = TRUE)
# A tibble: 3 × 7
  term        estimate std.error statistic  p.value conf.low conf.high
  <chr>          <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
1 (Intercept)    32.3      18.9       1.71 9.13e- 2  -5.29       69.9 
2 study_hours    10.6       1.08      9.81 3.40e-16   8.44       12.7 
3 sleep_hours     5.00      2.51      1.99 4.89e- 2   0.0255      9.97
tidy(model5, conf.int = TRUE)
# A tibble: 3 × 7
  term         estimate std.error statistic  p.value conf.low conf.high
  <chr>           <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
1 (Intercept)     21.6      3.76       5.76 9.89e- 8    14.2      29.1 
2 study_hours     11.6      0.474     24.5  2.56e-43    10.7      12.5 
3 stress_level     7.41     0.357     20.8  1.68e-37     6.70      8.12
tidy(model6, conf.int = TRUE)
# A tibble: 3 × 7
  term         estimate std.error statistic       p.value conf.low conf.high
  <chr>           <dbl>     <dbl>     <dbl>         <dbl>    <dbl>     <dbl>
1 (Intercept)     75.4     20.9        3.61 0.000483         34.0     117.  
2 sleep_hours      3.13     2.92       1.07 0.286            -2.67      8.94
3 stress_level     6.40     0.947      6.75 0.00000000106     4.52      8.28
tidy(model7, conf.int = TRUE)
# A tibble: 4 × 7
  term         estimate std.error statistic  p.value conf.low conf.high
  <chr>           <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
1 (Intercept)     -1.99     8.05     -0.247 8.05e- 1   -18.0      14.0 
2 study_hours     11.6      0.451    25.7   7.39e-45    10.7      12.5 
3 sleep_hours      3.43     1.05      3.28  1.46e- 3     1.35      5.51
4 stress_level     7.33     0.341    21.5   1.65e-38     6.66      8.01

Model selection

Start with backward elimination method

multivar.m <- lm(exam_score ~ study_hours + sleep_hours + stress_level, data = datais)
stepwise.bw <- step(multivar.m, direction = "backward")
Start:  AIC=466.93
exam_score ~ study_hours + sleep_hours + stress_level

               Df Sum of Sq   RSS    AIC
<none>                       9842 466.93
- sleep_hours   1      1102 10944 475.54
- stress_level  1     47482 57324 641.13
- study_hours   1     67850 77692 671.53

Make minimal and full model

minimal.model <- lm(exam_score ~ 1, data = datais)
full.model <- lm(exam_score ~ study_hours + sleep_hours + stress_level, data = datais)

run both and forward stepwise selection

step.both <- step(minimal.model, direction = "both", scope = list(upper = full.model))
Start:  AIC=707.77
exam_score ~ 1

               Df Sum of Sq    RSS    AIC
+ study_hours   1     56498  59676 643.15
+ stress_level  1     37561  78612 670.71
<none>                      116173 707.77
+ sleep_hours   1      1942 114231 708.08

Step:  AIC=643.15
exam_score ~ study_hours

               Df Sum of Sq    RSS    AIC
+ stress_level  1     48732  10944 475.54
+ sleep_hours   1      2352  57324 641.13
<none>                       59676 643.15
- study_hours   1     56498 116173 707.77

Step:  AIC=475.54
exam_score ~ study_hours + stress_level

               Df Sum of Sq   RSS    AIC
+ sleep_hours   1      1102  9842 466.93
<none>                      10944 475.54
- stress_level  1     48732 59676 643.15
- study_hours   1     67668 78612 670.71

Step:  AIC=466.93
exam_score ~ study_hours + stress_level + sleep_hours

               Df Sum of Sq   RSS    AIC
<none>                       9842 466.93
- sleep_hours   1      1102 10944 475.54
- stress_level  1     47482 57324 641.13
- study_hours   1     67850 77692 671.53
step.fw <- step(minimal.model, direction = "forward", scope = list(upper = full.model))
Start:  AIC=707.77
exam_score ~ 1

               Df Sum of Sq    RSS    AIC
+ study_hours   1     56498  59676 643.15
+ stress_level  1     37561  78612 670.71
<none>                      116173 707.77
+ sleep_hours   1      1942 114231 708.08

Step:  AIC=643.15
exam_score ~ study_hours

               Df Sum of Sq   RSS    AIC
+ stress_level  1     48732 10944 475.54
+ sleep_hours   1      2352 57324 641.13
<none>                      59676 643.15

Step:  AIC=475.54
exam_score ~ study_hours + stress_level

              Df Sum of Sq     RSS    AIC
+ sleep_hours  1    1101.8  9842.1 466.93
<none>                     10943.9 475.54

Step:  AIC=466.93
exam_score ~ study_hours + stress_level + sleep_hours

View ANOVA table for stepwise selections

step.both$anova
            Step Df  Deviance Resid. Df Resid. Dev      AIC
1                NA        NA        99 116173.278 707.7668
2  + study_hours -1 56497.536        98  59675.742 643.1511
3 + stress_level -1 48731.825        97  10943.917 475.5369
4  + sleep_hours -1  1101.835        96   9842.083 466.9252
step.fw$anova
            Step Df  Deviance Resid. Df Resid. Dev      AIC
1                NA        NA        99 116173.278 707.7668
2  + study_hours -1 56497.536        98  59675.742 643.1511
3 + stress_level -1 48731.825        97  10943.917 475.5369
4  + sleep_hours -1  1101.835        96   9842.083 466.9252
stepwise.bw$anova
  Step Df Deviance Resid. Df Resid. Dev      AIC
1      NA       NA        96   9842.083 466.9252

Make Final Model

Based on variable selection

We decide all variable should be included.

final.model <- lm(exam_score ~ study_hours + sleep_hours + stress_level, data = datais)
summary(final.model)

Call:
lm(formula = exam_score ~ study_hours + sleep_hours + stress_level, 
    data = datais)

Residuals:
    Min      1Q  Median      3Q     Max 
-25.170  -8.138   1.587   5.615  23.023 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)   -1.9899     8.0465  -0.247  0.80520    
study_hours   11.6136     0.4514  25.726  < 2e-16 ***
sleep_hours    3.4296     1.0462   3.278  0.00146 ** 
stress_level   7.3346     0.3408  21.521  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 10.13 on 96 degrees of freedom
Multiple R-squared:  0.9153,    Adjusted R-squared:  0.9126 
F-statistic: 345.7 on 3 and 96 DF,  p-value: < 2.2e-16

To check either Study hours and Sleep hours are correlated

cor(datais$study_hours, datais$sleep_hours)
[1] -0.01858065

Results show very low correlation (0.12) between Study hours and Sleep hours.

Check for interaction

Only include interaction term that is meaningful

interaction.model1 <- lm(exam_score ~ study_hours + sleep_hours + stress_level + study_hours * stress_level, data = datais)
tidy(interaction.model1)
# A tibble: 5 × 5
  term                     estimate std.error statistic  p.value
  <chr>                       <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)                 55.7     4.60       12.1  5.59e-21
2 study_hours                  3.46    0.457       7.56 2.50e-11
3 sleep_hours                  2.34    0.467       5.02 2.43e- 6
4 stress_level                -2.07    0.497      -4.17 6.78e- 5
5 study_hours:stress_level     1.57    0.0789     19.9  1.43e-35
nointeraction.model1 <- lm(exam_score ~ study_hours + sleep_hours + stress_level, data = datais)
tidy(nointeraction.model1)
# A tibble: 4 × 5
  term         estimate std.error statistic  p.value
  <chr>           <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)     -1.99     8.05     -0.247 8.05e- 1
2 study_hours     11.6      0.451    25.7   7.39e-45
3 sleep_hours      3.43     1.05      3.28  1.46e- 3
4 stress_level     7.33     0.341    21.5   1.65e-38

compare interaction model with no interaction model

anova(nointeraction.model1, interaction.model1)
Analysis of Variance Table

Model 1: exam_score ~ study_hours + sleep_hours + stress_level
Model 2: exam_score ~ study_hours + sleep_hours + stress_level + study_hours * 
    stress_level
  Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
1     96 9842.1                                  
2     95 1911.8  1    7930.3 394.07 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Model assessment

Build Preliminary final model

prelim.final.model <- lm(exam_score ~ study_hours + stress_level + sleep_hours + study_hours*stress_level, data = datais)

plot(prelim.final.model)

Check QQ Plot

library(lmtest)
bptest(prelim.final.model)

    studentized Breusch-Pagan test

data:  prelim.final.model
BP = 3.6463, df = 4, p-value = 0.456

Check Shapiro wilk test

shapiro.test(prelim.final.model$residuals)

    Shapiro-Wilk normality test

data:  prelim.final.model$residuals
W = 0.98631, p-value = 0.3928

Plot residuals against independent variables in the model

1. study hours

augment(prelim.final.model) %>%
  ggplot(aes(x = study_hours, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(
    title = "Residuals vs Study Hours",
    x = "Study Hours",
    y = "Residuals"
  ) +
  theme_minimal()

2. stress level

augment(prelim.final.model) %>%
  ggplot(aes(x = stress_level, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(
    title = "Residuals vs Stress Level",
    x = "Stress Level",
    y = "Residuals"
  ) +
  theme_minimal()

3. Sleep hours

augment(prelim.final.model) %>%
  ggplot(aes(x = sleep_hours, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(
    title = "Residuals vs Sleep Hours",
    x = "Sleep Hours",
    y = "Residuals"
  ) +
  theme_minimal()

Fitted values, residuals and influential values

make residuals value

res.mod <- residuals(prelim.final.model)
head(res.mod)
         1          2          3          4          5          6 
 0.9814899  3.6097311  0.7203962  4.7086989 -1.3011740  2.8956146 
hist(res.mod)

Alternatively, we can use broom::augment() to generate predictions, residuals and cluster assignments.

library(DT)
datais.pred.res <- augment(prelim.final.model)
datais.pred.res %>% datatable()

Predictions

Make new data frame for prediction

new_data <- expand.grid(
  study_hours  = c(2, 4, 6, 8),
  stress_level = c(1, 3, 5),
  sleep_hours = c(2,4,6,8)
)

new_data
   study_hours stress_level sleep_hours
1            2            1           2
2            4            1           2
3            6            1           2
4            8            1           2
5            2            3           2
6            4            3           2
7            6            3           2
8            8            3           2
9            2            5           2
10           4            5           2
11           6            5           2
12           8            5           2
13           2            1           4
14           4            1           4
15           6            1           4
16           8            1           4
17           2            3           4
18           4            3           4
19           6            3           4
20           8            3           4
21           2            5           4
22           4            5           4
23           6            5           4
24           8            5           4
25           2            1           6
26           4            1           6
27           6            1           6
28           8            1           6
29           2            3           6
30           4            3           6
31           6            3           6
32           8            3           6
33           2            5           6
34           4            5           6
35           6            5           6
36           8            5           6
37           2            1           8
38           4            1           8
39           6            1           8
40           8            1           8
41           2            3           8
42           4            3           8
43           6            3           8
44           8            3           8
45           2            5           8
46           4            5           8
47           6            5           8
48           8            5           8

predictions table

pred_exam_score <- augment(
  prelim.final.model,
  newdata = new_data
)

pred_exam_score %>% datatable()

Influentials

1. Cooks distance : values above 1 or above 4/n

2. standardized residuals : values above 2 or lower than −2

3. leverage above 2k+2n

= 2(7)+2/4225=0.0038???

### Cook’s distance

cutoff <- 4/((nrow(datais)-length(prelim.final.model$coefficients)-2))
plot(prelim.final.model, which = 4, cook.levels = cutoff)

datais.pred.res %>% 
  filter(.std.resid > 2 | .std.resid < -2 | .hat > 0.0038) %>%
  datatable()

keep standardized residuals between -2 and 2

non.influen.obs <- 
  datais.pred.res %>% 
  filter(.std.resid < 2 & .std.resid > -2 & .hat < 0.0038)

Compare models