Multiple Linear Regression Analysis

Author

Muamar Iskandar bin Mohamed Yusoff

Published

Invalid Date

Quarto

Muamar Iskandar bin Mohamed Yusoff.

Load library

Show code
library(here)
library(haven)
library(tidyverse)
library(ggplot2)
library(gtsummary)
library(broom)
library(dplyr)
library(corrplot)
library(readxl)

Load data

Show code
datais <- read_xlsx(here("Multiple_Linear_Regression_Hypothetical_Data.xlsx"))

Describe data

Show code
summary(datais)
   exam_score      study_hours      sleep_hours     stress_level  
 Min.   : 64.96   Min.   : 0.400   Min.   :3.770   Min.   : 1.00  
 1st Qu.:105.94   1st Qu.: 4.335   1st Qu.:6.450   1st Qu.: 3.00  
 Median :126.02   Median : 5.895   Median :7.050   Median : 5.00  
 Mean   :131.20   Mean   : 6.054   Mean   :6.980   Mean   : 5.31  
 3rd Qu.:153.74   3rd Qu.: 7.968   3rd Qu.:7.478   3rd Qu.: 8.00  
 Max.   :216.89   Max.   :10.780   Max.   :9.600   Max.   :10.00  

Tidy into table

Show code
library(tidyr)
datais %>%
  select(study_hours, sleep_hours, stress_level) %>%
  tbl_summary(
    statistic = list(all_continuous() ~ "{mean} ({sd})")
  )
Characteristic N = 1001
study_hours 6.05 (2.27)
sleep_hours 6.98 (0.98)
stress_level 5.31 (3.01)
1 Mean (SD)

Explore and wrangle data

Perform plots, to look for outliers, range of data, missing data, error in data entry, wrong groupings, distribution of data

Show code
ggplot(datais, aes(x = exam_score)) +
  geom_histogram(binwidth = 5, fill = "darkgreen", color = "black") +
  labs(
    title = "Distribution of Exam Scores",
    x = "Exam Score",
    y = "Frequency"
  ) +
  theme_minimal()

Show code
ggplot(datais, aes(x = study_hours, y = exam_score)) +
  geom_point(color = "blue") +
  labs(
    title = "Exam Score vs Study Hours",
    x = "Study Hours",
    y = "Exam Score"
  ) +
  theme_minimal()

Show code
ggplot(datais, aes(x = sleep_hours, y = exam_score)) +
  geom_point(color = "orange") +
  labs(
    title = "Exam Score vs Sleep Hours",
    x = "Sleep Hours",
    y = "Exam Score"
  ) +
  theme_minimal()

convert stress level into categorical

Show code
datais <- datais %>%
  mutate(
    stress_cat = case_when(
      stress_level <= 3 ~ "Low stress",
      stress_level <= 6 ~ "Moderate stress",
      TRUE ~ "High stress"
    )
  )

assess study hours distribution based on stress category

Show code
ggplot(datais, aes(x = stress_cat, y = study_hours, fill = stress_cat)) +
  geom_boxplot() +
  labs(
    title = "Study Hours by Stress Category",
    x = "Stress Category",
    y = "Study Hours"
  ) +
  theme_minimal()

assess sleep hours distribution based on stress category

Show code
ggplot(datais, aes(x = stress_cat, y = sleep_hours, fill = stress_cat)) +
  geom_boxplot() +
  labs( 
    title = "Sleep Hours by Stress Category",
    x = "Stress Category",
    y = "Sleep Hours"
  ) +
  theme_minimal()

Check correlations between variables

Show code
corrdata <- datais %>%
  select(exam_score, study_hours, sleep_hours, stress_level) %>%
  cor()
corrplot(corrdata, method = "circle", type = "upper", tl.col = "black", tl.srt = 45)

Build Linear Regression Model

Univariable model

Exam score based on study hours

Show code
model1 <- lm(exam_score ~ study_hours, data = datais)
summary(model1)

Call:
lm(formula = exam_score ~ study_hours, data = datais)

Residuals:
    Min      1Q  Median      3Q     Max 
-46.587 -18.287  -2.806  17.548  59.532 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   67.425      7.066   9.542 1.19e-15 ***
study_hours   10.535      1.094   9.632 7.61e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 24.68 on 98 degrees of freedom
Multiple R-squared:  0.4863,    Adjusted R-squared:  0.4811 
F-statistic: 92.78 on 1 and 98 DF,  p-value: 7.606e-16

Exam score based on sleep hours

Show code
model2 <- lm(exam_score ~ sleep_hours, data = datais)
summary(model2)

Call:
lm(formula = exam_score ~ sleep_hours, data = datais)

Residuals:
    Min      1Q  Median      3Q     Max 
-61.336 -27.764  -3.739  21.322  88.006 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   99.501     24.794   4.013 0.000117 ***
sleep_hours    4.542      3.518   1.291 0.199799    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 34.14 on 98 degrees of freedom
Multiple R-squared:  0.01672,   Adjusted R-squared:  0.006685 
F-statistic: 1.666 on 1 and 98 DF,  p-value: 0.1998

Exam score based on stress level

Show code
model3 <- lm(exam_score ~ stress_level, data = datais)
summary(model3)

Call:
lm(formula = exam_score ~ stress_level, data = datais)

Residuals:
    Min      1Q  Median      3Q     Max 
-71.384 -19.179   0.353  17.867  61.816 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)   96.8474     5.7642  16.802  < 2e-16 ***
stress_level   6.4696     0.9455   6.843 6.77e-10 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 28.32 on 98 degrees of freedom
Multiple R-squared:  0.3233,    Adjusted R-squared:  0.3164 
F-statistic: 46.82 on 1 and 98 DF,  p-value: 6.77e-10

Multivariable model

Study hours + Sleep Hours

Show code
model4 <- lm(exam_score ~ study_hours + sleep_hours, data = datais)
summary(model4)

Call:
lm(formula = exam_score ~ study_hours + sleep_hours, data = datais)

Residuals:
    Min      1Q  Median      3Q     Max 
-45.129 -18.519  -1.474  16.743  59.990 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   32.294     18.936   1.705   0.0913 .  
study_hours   10.575      1.078   9.813  3.4e-16 ***
sleep_hours    4.998      2.506   1.995   0.0489 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 24.31 on 97 degrees of freedom
Multiple R-squared:  0.5066,    Adjusted R-squared:  0.4964 
F-statistic: 49.79 on 2 and 97 DF,  p-value: 1.323e-15

Study hours + Stress level

Show code
model5 <- lm(exam_score ~ study_hours + stress_level, data = datais)
summary(model5)

Call:
lm(formula = exam_score ~ study_hours + stress_level, data = datais)

Residuals:
     Min       1Q   Median       3Q      Max 
-24.0089  -8.3616   0.9474   6.9240  23.3707 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)   21.6341     3.7557    5.76 9.89e-08 ***
study_hours   11.5974     0.4736   24.49  < 2e-16 ***
stress_level   7.4124     0.3567   20.78  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 10.62 on 97 degrees of freedom
Multiple R-squared:  0.9058,    Adjusted R-squared:  0.9039 
F-statistic: 466.3 on 2 and 97 DF,  p-value: < 2.2e-16

Sleep hours + Stress level

Show code
model6 <- lm(exam_score ~ sleep_hours + stress_level, data = datais)
summary(model6)

Call:
lm(formula = exam_score ~ sleep_hours + stress_level, data = datais)

Residuals:
    Min      1Q  Median      3Q     Max 
-72.506 -18.210   0.135  18.392  63.682 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)   75.3552    20.8616   3.612 0.000483 ***
sleep_hours    3.1341     2.9239   1.072 0.286428    
stress_level   6.3973     0.9471   6.754 1.06e-09 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 28.3 on 97 degrees of freedom
Multiple R-squared:  0.3312,    Adjusted R-squared:  0.3175 
F-statistic: 24.02 on 2 and 97 DF,  p-value: 3.354e-09

Study hours + Sleep hours + Stress level

Show code
model7 <- lm(exam_score ~ study_hours + sleep_hours + stress_level, data =
 datais)
summary(model7)

Call:
lm(formula = exam_score ~ study_hours + sleep_hours + stress_level, 
    data = datais)

Residuals:
    Min      1Q  Median      3Q     Max 
-25.170  -8.138   1.587   5.615  23.023 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)   -1.9899     8.0465  -0.247  0.80520    
study_hours   11.6136     0.4514  25.726  < 2e-16 ***
sleep_hours    3.4296     1.0462   3.278  0.00146 ** 
stress_level   7.3346     0.3408  21.521  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 10.13 on 96 degrees of freedom
Multiple R-squared:  0.9153,    Adjusted R-squared:  0.9126 
F-statistic: 345.7 on 3 and 96 DF,  p-value: < 2.2e-16
Show code
tidy(model1, conf.int = TRUE)
# A tibble: 2 × 7
  term        estimate std.error statistic  p.value conf.low conf.high
  <chr>          <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
1 (Intercept)     67.4      7.07      9.54 1.19e-15    53.4       81.4
2 study_hours     10.5      1.09      9.63 7.61e-16     8.36      12.7
Show code
tidy(model2, conf.int = TRUE)
# A tibble: 2 × 7
  term        estimate std.error statistic  p.value conf.low conf.high
  <chr>          <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
1 (Intercept)    99.5      24.8       4.01 0.000117    50.3      149. 
2 sleep_hours     4.54      3.52      1.29 0.200       -2.44      11.5
Show code
tidy(model3, conf.int = TRUE)
# A tibble: 2 × 7
  term         estimate std.error statistic  p.value conf.low conf.high
  <chr>           <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
1 (Intercept)     96.8      5.76      16.8  1.30e-30    85.4     108.  
2 stress_level     6.47     0.945      6.84 6.77e-10     4.59      8.35
Show code
tidy(model4, conf.int = TRUE)
# A tibble: 3 × 7
  term        estimate std.error statistic  p.value conf.low conf.high
  <chr>          <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
1 (Intercept)    32.3      18.9       1.71 9.13e- 2  -5.29       69.9 
2 study_hours    10.6       1.08      9.81 3.40e-16   8.44       12.7 
3 sleep_hours     5.00      2.51      1.99 4.89e- 2   0.0255      9.97
Show code
tidy(model5, conf.int = TRUE)
# A tibble: 3 × 7
  term         estimate std.error statistic  p.value conf.low conf.high
  <chr>           <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
1 (Intercept)     21.6      3.76       5.76 9.89e- 8    14.2      29.1 
2 study_hours     11.6      0.474     24.5  2.56e-43    10.7      12.5 
3 stress_level     7.41     0.357     20.8  1.68e-37     6.70      8.12
Show code
tidy(model6, conf.int = TRUE)
# A tibble: 3 × 7
  term         estimate std.error statistic       p.value conf.low conf.high
  <chr>           <dbl>     <dbl>     <dbl>         <dbl>    <dbl>     <dbl>
1 (Intercept)     75.4     20.9        3.61 0.000483         34.0     117.  
2 sleep_hours      3.13     2.92       1.07 0.286            -2.67      8.94
3 stress_level     6.40     0.947      6.75 0.00000000106     4.52      8.28
Show code
tidy(model7, conf.int = TRUE)
# A tibble: 4 × 7
  term         estimate std.error statistic  p.value conf.low conf.high
  <chr>           <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
1 (Intercept)     -1.99     8.05     -0.247 8.05e- 1   -18.0      14.0 
2 study_hours     11.6      0.451    25.7   7.39e-45    10.7      12.5 
3 sleep_hours      3.43     1.05      3.28  1.46e- 3     1.35      5.51
4 stress_level     7.33     0.341    21.5   1.65e-38     6.66      8.01

Model selection

Start with backward elimination method

Show code
multivar.m <- lm(exam_score ~ study_hours + sleep_hours + stress_level, data = datais)
stepwise.bw <- step(multivar.m, direction = "backward")
Start:  AIC=466.93
exam_score ~ study_hours + sleep_hours + stress_level

               Df Sum of Sq   RSS    AIC
<none>                       9842 466.93
- sleep_hours   1      1102 10944 475.54
- stress_level  1     47482 57324 641.13
- study_hours   1     67850 77692 671.53

Make minimal and full model

Show code
minimal.model <- lm(exam_score ~ 1, data = datais)
full.model <- lm(exam_score ~ study_hours + sleep_hours + stress_level, data = datais)

run both and forward stepwise selection

Show code
step.both <- step(minimal.model, direction = "both", scope = list(upper = full.model))
Start:  AIC=707.77
exam_score ~ 1

               Df Sum of Sq    RSS    AIC
+ study_hours   1     56498  59676 643.15
+ stress_level  1     37561  78612 670.71
<none>                      116173 707.77
+ sleep_hours   1      1942 114231 708.08

Step:  AIC=643.15
exam_score ~ study_hours

               Df Sum of Sq    RSS    AIC
+ stress_level  1     48732  10944 475.54
+ sleep_hours   1      2352  57324 641.13
<none>                       59676 643.15
- study_hours   1     56498 116173 707.77

Step:  AIC=475.54
exam_score ~ study_hours + stress_level

               Df Sum of Sq   RSS    AIC
+ sleep_hours   1      1102  9842 466.93
<none>                      10944 475.54
- stress_level  1     48732 59676 643.15
- study_hours   1     67668 78612 670.71

Step:  AIC=466.93
exam_score ~ study_hours + stress_level + sleep_hours

               Df Sum of Sq   RSS    AIC
<none>                       9842 466.93
- sleep_hours   1      1102 10944 475.54
- stress_level  1     47482 57324 641.13
- study_hours   1     67850 77692 671.53
Show code
step.fw <- step(minimal.model, direction = "forward", scope = list(upper = full.model))
Start:  AIC=707.77
exam_score ~ 1

               Df Sum of Sq    RSS    AIC
+ study_hours   1     56498  59676 643.15
+ stress_level  1     37561  78612 670.71
<none>                      116173 707.77
+ sleep_hours   1      1942 114231 708.08

Step:  AIC=643.15
exam_score ~ study_hours

               Df Sum of Sq   RSS    AIC
+ stress_level  1     48732 10944 475.54
+ sleep_hours   1      2352 57324 641.13
<none>                      59676 643.15

Step:  AIC=475.54
exam_score ~ study_hours + stress_level

              Df Sum of Sq     RSS    AIC
+ sleep_hours  1    1101.8  9842.1 466.93
<none>                     10943.9 475.54

Step:  AIC=466.93
exam_score ~ study_hours + stress_level + sleep_hours

View ANOVA table for stepwise selections

Show code
step.both$anova
            Step Df  Deviance Resid. Df Resid. Dev      AIC
1                NA        NA        99 116173.278 707.7668
2  + study_hours -1 56497.536        98  59675.742 643.1511
3 + stress_level -1 48731.825        97  10943.917 475.5369
4  + sleep_hours -1  1101.835        96   9842.083 466.9252
Show code
step.fw$anova
            Step Df  Deviance Resid. Df Resid. Dev      AIC
1                NA        NA        99 116173.278 707.7668
2  + study_hours -1 56497.536        98  59675.742 643.1511
3 + stress_level -1 48731.825        97  10943.917 475.5369
4  + sleep_hours -1  1101.835        96   9842.083 466.9252
Show code
stepwise.bw$anova
  Step Df Deviance Resid. Df Resid. Dev      AIC
1      NA       NA        96   9842.083 466.9252

Make Final Model

Based on variable selection

We decide all variable should be included.

Show code
final.model <- lm(exam_score ~ study_hours + sleep_hours + stress_level, data = datais)
summary(final.model)

Call:
lm(formula = exam_score ~ study_hours + sleep_hours + stress_level, 
    data = datais)

Residuals:
    Min      1Q  Median      3Q     Max 
-25.170  -8.138   1.587   5.615  23.023 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)   -1.9899     8.0465  -0.247  0.80520    
study_hours   11.6136     0.4514  25.726  < 2e-16 ***
sleep_hours    3.4296     1.0462   3.278  0.00146 ** 
stress_level   7.3346     0.3408  21.521  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 10.13 on 96 degrees of freedom
Multiple R-squared:  0.9153,    Adjusted R-squared:  0.9126 
F-statistic: 345.7 on 3 and 96 DF,  p-value: < 2.2e-16

To check either Study hours and Sleep hours are correlated

Show code
cor(datais$study_hours, datais$sleep_hours)
[1] -0.01858065

Results show very low correlation (0.12) between Study hours and Sleep hours.

Check for interaction

Only include interaction term that is meaningful

Show code
interaction.model1 <- lm(exam_score ~ study_hours + sleep_hours + stress_level + study_hours * stress_level, data = datais)
tidy(interaction.model1)
# A tibble: 5 × 5
  term                     estimate std.error statistic  p.value
  <chr>                       <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)                 55.7     4.60       12.1  5.59e-21
2 study_hours                  3.46    0.457       7.56 2.50e-11
3 sleep_hours                  2.34    0.467       5.02 2.43e- 6
4 stress_level                -2.07    0.497      -4.17 6.78e- 5
5 study_hours:stress_level     1.57    0.0789     19.9  1.43e-35
Show code
nointeraction.model1 <- lm(exam_score ~ study_hours + sleep_hours + stress_level, data = datais)
tidy(nointeraction.model1)
# A tibble: 4 × 5
  term         estimate std.error statistic  p.value
  <chr>           <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)     -1.99     8.05     -0.247 8.05e- 1
2 study_hours     11.6      0.451    25.7   7.39e-45
3 sleep_hours      3.43     1.05      3.28  1.46e- 3
4 stress_level     7.33     0.341    21.5   1.65e-38

compare interaction model with no interaction model

Show code
anova(nointeraction.model1, interaction.model1)
Analysis of Variance Table

Model 1: exam_score ~ study_hours + sleep_hours + stress_level
Model 2: exam_score ~ study_hours + sleep_hours + stress_level + study_hours * 
    stress_level
  Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
1     96 9842.1                                  
2     95 1911.8  1    7930.3 394.07 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Model assessment

Build Preliminary final model

Show code
prelim.final.model <- lm(exam_score ~ study_hours + stress_level + sleep_hours + study_hours*stress_level, data = datais)

plot(prelim.final.model)

Check QQ Plot

Show code
library(lmtest)
bptest(prelim.final.model)

    studentized Breusch-Pagan test

data:  prelim.final.model
BP = 3.6463, df = 4, p-value = 0.456

Check Shapiro wilk test

Show code
shapiro.test(prelim.final.model$residuals)

    Shapiro-Wilk normality test

data:  prelim.final.model$residuals
W = 0.98631, p-value = 0.3928

Plot residuals against independent variables in the model

1. study hours

Show code
augment(prelim.final.model) %>%
  ggplot(aes(x = study_hours, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(
    title = "Residuals vs Study Hours",
    x = "Study Hours",
    y = "Residuals"
  ) +
  theme_minimal()

2. stress level

Show code
augment(prelim.final.model) %>%
  ggplot(aes(x = stress_level, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(
    title = "Residuals vs Stress Level",
    x = "Stress Level",
    y = "Residuals"
  ) +
  theme_minimal()

3. Sleep hours

Show code
augment(prelim.final.model) %>%
  ggplot(aes(x = sleep_hours, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(
    title = "Residuals vs Sleep Hours",
    x = "Sleep Hours",
    y = "Residuals"
  ) +
  theme_minimal()

Fitted values, residuals and influential values

make residuals value

Show code
res.mod <- residuals(prelim.final.model)
head(res.mod)
         1          2          3          4          5          6 
 0.9814899  3.6097311  0.7203962  4.7086989 -1.3011740  2.8956146 
Show code
hist(res.mod)

Alternatively, we can use broom::augment() to generate predictions, residuals and cluster assignments.

Show code
library(DT)
datais.pred.res <- augment(prelim.final.model)
datais.pred.res %>% datatable()

Predictions

Make new data frame for prediction

Show code
new_data <- expand.grid(
  study_hours  = c(2, 4, 6, 8),
  stress_level = c(1, 3, 5),
  sleep_hours = c(2,4,6,8)
)

new_data
   study_hours stress_level sleep_hours
1            2            1           2
2            4            1           2
3            6            1           2
4            8            1           2
5            2            3           2
6            4            3           2
7            6            3           2
8            8            3           2
9            2            5           2
10           4            5           2
11           6            5           2
12           8            5           2
13           2            1           4
14           4            1           4
15           6            1           4
16           8            1           4
17           2            3           4
18           4            3           4
19           6            3           4
20           8            3           4
21           2            5           4
22           4            5           4
23           6            5           4
24           8            5           4
25           2            1           6
26           4            1           6
27           6            1           6
28           8            1           6
29           2            3           6
30           4            3           6
31           6            3           6
32           8            3           6
33           2            5           6
34           4            5           6
35           6            5           6
36           8            5           6
37           2            1           8
38           4            1           8
39           6            1           8
40           8            1           8
41           2            3           8
42           4            3           8
43           6            3           8
44           8            3           8
45           2            5           8
46           4            5           8
47           6            5           8
48           8            5           8

predictions table

Show code
pred_exam_score <- augment(
  prelim.final.model,
  newdata = new_data
)

pred_exam_score %>% datatable()

Influentials

1. Cooks distance : values above 1 or above 4/n

2. standardized residuals : values above 2 or lower than −2

3. leverage above 2k+2n

= 2(7)+2/4225=0.0038???

### Cook’s distance

Show code
cutoff <- 4/((nrow(datais)-length(prelim.final.model$coefficients)-2))
plot(prelim.final.model, which = 4, cook.levels = cutoff)

Show code
datais.pred.res %>% 
  filter(.std.resid > 2 | .std.resid < -2 | .hat > 0.0038) %>%
  datatable()

keep standardized residuals between -2 and 2

Show code
non.influen.obs <- 
  datais.pred.res %>% 
  filter(.std.resid < 2 & .std.resid > -2 & .hat < 0.0038)

Compare models