PART A: Introduction and Data Generation

1.1 Dataset/ Simulation

1.2 Analysis overview

RQ: Does the amount of sleep, study hours, and stress level significantly affect students’ exam scores?

RO: To analyze the impact of SleepHours, StudyHours, and StressLevel on ExamScore using multiple linear regression.

1.3 Prepare Environment and Read data

1.3.1 Load required Libary

# Data manipulation and workflow
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)

# Data reporting & tables
library(gtsummary)
library(knitr)

# Data visualisation
library(ggplot2)
library(corrplot)
## corrplot 0.95 loaded
# Modelling and diagnostics
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:gtsummary':
## 
##     select
## 
## The following object is masked from 'package:dplyr':
## 
##     select
# Model ouputs
library(broom)
library(broom.helpers)
## 
## Attaching package: 'broom.helpers'
## 
## The following objects are masked from 'package:gtsummary':
## 
##     all_categorical, all_continuous, all_contrasts, all_dichotomous,
##     all_interaction, all_intercepts

1.3.2 glimpse dataset

glimpse(student_data)
## Rows: 200
## Columns: 4
## $ study_hours  <dbl> 2.439524, 2.769823, 4.558708, 3.070508, 3.129288, 4.71506…
## $ sleep_hours  <dbl> 9.638572, 8.574896, 6.681826, 7.651833, 6.502792, 6.42850…
## $ stress_level <dbl> 4.889666, 3.247023, 4.047878, 4.956738, 6.006044, 2.52418…
## $ exam_score   <dbl> 154.55861, 120.76109, 123.04272, 124.59006, 132.00996, 10…

1.3.3 Transform data

Convert obesity to factors with ordered levels

student_data <- student_data %>%
  mutate(
    StressCategory = case_when(
      stress_level <= 5 ~ "Low Stress",
      stress_level > 5  ~ "High Stress"
    ),
    StressCategory = factor(StressCategory, levels = c("Low Stress", "High Stress"))
  )

head(student_data)
##   study_hours sleep_hours stress_level exam_score StressCategory
## 1    2.439524    9.638572     4.889666   154.5586     Low Stress
## 2    2.769823    8.574896     3.247023   120.7611     Low Stress
## 3    4.558708    6.681826     4.047878   123.0427     Low Stress
## 4    3.070508    7.651833     4.956738   124.5901     Low Stress
## 5    3.129288    6.502792     6.006044   132.0100    High Stress
## 6    4.715065    6.428504     2.524180   109.2051     Low Stress

PART B: EXPLORATORY DATA ANALYSIS

2.1 Descriptive Statistics

library(summarytools)
## 
## Attaching package: 'summarytools'
## The following object is masked from 'package:tibble':
## 
##     view
dfSummary(student_data)
## Data Frame Summary  
## student_data  
## Dimensions: 200 x 5  
## Duplicates: 0  
## 
## -----------------------------------------------------------------------------------------------------------------
## No   Variable         Stats / Values             Freqs (% of Valid)    Graph                 Valid      Missing  
## ---- ---------------- -------------------------- --------------------- --------------------- ---------- ---------
## 1    study_hours      Mean (sd) : 3 (0.9)        200 distinct values         : :             200        0        
##      [numeric]        min < med < max:                                       : :             (100.0%)   (0.0%)   
##                       0.7 < 2.9 < 6.2                                      : : : :                               
##                       IQR (CV) : 1.2 (0.3)                               . : : : : .                             
##                                                                        . : : : : : : :                           
## 
## 2    sleep_hours      Mean (sd) : 7.1 (1.2)      200 distinct values         . : :           200        0        
##      [numeric]        min < med < max:                                       : : : .         (100.0%)   (0.0%)   
##                       4 < 7 < 10.1                                         . : : : :                             
##                       IQR (CV) : 1.6 (0.2)                                 : : : : : : .                         
##                                                                        . : : : : : : : : .                       
## 
## 3    stress_level     Mean (sd) : 5 (1.4)        200 distinct values           : :           200        0        
##      [numeric]        min < med < max:                                         : : .         (100.0%)   (0.0%)   
##                       0.8 < 5.1 < 8.6                                        . : : :                             
##                       IQR (CV) : 1.9 (0.3)                                   : : : :                             
##                                                                          . : : : : : :                           
## 
## 4    exam_score       Mean (sd) : 125.7 (19.6)   200 distinct values         :   .           200        0        
##      [numeric]        min < med < max:                                     . : : : :         (100.0%)   (0.0%)   
##                       85.1 < 124.4 < 173.6                                 : : : : :                             
##                       IQR (CV) : 26.9 (0.2)                            . : : : : : : .                           
##                                                                        : : : : : : : : : :                       
## 
## 5    StressCategory   1. Low Stress               94 (47.0%)           IIIIIIIII             200        0        
##      [factor]         2. High Stress             106 (53.0%)           IIIIIIIIII            (100.0%)   (0.0%)   
## -----------------------------------------------------------------------------------------------------------------

2.1.1 Descriptive summary by Stress level

tbl_summary(
  student_data,
  by = StressCategory,
  include = c(exam_score, sleep_hours, study_hours),  # exclude StressLevel
  label = list(exam_score ~ 'Exam Score',
               sleep_hours ~ 'Sleeping Hours (hours)',
               study_hours ~ 'Study Hours (hours)'),
  statistic = list(
    all_continuous() ~ "{mean} ({sd})",
    all_categorical() ~ "{n} ({p}%)")
) |>
  modify_header (label = "**Variable**",
                 stat_1 ~ "**Low Stress**<br><span style='font-weight:normal;'>n = {n}</span>",
                 stat_2 ~ "**High Stress**<br><span style='font-weight:normal;'>n = {n}</span>") |>
  modify_spanning_header(c("stat_1", "stat_2") ~ "**Stress Level**") |>
  modify_caption("**Table 2: Descriptive Statistics Summary by Stress Level**")
Table 2: Descriptive Statistics Summary by Stress Level
Variable
Stress Level
Low Stress
n = 94
1
High Stress
n = 106
1
Exam Score 116 (16) 134 (18)
Sleeping Hours (hours) 7.14 (1.25) 6.97 (1.14)
Study Hours (hours) 2.97 (0.93) 3.01 (0.96)
1 Mean (SD)

2.2 Visualisation of dataset distribution

2.2.1 For Continous Variables

2.2.1.1: Histogram

student_data |>
  pivot_longer(
    cols = c(exam_score,sleep_hours,study_hours),
    names_to = "variable",
    values_to = "value"
  ) |>
  ggplot(aes(x = value)) +
  geom_histogram(bins = 10, fill = "lightblue", color = "black") +
  facet_wrap(~ variable, scales = "free") +
  labs(
    x = "Value",
    y = "Frequency",
    caption = "Figure 1: Distribution of Continuous Variables") +
  theme(plot.caption = element_text(hjust = 0.0, face = "bold", size = 10))

2.2.1.2: Box plot

student_data |>
  pivot_longer(
    cols = where(is.numeric),
    names_to= "variable",
    values_to = "value"
  ) |>
  ggplot(aes(StressCategory, value, fill = StressCategory)) +
  geom_boxplot(alpha = 0.7) +
  facet_wrap(~ variable, scales = "free_y") +
  scale_fill_brewer(palette = "Set2") +
  labs(
    x = "Stress Level",
    y = "Value",
    caption = "Figure 2: Distribution of Continous Variables by Stress Level"
  ) +
  theme(plot.caption = element_text(hjust = 0.0, face = "bold", size = 10))

2.2.1.3: Scatter plot

library(patchwork)
## 
## Attaching package: 'patchwork'
## The following object is masked from 'package:MASS':
## 
##     area
#| fig-cap: "Figure 3: Scatter plots showing relationship between continuous variables"

exam_sleep <- ggplot(student_data, aes(x = sleep_hours, y = exam_score)) +
  geom_point(color = "magenta", alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE,
              color = "red", fill = scales::alpha("lightpink", 0.2)) +
  theme_minimal() +
  labs(title = "Exam Score vs \nSleep Hours",
       x = "Sleeping Hours",
       y = "Exam Score")

exam_study <- ggplot(student_data, aes(x = study_hours, y = exam_score)) +
  geom_point(color = "darkgreen", alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE,
              color = "green", fill = scales::alpha("lightgreen", 0.2)) +
  theme_minimal() +
  labs(title = "Exam Score vs \nStudy Hours",
       x = "Studying hours",
       y = "Exam Score")

sleep_study <- ggplot(student_data, aes(x = sleep_hours, y = study_hours)) +
  geom_point(color = "darkblue", alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE,
              color = "blue", fill = scales::alpha("skyblue", 0.2)) +
  theme_minimal() +
  labs(title = "Sleeping Hours vs \nStudy Hours",
       x = "Sleeping Hours",
       y = "Study Hours")

Combine scatter plots with patchwork

(exam_sleep + exam_study + sleep_study) + 
  plot_layout(ncol = 3) & 
  theme(plot.title = element_text(hjust = 0.0, face = "bold", size = 9)) 
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

2.2.1.4: Bar Plot

ggplot(student_data, aes(x = StressCategory, fill = StressCategory)) +
  geom_bar(width = 0.4) +
  theme_minimal() +
  labs(caption = "Figure 4: Distribution of Stress Level",
       x = "Stress Level",
       y = "Count") +
  scale_fill_brewer(palette = "Set3") +
  theme(plot.caption = element_text(hjust = 0.0, face = "bold", size = 10))

2.3 Correlation Matrix for Continous Variable

# Select continuous variables
student_data2 <- student_data |> 
  dplyr::select(exam_score,study_hours,sleep_hours)

# Compute correlation matrix
cor.data <- 
  cor(student_data2,
      use = "complete.obs", 
      method = "pearson")
head(round(cor.data,2))
##             exam_score study_hours sleep_hours
## exam_score        1.00        0.24        0.65
## study_hours       0.24        1.00       -0.03
## sleep_hours       0.65       -0.03        1.00
# Build correlogram
cor_matrix <- cor(student_data |> select_if(is.numeric), use = "complete.obs")
corrplot(cor_matrix,
         type = "upper",
         method = "circle",
         addCoef.col = "black",
         tl.col = "black",
         tl.srt = 45)

2.4: Exploratory data analysis summary

The analysis shows that exam scores are generally higher in those who have low stress,and increased in those who are sleeping more and studying more.
Visualisation indicate linearity between continous variables and the outcome.

PART C: REGRESSION ANALYSIS

3.1 Univariate analysis

3.1.1 Sleeping hours in Hours

slr.sleep <- lm(exam_score ~ sleep_hours, data = student_data)

tidy(slr.sleep, conf.int = TRUE) |> 
  mutate(p.value = format.pval(p.value, digits = 2, eps = 0.001))
## # A tibble: 2 × 7
##   term        estimate std.error statistic p.value conf.low conf.high
##   <chr>          <dbl>     <dbl>     <dbl> <chr>      <dbl>     <dbl>
## 1 (Intercept)     50.3     6.31       7.98 <0.001     37.9       62.8
## 2 sleep_hours     10.7     0.882     12.1  <0.001      8.94      12.4
glance(slr.sleep)
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.426         0.423  14.9      147. 1.27e-25     1  -823. 1651. 1661.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

3.1.2 Study Hours by Hours

slr.study <- lm(exam_score ~ study_hours, data = student_data)

tidy(slr.study, conf.int = TRUE) |> 
  mutate(p.value = format.pval(p.value, digits = 2, eps = 0.001))
## # A tibble: 2 × 7
##   term        estimate std.error statistic p.value conf.low conf.high
##   <chr>          <dbl>     <dbl>     <dbl> <chr>      <dbl>     <dbl>
## 1 (Intercept)   111.        4.49     24.7  <0.001    102.      120.  
## 2 study_hours     4.98      1.43      3.48 <0.001      2.16      7.80
glance(slr.study)
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1    0.0576        0.0528  19.1      12.1 0.000621     1  -872. 1750. 1760.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

3.1.3 Stress Level

slr.stress <- lm(exam_score ~ stress_level, data = student_data)

tidy(slr.stress, conf.int = TRUE) |> 
  mutate(p.value = format.pval(p.value, digits = 2, eps = 0.001))
## # A tibble: 2 × 7
##   term         estimate std.error statistic p.value conf.low conf.high
##   <chr>           <dbl>     <dbl>     <dbl> <chr>      <dbl>     <dbl>
## 1 (Intercept)     84.8      4.04       21.0 <0.001     76.8      92.7 
## 2 stress_level     8.10     0.770      10.5 <0.001      6.58      9.62
glance(slr.stress)
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.358         0.355  15.7      111. 7.70e-21     1  -834. 1673. 1683.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

3.1.4 Univariable table summary

tbl_univ <- gtsummary::tbl_uvregression(
  student_data,
  method = lm,
  y = exam_score,
  include = c(sleep_hours,study_hours,StressCategory),
  label = list(sleep_hours ~ "Sleeping Hours by Hours",
               study_hours ~ "Study Hours by Hours",
               StressCategory ~ "Stress Level")
) |>
  modify_fmt_fun(
    estimate ~ function(x) style_number(x, digits = 2),
    conf.low ~ function(x) style_number(x, digits = 2),
    conf.high ~ function(x) style_number(x, digits = 2)
  ) |>
  gtsummary::bold_p()

#print table summary
tbl_univ
Characteristic N Beta 95% CI p-value
Sleeping Hours by Hours 200 10.68 8.94, 12.42 <0.001
Study Hours by Hours 200 4.98 2.16, 7.80 <0.001
Stress Level 200


    Low Stress
— —
    High Stress
17.81 12.93, 22.70 <0.001
Abbreviation: CI = Confidence Interval

3.2: Multivariable analysis

3.2.1: MVA with main effect only

mvamain <- lm(exam_score ~ sleep_hours + study_hours + StressCategory, data = student_data)
tidy(mvamain, conf.int = TRUE) |>
  mutate(p.value = format.pval(p.value, digits = 2, esp = 0.001))
## # A tibble: 4 × 7
##   term                   estimate std.error statistic p.value conf.low conf.high
##   <chr>                     <dbl>     <dbl>     <dbl> <chr>      <dbl>     <dbl>
## 1 (Intercept)               19.3      4.96       3.89 0.00014     9.49     29.1 
## 2 sleep_hours               11.4      0.595     19.2  < 2e-16    10.2      12.6 
## 3 study_hours                5.19     0.752      6.91 6.7e-11     3.71      6.68
## 4 StressCategoryHigh St…    19.7      1.42      13.8  < 2e-16    16.9      22.5

3.2.2: With Interaction of Sleep hours and Stress category

mvaint <- lm(exam_score ~ sleep_hours + study_hours + StressCategory + sleep_hours * StressCategory, data = student_data)
tidy(mvaint, conf.int = TRUE) |>
  mutate(p.value = format.pval(p.value, digits = 2, esp = 0.001))
## # A tibble: 5 × 7
##   term                   estimate std.error statistic p.value conf.low conf.high
##   <chr>                     <dbl>     <dbl>     <dbl> <chr>      <dbl>     <dbl>
## 1 (Intercept)               33.9      6.35       5.33 2.7e-07    21.4      46.4 
## 2 sleep_hours                9.42     0.808     11.7  < 2e-16     7.82     11.0 
## 3 study_hours                5.07     0.732      6.93 6.1e-11     3.62      6.51
## 4 StressCategoryHigh St…    -9.18     8.28      -1.11 0.26898   -25.5       7.15
## 5 sleep_hours:StressCat…     4.09     1.16       3.53 0.00052     1.81      6.37

3.2.3: Model comparison

3.2.3.1 Using ANOVA

modelcomparison <- anova(mvamain,mvaint)
mod_compare_tidy <- tidy(modelcomparison) |>
  mutate(Model = case_when(
    term == "exam_score ~ sleep_hours + study_hours + StressCategory" ~ "Model 1: Main effects",
    term == "exam_score ~ sleep_hours + study_hours + StressCategory + sleep_hours * StressCategory" ~ "Model 2: Interaction model"),
    p.value = format.pval(p.value, digits = 2, eps = 0.001)
  ) |>
  dplyr::select(Model, df.residual, rss, df, sumsq, statistic, p.value)

mod_compare_tidy
## # A tibble: 2 × 7
##   Model                      df.residual    rss    df sumsq statistic p.value
##   <chr>                            <dbl>  <dbl> <dbl> <dbl>     <dbl> <chr>  
## 1 Model 1: Main effects              196 19588.    NA   NA       NA   NA     
## 2 Model 2: Interaction model         195 18411.     1 1178.      12.5 <0.001

3.2.3.2 Model selection and Goodness of fit

model_fit <- bind_rows(
  glance(mvamain) |> mutate(Model = "Model 1: Main effects"),
  glance(mvaint) |> mutate(Model = "Model 2: Interaction model")
) |>
  dplyr::select(Model, r.squared, adj.r.squared, AIC, sigma, nobs)

model_fit
## # A tibble: 2 × 6
##   Model                      r.squared adj.r.squared   AIC sigma  nobs
##   <chr>                          <dbl>         <dbl> <dbl> <dbl> <int>
## 1 Model 1: Main effects          0.743         0.739 1494. 10.00   200
## 2 Model 2: Interaction model     0.759         0.754 1484.  9.72   200

3.2.3.3 Using Perfomance package

library(performance)
compare_performance(mvamain, mvaint, rank = TRUE)
## # Comparison of Model Performance Indices
## 
## Name    | Model |    R2 | R2 (adj.) |  RMSE | Sigma | AIC weights
## -----------------------------------------------------------------
## mvaint  |    lm | 0.759 |     0.754 | 9.594 | 9.717 |       0.995
## mvamain |    lm | 0.743 |     0.739 | 9.897 | 9.997 |       0.005
## 
## Name    | AICc weights | BIC weights | Performance-Score
## --------------------------------------------------------
## mvaint  |        0.994 |       0.972 |           100.00%
## mvamain |        0.006 |       0.028 |             0.00%

3.2.3.4 The Preliminary Final Model

prelim_m <- tbl_regression(
  mvaint,
  intercept = TRUE,
  conf.level = 0.95,
  pvalue_fun = ~style_pvalue(.x, digits = 3, eps = 0.001),
  label = list(sleep_hours ~ "Sleep Quantity by Hours",
               study_hours ~ "Study time by Hours",
               StressCategory ~ "Stress Level"),
  estimate_fun = ~style_number(.x, digits = 2),
) |>
  add_glance_table(
    include = c(r.squared, adj.r.squared, AIC, nobs)) |>
  modify_caption("**Table 4: Preliminary Final Model with Fit Statistics**")

prelim_m
Table 4: Preliminary Final Model with Fit Statistics
Characteristic Beta 95% CI p-value
(Intercept) 33.88 21.35, 46.41 <0.001
Sleep Quantity by Hours 9.42 7.82, 11.01 <0.001
Study time by Hours 5.07 3.62, 6.51 <0.001
Stress Level


    Low Stress — —
    High Stress -9.18 -25.51, 7.15 0.269
Sleep Quantity by Hours * Stress Level


    Sleep Quantity by Hours * High Stress 4.09 1.81, 6.37 <0.001
R² 0.759

Adjusted R² 0.754

AIC 1,484

No. Obs. 200

Abbreviation: CI = Confidence Interval

PART D: MODEL CHECKING

4.1 Checking Assumptions

4.1.1 Assumption 1: Linearity between predictors and residuals

4.1.1.1 Overall Linearity

Linearity was assessed using the residuals-versus-fitted values plot.

Residuals vs Fitted plot

plot(mvaint, which = 1)

4.1.1.2 Linearity of each continuous predictors

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine

Residuals vs working experience in years

p1 <- augment(mvaint) |>
  ggplot(aes(x = sleep_hours, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "red")

Residuals vs physical activity score

p2 <- augment(mvaint) |>
  ggplot(aes(x = study_hours, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "red")

grid.arrange (p1,p2, ncol = 2)

4.1.2 Assumption of Normality of residuals

par(mfrow = c(1, 2))

data.res <- augment(mvaint)

plot(mvaint, which = 2)

hist(data.res$.resid)

Shapiro Wilk test
shapiro.test(mvaint$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  mvaint$residuals
## W = 0.97172, p-value = 0.0004624

4.1.3 Assumption of Homoscedasticity

4.1.3.1 Visualisation

par(mfrow = c(1, 2))

plot(mvaint, which = c(1, 3))

Interpretation: The diagnostic plots indicate that the assumptions of linearity and homoscedasticity are largely met, and the regression model appears adequate despite the presence of a few influential observations.

4.1.3.2 Breusch Pagan test

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(mvaint)
## 
##  studentized Breusch-Pagan test
## 
## data:  mvaint
## BP = 14.729, df = 4, p-value = 0.005299

4.1.4 Assumption of Independence

Independence of residuals was evaluated using visual diagnostic plots, autocorrelation function (ACF) plots, and the Durbin–Watson test (1)(2).

4.1.4.1 Visualisation

plot(residuals(mvaint), type = "o")
abline(h = 0, col = "red", lty = 2)

4.1.4.2 Autocorrelation Function (ACF) plot

acf(data.res$.resid, main = "ACF of Residuals")

4.1.4.3 Durbin-Watson Test

library(lmtest)
dwtest(mvaint)
## 
##  Durbin-Watson test
## 
## data:  mvaint
## DW = 2.0683, p-value = 0.6883
## alternative hypothesis: true autocorrelation is greater than 0

4.1.4.4 Check Multicollinearity

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
vif(mvaint)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
##                sleep_hours                study_hours 
##                   1.963067                   1.003429 
##             StressCategory sleep_hours:StressCategory 
##                  36.180818                  36.283695

4.2 Checking Influentials

4.2.1 Cook’s distance

# Add diagnostics columns
data.inf <- augment(mvaint, student_data)

# Define thresholds
n_obs <- nrow(student_data)
p_preds <- length(coef(mvaint))
cooks_cutoff <- 4 / n_obs
leverage_cutoff <- (2 * p_preds + 2) / n_obs

# Filter Influential Observations
influential_obs <- data.inf %>%
  filter(.cooksd > cooks_cutoff | abs(.std.resid) > 2 | .hat > leverage_cutoff) 

# Print count
cat("Number of influential observation:", nrow(influential_obs), "\n")
## Number of influential observation: 17
cat("Cook's distance cutoff:", round(cooks_cutoff, 4), "\n")
## Cook's distance cutoff: 0.02

4.2.1.2 Cook’s distance plot

plot(mvaint, which = 4)

4.2.2 Leverage and standardized residuals

Residuals vs leverage plot

plot(mvaint, which = 5)

data.inf <- data.inf |> 
  dplyr::mutate(obs_id = row_number())

top10_influential <- data.inf |>
  arrange(desc(.cooksd), desc(abs(.std.resid))) |>
  slice(1:10)

top10_influential |>
  dplyr::select(
    obs_id,
    exam_score,
    study_hours,
    sleep_hours,
    StressCategory,
    .cooksd,
    .hat,
    .std.resid
  )
## # A tibble: 10 × 8
##    obs_id exam_score study_hours sleep_hours StressCategory .cooksd   .hat
##     <int>      <dbl>       <dbl>       <dbl> <fct>            <dbl>  <dbl>
##  1     16       96.4       4.79         9.01 Low Stress      0.287  0.0556
##  2    149      160.        5.10         5.83 High Stress     0.0985 0.0442
##  3     65      115.        1.93         9.75 Low Stress      0.0602 0.0620
##  4      1      155.        2.44         9.64 Low Stress      0.0397 0.0543
##  5     56       95.2       4.52         6.89 Low Stress      0.0378 0.0243
##  6    110      160.        3.92         9.45 Low Stress      0.0365 0.0537
##  7    108       85.1       1.33         7.25 Low Stress      0.0327 0.0259
##  8    198       96.1       1.75         8.01 Low Stress      0.0258 0.0237
##  9     97      165.        5.19         9.24 High Stress     0.0193 0.0729
## 10    135      145.        0.947        7.45 High Stress     0.0175 0.0354
## # ℹ 1 more variable: .std.resid <dbl>

4.2.3 DFBEATS

# Compute DFBETAS and display as a datatable
library(DT)

dfb <- dfbetas(mvaint)
dfb_long <- dfb |> 
  as.data.frame() |> 
  mutate(id = row_number()) |> 
  pivot_longer(
    cols = -id,
    names_to = "term",
    values_to = "dfbeta"
  )

# Cutoff
dfb_threshold <- 2 / sqrt(nrow(student_data))

# Identify influential observations
influential_dfb <- dfb_long |> 
  filter(abs(dfbeta) > dfb_threshold) |>
  arrange(desc(abs(dfbeta)))
influential_dfb %>%
  datatable()

4.2.4 Using Standardized Residuals and Leverage Value Cutoff

leverage_cutoff <- 2 * (length(coef(mvaint)) + 2) / nrow(student_data)
data.inf |> 
  filter(.std.resid > 2 | .std.resid < -2 | .hat > leverage_cutoff) |>
  datatable()
We have identified 13 potential outliers based on standardized residuals and leverage cutoff.

Remedial Measures

4.3.1 Remove outliers and refit model

dfx <- data.inf |>
  dplyr::filter(.std.resid < 2 & .std.resid > -2 & .hat < leverage_cutoff)

model3 <- lm(exam_score ~ study_hours + sleep_hours*StressCategory, data = dfx)

tidy(model3, conf.int = TRUE) |>
  mutate(p.value = format.pval(p.value, digits = 3, eps = 0.001))
## # A tibble: 5 × 7
##   term                   estimate std.error statistic p.value conf.low conf.high
##   <chr>                     <dbl>     <dbl>     <dbl> <chr>      <dbl>     <dbl>
## 1 (Intercept)               28.5      5.76       4.94 <0.001     17.1      39.8 
## 2 study_hours                4.86     0.667      7.30 <0.001      3.55      6.18
## 3 sleep_hours               10.5      0.746     14.1  <0.001      9.06     12.0 
## 4 StressCategoryHigh St…    -8.20     7.41      -1.11 0.27      -22.8       6.42
## 5 sleep_hours:StressCat…     3.64     1.04       3.49 <0.001      1.58      5.70
glance(model3)
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.811         0.807  7.99      195. 1.08e-64     4  -651. 1315. 1334.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

4.3.2 Compare model fittness between preliminary final model with model after removing outliers

### Preliminary Final Model
model_performance(mvaint)
## # Indices of model performance
## 
## AIC    |   AICc |    BIC |    R2 | R2 (adj.) |  RMSE | Sigma
## ------------------------------------------------------------
## 1484.1 | 1484.5 | 1503.8 | 0.759 |     0.754 | 9.594 | 9.717
### Refitted model after remove outlier
model_performance(model3)
## # Indices of model performance
## 
## AIC    |   AICc |    BIC |    R2 | R2 (adj.) |  RMSE | Sigma
## ------------------------------------------------------------
## 1314.9 | 1315.4 | 1334.3 | 0.811 |     0.807 | 7.884 | 7.992
### Model comparison was done using compare_performance function
compare_performance(mvaint, model3, rank = TRUE)
## When comparing models, please note that probably not all models were fit
##   from same data.
## # Comparison of Model Performance Indices
## 
## Name   | Model |    R2 | R2 (adj.) |  RMSE | Sigma | AIC weights | AICc weights
## -------------------------------------------------------------------------------
## model3 |    lm | 0.811 |     0.807 | 7.884 | 7.992 |        1.00 |         1.00
## mvaint |    lm | 0.759 |     0.754 | 9.594 | 9.717 |    1.89e-37 |     1.92e-37
## 
## Name   | BIC weights | Performance-Score
## ----------------------------------------
## model3 |        1.00 |           100.00%
## mvaint |    1.55e-37 |             0.00%

4.3.3 Compare coefficient changes after removing outliers

coef_compare <- tidy(mvaint) |>
  dplyr::select(term, estimate) |>
  dplyr::rename(estimate_prelim = estimate) |>
  dplyr::left_join(
    tidy(model3) |>
      dplyr::select(term, estimate) |>
      dplyr::rename(estimate_refit = estimate),
    by = "term"
  ) |>
  mutate(
    change_in_estimate = estimate_refit - estimate_prelim,
    percent_change = (change_in_estimate / estimate_prelim) * 100
  )
coef_compare
## # A tibble: 5 × 5
##   term          estimate_prelim estimate_refit change_in_estimate percent_change
##   <chr>                   <dbl>          <dbl>              <dbl>          <dbl>
## 1 (Intercept)             33.9           28.5              -5.42          -16.0 
## 2 sleep_hours              9.42          10.5               1.12           11.9 
## 3 study_hours              5.07           4.86             -0.202          -3.98
## 4 StressCatego…           -9.18          -8.20              0.976         -10.6 
## 5 sleep_hours:…            4.09           3.64             -0.447         -10.9

4.3.4 Compare model diagnostics after removing outliers

par(mfrow = c(1,2), mar = c(5,4,4,2))  # bottom, left, top, right

# Row 1: Residuals vs Fitted
plot(mvaint, which = 1, main = "Original Model")
plot(model3, which = 1, main = "Without Outliers")

# Row 2: Q-Q Plot
par(mfrow = c(1,2), mar = c(5,4,4,2))  # bottom, left, top, right
plot(mvaint, which = 2, main = "Original Model")
plot(model3, which = 2, main = "Without Outliers")

# Row 3: Scale-Location
par(mfrow = c(1,2), mar = c(5,4,4,2))  # bottom, left, top, right
plot(mvaint, which = 3, main = "Original Model")
plot(model3, which = 3, main = "Without Outliers")

# Row 4: Cook's Distance
par(mfrow = c(1,2), mar = c(5,4,4,2))  # bottom, left, top, right
plot(mvaint, which = 4, main = "Original Model")
plot(model3, which = 4, main = "Without Outliers")

PART E: RESULT PRESENTATION AND INTERPRETATION

5.1 Final Model

library(gt)

final_m <- tbl_regression(
  model3,
  intercept = TRUE,
  label = list(sleep_hours ~ "Sleep Quantity by Hours",
               study_hours ~ "Study time by Hours",
               StressCategory ~ "Stress Level"),
) |>
  bold_p(t = 0.05) |>
  bold_labels() |>
  add_glance_table(include = c(r.squared, adj.r.squared, AIC, BIC)) |>
  modify_fmt_fun(
    estimate ~ function(x) style_number(x, digits = 2),
    conf.low ~ function(x) style_number(x, digits = 2),
    conf.high ~ function(x) style_number(x, digits = 2)) |>
  modify_header(label = "**Variables**",
                estimate = "**Adj. B**",
                statistic = "**t-stat**") |>
  modify_caption("**Final Multivariable Linear Regression Model for EXam Score**")

final_m
Final Multivariable Linear Regression Model for EXam Score
Variables Adj. B t-stat 95% CI p-value
(Intercept) 28.46 4.94 17.08, 39.83 <0.001
Study time by Hours 4.86 7.30 3.55, 6.18 <0.001
Sleep Quantity by Hours 10.53 14.1 9.06, 12.00 <0.001
Stress Level



    Low Stress — — —
    High Stress -8.20 -1.11 -22.83, 6.42 0.3
Sleep Quantity by Hours * Stress Level



    Sleep Quantity by Hours * High Stress 3.64 3.49 1.58, 5.70 <0.001
R² 0.81


Adjusted R² 0.81


AIC 1,314.94


BIC 1,334.32


Abbreviation: CI = Confidence Interval

5.2 Final Model Equation : 28.46 + 4.86(study hours) + 10.53(sleep hours) - 10.53(high stress) + 3.64(sleep hours x high stress) + epsilon

5.3 Interpretation

high exam score for who are studying more and sleep more, less stress

5.4 Model performance visualisation

#  Actual vs Predicted
augmented_data <- augment(model3)
ggplot(augmented_data, aes(x = exam_score, y = .fitted)) +
  geom_point(alpha = 0.4, color = "#667eea", size = 2) +
  geom_abline(intercept = 0, slope = 1, color = "#F57C00", 
              linewidth = 1.2, linetype = "dashed") +
  geom_smooth(method = "lm", se = FALSE, color = "#D32F2F", linewidth = 1) +
  labs(title = "Actual vs Predicted Exam Score",
       x = "Actual Exam Score",
       y = "Predicted Exam Score") +
  theme_minimal(base_size = 13) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))
## `geom_smooth()` using formula = 'y ~ x'

PART F: PREDICTION

6.1 Create new data for prediction

newdata <- expand.grid(
  sleep_hours = mean(student_data$sleep_hours),
  study_hours = seq(0, 10, by = 0.5),
  StressCategory = levels(student_data$StressCategory)
)

newdata$StressCategory <- factor(
  newdata$StressCategory,
  levels = levels(student_data$StressCategory)
)

6.2 Generate predictions with confidence intervals using final model

pred.exmscr <- augment(model3, newdata = newdata, interval = "confidence", level = 0.95)
## Warning: The `level` argument is not supported in the `augment()` method for
## `lm` objects and will be ignored.
pred.exmscr |> datatable()

6.3 Prediction visualization

ggplot(pred.exmscr, 
       aes(x = study_hours, 
           y = .fitted,
           color = StressCategory, 
           fill = StressCategory)) +
  geom_ribbon(
    aes(ymin = .lower, ymax = .upper),
    alpha = 0.2,
    color = NA) +
  geom_line(linewidth = 1.2) +
  scale_color_brewer(palette = "Set2") +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title = "Predicted Effect of Study Hours on Exam Score",
    subtitle = "Stratified by Stress Level",
    x = "Study Hours",
    y = "Predicted Exam Score",
    color = "Stress Level"
  ) +
  theme_minimal(base_size = 12)