# Data manipulation and workflow
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
# Data reporting & tables
library(gtsummary)
library(knitr)
# Data visualisation
library(ggplot2)
library(corrplot)
## corrplot 0.95 loaded
# Modelling and diagnostics
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:gtsummary':
##
## select
##
## The following object is masked from 'package:dplyr':
##
## select
# Model ouputs
library(broom)
library(broom.helpers)
##
## Attaching package: 'broom.helpers'
##
## The following objects are masked from 'package:gtsummary':
##
## all_categorical, all_continuous, all_contrasts, all_dichotomous,
## all_interaction, all_intercepts
glimpse(student_data)
## Rows: 200
## Columns: 4
## $ study_hours <dbl> 2.439524, 2.769823, 4.558708, 3.070508, 3.129288, 4.71506…
## $ sleep_hours <dbl> 9.638572, 8.574896, 6.681826, 7.651833, 6.502792, 6.42850…
## $ stress_level <dbl> 4.889666, 3.247023, 4.047878, 4.956738, 6.006044, 2.52418…
## $ exam_score <dbl> 154.55861, 120.76109, 123.04272, 124.59006, 132.00996, 10…
student_data <- student_data %>%
mutate(
StressCategory = case_when(
stress_level <= 5 ~ "Low Stress",
stress_level > 5 ~ "High Stress"
),
StressCategory = factor(StressCategory, levels = c("Low Stress", "High Stress"))
)
head(student_data)
## study_hours sleep_hours stress_level exam_score StressCategory
## 1 2.439524 9.638572 4.889666 154.5586 Low Stress
## 2 2.769823 8.574896 3.247023 120.7611 Low Stress
## 3 4.558708 6.681826 4.047878 123.0427 Low Stress
## 4 3.070508 7.651833 4.956738 124.5901 Low Stress
## 5 3.129288 6.502792 6.006044 132.0100 High Stress
## 6 4.715065 6.428504 2.524180 109.2051 Low Stress
library(summarytools)
##
## Attaching package: 'summarytools'
## The following object is masked from 'package:tibble':
##
## view
dfSummary(student_data)
## Data Frame Summary
## student_data
## Dimensions: 200 x 5
## Duplicates: 0
##
## -----------------------------------------------------------------------------------------------------------------
## No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
## ---- ---------------- -------------------------- --------------------- --------------------- ---------- ---------
## 1 study_hours Mean (sd) : 3 (0.9) 200 distinct values : : 200 0
## [numeric] min < med < max: : : (100.0%) (0.0%)
## 0.7 < 2.9 < 6.2 : : : :
## IQR (CV) : 1.2 (0.3) . : : : : .
## . : : : : : : :
##
## 2 sleep_hours Mean (sd) : 7.1 (1.2) 200 distinct values . : : 200 0
## [numeric] min < med < max: : : : . (100.0%) (0.0%)
## 4 < 7 < 10.1 . : : : :
## IQR (CV) : 1.6 (0.2) : : : : : : .
## . : : : : : : : : .
##
## 3 stress_level Mean (sd) : 5 (1.4) 200 distinct values : : 200 0
## [numeric] min < med < max: : : . (100.0%) (0.0%)
## 0.8 < 5.1 < 8.6 . : : :
## IQR (CV) : 1.9 (0.3) : : : :
## . : : : : : :
##
## 4 exam_score Mean (sd) : 125.7 (19.6) 200 distinct values : . 200 0
## [numeric] min < med < max: . : : : : (100.0%) (0.0%)
## 85.1 < 124.4 < 173.6 : : : : :
## IQR (CV) : 26.9 (0.2) . : : : : : : .
## : : : : : : : : : :
##
## 5 StressCategory 1. Low Stress 94 (47.0%) IIIIIIIII 200 0
## [factor] 2. High Stress 106 (53.0%) IIIIIIIIII (100.0%) (0.0%)
## -----------------------------------------------------------------------------------------------------------------
tbl_summary(
student_data,
by = StressCategory,
include = c(exam_score, sleep_hours, study_hours), # exclude StressLevel
label = list(exam_score ~ 'Exam Score',
sleep_hours ~ 'Sleeping Hours (hours)',
study_hours ~ 'Study Hours (hours)'),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)")
) |>
modify_header (label = "**Variable**",
stat_1 ~ "**Low Stress**<br><span style='font-weight:normal;'>n = {n}</span>",
stat_2 ~ "**High Stress**<br><span style='font-weight:normal;'>n = {n}</span>") |>
modify_spanning_header(c("stat_1", "stat_2") ~ "**Stress Level**") |>
modify_caption("**Table 2: Descriptive Statistics Summary by Stress Level**")
| Variable |
Stress Level
|
|
|---|---|---|
| Low Stress n = 941 |
High Stress n = 1061 |
|
| Exam Score | 116 (16) | 134 (18) |
| Sleeping Hours (hours) | 7.14 (1.25) | 6.97 (1.14) |
| Study Hours (hours) | 2.97 (0.93) | 3.01 (0.96) |
| 1 Mean (SD) | ||
student_data |>
pivot_longer(
cols = c(exam_score,sleep_hours,study_hours),
names_to = "variable",
values_to = "value"
) |>
ggplot(aes(x = value)) +
geom_histogram(bins = 10, fill = "lightblue", color = "black") +
facet_wrap(~ variable, scales = "free") +
labs(
x = "Value",
y = "Frequency",
caption = "Figure 1: Distribution of Continuous Variables") +
theme(plot.caption = element_text(hjust = 0.0, face = "bold", size = 10))
student_data |>
pivot_longer(
cols = where(is.numeric),
names_to= "variable",
values_to = "value"
) |>
ggplot(aes(StressCategory, value, fill = StressCategory)) +
geom_boxplot(alpha = 0.7) +
facet_wrap(~ variable, scales = "free_y") +
scale_fill_brewer(palette = "Set2") +
labs(
x = "Stress Level",
y = "Value",
caption = "Figure 2: Distribution of Continous Variables by Stress Level"
) +
theme(plot.caption = element_text(hjust = 0.0, face = "bold", size = 10))
library(patchwork)
##
## Attaching package: 'patchwork'
## The following object is masked from 'package:MASS':
##
## area
#| fig-cap: "Figure 3: Scatter plots showing relationship between continuous variables"
exam_sleep <- ggplot(student_data, aes(x = sleep_hours, y = exam_score)) +
geom_point(color = "magenta", alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE,
color = "red", fill = scales::alpha("lightpink", 0.2)) +
theme_minimal() +
labs(title = "Exam Score vs \nSleep Hours",
x = "Sleeping Hours",
y = "Exam Score")
exam_study <- ggplot(student_data, aes(x = study_hours, y = exam_score)) +
geom_point(color = "darkgreen", alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE,
color = "green", fill = scales::alpha("lightgreen", 0.2)) +
theme_minimal() +
labs(title = "Exam Score vs \nStudy Hours",
x = "Studying hours",
y = "Exam Score")
sleep_study <- ggplot(student_data, aes(x = sleep_hours, y = study_hours)) +
geom_point(color = "darkblue", alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE,
color = "blue", fill = scales::alpha("skyblue", 0.2)) +
theme_minimal() +
labs(title = "Sleeping Hours vs \nStudy Hours",
x = "Sleeping Hours",
y = "Study Hours")
(exam_sleep + exam_study + sleep_study) +
plot_layout(ncol = 3) &
theme(plot.title = element_text(hjust = 0.0, face = "bold", size = 9))
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
ggplot(student_data, aes(x = StressCategory, fill = StressCategory)) +
geom_bar(width = 0.4) +
theme_minimal() +
labs(caption = "Figure 4: Distribution of Stress Level",
x = "Stress Level",
y = "Count") +
scale_fill_brewer(palette = "Set3") +
theme(plot.caption = element_text(hjust = 0.0, face = "bold", size = 10))
# Select continuous variables
student_data2 <- student_data |>
dplyr::select(exam_score,study_hours,sleep_hours)
# Compute correlation matrix
cor.data <-
cor(student_data2,
use = "complete.obs",
method = "pearson")
head(round(cor.data,2))
## exam_score study_hours sleep_hours
## exam_score 1.00 0.24 0.65
## study_hours 0.24 1.00 -0.03
## sleep_hours 0.65 -0.03 1.00
# Build correlogram
cor_matrix <- cor(student_data |> select_if(is.numeric), use = "complete.obs")
corrplot(cor_matrix,
type = "upper",
method = "circle",
addCoef.col = "black",
tl.col = "black",
tl.srt = 45)
slr.sleep <- lm(exam_score ~ sleep_hours, data = student_data)
tidy(slr.sleep, conf.int = TRUE) |>
mutate(p.value = format.pval(p.value, digits = 2, eps = 0.001))
## # A tibble: 2 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 (Intercept) 50.3 6.31 7.98 <0.001 37.9 62.8
## 2 sleep_hours 10.7 0.882 12.1 <0.001 8.94 12.4
glance(slr.sleep)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.426 0.423 14.9 147. 1.27e-25 1 -823. 1651. 1661.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
slr.study <- lm(exam_score ~ study_hours, data = student_data)
tidy(slr.study, conf.int = TRUE) |>
mutate(p.value = format.pval(p.value, digits = 2, eps = 0.001))
## # A tibble: 2 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 (Intercept) 111. 4.49 24.7 <0.001 102. 120.
## 2 study_hours 4.98 1.43 3.48 <0.001 2.16 7.80
glance(slr.study)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0576 0.0528 19.1 12.1 0.000621 1 -872. 1750. 1760.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
slr.stress <- lm(exam_score ~ stress_level, data = student_data)
tidy(slr.stress, conf.int = TRUE) |>
mutate(p.value = format.pval(p.value, digits = 2, eps = 0.001))
## # A tibble: 2 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 (Intercept) 84.8 4.04 21.0 <0.001 76.8 92.7
## 2 stress_level 8.10 0.770 10.5 <0.001 6.58 9.62
glance(slr.stress)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.358 0.355 15.7 111. 7.70e-21 1 -834. 1673. 1683.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
tbl_univ <- gtsummary::tbl_uvregression(
student_data,
method = lm,
y = exam_score,
include = c(sleep_hours,study_hours,StressCategory),
label = list(sleep_hours ~ "Sleeping Hours by Hours",
study_hours ~ "Study Hours by Hours",
StressCategory ~ "Stress Level")
) |>
modify_fmt_fun(
estimate ~ function(x) style_number(x, digits = 2),
conf.low ~ function(x) style_number(x, digits = 2),
conf.high ~ function(x) style_number(x, digits = 2)
) |>
gtsummary::bold_p()
#print table summary
tbl_univ
| Characteristic | N | Beta | 95% CI | p-value |
|---|---|---|---|---|
| Sleeping Hours by Hours | 200 | 10.68 | 8.94, 12.42 | <0.001 |
| Study Hours by Hours | 200 | 4.98 | 2.16, 7.80 | <0.001 |
| Stress Level | 200 | |||
|     Low Stress | — | — | ||
| Â Â Â Â High Stress | 17.81 | 12.93, 22.70 | <0.001 | |
| Abbreviation: CI = Confidence Interval | ||||
mvamain <- lm(exam_score ~ sleep_hours + study_hours + StressCategory, data = student_data)
tidy(mvamain, conf.int = TRUE) |>
mutate(p.value = format.pval(p.value, digits = 2, esp = 0.001))
## # A tibble: 4 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 (Intercept) 19.3 4.96 3.89 0.00014 9.49 29.1
## 2 sleep_hours 11.4 0.595 19.2 < 2e-16 10.2 12.6
## 3 study_hours 5.19 0.752 6.91 6.7e-11 3.71 6.68
## 4 StressCategoryHigh St… 19.7 1.42 13.8 < 2e-16 16.9 22.5
mvaint <- lm(exam_score ~ sleep_hours + study_hours + StressCategory + sleep_hours * StressCategory, data = student_data)
tidy(mvaint, conf.int = TRUE) |>
mutate(p.value = format.pval(p.value, digits = 2, esp = 0.001))
## # A tibble: 5 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 (Intercept) 33.9 6.35 5.33 2.7e-07 21.4 46.4
## 2 sleep_hours 9.42 0.808 11.7 < 2e-16 7.82 11.0
## 3 study_hours 5.07 0.732 6.93 6.1e-11 3.62 6.51
## 4 StressCategoryHigh St… -9.18 8.28 -1.11 0.26898 -25.5 7.15
## 5 sleep_hours:StressCat… 4.09 1.16 3.53 0.00052 1.81 6.37
modelcomparison <- anova(mvamain,mvaint)
mod_compare_tidy <- tidy(modelcomparison) |>
mutate(Model = case_when(
term == "exam_score ~ sleep_hours + study_hours + StressCategory" ~ "Model 1: Main effects",
term == "exam_score ~ sleep_hours + study_hours + StressCategory + sleep_hours * StressCategory" ~ "Model 2: Interaction model"),
p.value = format.pval(p.value, digits = 2, eps = 0.001)
) |>
dplyr::select(Model, df.residual, rss, df, sumsq, statistic, p.value)
mod_compare_tidy
## # A tibble: 2 × 7
## Model df.residual rss df sumsq statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 Model 1: Main effects 196 19588. NA NA NA NA
## 2 Model 2: Interaction model 195 18411. 1 1178. 12.5 <0.001
model_fit <- bind_rows(
glance(mvamain) |> mutate(Model = "Model 1: Main effects"),
glance(mvaint) |> mutate(Model = "Model 2: Interaction model")
) |>
dplyr::select(Model, r.squared, adj.r.squared, AIC, sigma, nobs)
model_fit
## # A tibble: 2 × 6
## Model r.squared adj.r.squared AIC sigma nobs
## <chr> <dbl> <dbl> <dbl> <dbl> <int>
## 1 Model 1: Main effects 0.743 0.739 1494. 10.00 200
## 2 Model 2: Interaction model 0.759 0.754 1484. 9.72 200
library(performance)
compare_performance(mvamain, mvaint, rank = TRUE)
## # Comparison of Model Performance Indices
##
## Name | Model | R2 | R2 (adj.) | RMSE | Sigma | AIC weights
## -----------------------------------------------------------------
## mvaint | lm | 0.759 | 0.754 | 9.594 | 9.717 | 0.995
## mvamain | lm | 0.743 | 0.739 | 9.897 | 9.997 | 0.005
##
## Name | AICc weights | BIC weights | Performance-Score
## --------------------------------------------------------
## mvaint | 0.994 | 0.972 | 100.00%
## mvamain | 0.006 | 0.028 | 0.00%
prelim_m <- tbl_regression(
mvaint,
intercept = TRUE,
conf.level = 0.95,
pvalue_fun = ~style_pvalue(.x, digits = 3, eps = 0.001),
label = list(sleep_hours ~ "Sleep Quantity by Hours",
study_hours ~ "Study time by Hours",
StressCategory ~ "Stress Level"),
estimate_fun = ~style_number(.x, digits = 2),
) |>
add_glance_table(
include = c(r.squared, adj.r.squared, AIC, nobs)) |>
modify_caption("**Table 4: Preliminary Final Model with Fit Statistics**")
prelim_m
| Characteristic | Beta | 95% CI | p-value |
|---|---|---|---|
| (Intercept) | 33.88 | 21.35, 46.41 | <0.001 |
| Sleep Quantity by Hours | 9.42 | 7.82, 11.01 | <0.001 |
| Study time by Hours | 5.07 | 3.62, 6.51 | <0.001 |
| Stress Level | |||
|     Low Stress | — | — | |
| Â Â Â Â High Stress | -9.18 | -25.51, 7.15 | 0.269 |
| Sleep Quantity by Hours * Stress Level | |||
| Â Â Â Â Sleep Quantity by Hours * High Stress | 4.09 | 1.81, 6.37 | <0.001 |
| R² | 0.759 | ||
| Adjusted R² | 0.754 | ||
| AIC | 1,484 | ||
| No. Obs. | 200 | ||
| Abbreviation: CI = Confidence Interval | |||
plot(mvaint, which = 1)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
p1 <- augment(mvaint) |>
ggplot(aes(x = sleep_hours, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, color = "red")
p2 <- augment(mvaint) |>
ggplot(aes(x = study_hours, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, color = "red")
grid.arrange (p1,p2, ncol = 2)
par(mfrow = c(1, 2))
data.res <- augment(mvaint)
plot(mvaint, which = 2)
hist(data.res$.resid)
shapiro.test(mvaint$residuals)
##
## Shapiro-Wilk normality test
##
## data: mvaint$residuals
## W = 0.97172, p-value = 0.0004624
par(mfrow = c(1, 2))
plot(mvaint, which = c(1, 3))
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(mvaint)
##
## studentized Breusch-Pagan test
##
## data: mvaint
## BP = 14.729, df = 4, p-value = 0.005299
plot(residuals(mvaint), type = "o")
abline(h = 0, col = "red", lty = 2)
acf(data.res$.resid, main = "ACF of Residuals")
library(lmtest)
dwtest(mvaint)
##
## Durbin-Watson test
##
## data: mvaint
## DW = 2.0683, p-value = 0.6883
## alternative hypothesis: true autocorrelation is greater than 0
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
vif(mvaint)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
## sleep_hours study_hours
## 1.963067 1.003429
## StressCategory sleep_hours:StressCategory
## 36.180818 36.283695
# Add diagnostics columns
data.inf <- augment(mvaint, student_data)
# Define thresholds
n_obs <- nrow(student_data)
p_preds <- length(coef(mvaint))
cooks_cutoff <- 4 / n_obs
leverage_cutoff <- (2 * p_preds + 2) / n_obs
# Filter Influential Observations
influential_obs <- data.inf %>%
filter(.cooksd > cooks_cutoff | abs(.std.resid) > 2 | .hat > leverage_cutoff)
# Print count
cat("Number of influential observation:", nrow(influential_obs), "\n")
## Number of influential observation: 17
cat("Cook's distance cutoff:", round(cooks_cutoff, 4), "\n")
## Cook's distance cutoff: 0.02
plot(mvaint, which = 4)
plot(mvaint, which = 5)
data.inf <- data.inf |>
dplyr::mutate(obs_id = row_number())
top10_influential <- data.inf |>
arrange(desc(.cooksd), desc(abs(.std.resid))) |>
slice(1:10)
top10_influential |>
dplyr::select(
obs_id,
exam_score,
study_hours,
sleep_hours,
StressCategory,
.cooksd,
.hat,
.std.resid
)
## # A tibble: 10 × 8
## obs_id exam_score study_hours sleep_hours StressCategory .cooksd .hat
## <int> <dbl> <dbl> <dbl> <fct> <dbl> <dbl>
## 1 16 96.4 4.79 9.01 Low Stress 0.287 0.0556
## 2 149 160. 5.10 5.83 High Stress 0.0985 0.0442
## 3 65 115. 1.93 9.75 Low Stress 0.0602 0.0620
## 4 1 155. 2.44 9.64 Low Stress 0.0397 0.0543
## 5 56 95.2 4.52 6.89 Low Stress 0.0378 0.0243
## 6 110 160. 3.92 9.45 Low Stress 0.0365 0.0537
## 7 108 85.1 1.33 7.25 Low Stress 0.0327 0.0259
## 8 198 96.1 1.75 8.01 Low Stress 0.0258 0.0237
## 9 97 165. 5.19 9.24 High Stress 0.0193 0.0729
## 10 135 145. 0.947 7.45 High Stress 0.0175 0.0354
## # ℹ 1 more variable: .std.resid <dbl>
# Compute DFBETAS and display as a datatable
library(DT)
dfb <- dfbetas(mvaint)
dfb_long <- dfb |>
as.data.frame() |>
mutate(id = row_number()) |>
pivot_longer(
cols = -id,
names_to = "term",
values_to = "dfbeta"
)
# Cutoff
dfb_threshold <- 2 / sqrt(nrow(student_data))
# Identify influential observations
influential_dfb <- dfb_long |>
filter(abs(dfbeta) > dfb_threshold) |>
arrange(desc(abs(dfbeta)))
influential_dfb %>%
datatable()
leverage_cutoff <- 2 * (length(coef(mvaint)) + 2) / nrow(student_data)
data.inf |>
filter(.std.resid > 2 | .std.resid < -2 | .hat > leverage_cutoff) |>
datatable()
dfx <- data.inf |>
dplyr::filter(.std.resid < 2 & .std.resid > -2 & .hat < leverage_cutoff)
model3 <- lm(exam_score ~ study_hours + sleep_hours*StressCategory, data = dfx)
tidy(model3, conf.int = TRUE) |>
mutate(p.value = format.pval(p.value, digits = 3, eps = 0.001))
## # A tibble: 5 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 (Intercept) 28.5 5.76 4.94 <0.001 17.1 39.8
## 2 study_hours 4.86 0.667 7.30 <0.001 3.55 6.18
## 3 sleep_hours 10.5 0.746 14.1 <0.001 9.06 12.0
## 4 StressCategoryHigh St… -8.20 7.41 -1.11 0.27 -22.8 6.42
## 5 sleep_hours:StressCat… 3.64 1.04 3.49 <0.001 1.58 5.70
glance(model3)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.811 0.807 7.99 195. 1.08e-64 4 -651. 1315. 1334.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
### Preliminary Final Model
model_performance(mvaint)
## # Indices of model performance
##
## AIC | AICc | BIC | R2 | R2 (adj.) | RMSE | Sigma
## ------------------------------------------------------------
## 1484.1 | 1484.5 | 1503.8 | 0.759 | 0.754 | 9.594 | 9.717
### Refitted model after remove outlier
model_performance(model3)
## # Indices of model performance
##
## AIC | AICc | BIC | R2 | R2 (adj.) | RMSE | Sigma
## ------------------------------------------------------------
## 1314.9 | 1315.4 | 1334.3 | 0.811 | 0.807 | 7.884 | 7.992
### Model comparison was done using compare_performance function
compare_performance(mvaint, model3, rank = TRUE)
## When comparing models, please note that probably not all models were fit
## from same data.
## # Comparison of Model Performance Indices
##
## Name | Model | R2 | R2 (adj.) | RMSE | Sigma | AIC weights | AICc weights
## -------------------------------------------------------------------------------
## model3 | lm | 0.811 | 0.807 | 7.884 | 7.992 | 1.00 | 1.00
## mvaint | lm | 0.759 | 0.754 | 9.594 | 9.717 | 1.89e-37 | 1.92e-37
##
## Name | BIC weights | Performance-Score
## ----------------------------------------
## model3 | 1.00 | 100.00%
## mvaint | 1.55e-37 | 0.00%
coef_compare <- tidy(mvaint) |>
dplyr::select(term, estimate) |>
dplyr::rename(estimate_prelim = estimate) |>
dplyr::left_join(
tidy(model3) |>
dplyr::select(term, estimate) |>
dplyr::rename(estimate_refit = estimate),
by = "term"
) |>
mutate(
change_in_estimate = estimate_refit - estimate_prelim,
percent_change = (change_in_estimate / estimate_prelim) * 100
)
coef_compare
## # A tibble: 5 × 5
## term estimate_prelim estimate_refit change_in_estimate percent_change
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 33.9 28.5 -5.42 -16.0
## 2 sleep_hours 9.42 10.5 1.12 11.9
## 3 study_hours 5.07 4.86 -0.202 -3.98
## 4 StressCatego… -9.18 -8.20 0.976 -10.6
## 5 sleep_hours:… 4.09 3.64 -0.447 -10.9
par(mfrow = c(1,2), mar = c(5,4,4,2)) # bottom, left, top, right
# Row 1: Residuals vs Fitted
plot(mvaint, which = 1, main = "Original Model")
plot(model3, which = 1, main = "Without Outliers")
# Row 2: Q-Q Plot
par(mfrow = c(1,2), mar = c(5,4,4,2)) # bottom, left, top, right
plot(mvaint, which = 2, main = "Original Model")
plot(model3, which = 2, main = "Without Outliers")
# Row 3: Scale-Location
par(mfrow = c(1,2), mar = c(5,4,4,2)) # bottom, left, top, right
plot(mvaint, which = 3, main = "Original Model")
plot(model3, which = 3, main = "Without Outliers")
# Row 4: Cook's Distance
par(mfrow = c(1,2), mar = c(5,4,4,2)) # bottom, left, top, right
plot(mvaint, which = 4, main = "Original Model")
plot(model3, which = 4, main = "Without Outliers")
library(gt)
final_m <- tbl_regression(
model3,
intercept = TRUE,
label = list(sleep_hours ~ "Sleep Quantity by Hours",
study_hours ~ "Study time by Hours",
StressCategory ~ "Stress Level"),
) |>
bold_p(t = 0.05) |>
bold_labels() |>
add_glance_table(include = c(r.squared, adj.r.squared, AIC, BIC)) |>
modify_fmt_fun(
estimate ~ function(x) style_number(x, digits = 2),
conf.low ~ function(x) style_number(x, digits = 2),
conf.high ~ function(x) style_number(x, digits = 2)) |>
modify_header(label = "**Variables**",
estimate = "**Adj. B**",
statistic = "**t-stat**") |>
modify_caption("**Final Multivariable Linear Regression Model for EXam Score**")
final_m
| Variables | Adj. B | t-stat | 95% CI | p-value |
|---|---|---|---|---|
| (Intercept) | 28.46 | 4.94 | 17.08, 39.83 | <0.001 |
| Study time by Hours | 4.86 | 7.30 | 3.55, 6.18 | <0.001 |
| Sleep Quantity by Hours | 10.53 | 14.1 | 9.06, 12.00 | <0.001 |
| Stress Level | ||||
|     Low Stress | — | — | — | |
| Â Â Â Â High Stress | -8.20 | -1.11 | -22.83, 6.42 | 0.3 |
| Sleep Quantity by Hours * Stress Level | ||||
| Â Â Â Â Sleep Quantity by Hours * High Stress | 3.64 | 3.49 | 1.58, 5.70 | <0.001 |
| R² | 0.81 | |||
| Adjusted R² | 0.81 | |||
| AIC | 1,314.94 | |||
| BIC | 1,334.32 | |||
| Abbreviation: CI = Confidence Interval | ||||
# Actual vs Predicted
augmented_data <- augment(model3)
ggplot(augmented_data, aes(x = exam_score, y = .fitted)) +
geom_point(alpha = 0.4, color = "#667eea", size = 2) +
geom_abline(intercept = 0, slope = 1, color = "#F57C00",
linewidth = 1.2, linetype = "dashed") +
geom_smooth(method = "lm", se = FALSE, color = "#D32F2F", linewidth = 1) +
labs(title = "Actual vs Predicted Exam Score",
x = "Actual Exam Score",
y = "Predicted Exam Score") +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
## `geom_smooth()` using formula = 'y ~ x'
newdata <- expand.grid(
sleep_hours = mean(student_data$sleep_hours),
study_hours = seq(0, 10, by = 0.5),
StressCategory = levels(student_data$StressCategory)
)
newdata$StressCategory <- factor(
newdata$StressCategory,
levels = levels(student_data$StressCategory)
)
pred.exmscr <- augment(model3, newdata = newdata, interval = "confidence", level = 0.95)
## Warning: The `level` argument is not supported in the `augment()` method for
## `lm` objects and will be ignored.
pred.exmscr |> datatable()
ggplot(pred.exmscr,
aes(x = study_hours,
y = .fitted,
color = StressCategory,
fill = StressCategory)) +
geom_ribbon(
aes(ymin = .lower, ymax = .upper),
alpha = 0.2,
color = NA) +
geom_line(linewidth = 1.2) +
scale_color_brewer(palette = "Set2") +
scale_fill_brewer(palette = "Set2") +
labs(
title = "Predicted Effect of Study Hours on Exam Score",
subtitle = "Stratified by Stress Level",
x = "Study Hours",
y = "Predicted Exam Score",
color = "Stress Level"
) +
theme_minimal(base_size = 12)