Show code
library(here)
library(haven)
library(tidyverse)
library(ggplot2)
library(gtsummary)
library(broom)
library(dplyr)
library(corrplot)
library(readr)
library(readxl)
library(knitr)library(here)
library(haven)
library(tidyverse)
library(ggplot2)
library(gtsummary)
library(broom)
library(dplyr)
library(corrplot)
library(readr)
library(readxl)
library(knitr)data1 <- read_xlsx(here("hypothetical_qol_data.xlsx"))
view(data1)
str(data1)tibble [200 × 4] (S3: tbl_df/tbl/data.frame)
$ qol : num [1:200] 57.8 47 48 56.1 55.5 ...
$ years_working: num [1:200] 19 13.9 20.2 27.2 13.1 ...
$ phys_activity: num [1:200] 5.72 6.12 7.17 7.11 2.24 3.12 6.03 6.03 6.03 10 ...
$ obesity : chr [1:200] "not obese" "obese" "obese" "not obese" ...
data1 %>%
summarise(
mean_qol = mean(qol, na.rm = TRUE),
sd_qol = sd(qol, na.rm = TRUE),
median_qol = median(qol, na.rm = TRUE),
iqr_qol = IQR(qol, na.rm = TRUE),
mean_years = mean(years_working, na.rm = TRUE),
sd_years = sd(years_working, na.rm = TRUE),
median_years = median(years_working, na.rm = TRUE),
iqr_years = IQR(years_working, na.rm = TRUE),
mean_phys = mean(phys_activity, na.rm = TRUE),
sd_phys = sd(phys_activity, na.rm = TRUE),
median_phys = median(phys_activity, na.rm = TRUE),
iqr_phys = IQR(phys_activity, na.rm = TRUE)
)# A tibble: 1 × 12
mean_qol sd_qol median_qol iqr_qol mean_years sd_years median_years iqr_years
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 52.5 6.21 52.4 8.00 14.7 7.35 15.0 9.65
# ℹ 4 more variables: mean_phys <dbl>, sd_phys <dbl>, median_phys <dbl>,
# iqr_phys <dbl>
table(data1$obesity)
not obese obese
120 80
prop.table(table(data1$obesity))
not obese obese
0.6 0.4
summary_table <- data1 %>%
summarise(
mean_qol = mean(qol, na.rm = TRUE),
sd_qol = sd(qol, na.rm = TRUE),
median_qol = median(qol, na.rm = TRUE),
iqr_qol = IQR(qol, na.rm = TRUE),
mean_years = mean(years_working, na.rm = TRUE),
sd_years = sd(years_working, na.rm = TRUE),
median_years = median(years_working, na.rm = TRUE),
iqr_years = IQR(years_working, na.rm = TRUE),
mean_phys = mean(phys_activity, na.rm = TRUE),
sd_phys = sd(phys_activity, na.rm = TRUE),
median_phys = median(phys_activity, na.rm = TRUE),
iqr_phys = IQR(phys_activity, na.rm = TRUE)
) %>%
pivot_longer(cols = everything(),
names_to = c("stat", "variable"),
names_sep = "_",
values_to = "value") %>%
pivot_wider(names_from = stat, values_from = value)
print(summary_table)# A tibble: 3 × 5
variable mean sd median iqr
<chr> <dbl> <dbl> <dbl> <dbl>
1 qol 52.5 6.21 52.4 8.00
2 years 14.7 7.35 15.0 9.65
3 phys 5.17 1.91 5.16 2.58
library(gt)
summary_table %>%
gt()| variable | mean | sd | median | iqr |
|---|---|---|---|---|
| qol | 52.51330 | 6.213524 | 52.375 | 8.0025 |
| years | 14.71440 | 7.354041 | 14.965 | 9.6475 |
| phys | 5.16545 | 1.908179 | 5.155 | 2.5850 |
# Histograms for continuous variables
ggplot(data1, aes(x = qol)) + geom_histogram(binwidth = 5, fill = "skyblue", color = "black")ggplot(data1, aes(x = years_working)) + geom_histogram(binwidth = 2, fill = "lightgreen", color = "black")ggplot(data1, aes(x = phys_activity)) + geom_histogram(binwidth = 1, fill = "orange", color = "black")# Box plots of continuous variables by categorical variable (obesity)
ggplot(data1, aes(x = obesity, y = qol, fill = obesity)) + geom_boxplot()ggplot(data1, aes(x = obesity, y = phys_activity, fill = obesity)) + geom_boxplot()# Scatter plots between continuous variables
ggplot(data1, aes(x = phys_activity, y = qol, color = obesity)) + geom_point() + geom_smooth(method = "lm")ggplot(data1, aes(x = years_working, y = qol, color = obesity)) + geom_point() + geom_smooth(method = "lm")# Bar plots for categorical variables
ggplot(data1, aes(x = obesity, fill = obesity)) + geom_bar()#correlation matrix
# Select only continuous variables
cont_vars <- data1 %>% select(qol, years_working, phys_activity)# Step 1: Select only continuous variables
cont_vars <- data1 %>% select(qol, years_working, phys_activity)
# Step 2: Create correlation matrix
cor_matrix <- cor(cont_vars, use = "complete.obs")
# Step 3: Print matrix
print(cor_matrix) qol years_working phys_activity
qol 1.0000000 0.2273044 0.1012412
years_working 0.2273044 1.0000000 0.0859336
phys_activity 0.1012412 0.0859336 1.0000000
# Step 4: Visualize (optional)
library(corrplot)
corrplot(cor_matrix, method = "circle", type = "upper", tl.cex = 0.8)# Make sure obesity is treated as a factor
data1$obesity <- factor(data1$obesity, levels = c("not obese", "obese"))
# Model 1: without interaction
model1 <- lm(qol ~ years_working + phys_activity + obesity, data = data1)
summary(model1)
Call:
lm(formula = qol ~ years_working + phys_activity + obesity, data = data1)
Residuals:
Min 1Q Median 3Q Max
-13.9316 -3.1254 -0.1097 3.4352 13.0759
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 51.33768 1.28228 40.036 < 2e-16 ***
years_working 0.18719 0.04985 3.755 0.000229 ***
phys_activity 0.19844 0.19230 1.032 0.303358
obesityobese -6.50958 0.74436 -8.745 1.01e-15 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 5.153 on 196 degrees of freedom
Multiple R-squared: 0.3227, Adjusted R-squared: 0.3123
F-statistic: 31.13 on 3 and 196 DF, p-value: < 2.2e-16
# Model 2: with interaction between phys_activity and obesity
model2 <- lm(qol ~ years_working + phys_activity * obesity, data = data1)
summary(model2)
Call:
lm(formula = qol ~ years_working + phys_activity * obesity, data = data1)
Residuals:
Min 1Q Median 3Q Max
-13.2244 -3.4707 -0.2129 3.0165 13.2211
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 48.74359 1.52810 31.898 < 2e-16 ***
years_working 0.19298 0.04891 3.945 0.000111 ***
phys_activity 0.67819 0.24770 2.738 0.006754 **
obesityobese -0.67503 2.08576 -0.324 0.746561
phys_activity:obesityobese -1.13579 0.38036 -2.986 0.003188 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 5.052 on 195 degrees of freedom
Multiple R-squared: 0.3523, Adjusted R-squared: 0.339
F-statistic: 26.52 on 4 and 195 DF, p-value: < 2.2e-16
#compare models
anova(model1, model2)Analysis of Variance Table
Model 1: qol ~ years_working + phys_activity + obesity
Model 2: qol ~ years_working + phys_activity * obesity
Res.Df RSS Df Sum of Sq F Pr(>F)
1 196 5203.8
2 195 4976.3 1 227.55 8.9166 0.003188 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Linearity, Homoscedasticity, Normality, Independence
Diagnostic plots for model2
par(mfrow = c(2, 2)) # 4 plots in one panel
plot(model2)Explanation of plots: 1. Residuals vs Fitted → check linearity & homoscedasticity 2. Normal Q-Q → check normality of residuals 3. Scale-Location → check homoscedasticity 4. Residuals vs Leverage → check independence & influential points
cooksd <- cooks.distance(model2)
df_cook <- data.frame(obs = 1:length(cooksd), cooksd = cooksd)
ggplot(df_cook, aes(x = obs, y = cooksd)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_hline(yintercept = 4/length(cooksd), color = "red", linetype = "dashed") +
labs(title = "Cook's Distance", x = "Observation", y = "Cook's Distance") +
theme_minimal(base_size = 18) # increase base font sizepar(mfrow = c(1, 1))
dfbetas_values <- dfbetas(model2)
head(dfbetas_values) # first few rows (Intercept) years_working phys_activity obesityobese
1 -0.003081565 0.011924248 6.648270e-03 -0.001132980
2 -0.001054809 0.002410646 -9.564843e-05 0.002260047
3 0.002738968 -0.006259601 2.483653e-04 0.007669722
4 0.055755809 -0.064152854 -4.793487e-02 -0.022606905
5 0.085797409 -0.005276983 -8.038357e-02 -0.061357727
6 -0.039237408 0.003692035 3.361207e-02 0.027696893
phys_activity:obesityobese
1 -0.005110040
2 -0.008484178
3 -0.013100275
4 0.035415480
5 0.052692367
6 -0.022130345
# Now plot DFBETAS cleanly
plot(dfbetas_values[, "phys_activity"], type = "h",
main = "DFBETAS for phys_activity", ylab = "DFBETAS")
abline(h = c(-2/sqrt(nrow(data1)), 2/sqrt(nrow(data1))), col = "red", lty = 2)# Leverage (hat values)
hat_values <- hatvalues(model2)
plot(hat_values, type = "h", main = "Leverage Values", ylab = "Leverage")
abline(h = 2*mean(hat_values), col = "red", lty = 2) # rule of thumb cutoff# Create a full regression table
library(gtsummary)
tbl <- tbl_regression(
model2,
intercept = TRUE,
label = list(
years_working ~ "Years of Working (hours/day)",
phys_activity ~ "Physical Activity (hours/day)",
obesity ~ "Obesity Status (Yes)",
`phys_activity:obesity` ~ "Physical Activity × Obesity"
)
)
# View the table in RStudio Viewer
as_gtfunction (x, include = everything(), return_calls = FALSE, ...)
{
set_cli_abort_call()
check_class(x, "gtsummary")
check_scalar_logical(return_calls)
x <- do.call(get_theme_element("pkgwide-fun:pre_conversion",
default = identity), list(x))
x <- .table_styling_expr_to_row_number(x)
gt_calls <- table_styling_to_gt_calls(x = x, ...)
insert_expr_after <- get_theme_element("as_gt-lst:addl_cmds")
gt_calls <- reduce(.x = seq_along(insert_expr_after), .f = function(x,
y) {
add_expr_after(calls = x, add_after = names(insert_expr_after[y]),
expr = insert_expr_after[[y]], new_name = paste0("user_added",
y))
}, .init = gt_calls)
cards::process_selectors(data = vec_to_df(names(gt_calls)),
include = {
{
include
}
})
if (return_calls == TRUE) {
return(gt_calls[include])
}
.eval_list_of_exprs(gt_calls[include])
}
<bytecode: 0x000002566f607ee0>
<environment: namespace:gtsummary>
#to insert R, adjusted R squared and F-statistic
library(broom)
# Get model-level statistics
fit_stats <- glance(model2)
fit_stats[c("r.squared", "adj.r.squared", "statistic")]# A tibble: 1 × 3
r.squared adj.r.squared statistic
<dbl> <dbl> <dbl>
1 0.352 0.339 26.5
library(gtsummary)
tbl <- tbl_regression(
model2,
intercept = TRUE,
label = list(
years_working ~ "Years of Working (hours/day)",
phys_activity ~ "Physical Activity (hours/day)",
obesity ~ "Obesity Status (Yes)",
`phys_activity:obesity` ~ "Physical Activity × Obesity"
)
)
# Convert to gt table for Viewer
gt_tbl <- as_gt(tbl)
# Add model fit statistics as a footer
gt_tbl <- gt_tbl %>%
tab_footnote(
footnote = paste0(
"R² = ", round(fit_stats$r.squared, 3),
"; Adjusted R² = ", round(fit_stats$adj.r.squared, 3),
"; F-statistic = ", round(fit_stats$statistic, 3)
),
locations = cells_title(groups = "title")
)
gt_tbl| Characteristic | Beta | 95% CI | p-value |
|---|---|---|---|
| (Intercept) | 49 | 46, 52 | <0.001 |
| Years of Working (hours/day) | 0.19 | 0.10, 0.29 | <0.001 |
| Physical Activity (hours/day) | 0.68 | 0.19, 1.2 | 0.007 |
| Obesity Status (Yes) | |||
| not obese | — | — | |
| obese | -0.68 | -4.8, 3.4 | 0.7 |
| Physical Activity × Obesity | |||
| Physical Activity (hours/day) * obese | -1.1 | -1.9, -0.39 | 0.003 |
| 1 R² = 0.352; Adjusted R² = 0.339; F-statistic = 26.516 | |||
| Abbreviation: CI = Confidence Interval | |||
# Create a grid of values for prediction
newdata <- expand.grid(
years_working = mean(data1$years_working, na.rm = TRUE), # hold constant
phys_activity = seq(min(data1$phys_activity, na.rm = TRUE),
max(data1$phys_activity, na.rm = TRUE),
length.out = 100), # range of activity
obesity = levels(data1$obesity) # obese vs non-obese
)
# Add predicted values
newdata$pred_qol <- predict(model2, newdata)
# Plot interaction
ggplot(newdata, aes(x = phys_activity, y = pred_qol, color = obesity)) +
geom_line(size = 1.2) +
labs(
title = "Interaction Plot: QOL vs Physical Activity",
x = "Physical Activity (hours/day)",
y = "Predicted QOL",
color = "Obesity Status"
) +
theme_minimal(base_size = 14)Results
A total of 200 respondents were included in the analysis. The mean Quality of Life (QOL) score was 52.5 (SD = 6.21), while the mean years of working experience was 14.7 years (SD = 7.35). The mean physical activity score was 5.17 (SD = 1.91). Approximately 40% of the respondents were classified as obese.
1) Regression Model Without Interaction
In the initial multiple linear regression model including years of working, physical activity, and obesity status, two variables were statistically significant predictors of QOL. Years of working was positively associated with QOL (β = 0.19, p < 0.001), indicating that each additional year of working experience was associated with a 0.19-point increase in QOL, holding other variables constant. Obesity status demonstrated a strong negative association with QOL (β = −6.51, p < 0.001), whereby obese respondents reported approximately 6.5-point lower QOL compared with non-obese respondents. Physical activity was not statistically significant in this main-effects model (β = 0.20, p = 0.303). The model explained 31.2% of the variance in QOL (Adjusted R² = 0.312).
2) Regression Model With Interaction
A second model incorporating an interaction term between physical activity and obesity status demonstrated an improvement in model fit (Adjusted R² = 0.339). Years of working remained a significant positive predictor of QOL (β = 0.19, p < 0.001). Physical activity showed a significant positive association with QOL among non-obese respondents (β = 0.68, p = 0.0068). However, the interaction term between physical activity and obesity was statistically significant and negative (β = −1.14, p = 0.003), indicating that the beneficial effect of physical activity on QOL was attenuated among obese respondents. The main effect of obesity became non-significant once the interaction was included, suggesting that the impact of obesity on QOL is conditional on physical activity level rather than constant across groups.
An ANOVA model comparison confirmed that inclusion of the interaction term significantly improved model performance (ΔRSS = 227.6; F = 8.92, p = 0.003).
Model Diagnostics and Influential Observations
Residual diagnostic plots indicated that model assumptions were generally satisfied. The residuals were approximately normally distributed, with no clear evidence of non-linearity or severe heteroscedasticity. Minor deviations in variance dispersion were observed but were not judged to meaningfully affect model stability. Cook’s distance, leverage statistics, and DFBETAS analyses indicated the presence of a small number of moderately influential observations; however, none exceeded conventional thresholds, and no single case exerted disproportionate influence on parameter estimates.
Thank You