Load libraries, load dataset, set randomization seed.
## R markdown link: http://rpubs.com/jasohosk/1425924
## Suppress warnings and messages globally
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
## Load libraries
packages <- c("dplyr",
"ggplot2",
"GGally",
"psych",
"corrplot",
"boot",
"tidyr")
installed <- packages %in% rownames(installed.packages())
if (any(!installed)) {
install.packages(packages[!installed])
}
lapply(packages, library, character.only = TRUE)
## Set working directory
setwd(
"/Users/jasonhoskin/Library/CloudStorage/OneDrive-IndianaUniversity/spring-2026/saa/sgm-caregivers"
)
## Import dataset
url <- "final-combined-table.csv"
df <- read.csv(url)
## Remove unnecessary variables
rm(installed, packages, url)
## Filter out rows with any NAs
df1 <- df |> tidyr::drop_na(
caregiver.burden,
ipaq.continuous,
network.size,
effective.size,
mean.degree,
ego.density,
work.hours.per.week,
loneliness,
qol
)
## Set seed
set.seed(123)
Provide descriptive statistics.
# ---- Continuous variables ----
continuous_vars <- df1 |>
select(
caregiver.burden,
ipaq.continuous,
network.size,
effective.size,
mean.degree,
ego.density,
work.hours.per.week,
loneliness,
qol,
biological.children,
not.biological.children
)
# Using psych::describe() for a clean summary table
continuous_descriptives <- psych::describe(continuous_vars) |>
as.data.frame() |>
select(n, mean, sd, median, min, max, skew, kurtosis) |>
round(2)
print(continuous_descriptives)
## n mean sd median min max skew
## caregiver.burden 19 19.84 5.30 20.00 11.00 29.00 0.12
## ipaq.continuous 19 3249.95 3794.43 1884.00 0.00 13488.00 1.90
## network.size 19 11.32 5.14 11.00 4.00 21.00 0.18
## effective.size 19 7.16 4.39 6.10 1.00 14.78 0.20
## mean.degree 19 3.84 2.39 3.22 0.13 9.69 0.58
## ego.density 19 0.49 0.25 0.41 0.19 1.00 0.77
## work.hours.per.week 19 27.37 19.55 35.00 0.00 60.00 -0.17
## loneliness 19 8.74 3.28 8.00 3.00 14.00 -0.24
## qol 19 6.11 2.00 7.00 2.00 9.00 -0.45
## biological.children 19 0.21 0.54 0.00 0.00 2.00 2.25
## not.biological.children 19 0.42 1.12 0.00 0.00 4.00 2.31
## kurtosis
## caregiver.burden -0.84
## ipaq.continuous 2.45
## network.size -1.29
## effective.size -1.36
## mean.degree -0.23
## ego.density -0.67
## work.hours.per.week -1.52
## loneliness -1.07
## qol -1.09
## biological.children 4.09
## not.biological.children 3.96
# ---- Categorical variables ----
categorical_vars <- c(
"current.gender",
"race",
"ethnicity",
"sexual.orientation",
"education",
"military",
"living.situation",
"marital.status",
"employment.status"
)
# Frequency tables with counts and percentages
categorical_summary <- lapply(categorical_vars, function(var) {
if (var %in% names(df1)) {
tab <- table(df1[[var]], useNA = "ifany")
prop <- round(prop.table(tab) * 100, 1)
data.frame(
variable = var,
level = names(tab),
n = as.integer(tab),
percent = as.numeric(prop)
)
} else {
message(paste("Variable not found:", var))
NULL
}
})
categorical_summary <- do.call(rbind, categorical_summary)
print(categorical_summary, row.names = FALSE)
## variable level n percent
## current.gender 0 9 47.4
## current.gender 1 8 42.1
## current.gender 2 2 10.5
## race 0 13 68.4
## race 1 1 5.3
## race 2 1 5.3
## race 3 4 21.1
## ethnicity 0 18 94.7
## ethnicity 1 1 5.3
## sexual.orientation 0 15 78.9
## sexual.orientation 1 2 10.5
## sexual.orientation 2 1 5.3
## sexual.orientation 3 1 5.3
## education 0 2 10.5
## education 1 1 5.3
## education 2 8 42.1
## education 4 7 36.8
## education 5 1 5.3
## military 0 18 94.7
## military 1 1 5.3
## living.situation 0 12 63.2
## living.situation 1 3 15.8
## living.situation 2 1 5.3
## living.situation 3 3 15.8
## marital.status 0 7 36.8
## marital.status 1 3 15.8
## marital.status 2 9 47.4
## employment.status 0 8 42.1
## employment.status 1 3 15.8
## employment.status 2 3 15.8
## employment.status 3 1 5.3
## employment.status 4 1 5.3
## employment.status 6 3 15.8
Generate scatterplot matrices for all continuous variable using Spearman correlations.
## Plot distributions with scatterplots and correlations.
ggpairs(
continuous_vars,
lower = list(
continuous = wrap(
"smooth",
method = "loess",
se = FALSE,
alpha = 0.6,
size = 1.5
)
),
upper = list(continuous = wrap(
"cor",
method = "spearman",
size = 3,
stars = FALSE
)),
diag = list(continuous = wrap("densityDiag", alpha = 0.5))
) +
theme_bw(base_size = 9) +
theme(axis.text = element_text(size = 6), strip.text = element_text(size = 7)) +
labs(title = "Scatterplot Matrix: All Continuous Variables", subtitle = "Spearman correlations (upper); LOESS smooth (lower); density (diagonal)")
Generate a focal plot that focuses on caregiver burden versus the other continuous variables.
burden_focal <- df1 |>
select(
caregiver.burden,
ipaq.continuous,
network.size,
effective.size,
mean.degree,
ego.density,
work.hours.per.week,
loneliness,
qol,
biological.children,
not.biological.children
) |>
pivot_longer(
cols = -caregiver.burden,
names_to = "variable",
values_to = "value"
)
ggplot(burden_focal, aes(x = caregiver.burden, y = value)) +
geom_point(alpha = 0.7, size = 2) +
geom_smooth(
method = "loess",
se = TRUE,
alpha = 0.2,
color = "steelblue"
) +
facet_wrap( ~ variable, scales = "free_y", ncol = 3) +
theme_bw() +
labs(title = "Caregiver Burden vs. All Continuous Variables", x = "Caregiver Burden (Zarit)", y = "Value")
pairs.panels(
continuous_vars,
method = "spearman",
hist.col = "lightblue",
density = TRUE,
ellipses = FALSE,
smooth = TRUE,
lm = FALSE,
cex.cor = 1,
stars = FALSE
)
Generate correlation heatmap.
cor_matrix <- cor(continuous_vars, method = "spearman", use = "complete.obs")
corrplot(
cor_matrix,
method = "color",
type = "upper",
order = "original",
addCoef.col = "black",
number.cex = 0.7,
tl.col = "black",
tl.srt = 45,
diag = FALSE,
title = "Spearman Correlation Heatmap",
mar = c(0, 0, 2, 0)
)
Assess network measures via a zoomed-in matrix.
network_vars <- df1 |>
select(
network.size,
effective.size,
mean.degree,
ego.density,
caregiver.burden,
ipaq.continuous
)
ggpairs(
network_vars,
lower = list(continuous = wrap(
"smooth",
method = "lm",
se = TRUE,
alpha = 0.5
)),
upper = list(continuous = wrap(
"cor", method = "spearman", stars = FALSE
)),
diag = list(continuous = "densityDiag")
) +
theme_bw() +
labs(title = "Network Structure Measures + Burden + Activity", subtitle = "Linear fit shown to assess redundancy")
Generate bootstrap confidence intervals for all correlations to have bias-corrected accelerted (BCa) intervals.
# Set seed
set.seed(123)
# Function to compute Spearman correlation on a bootstrap sample
boot_spearman <- function(data, indices) {
d <- data[indices, ]
cor(d[[1]], d[[2]], method = "spearman")
}
# Function to run bootstrap for a pair of variables with BCa intervals (falls back to percentile if BCa fails)
run_boot_cor <- function(data, var1, var2, R = 10000) {
pair_df <- data[, c(var1, var2)]
b <- boot(pair_df, boot_spearman, R = R)
ci <- tryCatch(
boot.ci(b, type = "bca"),
error = function(e)
boot.ci(b, type = "perc")
)
ci_type <- if (!is.null(ci$bca))
"BCa"
else
"percentile"
ci_vals <- if (!is.null(ci$bca))
ci$bca[4:5]
else
ci$percent[4:5]
data.frame(
var1 = var1,
var2 = var2,
rho = round(b$t0, 3),
lower = round(ci_vals[1], 3),
upper = round(ci_vals[2], 3),
ci_type = ci_type
)
}
# Build all unique pairs of continuous variables
var_names <- names(continuous_vars)
pairs_grid <- t(combn(var_names, 2))
# Run bootstrap for every pair
boot_results <- do.call(rbind, lapply(seq_len(nrow(pairs_grid)), function(i) {
run_boot_cor(continuous_vars, pairs_grid[i, 1], pairs_grid[i, 2], R = 10000)
}))
# Sort by absolute correlation strength
boot_results <- boot_results[order(-abs(boot_results$rho)), ]
print(boot_results, row.names = FALSE)
## var1 var2 rho lower upper ci_type
## network.size effective.size 0.893 0.619 0.965 BCa
## effective.size ego.density -0.845 -0.959 -0.649 BCa
## loneliness qol -0.693 -0.878 -0.302 BCa
## network.size ego.density -0.642 -0.834 -0.300 BCa
## biological.children not.biological.children 0.558 -0.149 0.997 BCa
## network.size not.biological.children 0.526 0.216 0.765 BCa
## work.hours.per.week not.biological.children 0.518 0.263 0.778 BCa
## caregiver.burden ipaq.continuous -0.516 -0.792 -0.135 BCa
## network.size mean.degree 0.492 0.063 0.770 BCa
## caregiver.burden loneliness 0.472 -0.096 0.763 BCa
## loneliness biological.children 0.426 -0.043 0.778 BCa
## effective.size not.biological.children 0.379 0.043 0.676 BCa
## caregiver.burden work.hours.per.week -0.370 -0.769 0.147 BCa
## work.hours.per.week biological.children 0.316 0.000 0.620 BCa
## network.size loneliness 0.314 -0.207 0.701 BCa
## mean.degree work.hours.per.week 0.303 -0.270 0.733 BCa
## effective.size loneliness 0.300 -0.171 0.670 BCa
## caregiver.burden ego.density -0.299 -0.683 0.167 BCa
## mean.degree biological.children -0.261 -0.714 0.537 BCa
## ipaq.continuous work.hours.per.week 0.257 -0.353 0.723 BCa
## ego.density work.hours.per.week 0.257 -0.195 0.657 BCa
## ipaq.continuous ego.density 0.242 -0.307 0.663 BCa
## mean.degree not.biological.children 0.228 -0.536 0.637 BCa
## ipaq.continuous loneliness -0.213 -0.623 0.273 BCa
## network.size qol -0.213 -0.612 0.296 BCa
## caregiver.burden biological.children -0.200 -0.587 0.484 BCa
## effective.size qol -0.183 -0.576 0.318 BCa
## effective.size mean.degree 0.171 -0.316 0.584 BCa
## caregiver.burden effective.size 0.170 -0.378 0.604 BCa
## caregiver.burden not.biological.children -0.170 -0.632 0.543 BCa
## ego.density biological.children 0.170 -0.142 0.505 BCa
## ipaq.continuous mean.degree 0.161 -0.417 0.643 BCa
## loneliness not.biological.children 0.159 -0.222 0.608 BCa
## caregiver.burden network.size 0.152 -0.392 0.606 BCa
## caregiver.burden mean.degree 0.148 -0.355 0.603 BCa
## mean.degree ego.density 0.141 -0.324 0.564 BCa
## caregiver.burden qol -0.132 -0.560 0.353 BCa
## network.size work.hours.per.week 0.125 -0.404 0.622 BCa
## mean.degree loneliness 0.123 -0.430 0.552 BCa
## mean.degree qol -0.116 -0.494 0.360 BCa
## ego.density loneliness -0.107 -0.568 0.420 BCa
## ipaq.continuous effective.size -0.102 -0.573 0.440 BCa
## qol biological.children -0.101 -0.511 0.237 BCa
## work.hours.per.week qol 0.100 -0.370 0.548 BCa
## effective.size biological.children 0.091 -0.245 0.424 BCa
## network.size biological.children 0.087 -0.443 0.504 BCa
## ipaq.continuous network.size 0.085 -0.442 0.548 BCa
## ego.density not.biological.children -0.082 -0.394 0.238 BCa
## qol not.biological.children -0.080 -0.465 0.256 BCa
## ipaq.continuous not.biological.children -0.068 -0.636 0.398 BCa
## work.hours.per.week loneliness -0.055 -0.506 0.399 BCa
## ego.density qol 0.049 -0.441 0.501 BCa
## ipaq.continuous qol 0.045 -0.393 0.465 BCa
## effective.size work.hours.per.week -0.024 -0.545 0.548 BCa
## ipaq.continuous biological.children 0.012 -0.635 0.415 BCa
# ---- Focal bootstrap: burden vs. each variable, visualized ----
# Forest-plot-style display of burden's correlations with everything else, with bootstrap CIs.
burden_boot <- boot_results |>
filter(var1 == "caregiver.burden" | var2 == "caregiver.burden") |>
mutate(other_var = ifelse(var1 == "caregiver.burden", var2, var1)) |>
arrange(rho)
burden_boot$other_var <- factor(burden_boot$other_var, levels = burden_boot$other_var)
ggplot(burden_boot, aes(x = rho, y = other_var)) +
geom_vline(xintercept = 0,
linetype = "dashed",
color = "gray50") +
geom_errorbar(aes(xmin = lower, xmax = upper),
width = 0.2,
orientation = "y") +
geom_point(size = 3, color = "steelblue") +
theme_bw() +
labs(
title = "Caregiver Burden: Bootstrapped Spearman Correlations",
subtitle = "Point estimates with 95% BCa bootstrap CIs (R = 10,000)",
x = expression(Spearman ~ rho),
y = NULL
)
# ---- Focal bootstrap: IPAQ vs. each variable ----
ipaq_boot <- boot_results |>
filter(var1 == "ipaq.continuous" | var2 == "ipaq.continuous") |>
mutate(other_var = ifelse(var1 == "ipaq.continuous", var2, var1)) |>
arrange(rho)
ipaq_boot$other_var <- factor(ipaq_boot$other_var, levels = ipaq_boot$other_var)
ggplot(ipaq_boot, aes(x = rho, y = other_var)) +
geom_vline(xintercept = 0,
linetype = "dashed",
color = "gray50") +
geom_errorbar(aes(xmin = lower, xmax = upper),
width = 0.2,
orientation = "y") +
geom_point(size = 3, color = "darkorange") +
theme_bw() +
labs(
title = "Physical Activity (IPAQ): Bootstrapped Spearman Correlations",
subtitle = "Point estimates with 95% BCa bootstrap CIs (R = 10,000)",
x = expression(Spearman ~ rho),
y = NULL
)