Load libraries, load dataset, set randomization seed.

## R markdown link: http://rpubs.com/jasohosk/1425924

## Suppress warnings and messages globally
knitr::opts_chunk$set(warning = FALSE, message = FALSE)

## Load libraries
packages <- c("dplyr",
              "ggplot2",
              "GGally",
              "psych",
              "corrplot",
              "boot",
              "tidyr")

installed <- packages %in% rownames(installed.packages())
if (any(!installed)) {
  install.packages(packages[!installed])
}
lapply(packages, library, character.only = TRUE)

## Set working directory
setwd(
  "/Users/jasonhoskin/Library/CloudStorage/OneDrive-IndianaUniversity/spring-2026/saa/sgm-caregivers"
)

## Import dataset
url <- "final-combined-table.csv"
df <- read.csv(url)

## Remove unnecessary variables
rm(installed, packages, url)

## Filter out rows with any NAs
df1 <- df |> tidyr::drop_na(
  caregiver.burden,
  ipaq.continuous,
  network.size,
  effective.size,
  mean.degree,
  ego.density,
  work.hours.per.week,
  loneliness,
  qol
)

## Set seed
set.seed(123)

Provide descriptive statistics.

# ---- Continuous variables ----
continuous_vars <- df1 |>
  select(
    caregiver.burden,
    ipaq.continuous,
    network.size,
    effective.size,
    mean.degree,
    ego.density,
    work.hours.per.week,
    loneliness,
    qol,
    biological.children,
    not.biological.children
  )

# Using psych::describe() for a clean summary table
continuous_descriptives <- psych::describe(continuous_vars) |>
  as.data.frame() |>
  select(n, mean, sd, median, min, max, skew, kurtosis) |>
  round(2)

print(continuous_descriptives)
##                          n    mean      sd  median   min      max  skew
## caregiver.burden        19   19.84    5.30   20.00 11.00    29.00  0.12
## ipaq.continuous         19 3249.95 3794.43 1884.00  0.00 13488.00  1.90
## network.size            19   11.32    5.14   11.00  4.00    21.00  0.18
## effective.size          19    7.16    4.39    6.10  1.00    14.78  0.20
## mean.degree             19    3.84    2.39    3.22  0.13     9.69  0.58
## ego.density             19    0.49    0.25    0.41  0.19     1.00  0.77
## work.hours.per.week     19   27.37   19.55   35.00  0.00    60.00 -0.17
## loneliness              19    8.74    3.28    8.00  3.00    14.00 -0.24
## qol                     19    6.11    2.00    7.00  2.00     9.00 -0.45
## biological.children     19    0.21    0.54    0.00  0.00     2.00  2.25
## not.biological.children 19    0.42    1.12    0.00  0.00     4.00  2.31
##                         kurtosis
## caregiver.burden           -0.84
## ipaq.continuous             2.45
## network.size               -1.29
## effective.size             -1.36
## mean.degree                -0.23
## ego.density                -0.67
## work.hours.per.week        -1.52
## loneliness                 -1.07
## qol                        -1.09
## biological.children         4.09
## not.biological.children     3.96
# ---- Categorical variables ----
categorical_vars <- c(
  "current.gender",
  "race",
  "ethnicity",
  "sexual.orientation",
  "education",
  "military",
  "living.situation",
  "marital.status",
  "employment.status"
)

# Frequency tables with counts and percentages
categorical_summary <- lapply(categorical_vars, function(var) {
  if (var %in% names(df1)) {
    tab <- table(df1[[var]], useNA = "ifany")
    prop <- round(prop.table(tab) * 100, 1)
    data.frame(
      variable = var,
      level = names(tab),
      n = as.integer(tab),
      percent = as.numeric(prop)
    )
  } else {
    message(paste("Variable not found:", var))
    NULL
  }
})

categorical_summary <- do.call(rbind, categorical_summary)
print(categorical_summary, row.names = FALSE)
##            variable level  n percent
##      current.gender     0  9    47.4
##      current.gender     1  8    42.1
##      current.gender     2  2    10.5
##                race     0 13    68.4
##                race     1  1     5.3
##                race     2  1     5.3
##                race     3  4    21.1
##           ethnicity     0 18    94.7
##           ethnicity     1  1     5.3
##  sexual.orientation     0 15    78.9
##  sexual.orientation     1  2    10.5
##  sexual.orientation     2  1     5.3
##  sexual.orientation     3  1     5.3
##           education     0  2    10.5
##           education     1  1     5.3
##           education     2  8    42.1
##           education     4  7    36.8
##           education     5  1     5.3
##            military     0 18    94.7
##            military     1  1     5.3
##    living.situation     0 12    63.2
##    living.situation     1  3    15.8
##    living.situation     2  1     5.3
##    living.situation     3  3    15.8
##      marital.status     0  7    36.8
##      marital.status     1  3    15.8
##      marital.status     2  9    47.4
##   employment.status     0  8    42.1
##   employment.status     1  3    15.8
##   employment.status     2  3    15.8
##   employment.status     3  1     5.3
##   employment.status     4  1     5.3
##   employment.status     6  3    15.8

Generate scatterplot matrices for all continuous variable using Spearman correlations.

## Plot distributions with scatterplots and correlations.
ggpairs(
  continuous_vars,
  lower = list(
    continuous = wrap(
      "smooth",
      method = "loess",
      se = FALSE,
      alpha = 0.6,
      size = 1.5
    )
  ),
  upper = list(continuous = wrap(
    "cor",
    method = "spearman",
    size = 3,
    stars = FALSE
  )),
  diag  = list(continuous = wrap("densityDiag", alpha = 0.5))
) +
  theme_bw(base_size = 9) +
  theme(axis.text = element_text(size = 6), strip.text = element_text(size = 7)) +
  labs(title = "Scatterplot Matrix: All Continuous Variables", subtitle = "Spearman correlations (upper); LOESS smooth (lower); density (diagonal)")

Generate a focal plot that focuses on caregiver burden versus the other continuous variables.

burden_focal <- df1 |>
  select(
    caregiver.burden,
    ipaq.continuous,
    network.size,
    effective.size,
    mean.degree,
    ego.density,
    work.hours.per.week,
    loneliness,
    qol,
    biological.children,
    not.biological.children
  ) |>
  pivot_longer(
    cols = -caregiver.burden,
    names_to = "variable",
    values_to = "value"
  )

ggplot(burden_focal, aes(x = caregiver.burden, y = value)) +
  geom_point(alpha = 0.7, size = 2) +
  geom_smooth(
    method = "loess",
    se = TRUE,
    alpha = 0.2,
    color = "steelblue"
  ) +
  facet_wrap( ~ variable, scales = "free_y", ncol = 3) +
  theme_bw() +
  labs(title = "Caregiver Burden vs. All Continuous Variables", x = "Caregiver Burden (Zarit)", y = "Value")

  1. Use “psych” package to generate histograms with correlations sized by magnitude and LOESS curves on scatterplots.
pairs.panels(
  continuous_vars,
  method = "spearman",
  hist.col = "lightblue",
  density = TRUE,
  ellipses = FALSE,
  smooth = TRUE,
  lm = FALSE,
  cex.cor = 1,
  stars = FALSE
)

Generate correlation heatmap.

cor_matrix <- cor(continuous_vars, method = "spearman", use = "complete.obs")

corrplot(
  cor_matrix,
  method = "color",
  type = "upper",
  order = "original",
  addCoef.col = "black",
  number.cex = 0.7,
  tl.col = "black",
  tl.srt = 45,
  diag = FALSE,
  title = "Spearman Correlation Heatmap",
  mar = c(0, 0, 2, 0)
)

Assess network measures via a zoomed-in matrix.

network_vars <- df1 |>
  select(
    network.size,
    effective.size,
    mean.degree,
    ego.density,
    caregiver.burden,
    ipaq.continuous
  )

ggpairs(
  network_vars,
  lower = list(continuous = wrap(
    "smooth",
    method = "lm",
    se = TRUE,
    alpha = 0.5
  )),
  upper = list(continuous = wrap(
    "cor", method = "spearman", stars = FALSE
  )),
  diag  = list(continuous = "densityDiag")
) +
  theme_bw() +
  labs(title = "Network Structure Measures + Burden + Activity", subtitle = "Linear fit shown to assess redundancy")

Generate bootstrap confidence intervals for all correlations to have bias-corrected accelerted (BCa) intervals.

# Set seed
set.seed(123)

# Function to compute Spearman correlation on a bootstrap sample
boot_spearman <- function(data, indices) {
  d <- data[indices, ]
  cor(d[[1]], d[[2]], method = "spearman")
}

# Function to run bootstrap for a pair of variables with BCa intervals (falls back to percentile if BCa fails)
run_boot_cor <- function(data, var1, var2, R = 10000) {
  pair_df <- data[, c(var1, var2)]
  b <- boot(pair_df, boot_spearman, R = R)
  ci <- tryCatch(
    boot.ci(b, type = "bca"),
    error = function(e)
      boot.ci(b, type = "perc")
  )
  ci_type <- if (!is.null(ci$bca))
    "BCa"
  else
    "percentile"
  ci_vals <- if (!is.null(ci$bca))
    ci$bca[4:5]
  else
    ci$percent[4:5]
  data.frame(
    var1 = var1,
    var2 = var2,
    rho  = round(b$t0, 3),
    lower = round(ci_vals[1], 3),
    upper = round(ci_vals[2], 3),
    ci_type = ci_type
  )
}

# Build all unique pairs of continuous variables
var_names <- names(continuous_vars)
pairs_grid <- t(combn(var_names, 2))

# Run bootstrap for every pair
boot_results <- do.call(rbind, lapply(seq_len(nrow(pairs_grid)), function(i) {
  run_boot_cor(continuous_vars, pairs_grid[i, 1], pairs_grid[i, 2], R = 10000)
}))

# Sort by absolute correlation strength
boot_results <- boot_results[order(-abs(boot_results$rho)), ]
print(boot_results, row.names = FALSE)
##                 var1                    var2    rho  lower  upper ci_type
##         network.size          effective.size  0.893  0.619  0.965     BCa
##       effective.size             ego.density -0.845 -0.959 -0.649     BCa
##           loneliness                     qol -0.693 -0.878 -0.302     BCa
##         network.size             ego.density -0.642 -0.834 -0.300     BCa
##  biological.children not.biological.children  0.558 -0.149  0.997     BCa
##         network.size not.biological.children  0.526  0.216  0.765     BCa
##  work.hours.per.week not.biological.children  0.518  0.263  0.778     BCa
##     caregiver.burden         ipaq.continuous -0.516 -0.792 -0.135     BCa
##         network.size             mean.degree  0.492  0.063  0.770     BCa
##     caregiver.burden              loneliness  0.472 -0.096  0.763     BCa
##           loneliness     biological.children  0.426 -0.043  0.778     BCa
##       effective.size not.biological.children  0.379  0.043  0.676     BCa
##     caregiver.burden     work.hours.per.week -0.370 -0.769  0.147     BCa
##  work.hours.per.week     biological.children  0.316  0.000  0.620     BCa
##         network.size              loneliness  0.314 -0.207  0.701     BCa
##          mean.degree     work.hours.per.week  0.303 -0.270  0.733     BCa
##       effective.size              loneliness  0.300 -0.171  0.670     BCa
##     caregiver.burden             ego.density -0.299 -0.683  0.167     BCa
##          mean.degree     biological.children -0.261 -0.714  0.537     BCa
##      ipaq.continuous     work.hours.per.week  0.257 -0.353  0.723     BCa
##          ego.density     work.hours.per.week  0.257 -0.195  0.657     BCa
##      ipaq.continuous             ego.density  0.242 -0.307  0.663     BCa
##          mean.degree not.biological.children  0.228 -0.536  0.637     BCa
##      ipaq.continuous              loneliness -0.213 -0.623  0.273     BCa
##         network.size                     qol -0.213 -0.612  0.296     BCa
##     caregiver.burden     biological.children -0.200 -0.587  0.484     BCa
##       effective.size                     qol -0.183 -0.576  0.318     BCa
##       effective.size             mean.degree  0.171 -0.316  0.584     BCa
##     caregiver.burden          effective.size  0.170 -0.378  0.604     BCa
##     caregiver.burden not.biological.children -0.170 -0.632  0.543     BCa
##          ego.density     biological.children  0.170 -0.142  0.505     BCa
##      ipaq.continuous             mean.degree  0.161 -0.417  0.643     BCa
##           loneliness not.biological.children  0.159 -0.222  0.608     BCa
##     caregiver.burden            network.size  0.152 -0.392  0.606     BCa
##     caregiver.burden             mean.degree  0.148 -0.355  0.603     BCa
##          mean.degree             ego.density  0.141 -0.324  0.564     BCa
##     caregiver.burden                     qol -0.132 -0.560  0.353     BCa
##         network.size     work.hours.per.week  0.125 -0.404  0.622     BCa
##          mean.degree              loneliness  0.123 -0.430  0.552     BCa
##          mean.degree                     qol -0.116 -0.494  0.360     BCa
##          ego.density              loneliness -0.107 -0.568  0.420     BCa
##      ipaq.continuous          effective.size -0.102 -0.573  0.440     BCa
##                  qol     biological.children -0.101 -0.511  0.237     BCa
##  work.hours.per.week                     qol  0.100 -0.370  0.548     BCa
##       effective.size     biological.children  0.091 -0.245  0.424     BCa
##         network.size     biological.children  0.087 -0.443  0.504     BCa
##      ipaq.continuous            network.size  0.085 -0.442  0.548     BCa
##          ego.density not.biological.children -0.082 -0.394  0.238     BCa
##                  qol not.biological.children -0.080 -0.465  0.256     BCa
##      ipaq.continuous not.biological.children -0.068 -0.636  0.398     BCa
##  work.hours.per.week              loneliness -0.055 -0.506  0.399     BCa
##          ego.density                     qol  0.049 -0.441  0.501     BCa
##      ipaq.continuous                     qol  0.045 -0.393  0.465     BCa
##       effective.size     work.hours.per.week -0.024 -0.545  0.548     BCa
##      ipaq.continuous     biological.children  0.012 -0.635  0.415     BCa
# ---- Focal bootstrap: burden vs. each variable, visualized ----
# Forest-plot-style display of burden's correlations with everything else, with bootstrap CIs.

burden_boot <- boot_results |>
  filter(var1 == "caregiver.burden" | var2 == "caregiver.burden") |>
  mutate(other_var = ifelse(var1 == "caregiver.burden", var2, var1)) |>
  arrange(rho)

burden_boot$other_var <- factor(burden_boot$other_var, levels = burden_boot$other_var)

ggplot(burden_boot, aes(x = rho, y = other_var)) +
  geom_vline(xintercept = 0,
             linetype = "dashed",
             color = "gray50") +
  geom_errorbar(aes(xmin = lower, xmax = upper),
                width = 0.2,
                orientation = "y") +
  geom_point(size = 3, color = "steelblue") +
  theme_bw() +
  labs(
    title = "Caregiver Burden: Bootstrapped Spearman Correlations",
    subtitle = "Point estimates with 95% BCa bootstrap CIs (R = 10,000)",
    x = expression(Spearman ~ rho),
    y = NULL
  )

# ---- Focal bootstrap: IPAQ vs. each variable ----
ipaq_boot <- boot_results |>
  filter(var1 == "ipaq.continuous" | var2 == "ipaq.continuous") |>
  mutate(other_var = ifelse(var1 == "ipaq.continuous", var2, var1)) |>
  arrange(rho)

ipaq_boot$other_var <- factor(ipaq_boot$other_var, levels = ipaq_boot$other_var)

ggplot(ipaq_boot, aes(x = rho, y = other_var)) +
  geom_vline(xintercept = 0,
             linetype = "dashed",
             color = "gray50") +
  geom_errorbar(aes(xmin = lower, xmax = upper),
                width = 0.2,
                orientation = "y") +
  geom_point(size = 3, color = "darkorange") +
  theme_bw() +
  labs(
    title = "Physical Activity (IPAQ): Bootstrapped Spearman Correlations",
    subtitle = "Point estimates with 95% BCa bootstrap CIs (R = 10,000)",
    x = expression(Spearman ~ rho),
    y = NULL
  )