1 Read Data

2 Order of Events

2.1 event, study pairs

all |> dplyr::distinct(event, study)

notice some peoples first visit ( V1 ) may be in HCA or, AABC – same being true for V2
- suggesting people joined study at different time periods

all <- all |>
  dplyr::mutate(
    event = factor(
      event,
      levels = c(
        "V1", "F1",
        "V2", "F2",
        "CR",
        "V3", "F3",
        "V4",
        "AF1"
      ),
      ordered = TRUE
    ),

    study = factor(
      study,
      levels = c("HCA", "AABC"),
      ordered = TRUE
    )
  )
unique(all$study)

## [1] HCA  AABC
## Levels: HCA < AABC

unique(all$event)

## [1] V1  F1  V2  F2  CR  V3  F3  AF1 V4 
## Levels: V1 < F1 < V2 < F2 < CR < V3 < F3 < V4 < AF1

patient_paths <-
  all |>
  filter(age_open != "90 or older") |>
  mutate(
    age_open = as.numeric(age_open),
    patient = str_remove(id, "^[A-Z]+")
  ) |>
  distinct(patient, study, event, age_open) |>
  arrange(patient, age_open, study, event) |>   
  group_by(patient) |>
  summarise(
    n_studies = n_distinct(study),
    n_events = n(),

    years_passed = if_else(
      n() > 1,
      max(age_open, na.rm = TRUE) - min(age_open, na.rm = TRUE),
      NA_real_
    ),

    study_event_path = paste(
      paste(study, event, sep = ":"),
      collapse = " → "
    ),

    .groups = "drop"
  ) |>
  dplyr::select(
    patient,
    years_passed,
    study_event_path,
    everything()
  ) |>
  arrange(desc(years_passed))

plot(patient_paths$years_passed, patient_paths$n_events, xlab = "Yrs Passed", ylab = "n_events")

3 Path Diagram

patient_paths

library(tidyverse)
patient_steps <-
  patient_paths |>
  separate_rows(study_event_path, sep = " → ") |>
  separate(
    study_event_path,
    into = c("study", "event"),
    sep = ":"
  ) |>
  mutate(
    study = factor(
      study,
      levels = c("HCA", "AABC"),
      ordered = TRUE
    ),
    event = factor(
      event,
      levels = c(
        "V1", "F1",
        "V2", "F2",
        "CR",
        "V3", "F3",
        "V4",
        "AF1"
      ),
      ordered = TRUE
    )
  ) |>
  group_by(patient) |>
  mutate(step = row_number()) |>
  ungroup()

patient_steps <-
  patient_steps |>
  mutate(
    study_event = interaction(
      study,
      event,
      sep = ":",
      lex.order = TRUE
    )
  )

patient_edges <-
  patient_steps |>
  arrange(patient, step) |>
  group_by(patient) |>
  mutate(
    next_study_event = lead(study_event),
    next_step = lead(step)
  ) |>
  filter(!is.na(next_study_event)) |>
  ungroup()

transition_counts <-
  patient_edges |>
  count(
    step,
    next_step,
    study_event,
    next_study_event,
    name = "n_patients"
  )

library(grid)

max_step <- max(patient_steps$step)

path_plot <-
  ggplot(transition_counts) +
  geom_segment(
    aes(
      x = step,
      xend = next_step,
      y = study_event,
      yend = next_study_event,
      color = n_patients,
      linewidth = n_patients
    ),
    arrow = arrow(length = unit(0.15, "cm")),
    alpha = 0.8
  ) +
  geom_point(
    data = patient_steps |> distinct(step, study_event),
    aes(x = step, y = study_event),
    size = 3
  ) +
  scale_x_continuous(
    breaks = seq(1, max_step, by = 1),
    limits = c(1, max_step)
  ) +
  scale_linewidth(range = c(0.5, 3)) +
  scale_color_gradient(
    low = "mistyrose",
    high = "red4"
  ) +
  labs(
    x = "Sequence Step",
    y = "Study:Event",
    color = "Number of Patients",
    linewidth = "Number of Patients",
    title = "Patient Study/Event Paths"
  ) +
  theme_minimal()

path_plot

Notice that people are going from F1 –> F2 instead of visiting : F1 –> V2 –> F2, so i want to see if they take about the same amount of time (yrs) to go from F1 –> F2 as the people who went F1 –> V2 –> F2 – which would suggest people are going to the visit but we just arent capturing their data.

3.1 Time between events

library(dplyr)
library(stringr)

event_levels <- c(
  "V1", "F1",
  "V2", "F2",
  "CR",
  "V3", "F3",
  "V4",
  "AF1"
)

time_between_events <-
  all |>
  filter(age_open != "90 or older") |>

  mutate(
    age_open = as.numeric(age_open),

    patient = str_remove(id, "^[A-Z]+"),

    event = factor(
      event,
      levels = event_levels,
      ordered = TRUE
    ),

    study = factor(
      study,
      levels = c("HCA", "AABC"),
      ordered = TRUE
    ),

    study_event = paste(study, event, sep = ":")
  ) |>

  distinct(
    patient,
    study,
    event,
    age_open,
    .keep_all = TRUE
  ) |>

  arrange(
    patient,
    event,
    study,
    age_open
  ) |>

  group_by(patient) |>

  mutate(
    from_step = row_number(),

    to_step = lead(from_step),

    from_study = study,
    from_event = event,
    from_study_event = study_event,
    from_age = age_open,

    to_study = lead(study),
    to_event = lead(event),
    to_study_event = lead(study_event),
    to_age = lead(age_open),

    years_between = to_age - from_age
  ) |>

  ungroup() |>

  filter(!is.na(to_study_event)) |>

  dplyr::select(
    patient,
    from_step,
    to_step,
    from_study,
    from_event,
    from_study_event,
    from_age,
    to_study,
    to_event,
    to_study_event,
    to_age,
    years_between
  )

event_steps <-
  bind_rows(
    time_between_events |>
      transmute(
        patient,
        step = from_step,
        study_event = from_study_event,
        age = from_age
      ),

    time_between_events |>
      transmute(
        patient,
        step = to_step,
        study_event = to_study_event,
        age = to_age
      )
  ) |>
  distinct(patient, step, study_event, age) |>
  arrange(patient, step)

patient_event_paths <-
  event_steps |>
  group_by(patient) |>
  summarise(
    full_path = paste(study_event, collapse = " → "),
    n_events = n(),
    .groups = "drop"
  )

path_direct <- paste(
  c("HCA:V1", "HCA:F1", "HCA:F2"),
  collapse = " → "
)

path_with_visit <- paste(
  c("HCA:V1", "HCA:F1", "HCA:V2", "HCA:F2"),
  collapse = " → "
)

direct_patients <-
  patient_event_paths |>
  filter(full_path == path_direct) |>
  pull(patient)

visit_patients <-
  patient_event_paths |>
  filter(full_path == path_with_visit) |>
  pull(patient)

3.1.1 immediate transition group F1–>F2

direct_f1_to_f2_time <-
  event_steps |>
  filter(patient %in% direct_patients) |>
  group_by(patient) |>
  summarise(
    path_type = "HCA:V1 → HCA:F1 → HCA:F2",
    age_f1 = age[study_event == "HCA:F1"],
    age_f2 = age[study_event == "HCA:F2"],
    years_f1_to_f2 = age_f2 - age_f1,
    .groups = "drop"
  )

3.1.2 Intermediate visit group

visit_f1_to_f2_time <-
  event_steps |>
  filter(patient %in% visit_patients) |>
  group_by(patient) |>
  summarise(
    path_type = "HCA:V1 → HCA:F1 → HCA:V2 → HCA:F2",
    age_f1 = age[study_event == "HCA:F1"],
    age_v2 = age[study_event == "HCA:V2"],
    age_f2 = age[study_event == "HCA:F2"],
    years_f1_to_v2 = age_v2 - age_f1,
    years_v2_to_f2 = age_f2 - age_v2,
    years_f1_to_f2 = age_f2 - age_f1,
    .groups = "drop"
  )

f1_to_f2_comparison <-
  bind_rows(
    direct_f1_to_f2_time,
    visit_f1_to_f2_time |>
      dplyr::select(patient, path_type, age_f1, age_f2, years_f1_to_f2)
  )

f1_to_f2_comparison

f1_to_f2_summary <-
  f1_to_f2_comparison |>
  group_by(path_type) |>
  summarise(
    n_patients = n(),
    mean_years_f1_to_f2 = mean(years_f1_to_f2, na.rm = TRUE),
    median_years_f1_to_f2 = median(years_f1_to_f2, na.rm = TRUE),
    sd_years_f1_to_f2 = sd(years_f1_to_f2, na.rm = TRUE),
    min_years_f1_to_f2 = min(years_f1_to_f2, na.rm = TRUE),
    max_years_f1_to_f2 = max(years_f1_to_f2, na.rm = TRUE),
    .groups = "drop"
  )

f1_to_f2_summary

ggplot(f1_to_f2_comparison,
       aes(x = path_type, y = years_f1_to_f2)) +
  geom_boxplot() +
  geom_jitter(width = 0.15, alpha = 0.5) +
  labs(
    x = "Exact patient path",
    y = "Years from HCA:F1 to HCA:F2",
    title = "Time from HCA:F1 to HCA:F2 by exact event path"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 20, hjust = 1))

4 Missingness Relevance

A great deal of data is missing ( NA ) – therefore our data is heavily biased. We would like to determine in what way its biased. Perhaps by adjusting for this bias we see strong associations for each variable to the cognitive markers in studying dimentia.

4.1 Blood Data

library(dplyr)
library(naniar)
blood |> 
  miss_var_summary() |>
  as.data.frame() |>
  dplyr::select(variable, pct_miss)

direct_ldl is entirely missing, we may discard this feature
Many repeated values

4.2 Diet Data

diet |> 
  miss_var_summary() |>
  as.data.frame() |>
  dplyr::select(variable, pct_miss)

interpretation :

Notice the only features without missingness
Notice the large amount of missingness is the

Participant info/Demographics
- data_category
Notice that the missingness appears to repeat – showing runs of the same pct_miss for both features

4.3 Ct and Graph Repeated Missingness

4.4 Diet Data

ct <-
  diet |> 
  miss_var_summary() |> 
  mutate(
    ct = ifelse(pct_miss == 0, 0, 1)
    ) |> count(ct) 
ct

Interpretation :

There are only 4 non-missing values every other value is just 84.6 percent missing – which is a HUGE amt.

4.5 Blood Data

ct <- 
  blood |> 
  miss_var_summary() 

library(ggplot2)
ct |> 
  ggplot(aes(pct_miss)) + 
  geom_histogram(binwidth = 10)

interpretation

this one is slightly more varied
0, about 60, 90 100% percent missing – with 60, 90 happening quite frequently

ct <- ct |>
  mutate(
    char = format(pct_miss)
  )

ct <- as.vector(ct$pct_miss)/100
ct <- ct |> round(1) 
ct <- ct |> as.character()
barplot(table(ct))

We can see that the repeated values are near 0 (id features), 60%, 90% or 100%

5 Build Features

5.1 `patient` study & events ct

Purpose : identify how often a patients appear in our data.

For example :

Are they in both studies?
How many unique events?

5.1.1 Helper Func : Create `unique_id`, `Patient` Variable

library(dplyr)

diet <-
  diet |>
  mutate(
    patient = str_remove(id, "^[A-Z]+") 
  ) |> dplyr::select(patient, everything())

blood <-
  blood |>
  mutate(
    patient = str_remove(id, "^[A-Z]+") 
  ) |> dplyr::select(patient, everything())

6 `patient` specific missing ct

Purpose : identify how often patients appear in our data to better understand missing values.

6.1 Diet

diet_patients <-
  diet |>
  group_by(patient) |>
  summarize(
    n_rows = n(),
    total_missing =
      sum(across(everything(), ~ sum(is.na(.x)))),
    .groups = "drop"
  ) |>
  mutate(
    pct =
      round(
        total_missing /
          (n_rows * (ncol(diet) - 1)),
        2
      )
  ) |>
  arrange(desc(pct))

diet_patients <-
  diet_patients |>  mutate(
    p_rank = paste0("patient", row_number())
  ) |> dplyr::select(p_rank, everything(), patient)

plot(diet_patients$n_rows, diet_patients$total_missing)

6.2 blood

blood_patients <-
  blood |>
  group_by(patient) |>
  summarize(
    n_rows = n(),
    total_missing =
      sum(across(everything(), ~ sum(is.na(.x)))),
    total_cells =
      n_rows * ncol(pick(everything())),
    pct =
      round(total_missing / total_cells, 2),
    .groups = "drop"
  ) |>
  arrange(desc(pct))

blood_patients <-
  blood_patients |>  mutate(
    p_rank = paste0("patient", row_number())
  ) |> dplyr::select(p_rank, everything(), patient)

plot(blood_patients$n_rows,blood_patients$total_missing)

7 Correlations of `Cognitive` Features – ALL

Note :

we have repeated observations, so we are taking people who had more visits more into account
Relevant factor analysis Features^
- Memory_Tr35_60y,
  - Cognition Factor Analysis: Memory factor estimated in Union V1 35-60 yo subjects and applied to all subjects all visits
- FluidIQ_Tr35_60y,
  - Cognition Factor Analysis: Fluid IQ factor estimated in Union V1 35-60 yo subjects and applied to all subjects all visits
- CrystIQ_Tr35_60y
  - Cognition Factor Analysis: Crystallized IQ factor estimated in Union V1 35-60 yo subjects and applied to all subjects all visits

Memory <- all$Memory_Tr35_60y |> as.numeric()
fluidIQ <- all$FluidIQ_Tr35_60y |> as.numeric()
CrystIQ <- all$CrystIQ_Tr35_60y |> as.numeric()

7.1 `Memory`, `fluidIQ` dot-plt

# Memory ~ fluidIQ
Memory_fluidIQ_mdl <- lm(Memory ~ fluidIQ)

plot(Memory, fluidIQ)
abline(Memory_fluidIQ_mdl, col = "red")

7.2 `Memory`, `CrystIQ` dot-plt

# Memory ~ CrystIQ
Memory_CrystIQ_mdl <- lm(Memory ~ CrystIQ)

plot(Memory, CrystIQ)
abline(Memory_CrystIQ_mdl, col = "red")

7.3 `fluidIQ`, `CrystIQ` dot-plt

# fluidIQ ~ CrystIQ
fluidIQ_CrystIQ_mdl <- lm(fluidIQ ~ CrystIQ)

plot(fluidIQ, CrystIQ)
abline(fluidIQ_CrystIQ_mdl, col = "red")

diet_num <- diet |> dplyr::select(where(is.numeric))
blood_num <- blood |> dplyr::select(where(is.numeric))

diet_num <- cbind(Memory, fluidIQ, CrystIQ, diet_num)
blood_num <- cbind(Memory, fluidIQ, CrystIQ, blood_num)

8 Diet Correlations w/ `Memory`

mem_cor <-
  cor(diet_num$Memory, diet_num[4:length(diet_num)],
      use = "complete.obs")


mem_cor <-
  mem_cor |> 
  t() |> as.data.frame()

mem_cor <- 
  mem_cor |> 
  rename(Memory = V1) |> 
  round(3) |>
  dplyr::select(Memory)

mem_cor

9 Diet Correlations w/ `fluidIQ`

fluid_cor <-
  cor(diet_num$fluidIQ, diet_num[4:length(diet_num)],
      use = "complete.obs")


fluid_cor <-
  fluid_cor |> 
  t() |> as.data.frame()

fluid_cor |> 
  rename(fluidIQ = V1) |> 
  round(3) |>
  dplyr::select(fluidIQ)

10 Diet Correlations w/ `CrystIQ`

cryst_cor <-
  cor(diet_num$CrystIQ, diet_num[4:length(diet_num)],
      use = "complete.obs")


cryst_cor <-
  cryst_cor |> 
  t() |> as.data.frame()

cryst_cor <-
  cryst_cor |> 
  rename(CrystIQ = V1) |> 
  round(3) |> dplyr::select(CrystIQ) 


cryst_cor

Overall interpretation

Extremely weak correlations for all variables in Diet Data
- Diet Data doesnt appear strongly related

Possible Causes

underlying relationship isnt linear
Missing values may be biasing variables

diet_cor_lst <-
  list(
  cryst_cor=cryst_cor,
  fluid_cor=fluid_cor,
  mem_cor=mem_cor
)

11 Patient level analysis

blood |>
  group_by(patient) |> 
  dplyr::select(patient, where(is.numeric)) |> 
  summarize(
    across(everything(), ~ mean(.x, na.rm = TRUE))
  )

12 Patient Correlations of `Cognitive` Features

here we generate a report of correlations for each individual patient
Suppose we just average out their values across all Study and event

diet_patients

13 Multivariate Normal – Cognitive Features

cognitive_df <- 
  all |> 
  dplyr::select(id_event, id, event, study,
         Memory_Tr35_60y, 
         FluidIQ_Tr35_60y, 
         CrystIQ_Tr35_60y) |>
  mutate(
    patient = str_remove(id, "^[A-Z]+") 
  ) |> dplyr::select(patient, everything()) |>
  mutate(
    across(
      -c(patient, id_event, id, event, study),
      as.numeric
    )
  )

par(mfrow = c(1, 3))

hist(
  cognitive_df$Memory_Tr35_60y,
  main = "Memory",
  xlab = ""
)

hist(
  cognitive_df$FluidIQ_Tr35_60y,
  main = "FluidIQ",
  xlab = ""
)

hist(
  cognitive_df$CrystIQ_Tr35_60y,
  main = "CrystIQ",
  xlab = ""
)

notice each is normally distributed

cognitive_df |>
  dplyr::select(where(is.numeric)) |>
  summarize(
    mean_Memory = mean(Memory_Tr35_60y, na.rm = TRUE), 
    sd_Memory = sd(Memory_Tr35_60y, na.rm = TRUE), 
    mean_FluidIQ_y = mean(FluidIQ_Tr35_60y, na.rm = TRUE),
    sd_FluidIQ = sd(FluidIQ_Tr35_60y, na.rm = TRUE), 
    mean_CrystIQ = mean(CrystIQ_Tr35_60y, na.rm = TRUE),
    sd_CrystIQ = sd(CrystIQ_Tr35_60y, na.rm = TRUE)
  )

tbl <-
  all %>%
  count(study, event, name = "n") %>%
  mutate(percent = round(100 * n / sum(n), 2)) %>%
  arrange(study, event)

14 Event Study Pairs – ct

library(dplyr)
library(gt)
library(scales)

all %>%
  count(study, event, name = "n") %>%
  mutate(percent = n / sum(n)) %>%
  arrange(study, event) %>%
  gt(groupname_col = "study") %>%
  cols_label(
    event = "Event",
    n = "Count",
    percent = "Percent"
  ) %>%
  fmt_percent(
    columns = percent,
    decimals = 1
  ) %>%
  tab_header(
    title = md("**Event Counts by Study**"),
    subtitle = "Distribution of event-study pairs"
  ) %>%
  cols_align(
    align = "center",
    columns = c(n, percent)
  ) %>%
  tab_options(
    table.font.size = px(14),
    heading.title.font.size = px(18)
  )

Event	Count	Percent
Event Counts by Study
Distribution of event-study pairs
HCA
V1	1198	18.5%
F1	1136	17.6%
V2	609	9.4%
F2	843	13.0%
CR	491	7.6%
F3	550	8.5%
AABC
V1	198	3.1%
V2	306	4.7%
V3	471	7.3%
V4	96	1.5%
AF1	567	8.8%

15 Arrange in groups of id ages

patient_age_order <-
  all |>
  filter(age_open != "90 or older") |>

  mutate(
    patient = str_remove(id, "^[A-Z]+"),

    age_open = as.numeric(age_open),

    event = factor(
      event,
      levels = event_levels,
      ordered = TRUE
    ),

    study = factor(
      study,
      levels = c("HCA", "AABC"),
      ordered = TRUE
    )
  ) |>

  arrange(
    patient,
    age_open,
    study,
    event
  ) |>

  group_by(patient) |>

  mutate(
    visit_order = row_number()
  ) |>

  ungroup()

patient_age_order |> mutate(
    person =
      paste0(
        "p",
        dense_rank(patient)
      )
  ) |>
  dplyr::select(person, visit_order,
    age_open,
    study,
    event, 
    patient
    ) |>
  arrange(
    patient,
    age_open,
    study,
    event
  ) |>

  group_by(patient)

library(dplyr)
library(ggplot2)
library(grid)

study_event_levels <-
  expand.grid(
    study = c("HCA", "AABC"),
    event = c(
      "V1", "F1",
      "V2", "F2",
      "CR",
      "V3", "F3",
      "V4",
      "AF1"
    )
  ) |>
  arrange(study, event) |>
  transmute(
    study_event = paste(study, event, sep = ":")
  ) |>
  pull(study_event)

age_ordered_steps <-
  patient_age_order |>

  mutate(
    person =
      paste0(
        "p",
        dense_rank(patient)
      ),

    study = factor(
      study,
      levels = c("HCA", "AABC"),
      ordered = TRUE
    ),

    event = factor(
      event,
      levels = c(
        "V1", "F1",
        "V2", "F2",
        "CR",
        "V3", "F3",
        "V4",
        "AF1"
      ),
      ordered = TRUE
    ),

    study_event =
      factor(
        paste(study, event, sep = ":"),
        levels = study_event_levels,
        ordered = TRUE
      )
  ) |>

  arrange(
    patient,
    age_open,
    study,
    event
  ) |>

  group_by(patient) |>

  mutate(
    step = row_number()
  ) |>

  ungroup()

age_ordered_edges <-
  age_ordered_steps |>

  arrange(patient, step) |>

  group_by(patient) |>

  mutate(
    next_step = lead(step),

    next_study_event =
      lead(study_event),

    next_age =
      lead(age_open)
  ) |>

  filter(!is.na(next_study_event)) |>

  ungroup()

age_transition_counts <-
  age_ordered_edges |>

  count(
    step,
    next_step,
    study_event,
    next_study_event,
    name = "n_patients"
  )

max_step <- max(age_ordered_steps$step)

age_path_plot <-
  ggplot(age_transition_counts) +

  geom_segment(
    aes(
      x = step,
      xend = next_step,

      y = study_event,
      yend = next_study_event,

      linewidth = n_patients,
      color = n_patients
    ),

    arrow =
      arrow(length = unit(0.12, "cm")),

    alpha = 0.75
  ) +

  geom_point(
    data =
      age_ordered_steps |>
      distinct(step, study_event),

    aes(
      x = step,
      y = study_event
    ),

    size = 3
  ) +

  scale_x_continuous(
    breaks = seq(1, max_step, by = 1)
  ) +

  scale_linewidth(
    range = c(0.5, 4)
  ) +

  scale_color_gradient(
    low = "mistyrose",
    high = "red4"
  ) +

  labs(
    title =
      "Age-Ordered Patient Movement Through Study Events",

    subtitle =
      "Transitions follow youngest → oldest observations",

    x = "Sequential Observation Order",

    y = "Study : Event",

    linewidth = "Patients",

    color = "Patients"
  ) +

  theme_minimal()

age_path_plot

retention_df <-
  patient_age_order |>

  group_by(patient) |>

  summarise(

    n_visits = n(),

    first_age =
      min(age_open, na.rm = TRUE),

    last_age =
      max(age_open, na.rm = TRUE),

    years_followed =
      last_age - first_age,

    first_event =
      first(event),

    last_event =
      last(event),

    first_study =
      first(study),

    last_study =
      last(study),

    completed_v4 =
      any(event == "V4"),

    completed_f3 =
      any(event == "F3"),

    last_observed_event =
      paste(last_study, last_event, sep = ":"),

    .groups = "drop"
  )

retention_df

16 How do Measures Change with Dropoff?

visit_retention <-
  patient_age_order |>

  group_by(patient) |>

  mutate(
    max_visit =
      max(visit_order)
  ) |>

  ungroup()

d <- visit_retention |> group_by(patient) |> mutate(mean(max_visit))
barplot(table(d$max_visit))

table(d$max_visit)

## 
##    1    2    3    4    5    6    7    8    9 
##  130  290  309 1020 1220 1254 1190  672   90

summary(d$max_visit)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   4.000   6.000   5.483   7.000   9.000

17 Path Diagram 2

library(dplyr)
library(ggplot2)
library(grid)

study_event_levels <-
  c(
    "HCA:V1",
    "HCA:F1",
    "HCA:V2",
    "HCA:F2",
    "HCA:CR",
    "HCA:V3",
    "HCA:F3",
    "HCA:V4",
    "HCA:AF1",

    "AABC:V1",
    "AABC:F1",
    "AABC:V2",
    "AABC:F2",
    "AABC:CR",
    "AABC:V3",
    "AABC:F3",
    "AABC:V4",
    "AABC:AF1"
  )

retention_paths <-
  patient_age_order |>

  mutate(

    study_event =
      factor(
        paste(study, event, sep = ":"),
        levels = study_event_levels,
        ordered = TRUE
      )

  ) |>

  arrange(
    patient,
    age_open,
    study,
    event
  ) |>

  group_by(patient) |>

  mutate(

    visit_order = row_number(),

    next_visit = lead(visit_order),

    next_study_event =
      lead(study_event)

  ) |>

  ungroup()

node_counts <-
  retention_paths |>

  count(
    visit_order,
    study_event,
    name = "n_patients"
  )

edge_counts <-
  retention_paths |>

  filter(!is.na(next_study_event)) |>

  count(
    visit_order,
    next_visit,
    study_event,
    next_study_event,
    name = "n_patients"
  )

dropoff_counts <-
  retention_paths |>

  group_by(patient) |>

  summarise(
    final_visit =
      max(visit_order)
  ) |>

  count(final_visit)

retention_plot <-

  ggplot() +

  geom_segment(

    data = edge_counts,

    aes(
      x = visit_order,
      xend = next_visit,

      y = study_event,
      yend = next_study_event,

      linewidth = n_patients,
      color = n_patients
    ),

    alpha = 0.5,

    arrow =
      arrow(length = unit(0.10, "cm"))

  ) +

  geom_point(

    data = node_counts,

    aes(
      x = visit_order,
      y = study_event,
      size = n_patients
    ),

    color = "black",
    alpha = 0.85

  ) +

  geom_text(

    data =
      dropoff_counts,

    aes(
      x = final_visit,
      y = 1,
      label = paste0(
        "dropoff=", n
      )
    ),

    size = 3,
    vjust = -1

  ) +

  scale_x_continuous(
    breaks = 1:9
  ) +

  scale_size(
    range = c(2, 12)
  ) +

  scale_linewidth(
    range = c(0.3, 4)
  ) +

  scale_color_gradient(
    low = "lightblue",
    high = "darkred"
  ) +

  labs(

    title =
      "Longitudinal Retention and Movement Through Study Events",

    subtitle =
      "Node size = remaining participants | edge thickness = transitions",

    x =
      "Visit Order",

    y =
      "Study : Event",

    size =
      "Patients",

    linewidth =
      "Transitions",

    color =
      "Transitions"

  ) +

  theme_minimal()

retention_plot

18 patient-level modeling table

library(dplyr)

baseline_features <-

  all |>

  filter(age_open != "90 or older") |>

  mutate(
    patient = str_remove(id, "^[A-Z]+")
  ) |>

  group_by(patient) |>

  summarise(

    Memory =
      mean(as.numeric(Memory_Tr35_60y), na.rm = TRUE),

    FluidIQ =
      mean(as.numeric(FluidIQ_Tr35_60y), na.rm = TRUE),

    CrystIQ =
      mean(as.numeric(CrystIQ_Tr35_60y), na.rm = TRUE),

    age =
      mean(as.numeric(age_open), na.rm = TRUE),

    sex =
      first(sex),

    .groups = "drop"
  )

blood_summary <-

  blood |>

  group_by(patient) |>

  summarise(

    glucose =
      mean(glucose, na.rm = TRUE),

    hba1c =
      mean(hba1c, na.rm = TRUE),

    hdl =
      mean(hdl, na.rm = TRUE),

    cholesterol =
      mean(cholesterol, na.rm = TRUE),

    insulin =
      mean(insulin, na.rm = TRUE),

    .groups = "drop"
  )

diet_summary <-

  diet |>

  group_by(patient) |>

  summarise(

    asa24_fibe =
      mean(asa24_fibe, na.rm = TRUE),

    asa24_magn =
      mean(asa24_magn, na.rm = TRUE),

    asa24_fola =
      mean(asa24_fola, na.rm = TRUE),

    asa24_kcal =
      mean(asa24_kcal, na.rm = TRUE),

    asa24_sugr =
      mean(asa24_sugr, na.rm = TRUE),

    .groups = "drop"
  )

retention_master <-

  baseline_features |>

  left_join(
    blood_summary,
    by = "patient"
  ) |>

  left_join(
    diet_summary,
    by = "patient"
  ) |>

  left_join(
    retention_df,
    by = "patient"
  )

library(MASS)

full_model <-

  lm(

    Memory ~

      age +
      sex +

      CrystIQ +

      glucose +
      hba1c +
      hdl +
      cholesterol +
      insulin +

      asa24_fibe +
      asa24_magn +
      asa24_fola +
      asa24_kcal +
      asa24_sugr +

      n_visits +
      years_followed +
      completed_v4 +
      completed_f3,

    data = retention_master
  )

step_model <-

  stepAIC(
    full_model,
    direction = "both"
  )

## Start:  AIC=-524.24
## Memory ~ age + sex + CrystIQ + glucose + hba1c + hdl + cholesterol + 
##     insulin + asa24_fibe + asa24_magn + asa24_fola + asa24_kcal + 
##     asa24_sugr + n_visits + years_followed + completed_v4 + completed_f3
## 
##                  Df Sum of Sq    RSS     AIC
## - hdl             1     0.001 397.15 -526.24
## - glucose         1     0.083 397.23 -526.07
## - cholesterol     1     0.089 397.24 -526.06
## - asa24_sugr      1     0.090 397.24 -526.06
## - asa24_fola      1     0.103 397.25 -526.03
## - completed_v4    1     0.453 397.60 -525.33
## - asa24_magn      1     0.529 397.68 -525.18
## - completed_f3    1     0.649 397.80 -524.93
## - hba1c           1     0.758 397.91 -524.71
## - asa24_kcal      1     0.883 398.03 -524.46
## <none>                        397.15 -524.24
## - asa24_fibe      1     1.292 398.44 -523.64
## - years_followed  1     1.726 398.88 -522.77
## - n_visits        1     2.142 399.29 -521.94
## - insulin         1     3.269 400.42 -519.68
## - sex             1    30.471 427.62 -467.10
## - age             1    95.106 492.25 -354.49
## - CrystIQ         1   127.146 524.30 -304.05
## 
## Step:  AIC=-526.24
## Memory ~ age + sex + CrystIQ + glucose + hba1c + cholesterol + 
##     insulin + asa24_fibe + asa24_magn + asa24_fola + asa24_kcal + 
##     asa24_sugr + n_visits + years_followed + completed_v4 + completed_f3
## 
##                  Df Sum of Sq    RSS     AIC
## - glucose         1     0.081 397.23 -528.07
## - asa24_sugr      1     0.089 397.24 -528.06
## - cholesterol     1     0.100 397.25 -528.04
## - asa24_fola      1     0.103 397.25 -528.03
## - completed_v4    1     0.455 397.61 -527.32
## - asa24_magn      1     0.532 397.68 -527.17
## - completed_f3    1     0.649 397.80 -526.93
## - hba1c           1     0.785 397.94 -526.66
## - asa24_kcal      1     0.885 398.04 -526.46
## <none>                        397.15 -526.24
## - asa24_fibe      1     1.292 398.44 -525.64
## - years_followed  1     1.725 398.88 -524.77
## + hdl             1     0.001 397.15 -524.24
## - n_visits        1     2.142 399.29 -523.94
## - insulin         1     3.646 400.80 -520.93
## - sex             1    34.525 431.68 -461.55
## - age             1    99.389 496.54 -349.56
## - CrystIQ         1   127.401 524.55 -305.65
## 
## Step:  AIC=-528.07
## Memory ~ age + sex + CrystIQ + hba1c + cholesterol + insulin + 
##     asa24_fibe + asa24_magn + asa24_fola + asa24_kcal + asa24_sugr + 
##     n_visits + years_followed + completed_v4 + completed_f3
## 
##                  Df Sum of Sq    RSS     AIC
## - asa24_sugr      1     0.097 397.33 -529.88
## - cholesterol     1     0.099 397.33 -529.87
## - asa24_fola      1     0.108 397.34 -529.86
## - completed_v4    1     0.429 397.66 -529.21
## - asa24_magn      1     0.541 397.77 -528.98
## - completed_f3    1     0.674 397.91 -528.72
## - asa24_kcal      1     0.905 398.14 -528.25
## <none>                        397.23 -528.07
## - asa24_fibe      1     1.314 398.55 -527.43
## - years_followed  1     1.762 398.99 -526.53
## + glucose         1     0.081 397.15 -526.24
## + hdl             1     0.000 397.23 -526.07
## - n_visits        1     2.127 399.36 -525.80
## - hba1c           1     2.657 399.89 -524.74
## - insulin         1     3.565 400.80 -522.93
## - sex             1    35.472 432.70 -461.65
## - age             1   100.419 497.65 -349.77
## - CrystIQ         1   127.764 525.00 -306.98
## 
## Step:  AIC=-529.88
## Memory ~ age + sex + CrystIQ + hba1c + cholesterol + insulin + 
##     asa24_fibe + asa24_magn + asa24_fola + asa24_kcal + n_visits + 
##     years_followed + completed_v4 + completed_f3
## 
##                  Df Sum of Sq    RSS     AIC
## - cholesterol     1     0.095 397.42 -531.69
## - asa24_fola      1     0.103 397.43 -531.67
## - completed_v4    1     0.431 397.76 -531.01
## - asa24_magn      1     0.508 397.84 -530.86
## - completed_f3    1     0.670 398.00 -530.53
## - asa24_kcal      1     0.899 398.23 -530.07
## <none>                        397.33 -529.88
## - asa24_fibe      1     1.359 398.69 -529.15
## - years_followed  1     1.777 399.11 -528.31
## + asa24_sugr      1     0.097 397.23 -528.07
## + glucose         1     0.089 397.24 -528.06
## + hdl             1     0.000 397.33 -527.88
## - n_visits        1     2.143 399.47 -527.58
## - hba1c           1     2.735 400.06 -526.39
## - insulin         1     3.604 400.93 -524.66
## - sex             1    36.105 433.43 -462.30
## - age             1   102.629 499.96 -348.07
## - CrystIQ         1   127.923 525.25 -308.59
## 
## Step:  AIC=-531.69
## Memory ~ age + sex + CrystIQ + hba1c + insulin + asa24_fibe + 
##     asa24_magn + asa24_fola + asa24_kcal + n_visits + years_followed + 
##     completed_v4 + completed_f3
## 
##                  Df Sum of Sq    RSS     AIC
## - asa24_fola      1     0.105 397.53 -533.48
## - completed_v4    1     0.445 397.87 -532.79
## - asa24_magn      1     0.503 397.93 -532.68
## - completed_f3    1     0.687 398.11 -532.31
## - asa24_kcal      1     0.897 398.32 -531.88
## <none>                        397.42 -531.69
## - asa24_fibe      1     1.362 398.79 -530.95
## - years_followed  1     1.781 399.20 -530.11
## + cholesterol     1     0.095 397.33 -529.88
## + asa24_sugr      1     0.092 397.33 -529.87
## + glucose         1     0.088 397.34 -529.87
## + hdl             1     0.004 397.42 -529.70
## - n_visits        1     2.133 399.56 -529.41
## - hba1c           1     2.764 400.19 -528.14
## - insulin         1     3.541 400.97 -526.59
## - sex             1    38.200 435.62 -460.27
## - age             1   103.679 501.10 -348.24
## - CrystIQ         1   128.562 525.99 -309.47
## 
## Step:  AIC=-533.48
## Memory ~ age + sex + CrystIQ + hba1c + insulin + asa24_fibe + 
##     asa24_magn + asa24_kcal + n_visits + years_followed + completed_v4 + 
##     completed_f3
## 
##                  Df Sum of Sq    RSS     AIC
## - completed_v4    1     0.469 398.00 -534.53
## - asa24_magn      1     0.555 398.08 -534.36
## - completed_f3    1     0.677 398.20 -534.12
## - asa24_kcal      1     0.799 398.33 -533.87
## <none>                        397.53 -533.48
## - years_followed  1     1.802 399.33 -531.86
## + asa24_fola      1     0.105 397.42 -531.69
## + cholesterol     1     0.096 397.43 -531.67
## + glucose         1     0.093 397.43 -531.66
## + asa24_sugr      1     0.088 397.44 -531.65
## - asa24_fibe      1     1.961 399.49 -531.54
## + hdl             1     0.004 397.52 -531.48
## - n_visits        1     2.179 399.71 -531.10
## - hba1c           1     2.727 400.25 -530.01
## - insulin         1     3.542 401.07 -528.38
## - sex             1    38.192 435.72 -462.09
## - age             1   104.200 501.73 -349.24
## - CrystIQ         1   129.113 526.64 -310.47
## 
## Step:  AIC=-534.53
## Memory ~ age + sex + CrystIQ + hba1c + insulin + asa24_fibe + 
##     asa24_magn + asa24_kcal + n_visits + years_followed + completed_f3
## 
##                  Df Sum of Sq    RSS     AIC
## - asa24_magn      1     0.528 398.52 -535.47
## - asa24_kcal      1     0.797 398.79 -534.93
## <none>                        398.00 -534.53
## - completed_f3    1     1.179 399.18 -534.17
## + completed_v4    1     0.469 397.53 -533.48
## - years_followed  1     1.541 399.54 -533.44
## - n_visits        1     1.724 399.72 -533.08
## + asa24_fola      1     0.129 397.87 -532.79
## + cholesterol     1     0.110 397.89 -532.76
## + asa24_sugr      1     0.089 397.91 -532.71
## + glucose         1     0.065 397.93 -532.66
## + hdl             1     0.008 397.99 -532.55
## - asa24_fibe      1     2.072 400.07 -532.38
## - hba1c           1     2.737 400.73 -531.05
## - insulin         1     3.547 401.54 -529.44
## - sex             1    39.872 437.87 -460.15
## - age             1   103.804 501.80 -351.13
## - CrystIQ         1   129.252 527.25 -311.55
## 
## Step:  AIC=-535.47
## Memory ~ age + sex + CrystIQ + hba1c + insulin + asa24_fibe + 
##     asa24_kcal + n_visits + years_followed + completed_f3
## 
##                  Df Sum of Sq    RSS     AIC
## - asa24_kcal      1     0.397 398.92 -536.68
## <none>                        398.52 -535.47
## - completed_f3    1     1.155 399.68 -535.16
## - years_followed  1     1.466 399.99 -534.54
## + asa24_magn      1     0.528 398.00 -534.53
## + completed_v4    1     0.442 398.08 -534.36
## - n_visits        1     1.624 400.15 -534.22
## + asa24_fola      1     0.183 398.34 -533.84
## + cholesterol     1     0.105 398.42 -533.68
## + glucose         1     0.073 398.45 -533.62
## + asa24_sugr      1     0.057 398.47 -533.59
## + hdl             1     0.013 398.51 -533.50
## - hba1c           1     2.931 401.46 -531.61
## - insulin         1     3.487 402.01 -530.51
## - asa24_fibe      1     7.178 405.70 -523.19
## - sex             1    39.766 438.29 -461.38
## - age             1   103.510 502.03 -352.75
## - CrystIQ         1   129.599 528.12 -312.22
## 
## Step:  AIC=-536.68
## Memory ~ age + sex + CrystIQ + hba1c + insulin + asa24_fibe + 
##     n_visits + years_followed + completed_f3
## 
##                  Df Sum of Sq    RSS     AIC
## <none>                        398.92 -536.68
## - completed_f3    1     1.210 400.13 -536.26
## - years_followed  1     1.452 400.37 -535.77
## + completed_v4    1     0.452 398.47 -535.59
## - n_visits        1     1.575 400.50 -535.53
## + asa24_kcal      1     0.397 398.52 -535.47
## + asa24_magn      1     0.127 398.79 -534.93
## + cholesterol     1     0.105 398.82 -534.89
## + glucose         1     0.072 398.85 -534.82
## + asa24_sugr      1     0.041 398.88 -534.76
## + asa24_fola      1     0.038 398.88 -534.75
## + hdl             1     0.021 398.90 -534.72
## - hba1c           1     2.964 401.89 -532.75
## - insulin         1     3.432 402.35 -531.82
## - asa24_fibe      1     7.948 406.87 -522.90
## - sex             1    43.351 442.27 -456.15
## - age             1   103.126 502.05 -354.73
## - CrystIQ         1   130.140 529.06 -312.81

summary(step_model)

## 
## Call:
## lm(formula = Memory ~ age + sex + CrystIQ + hba1c + insulin + 
##     asa24_fibe + n_visits + years_followed + completed_f3, data = retention_master)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.18045 -0.48211 -0.00079  0.48311  2.26815 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.562761   0.257084   6.079 1.88e-09 ***
## age              -0.026125   0.001828 -14.291  < 2e-16 ***
## sexM             -0.477937   0.051582  -9.266  < 2e-16 ***
## CrystIQ           0.519937   0.032387  16.054  < 2e-16 ***
## hba1c            -0.101033   0.041699  -2.423   0.0156 *  
## insulin           0.007689   0.002949   2.607   0.0093 ** 
## asa24_fibe        0.008220   0.002072   3.967 7.93e-05 ***
## n_visits          0.048320   0.027361   1.766   0.0778 .  
## years_followed   -0.039671   0.023395  -1.696   0.0903 .  
## completed_f3TRUE  0.095800   0.061898   1.548   0.1221    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7106 on 790 degrees of freedom
##   (550 observations deleted due to missingness)
## Multiple R-squared:  0.4914, Adjusted R-squared:  0.4856 
## F-statistic: 84.82 on 9 and 790 DF,  p-value: < 2.2e-16

The final step-wise regression model identified several important predictors of Memory performance in the HCA/AABC cohort. Crystallized IQ was the strongest positive predictor, while age showed a strong negative association with memory. Males also scored lower on average than females after adjustment for other variables.

Higher HbA1c levels were associated with worse memory performance, while higher dietary fiber intake was associated with better memory scores. Insulin also remained significant, suggesting a potential relationship between metabolic regulation and cognition.

Unlike earlier cross-sectional analyses, this model incorporated longitudinal retention variables such as number of visits and years followed to partially account for dropout and participation bias. Overall, the model explained approximately 49% of the variance in Memory scores, indicating that cognitive reserve, aging, metabolic health, diet, and retention behavior all contribute to cognitive outcomes in this dataset.

19 Missingness

# Percent missing for ALL variables in retention_master

missing_summary <-
  retention_master |>

  summarise(
    across(
      everything(),
      ~ mean(is.na(.x))
    )
  ) |>

  pivot_longer(
    everything(),
    names_to = "variable",
    values_to = "pct_missing"
  ) |>

  mutate(
    pct_missing =
      round(
        100 * pct_missing,
        2
      )
  ) |>

  arrange(desc(pct_missing))

missing_summary

20 GAM model

library(mgcv)

memory_gam <-
  gam(
    Memory ~

      sex +

      s(age) +

      s(CrystIQ) +

      s(hba1c) +

      s(insulin) +

      s(asa24_fibe) +

      n_visits +

      years_followed,

    data = retention_master,

    method = "REML"
  )

summary(memory_gam)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Memory ~ sex + s(age) + s(CrystIQ) + s(hba1c) + s(insulin) + 
##     s(asa24_fibe) + n_visits + years_followed
## 
## Parametric coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -0.36067    0.06867  -5.252 1.93e-07 ***
## sexM           -0.49300    0.05151  -9.571  < 2e-16 ***
## n_visits        0.05030    0.02720   1.849   0.0648 .  
## years_followed -0.03312    0.02285  -1.450   0.1475    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                 edf Ref.df       F  p-value    
## s(age)        1.000  1.001 200.324  < 2e-16 ***
## s(CrystIQ)    2.626  3.355  81.978  < 2e-16 ***
## s(hba1c)      1.000  1.000   5.357   0.0209 *  
## s(insulin)    1.777  2.239   4.162   0.0133 *  
## s(asa24_fibe) 1.000  1.001  15.436 9.27e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.491   Deviance explained = 49.8%
## -REML = 879.81  Scale est. = 0.49949   n = 800

library(mgcv)
library(ggplot2)

# visualize GAM smooth terms

par(mfrow = c(2, 2))

plot(
  memory_gam,
  shade = TRUE,
  seWithMean = TRUE,
  pages = 1
)

par(mfrow = c(1, 1))

# residual diagnostics

par(mfrow = c(2, 2))

gam.check(memory_gam)

## 
## Method: REML   Optimizer: outer newton
## full convergence after 12 iterations.
## Gradient range [-0.0001403381,0.0001573871]
## (score 879.8084 & scale 0.4994854).
## Hessian positive definite, eigenvalue range [2.220847e-05,395.5019].
## Model rank =  49 / 49 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                 k'  edf k-index p-value
## s(age)        9.00 1.00    1.04    0.90
## s(CrystIQ)    9.00 2.63    0.97    0.20
## s(hba1c)      9.00 1.00    1.01    0.58
## s(insulin)    9.00 1.78    1.03    0.76
## s(asa24_fibe) 9.00 1.00    1.01    0.61

par(mfrow = c(1, 1))

# nonlinear age effect alone

ggplot(
  retention_master,
  aes(
    x = age,
    y = Memory
  )
) +

  geom_point(alpha = 0.25) +

  geom_smooth(
    method = "gam",
    formula = y ~ s(x),
    color = "blue"
  ) +

  labs(
    title = "Nonlinear Age Effect on Memory",
    x = "Age",
    y = "Memory"
  ) +

  theme_minimal()

# nonlinear hba1c effect

ggplot(
  retention_master,
  aes(
    x = hba1c,
    y = Memory
  )
) +

  geom_point(alpha = 0.25) +

  geom_smooth(
    method = "gam",
    formula = y ~ s(x),
    color = "darkred"
  ) +

  labs(
    title = "Nonlinear HbA1c Effect on Memory",
    x = "HbA1c",
    y = "Memory"
  ) +

  theme_minimal()

# Crystallized IQ

ggplot(
  retention_master,
  aes(
    x = CrystIQ,
    y = Memory
  )
) +

  geom_point(alpha = 0.25) +

  geom_smooth(
    method = "gam",
    formula = y ~ s(x),
    se = TRUE
  ) +

  labs(
    title = "Nonlinear Effect of Crystallized IQ on Memory",
    x = "Crystallized IQ",
    y = "Memory"
  ) +

  theme_minimal()

# Insulin

ggplot(
  retention_master,
  aes(
    x = insulin,
    y = Memory
  )
) +

  geom_point(alpha = 0.25) +

  geom_smooth(
    method = "gam",
    formula = y ~ s(x),
    se = TRUE
  ) +

  labs(
    title = "Nonlinear Effect of Insulin on Memory",
    x = "Insulin",
    y = "Memory"
  ) +

  theme_minimal()

hist(retention_master$insulin)

hist(log10(retention_master$insulin))

# Dietary Fiber

ggplot(
  retention_master,
  aes(
    x = asa24_fibe,
    y = Memory
  )
) +

  geom_point(alpha = 0.25) +

  geom_smooth(
    method = "gam",
    formula = y ~ s(x),
    se = TRUE
  ) +

  labs(
    title = "Nonlinear Effect of Fiber Intake on Memory",
    x = "Dietary Fiber",
    y = "Memory"
  ) +

  theme_minimal()

hist(retention_master$asa24_fibe)

hist(log10(retention_master$asa24_fibe))

notice the log-transformed var are more normally distributed – indicating a multiplicative relationship

21 Adj Non-Normality

retention_master <-

  retention_master |>

  mutate(

    log_insulin =
      log1p(insulin),

    log_fiber =
      log1p(asa24_fibe)

  )

library(mgcv)

memory_gam_log <-

  gam(
    Memory ~

      sex +

      s(age) +

      s(CrystIQ) +

      s(hba1c) +

      s(log_insulin) +

      s(log_fiber) +

      n_visits +

      years_followed,

    data = retention_master,

    method = "REML"
  )

summary(memory_gam_log)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Memory ~ sex + s(age) + s(CrystIQ) + s(hba1c) + s(log_insulin) + 
##     s(log_fiber) + n_visits + years_followed
## 
## Parametric coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -0.35988    0.06884  -5.228  2.2e-07 ***
## sexM           -0.48646    0.05151  -9.443  < 2e-16 ***
## n_visits        0.05034    0.02727   1.846   0.0652 .  
## years_followed -0.03389    0.02290  -1.480   0.1393    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                  edf Ref.df       F p-value    
## s(age)         1.001  1.002 205.934 < 2e-16 ***
## s(CrystIQ)     2.653  3.388  78.419 < 2e-16 ***
## s(hba1c)       1.001  1.002   5.036 0.02505 *  
## s(log_insulin) 2.858  3.689   2.306 0.07696 .  
## s(log_fiber)   1.450  1.790   7.023 0.00116 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.49   Deviance explained = 49.7%
## -REML = 882.21  Scale est. = 0.50089   n = 800

# LOG INSULIN SPLINE

ggplot(
  retention_master,
  aes(
    x = log_insulin,
    y = Memory
  )
) +

  geom_point(alpha = 0.25) +

  geom_smooth(
    method = "gam",
    formula = y ~ s(x),
    se = TRUE
  ) +

  labs(
    title = "Spline Effect of Log(Insulin) on Memory",
    x = "log(1 + Insulin)",
    y = "Memory"
  ) +

  theme_minimal()

# LOG FIBER SPLINE

ggplot(
  retention_master,
  aes(
    x = log_fiber,
    y = Memory
  )
) +

  geom_point(alpha = 0.25) +

  geom_smooth(
    method = "gam",
    formula = y ~ s(x),
    se = TRUE
  ) +

  labs(
    title = "Spline Effect of Log(Fiber Intake) on Memory",
    x = "log(1 + Fiber Intake)",
    y = "Memory"
  ) +

  theme_minimal()

ggplot(
  retention_master,
  aes(
    x = log_insulin,
    y = Memory
  )
) +

  stat_summary_bin(
    fun = mean,
    bins = 25,
    geom = "point"
  ) +

  stat_summary_bin(
    fun = mean,
    bins = 25,
    geom = "line"
  ) +

  labs(
    title = "Binned Mean Memory by Log(Insulin)",
    x = "log(1 + Insulin)",
    y = "Mean Memory"
  ) +

  theme_minimal()

22 Cross fold validation

library(mgcv)
library(rsample)
library(purrr)
library(dplyr)
library(yardstick)

set.seed(141)

# 5-fold cross validation
folds <-
  vfold_cv(
    retention_master,
    v = 20
  )

cv_results <-
  folds |>

  mutate(

    model =
      map(

        splits,

        ~ gam(
          Memory ~

            sex +

            s(age) +

            s(CrystIQ) +

            s(hba1c) +

            insulin +

            asa24_fibe +

            n_visits +

            years_followed,

          data = analysis(.x),

          method = "REML"
        )

      ),

    preds =
      map2(

        model,
        splits,

        ~ {

          assess_data <- assessment(.y)

          assess_data |>

            mutate(
              pred =
                predict(
                  .x,
                  newdata = assess_data
                )
            )

        }

      ),

    metrics =
      map(

        preds,

        ~ metrics(
          .x,
          truth = Memory,
          estimate = pred
        )

      )

  )

# summarize CV performance

cv_summary <-
  cv_results |>

  dplyr::select(metrics) |>

  unnest(metrics)

cv_summary

cv_summary |>

  group_by(.metric) |>

  summarise(

    mean =
      mean(.estimate),

    sd =
      sd(.estimate)

  )

23 best fold

best_fold_index <-

  cv_results |>

  mutate(

    rsq =
      map_dbl(
        metrics,
        ~ filter(.x, .metric == "rsq")$.estimate
      )

  ) |>

  arrange(desc(rsq)) |>

  slice(1) |>

  pull(id)

best_fold_index

## [1] "Fold06"

best_model_df <-

  cv_results |>

  mutate(

    rsq =
      map_dbl(
        metrics,
        ~ filter(.x, .metric == "rsq")$.estimate
      )

  ) |>

  arrange(desc(rsq)) |>

  slice(1)

best_model <-
  best_model_df$model[[1]]

summary(best_model)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Memory ~ sex + s(age) + s(CrystIQ) + s(hba1c) + insulin + asa24_fibe + 
##     n_visits + years_followed
## 
## Parametric coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -0.623252   0.089640  -6.953 7.82e-12 ***
## sexM           -0.468244   0.053403  -8.768  < 2e-16 ***
## insulin         0.008033   0.003001   2.677 0.007603 ** 
## asa24_fibe      0.008122   0.002130   3.814 0.000148 ***
## n_visits        0.041051   0.028110   1.460 0.144609    
## years_followed -0.026041   0.023513  -1.107 0.268447    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##              edf Ref.df       F p-value    
## s(age)     1.005  1.010 182.146  <2e-16 ***
## s(CrystIQ) 2.690  3.437  73.504  <2e-16 ***
## s(hba1c)   1.001  1.001   5.263   0.022 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.483   Deviance explained =   49%
## -REML = 843.55  Scale est. = 0.50727   n = 757

par(mfrow = c(2, 2))

plot(
  best_model,
  shade = TRUE,
  pages = 1
)

par(mfrow = c(1, 1))

par(mfrow = c(2, 2))

gam.check(best_model)

## 
## Method: REML   Optimizer: outer newton
## full convergence after 8 iterations.
## Gradient range [-0.0002795111,0.001470631]
## (score 843.5461 & scale 0.5072726).
## Hessian positive definite, eigenvalue range [9.771001e-05,374.0004].
## Model rank =  33 / 33 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##              k'  edf k-index p-value
## s(age)     9.00 1.00    1.04    0.80
## s(CrystIQ) 9.00 2.69    0.97    0.21
## s(hba1c)   9.00 1.00    0.98    0.30

par(mfrow = c(1, 1))

library(ggplot2)
library(patchwork)

# AGE

p_age <-

  ggplot(
    retention_master,
    aes(
      x = age,
      y = Memory
    )
  ) +

  geom_point(alpha = 0.25) +

  geom_smooth(
    method = "gam",
    formula = y ~ s(x),
    se = TRUE
  ) +

  labs(
    title = "Age",
    x = "Age",
    y = "Memory"
  ) +

  theme_minimal()

# CRYSTIQ

p_cryst <-

  ggplot(
    retention_master,
    aes(
      x = CrystIQ,
      y = Memory
    )
  ) +

  geom_point(alpha = 0.25) +

  geom_smooth(
    method = "gam",
    formula = y ~ s(x),
    se = TRUE
  ) +

  labs(
    title = "Crystallized IQ",
    x = "Crystallized IQ",
    y = "Memory"
  ) +

  theme_minimal()

# HBA1C

p_hba1c <-

  ggplot(
    retention_master,
    aes(
      x = hba1c,
      y = Memory
    )
  ) +

  geom_point(alpha = 0.25) +

  geom_smooth(
    method = "gam",
    formula = y ~ s(x),
    se = TRUE
  ) +

  labs(
    title = "HbA1c",
    x = "HbA1c",
    y = "Memory"
  ) +

  theme_minimal()

# INSULIN

p_insulin <-

  ggplot(
    retention_master,
    aes(
      x = insulin,
      y = Memory
    )
  ) +

  geom_point(alpha = 0.25) +

  geom_smooth(
    method = "gam",
    formula = y ~ s(x),
    se = TRUE
  ) +

  labs(
    title = "Insulin",
    x = "Insulin",
    y = "Memory"
  ) +

  theme_minimal()

# FIBER

p_fiber <-

  ggplot(
    retention_master,
    aes(
      x = asa24_fibe,
      y = Memory
    )
  ) +

  geom_point(alpha = 0.25) +

  geom_smooth(
    method = "gam",
    formula = y ~ s(x),
    se = TRUE
  ) +

  labs(
    title = "Dietary Fiber",
    x = "Fiber Intake",
    y = "Memory"
  ) +

  theme_minimal()

# COMBINE ALL PLOTS

(p_age | p_cryst) /
(p_hba1c | p_insulin) /
p_fiber

The generalized additive model (GAM) demonstrated good statistical performance and adequate model diagnostics, with successful convergence, stable optimization, and no evidence that spline complexity was insufficient. The spline basis checks suggested that the selected smoothness parameters appropriately captured nonlinear structure without overfitting.

Age showed a strong negative association with memory performance, with the estimated degrees of freedom (edf ≈ 1) indicating an approximately linear decline in memory with increasing age. Crystallized IQ demonstrated a significant nonlinear relationship with memory (edf ≈ 2.7), suggesting that the protective cognitive effects of crystallized intelligence vary across its range rather than following a simple linear trend. HbA1c also showed a significant association with memory, with higher levels linked to poorer cognitive performance, supporting a relationship between glucose dysregulation and cognition.

Among the linear predictors, male sex was associated with lower memory scores, while higher insulin levels and greater dietary fiber intake were associated with better memory performance after adjustment for demographic and metabolic covariates. Number of study visits and years followed were not statistically significant but were retained to partially account for longitudinal retention effects.

Overall, the model explained approximately 49% of the variance in memory scores (adjusted R² = 0.483), indicating substantial explanatory power for observational cognitive aging data. These findings support the broader AABC study hypothesis that modifiable metabolic and dietary factors may contribute to cognitive aging trajectories and resilience in brain health.

24 longitudinal model

“Is there a relationship between changes in diet and changes in cognition over time?”

library(dplyr)
library(stringr)
library(lme4)
library(lmerTest)

# ---------------------------------
# Build longitudinal merged dataset
# ---------------------------------

longitudinal_df <-

  all |>

  filter(age_open != "90 or older") |>

  mutate(

    patient =
      str_remove(id, "^[A-Z]+"),

    age =
      as.numeric(age_open),

    Memory =
      as.numeric(Memory_Tr35_60y)

  ) |>

  dplyr::select(
    patient,
    id_event,
    event,
    study,
    age,
    sex,
    Memory
  ) |>

  left_join(

    blood |>

      mutate(
        patient =
          str_remove(id, "^[A-Z]+")
      ) |>

      dplyr::select(
        patient,
        id_event,
        hba1c,
        insulin,
        glucose
      ),

    by = c("patient", "id_event")

  ) |>

  left_join(

    diet |>

      mutate(
        patient =
          str_remove(id, "^[A-Z]+")
      ) |>

      dplyr::select(
        patient,
        id_event,
        asa24_fibe,
        asa24_sugr,
        asa24_kcal
      ),

    by = c("patient", "id_event")

  )

# ---------------------------------
# Keep rows with key variables
# ---------------------------------

model_df <-

  longitudinal_df |>

  filter(

    !is.na(Memory),

    !is.na(age),

    !is.na(hba1c),

    !is.na(insulin),

    !is.na(asa24_fibe)

  )

24.1 Patient Loss

all |> mutate(
  patient = str_remove(id, "^[A-Z]+")) |> distinct(patient) |> nrow()

## [1] 1396

longitudinal_df |> distinct(patient) |> nrow()

## [1] 1350

model_df |> distinct(patient) |> nrow()

## [1] 692

model_df$event |> unique()

## [1] V3 V2 V4 V1
## Levels: V1 < F1 < V2 < F2 < CR < V3 < F3 < V4 < AF1

24.2 Patient Loss Paths

# -----------------------------------------
# Flexible retention plot function
# Works for:
#   all
#   longitudinal_df
#   model_df
# -----------------------------------------

library(dplyr)
library(stringr)
library(ggplot2)
library(grid)

make_retention_plot <- function(df, title_text) {

  # -----------------------------------------
  # Build age variable if needed
  # -----------------------------------------

  if(!"age_open" %in% colnames(df)) {

    if("age" %in% colnames(df)) {

      df <- df |>
        mutate(
          age_open = age
        )

    } else {

      stop("No age or age_open column found.")

    }

  }

  # -----------------------------------------
  # Build patient variable if needed
  # -----------------------------------------

  if(!"patient" %in% colnames(df)) {

    if("id" %in% colnames(df)) {

      df <- df |>
        mutate(
          patient =
            str_remove(id, "^[A-Z]+")
        )

    } else {

      stop("No patient or id column found.")

    }

  }

  # -----------------------------------------
  # Event ordering
  # -----------------------------------------

  study_event_levels <-
    c(
      "HCA:V1","HCA:F1","HCA:V2","HCA:F2",
      "HCA:CR","HCA:V3","HCA:F3","HCA:V4","HCA:AF1",
      "AABC:V1","AABC:F1","AABC:V2","AABC:F2",
      "AABC:CR","AABC:V3","AABC:F3","AABC:V4","AABC:AF1"
    )

  # -----------------------------------------
  # Build longitudinal structure
  # -----------------------------------------

  retention_paths <-

    df |>

    filter(!is.na(age_open)) |>

    mutate(

      age_open =
        suppressWarnings(as.numeric(age_open)),

      study_event =
        factor(
          paste(study, event, sep = ":"),
          levels = study_event_levels,
          ordered = TRUE
        )

    ) |>

    arrange(
      patient,
      age_open
    ) |>

    group_by(patient) |>

    mutate(

      visit_order =
        row_number(),

      next_visit =
        lead(visit_order),

      next_study_event =
        lead(study_event)

    ) |>

    ungroup()

  # -----------------------------------------
  # Node counts
  # -----------------------------------------

  node_counts <-

    retention_paths |>

    count(
      visit_order,
      study_event,
      name = "n_patients"
    )

  # -----------------------------------------
  # Edge counts
  # -----------------------------------------

  edge_counts <-

    retention_paths |>

    filter(!is.na(next_study_event)) |>

    count(
      visit_order,
      next_visit,
      study_event,
      next_study_event,
      name = "n_patients"
    )

  # -----------------------------------------
  # Dropoff counts
  # -----------------------------------------

  dropoff_counts <-

    retention_paths |>

    group_by(patient) |>

    summarise(
      final_visit =
        max(visit_order),
      .groups = "drop"
    ) |>

    count(final_visit)

  # -----------------------------------------
  # Plot
  # -----------------------------------------

  ggplot() +

    geom_segment(

      data = edge_counts,

      aes(
        x = visit_order,
        xend = next_visit,

        y = study_event,
        yend = next_study_event,

        linewidth = n_patients,
        color = n_patients
      ),

      alpha = 0.5,

      arrow =
        arrow(length = unit(0.10, "cm"))

    ) +

    geom_point(

      data = node_counts,

      aes(
        x = visit_order,
        y = study_event,
        size = n_patients
      ),

      color = "black",
      alpha = 0.85

    ) +

    geom_text(

      data = dropoff_counts,

      aes(
        x = final_visit,
        y = 1,
        label = paste0(
          "dropoff=", n
        )
      ),

      size = 3,
      vjust = -1

    ) +

    scale_x_continuous(
      breaks = 1:9
    ) +

    scale_size(
      range = c(2, 12)
    ) +

    scale_linewidth(
      range = c(0.3, 4)
    ) +

    scale_color_gradient(
      low = "lightblue",
      high = "darkred"
    ) +

    labs(

      title = title_text,

      subtitle =
        "Node size = remaining participants | edge thickness = transitions",

      x =
        "Visit Order",

      y =
        "Study : Event",

      size =
        "Patients",

      linewidth =
        "Transitions",

      color =
        "Transitions"

    ) +

    theme_minimal()

}

# -----------------------------------------
# Generate retention plots
# -----------------------------------------

retention_all <-

  make_retention_plot(
    all,
    "Retention Plot: ALL dataset"
  )

retention_longitudinal <-

  make_retention_plot(
    longitudinal_df,
    "Retention Plot: longitudinal_df"
  )

retention_model <-

  make_retention_plot(
    model_df,
    "Retention Plot: model_df"
  )

# -----------------------------------------
# Print plots
# -----------------------------------------

retention_all

retention_longitudinal

retention_model

24.3 memory_lmm

# ---------------------------------
# Longitudinal mixed-effects model
# ---------------------------------

memory_lmm <-

  lmer(

    Memory ~

      age +

      sex +

      hba1c +

      insulin +

      asa24_fibe +

      (1 | patient),

    data = model_df

  )

summary(memory_lmm)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Memory ~ age + sex + hba1c + insulin + asa24_fibe + (1 | patient)
##    Data: model_df
## 
## REML criterion at convergence: 2057.1
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.36665 -0.31327 -0.00039  0.32416  2.16056 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  patient  (Intercept) 0.6950   0.8337  
##  Residual             0.1612   0.4015  
## Number of obs: 794, groups:  patient, 692
## 
## Fixed effects:
##               Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)  2.035e+00  2.922e-01  7.867e+02   6.963 7.01e-12 ***
## age         -3.187e-02  2.557e-03  7.169e+02 -12.463  < 2e-16 ***
## sexM        -4.400e-01  7.167e-02  6.993e+02  -6.139 1.39e-09 ***
## hba1c       -9.738e-02  4.558e-02  7.680e+02  -2.136    0.033 *  
## insulin     -2.633e-04  3.254e-03  6.960e+02  -0.081    0.936    
## asa24_fibe   1.058e-02  2.392e-03  6.611e+02   4.424 1.13e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##            (Intr) age    sexM   hba1c  insuln
## age        -0.446                            
## sexM        0.003 -0.054                     
## hba1c      -0.784 -0.142 -0.059              
## insulin    -0.029  0.117  0.011 -0.203       
## asa24_fibe -0.239  0.066 -0.152  0.049  0.055

24.4 Diagnostics

par(mfrow = c(2,2))

plot(memory_lmm)

qqnorm(residuals(memory_lmm))
qqline(residuals(memory_lmm))

hist(residuals(memory_lmm))

par(mfrow = c(1,1))

icc <-

  0.695 / (0.695 + 0.161)

icc

## [1] 0.8119159

repeated measures within a person are much more similar than measurements across different people.

memory_lmm_no_corr <-
  lmer(
    Memory ~
      age +
      sex +
      hba1c +
      asa24_fibe +
      (1 | patient),
    data = model_df
  )


summary(memory_lmm_no_corr)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Memory ~ age + sex + hba1c + asa24_fibe + (1 | patient)
##    Data: model_df
## 
## REML criterion at convergence: 2047.4
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.36946 -0.31390  0.00029  0.32360  2.16804 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  patient  (Intercept) 0.6944   0.8333  
##  Residual             0.1609   0.4011  
## Number of obs: 794, groups:  patient, 692
## 
## Fixed effects:
##               Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)   2.034170   0.291900 787.831518   6.969 6.76e-12 ***
## age          -0.031841   0.002538 712.160475 -12.547  < 2e-16 ***
## sexM         -0.439949   0.071627 699.893234  -6.142 1.36e-09 ***
## hba1c        -0.098134   0.044603 767.634375  -2.200   0.0281 *  
## asa24_fibe    0.010591   0.002387 663.369856   4.437 1.07e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##            (Intr) age    sexM   hba1c 
## age        -0.446                     
## sexM        0.003 -0.056              
## hba1c      -0.807 -0.122 -0.058       
## asa24_fibe -0.238  0.060 -0.153  0.061

par(mfrow = c(2,2))

plot(memory_lmm_no_corr)

qqnorm(residuals(memory_lmm_no_corr))
qqline(residuals(memory_lmm_no_corr))

hist(residuals(memory_lmm_no_corr))

par(mfrow = c(1,1))

based on the plot we are systematically missing structure

anova(memory_lmm, memory_lmm_no_corr)

The likelihood-ratio test showed that adding insulin did not improve the longitudinal mixed-effects model (p = 0.94), meaning the model with insulin performed essentially identically to the simpler model without it. Because the simpler model also had a slightly lower AIC, it is preferred for interpretation and inference. This suggests that insulin’s earlier association with Memory was likely driven by between-person differences or confounding rather than a stable within-person longitudinal relationship. In contrast, age, HbA1c, and dietary fiber remained meaningful predictors after accounting for repeated observations within participants, indicating that cognitive aging, glucose regulation, and diet are the strongest longitudinal signals associated with Memory performance in this cohort.

25 GAM Longitudinal

model_df <-

  longitudinal_df |>

  left_join(

    baseline_features |>

      dplyr::select(
        patient,
        CrystIQ
      ),

    by = "patient"

  ) |>

  filter(

    !is.na(Memory),

    !is.na(age),

    !is.na(CrystIQ),

    !is.na(hba1c),

    !is.na(asa24_fibe)

  )

library(mgcv)

memory_gamm <-

  gamm(

    Memory ~

      sex +

      s(age) +

      s(CrystIQ) +

      s(hba1c) +

      s(asa24_fibe),

    random =
      list(
        patient = ~1
      ),

    data = model_df,

    method = "REML"

  )

summary(memory_gamm$gam)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Memory ~ sex + s(age) + s(CrystIQ) + s(hba1c) + s(asa24_fibe)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.31761    0.03991  -7.959 6.05e-15 ***
## sexM        -0.45675    0.06190  -7.378 4.08e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                 edf Ref.df      F p-value    
## s(age)        3.735  3.735 52.561  <2e-16 ***
## s(CrystIQ)    2.576  2.576 84.693  <2e-16 ***
## s(hba1c)      1.000  1.000  3.168  0.0755 .  
## s(asa24_fibe) 2.543  2.543  6.086  0.0027 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.442   
##   Scale est. = 0.15491   n = 799

plot(memory_gamm$gam, pages = 1)

gam.check(memory_gamm$gam)

## 
## 'gamm' based fit - care required with interpretation.
## Checks based on working residuals may be misleading.
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                 k'  edf k-index p-value    
## s(age)        9.00 3.74    1.04    0.92    
## s(CrystIQ)    9.00 2.58    0.89  <2e-16 ***
## s(hba1c)      9.00 1.00    0.96    0.10 .  
## s(asa24_fibe) 9.00 2.54    0.94    0.03 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

AIC(memory_lmm)

## [1] 2073.055

AIC(memory_gamm$lme)

## [1] 1875.152

library(gratia)
library(ggplot2)

draw(
  memory_gamm$gam,
  residuals = TRUE
)

memory_gamm_2d <-

  gamm(

    Memory ~

      sex +

      s(CrystIQ) +

      te(age, hba1c) +

      s(asa24_fibe),

    random =
      list(patient = ~1),

    data = model_df,

    method = "REML"
  )

 vis.gam(
  memory_gamm_2d$gam,
  view = c("age", "hba1c"),
  plot.type = "contour",
  color = "terrain"
)

vis.gam(
  memory_gamm_2d$gam,
  view = c("age", "hba1c"),
  theta = 35,
  phi = 30
)

library(mgcv)
library(plotly)
library(dplyr)

# prediction grid

surface_df <-

  expand.grid(

    age =
      seq(
        min(model_df$age),
        max(model_df$age),
        length.out = 60
      ),

    hba1c =
      seq(
        min(model_df$hba1c),
        max(model_df$hba1c),
        length.out = 60
      ),

    CrystIQ =
      mean(model_df$CrystIQ, na.rm = TRUE),

    asa24_fibe =
      mean(model_df$asa24_fibe, na.rm = TRUE),

    sex = "F"
  )

# predicted values

surface_df$pred <-

  predict(
    memory_gamm_2d$gam,
    newdata = surface_df
  )

# reshape into matrix

z_matrix <-

  matrix(
    surface_df$pred,
    nrow = 60,
    ncol = 60
  )

# interactive surface

plot_ly(

  x =
    unique(surface_df$age),

  y =
    unique(surface_df$hba1c),

  z =
    z_matrix,

  type = "surface"

) |>

  layout(

    title =
      "Interactive GAM Surface: Age × HbA1c on Memory",

    scene = list(

      xaxis =
        list(title = "Age"),

      yaxis =
        list(title = "HbA1c"),

      zaxis =
        list(title = "Predicted Memory")

    )

  )

26 Pred. Surface Split by Sex

library(mgcv)
library(plotly)
library(dplyr)

# -----------------------------
# Prediction grid by sex
# -----------------------------

surface_df <-

  expand.grid(

    age =
      seq(
        min(model_df$age),
        max(model_df$age),
        length.out = 60
      ),

    hba1c =
      seq(
        min(model_df$hba1c),
        max(model_df$hba1c),
        length.out = 60
      ),

    sex = c("F", "M"),

    CrystIQ =
      mean(model_df$CrystIQ, na.rm = TRUE),

    asa24_fibe =
      mean(model_df$asa24_fibe, na.rm = TRUE)

  )

# -----------------------------
# Predictions
# -----------------------------

surface_df$pred <-

  predict(
    memory_gamm_2d$gam,
    newdata = surface_df
  )

# -----------------------------
# Split surfaces
# -----------------------------

female_df <-
  filter(surface_df, sex == "F")

male_df <-
  filter(surface_df, sex == "M")

z_f <-

  matrix(
    female_df$pred,
    nrow = 60,
    ncol = 60
  )

z_m <-

  matrix(
    male_df$pred,
    nrow = 60,
    ncol = 60
  )

# -----------------------------
# Interactive subplot surfaces
# -----------------------------

p_f <-

  plot_ly(

    x = unique(female_df$age),

    y = unique(female_df$hba1c),

    z = z_f,

    type = "surface"

  ) |>

  layout(

    title = "Females",

    scene = list(

      xaxis = list(title = "Age"),

      yaxis = list(title = "HbA1c"),

      zaxis = list(title = "Predicted Memory")

    )

  )

p_m <-

  plot_ly(

    x = unique(male_df$age),

    y = unique(male_df$hba1c),

    z = z_m,

    type = "surface"

  ) |>

  layout(

    title = "Males",

    scene = list(

      xaxis = list(title = "Age"),

      yaxis = list(title = "HbA1c"),

      zaxis = list(title = "Predicted Memory")

    )

  )

subplot(
  p_f,
  p_m,
  margin = 0.05
)

27 GAM Longitudinal + Drop-off

model_df <-

  longitudinal_df |>

  left_join(

    baseline_features |>

      dplyr::select(
        patient,
        CrystIQ
      ),

    by = "patient"

  ) |>

  left_join(

    retention_df |>

      dplyr::select(
        patient,
        n_visits,
        years_followed,
        completed_f3,
        completed_v4
      ),

    by = "patient"

  ) |>

  filter(

    !is.na(Memory),

    !is.na(age),

    !is.na(CrystIQ),

    !is.na(hba1c),

    !is.na(asa24_fibe)

  )

memory_gamm_dropout <-

  gamm(

    Memory ~

      sex +

      s(age) +

      s(CrystIQ) +

      s(hba1c) +

      s(asa24_fibe) +

      n_visits +

      years_followed +

      completed_f3 +

      completed_v4,

    random =
      list(
        patient = ~1
      ),

    data = model_df,

    method = "REML"

  )

summary(memory_gamm_dropout$gam)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Memory ~ sex + s(age) + s(CrystIQ) + s(hba1c) + s(asa24_fibe) + 
##     n_visits + years_followed + completed_f3 + completed_v4
## 
## Parametric coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -0.55784    0.08846  -6.306 4.77e-10 ***
## sexM             -0.44945    0.06217  -7.229 1.16e-12 ***
## n_visits          0.13772    0.03477   3.961 8.14e-05 ***
## years_followed   -0.08107    0.02808  -2.887   0.0040 ** 
## completed_f3TRUE -0.09440    0.07721  -1.223   0.2218    
## completed_v4TRUE -0.19848    0.10794  -1.839   0.0663 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                 edf Ref.df      F p-value    
## s(age)        3.812  3.812 55.064 < 2e-16 ***
## s(CrystIQ)    2.589  2.589 72.003 < 2e-16 ***
## s(hba1c)      1.000  1.000  2.611 0.10652    
## s(asa24_fibe) 2.446  2.446  5.668 0.00405 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.453   
##   Scale est. = 0.15541   n = 799

28 AIC comparison

# ---------------------------------
# Compare ALL cross-sectional
# and longitudinal models by AIC
# ---------------------------------

library(dplyr)

model_comparison <-

  tibble(

    model = c(

      # Cross-sectional models
      "Cross-sectional LM",
      "Cross-sectional GAM",
      "Cross-sectional GAM (log transformed)",

      # Longitudinal models
      "Longitudinal LMM",
      "Longitudinal LMM (no insulin)",
      "Longitudinal GAMM",
      "Longitudinal GAMM + dropout"

    ),

    AIC = c(

      # Cross-sectional
      AIC(step_model),

      AIC(memory_gam),

      AIC(memory_gam_log),

      # Longitudinal
      AIC(memory_lmm),

      AIC(memory_lmm_no_corr),

      AIC(memory_gamm$lme),

      AIC(memory_gamm_dropout$lme)

    )

  ) |>

  arrange(AIC)

model_comparison

library(ggplot2)

ggplot(
  model_comparison,
  aes(
    x = reorder(model, AIC),
    y = AIC
  )
) +

  geom_col() +

  coord_flip() +

  labs(
    title = "Model Comparison by AIC",
    x = "",
    y = "AIC (lower is better)"
  ) +

  theme_minimal()

29 Overall Report

The AABC/HCA analyses focused on whether modifiable lifestyle and metabolic factors are associated with cognitive aging, particularly Memory performance. The strongest and most consistent findings across models were:

Older age was associated with lower Memory scores.
Higher HbA1c (poorer glucose regulation) was associated with worse cognition.
Higher dietary fiber intake was associated with better Memory performance.
Crystallized IQ showed a strong protective relationship with cognition.
Several relationships were nonlinear, meaning GAM/GAMM models captured the data substantially better than standard linear models.

Cross-sectional GAM models produced the best overall fit (AIC ≈ 1731), explaining nearly 49% of the variance in Memory scores. Longitudinal mixed-effects models confirmed that repeated measurements within individuals were highly correlated (ICC ≈ 0.81), supporting the importance of participant-specific longitudinal modeling.

Missingness Report

Isaiah C. Mireles

2026-05-17