1 Read Data

2 Order of Events

2.1 event, study pairs

all |> dplyr::distinct(event, study)

notice some peoples first visit ( V1 ) may be in HCA or, AABC – same being true for V2
- suggesting people joined study at different time periods

all <- all |>
  dplyr::mutate(
    event = factor(
      event,
      levels = c(
        "V1", "F1",
        "V2", "F2",
        "CR",
        "V3", "F3",
        "V4",
        "AF1"
      ),
      ordered = TRUE
    ),

    study = factor(
      study,
      levels = c("HCA", "AABC"),
      ordered = TRUE
    )
  )

patient_paths <-
  all |>
  filter(age_open != "90 or older") |>
  mutate(age_open = as.numeric(age_open)) |> 
  mutate(
    patient = str_remove(id, "^[A-Z]+")
  ) |>
  distinct(patient, study, event, age_open) |>
  arrange(patient, study, event) |>
  group_by(patient) |>
  arrange(study, event) |> 
  summarise(
    n_studies = n_distinct(study),
    n_events = n(),

    years_passed = if_else(
      n() > 1,
      max(age_open, na.rm = TRUE) - min(age_open, na.rm = TRUE),
      NA_real_
    ),

    study_event_path = paste(
      paste(study, event, sep = ":"),
      collapse = " → "
    ),

    .groups = "drop"
  ) |>
  select(patient, years_passed, study_event_path, years_passed, everything()) |>
  arrange(desc(years_passed))
patient_paths

plot(patient_paths$years_passed, patient_paths$n_events, xlab = "Yrs Passed", ylab = "n_events")

3 Path Diagram

patient_paths

library(tidyverse)

patient_steps <-
  patient_paths |>
  separate_rows(study_event_path, sep = " → ") |>
  separate(
    study_event_path,
    into = c("study", "event"),
    sep = ":"
  ) |>
  mutate(
    study = factor(
      study,
      levels = c("HCA", "AABC"),
      ordered = TRUE
    ),
    event = factor(
      event,
      levels = c(
        "V1", "F1",
        "V2", "F2",
        "CR",
        "V3", "F3",
        "V4",
        "AF1"
      ),
      ordered = TRUE
    )
  ) |>
  arrange(patient, study, event) |>
  group_by(patient) |>
  mutate(step = row_number()) |>
  ungroup()

patient_steps <-
  patient_steps |>
  mutate(
    study_event = interaction(
      study,
      event,
      sep = ":",
      lex.order = TRUE
    )
  )

patient_edges <-
  patient_steps |>
  arrange(patient, step) |>
  group_by(patient) |>
  mutate(
    next_study_event = lead(study_event),
    next_step = lead(step)
  ) |>
  filter(!is.na(next_study_event)) |>
  ungroup()

transition_counts <-
  patient_edges |>
  count(
    step,
    next_step,
    study_event,
    next_study_event,
    name = "n_patients"
  )

library(grid)

max_step <- max(patient_steps$step)

path_plot <-
  ggplot(transition_counts) +
  geom_segment(
    aes(
      x = step,
      xend = next_step,
      y = study_event,
      yend = next_study_event,
      color = n_patients,
      linewidth = n_patients
    ),
    arrow = arrow(length = unit(0.15, "cm")),
    alpha = 0.8
  ) +
  geom_point(
    data = patient_steps |> distinct(step, study_event),
    aes(x = step, y = study_event),
    size = 3
  ) +
  scale_x_continuous(
    breaks = seq(1, max_step, by = 1),
    limits = c(1, max_step)
  ) +
  scale_linewidth(range = c(0.5, 3)) +
  scale_color_gradient(
    low = "mistyrose",
    high = "red4"
  ) +
  labs(
    x = "Sequence Step",
    y = "Study:Event",
    color = "Number of Patients",
    linewidth = "Number of Patients",
    title = "Patient Study/Event Paths"
  ) +
  theme_minimal()

path_plot

4 Missingness Relevance

A great deal of data is missing ( NA ) – therefore our data is heavily biased. We would like to determine in what way its biased. Perhaps by adjusting for this bias we see strong associations for each variable to the cognitive markers in studying dimentia.

4.1 Blood Data

library(dplyr)
library(naniar)
blood |> 
  miss_var_summary() |>
  as.data.frame() |>
  select(variable, pct_miss)

direct_ldl is entirely missing, we may discard this feature
Many repeated values

4.2 Diet Data

diet |> 
  miss_var_summary() |>
  as.data.frame() |>
  select(variable, pct_miss)

interpretation :

Notice the only features without missingness
Notice the large amount of missingness is the

Participant info/Demographics
- data_category
Notice that the missingness appears to repeat – showing runs of the same pct_miss for both features

4.3 Ct and Graph Repeated Missingness

4.4 Diet Data

ct <-
  diet |> 
  miss_var_summary() |> 
  mutate(
    ct = ifelse(pct_miss == 0, 0, 1)
    ) |> count(ct) 
ct

Interpretation :

There are only 4 non-missing values every other value is just 84.6 percent missing – which is a HUGE amt.

4.5 Blood Data

ct <- 
  blood |> 
  miss_var_summary() 

library(ggplot2)
ct |> 
  ggplot(aes(pct_miss)) + 
  geom_histogram(binwidth = 10)

interpretation

this one is slightly more varied
0, about 60, 90 100% percent missing – with 60, 90 happening quite frequently

ct <- ct |>
  mutate(
    char = format(pct_miss)
  )

ct <- as.vector(ct$pct_miss)/100
ct <- ct |> round(1) 
ct <- ct |> as.character()
barplot(table(ct))

We can see that the repeated values are near 0 (id features), 60%, 90% or 100%

5 Build Features

5.1 `patient` study & events ct

Purpose : identify how often a patients appear in our data.

For example :

Are they in both studies?
How many unique events?

5.1.1 Helper Func : Create `unique_id`, `Patient` Variable

library(dplyr)

diet <-
  diet |>
  mutate(
    patient = str_remove(id, "^[A-Z]+") 
  ) |> select(patient, everything())

blood <-
  blood |>
  mutate(
    patient = str_remove(id, "^[A-Z]+") 
  ) |> select(patient, everything())

6 `patient` specific missing ct

Purpose : identify how often patients appear in our data to better understand missing values.

6.1 Diet

diet_patients <-
  diet |>
  group_by(patient) |>
  summarize(
    n_rows = n(),
    total_missing =
      sum(across(everything(), ~ sum(is.na(.x)))),
    .groups = "drop"
  ) |>
  mutate(
    pct =
      round(
        total_missing /
          (n_rows * (ncol(diet) - 1)),
        2
      )
  ) |>
  arrange(desc(pct))

diet_patients <-
  diet_patients |>  mutate(
    p_rank = paste0("patient", row_number())
  ) |> select(p_rank, everything(), patient)

plot(diet_patients$n_rows, diet_patients$total_missing)

6.2 blood

blood_patients <-
  blood |>
  group_by(patient) |>
  summarize(
    n_rows = n(),
    total_missing =
      sum(across(everything(), ~ sum(is.na(.x)))),
    total_cells =
      n_rows * ncol(pick(everything())),
    pct =
      round(total_missing / total_cells, 2),
    .groups = "drop"
  ) |>
  arrange(desc(pct))

blood_patients <-
  blood_patients |>  mutate(
    p_rank = paste0("patient", row_number())
  ) |> select(p_rank, everything(), patient)

plot(blood_patients$n_rows,blood_patients$total_missing)

7 Correlations of `Cognitive` Features – ALL

Note :

we have repeated observations, so we are taking people who had more visits more into account
Relevant factor analysis Features^
- Memory_Tr35_60y,
  - Cognition Factor Analysis: Memory factor estimated in Union V1 35-60 yo subjects and applied to all subjects all visits
- FluidIQ_Tr35_60y,
  - Cognition Factor Analysis: Fluid IQ factor estimated in Union V1 35-60 yo subjects and applied to all subjects all visits
- CrystIQ_Tr35_60y
  - Cognition Factor Analysis: Crystallized IQ factor estimated in Union V1 35-60 yo subjects and applied to all subjects all visits

Memory <- all$Memory_Tr35_60y |> as.numeric()
fluidIQ <- all$FluidIQ_Tr35_60y |> as.numeric()
CrystIQ <- all$CrystIQ_Tr35_60y |> as.numeric()

7.1 `Memory`, `fluidIQ` dot-plt

# Memory ~ fluidIQ
Memory_fluidIQ_mdl <- lm(Memory ~ fluidIQ)

plot(Memory, fluidIQ)
abline(Memory_fluidIQ_mdl, col = "red")

7.2 `Memory`, `CrystIQ` dot-plt

# Memory ~ CrystIQ
Memory_CrystIQ_mdl <- lm(Memory ~ CrystIQ)

plot(Memory, CrystIQ)
abline(Memory_CrystIQ_mdl, col = "red")

7.3 `fluidIQ`, `CrystIQ` dot-plt

# fluidIQ ~ CrystIQ
fluidIQ_CrystIQ_mdl <- lm(fluidIQ ~ CrystIQ)

plot(fluidIQ, CrystIQ)
abline(fluidIQ_CrystIQ_mdl, col = "red")

diet_num <- diet |> select(where(is.numeric))
blood_num <- blood |> select(where(is.numeric))

diet_num <- cbind(Memory, fluidIQ, CrystIQ, diet_num)
blood_num <- cbind(Memory, fluidIQ, CrystIQ, blood_num)

8 Diet Correlations w/ `Memory`

mem_cor <-
  cor(diet_num$Memory, diet_num[4:length(diet_num)],
      use = "complete.obs")


mem_cor <-
  mem_cor |> 
  t() |> as.data.frame()

mem_cor <- 
  mem_cor |> 
  rename(Memory = V1) |> 
  round(3) |>
  select(Memory)

mem_cor

9 Diet Correlations w/ `fluidIQ`

fluid_cor <-
  cor(diet_num$fluidIQ, diet_num[4:length(diet_num)],
      use = "complete.obs")


fluid_cor <-
  fluid_cor |> 
  t() |> as.data.frame()

fluid_cor |> 
  rename(fluidIQ = V1) |> 
  round(3) |>
  select(fluidIQ)

10 Diet Correlations w/ `CrystIQ`

cryst_cor <-
  cor(diet_num$CrystIQ, diet_num[4:length(diet_num)],
      use = "complete.obs")


cryst_cor <-
  cryst_cor |> 
  t() |> as.data.frame()

cryst_cor <-
  cryst_cor |> 
  rename(CrystIQ = V1) |> 
  round(3) |> select(CrystIQ) 


cryst_cor

Overall interpretation

Extremely weak correlations for all variables in Diet Data
- Diet Data doesnt appear strongly related

Possible Causes

underlying relationship isnt linear
Missing values may be biasing variables

diet_cor_lst <-
  list(
  cryst_cor=cryst_cor,
  fluid_cor=fluid_cor,
  mem_cor=mem_cor
)

11 Patient level analysis

blood |>
  group_by(patient) |> 
  select(patient, where(is.numeric)) |> 
  summarize(
    across(everything(), ~ mean(.x, na.rm = TRUE))
  )

12 Patient Correlations of `Cognitive` Features

here we generate a report of correlations for each individual patient
Suppose we just average out their values across all Study and event

diet_patients

13 Multivariate Normal – Cognitive Features

cognitive_df <- 
  all |> 
  select(id_event, id, event, study,
         Memory_Tr35_60y, 
         FluidIQ_Tr35_60y, 
         CrystIQ_Tr35_60y) |>
  mutate(
    patient = str_remove(id, "^[A-Z]+") 
  ) |> select(patient, everything()) |>
  mutate(
    across(
      -c(patient, id_event, id, event, study),
      as.numeric
    )
  )

par(mfrow = c(1, 3))

hist(
  cognitive_df$Memory_Tr35_60y,
  main = "Memory",
  xlab = ""
)

hist(
  cognitive_df$FluidIQ_Tr35_60y,
  main = "FluidIQ",
  xlab = ""
)

hist(
  cognitive_df$CrystIQ_Tr35_60y,
  main = "CrystIQ",
  xlab = ""
)

notice each is normally distributed

cognitive_df |>
  select(where(is.numeric)) |>
  summarize(
    mean_Memory = mean(Memory_Tr35_60y, na.rm = TRUE), 
    sd_Memory = sd(Memory_Tr35_60y, na.rm = TRUE), 
    mean_FluidIQ_y = mean(FluidIQ_Tr35_60y, na.rm = TRUE),
    sd_FluidIQ = sd(FluidIQ_Tr35_60y, na.rm = TRUE), 
    mean_CrystIQ = mean(CrystIQ_Tr35_60y, na.rm = TRUE),
    sd_CrystIQ = sd(CrystIQ_Tr35_60y, na.rm = TRUE)
  )

Missingness Report

Isaiah C. Mireles

2026-05-17

1 Read Data

2 Order of Events

2.1 event, study pairs

3 Path Diagram

4 Missingness Relevance

4.1 Blood Data

4.2 Diet Data

4.3 Ct and Graph Repeated Missingness

4.4 Diet Data

4.5 Blood Data

5 Build Features

5.1 `patient` study & events ct

5.1.1 Helper Func : Create `unique_id`, `Patient` Variable

6 `patient` specific missing ct

6.1 Diet

6.2 blood

7 Correlations of `Cognitive` Features – ALL

7.1 `Memory`, `fluidIQ` dot-plt

7.2 `Memory`, `CrystIQ` dot-plt

7.3 `fluidIQ`, `CrystIQ` dot-plt

8 Diet Correlations w/ `Memory`

9 Diet Correlations w/ `fluidIQ`

10 Diet Correlations w/ `CrystIQ`

11 Patient level analysis

12 Patient Correlations of `Cognitive` Features

13 Multivariate Normal – Cognitive Features

Missingness Report

Isaiah C. Mireles

2026-05-17

1 Read Data

2 Order of Events

2.1 event, study pairs

3 Path Diagram

4 Missingness Relevance

4.1 Blood Data

4.2 Diet Data

4.3 Ct and Graph Repeated Missingness

4.4 Diet Data

4.5 Blood Data

5 Build Features

5.1 patient study & events ct

5.1.1 Helper Func : Create unique_id, Patient Variable

6 patient specific missing ct

6.1 Diet

6.2 blood

7 Correlations of Cognitive Features – ALL

7.1 Memory, fluidIQ dot-plt

7.2 Memory, CrystIQ dot-plt

7.3 fluidIQ, CrystIQ dot-plt

8 Diet Correlations w/ Memory

9 Diet Correlations w/ fluidIQ

10 Diet Correlations w/ CrystIQ

11 Patient level analysis

12 Patient Correlations of Cognitive Features

13 Multivariate Normal – Cognitive Features

5.1 `patient` study & events ct

5.1.1 Helper Func : Create `unique_id`, `Patient` Variable

6 `patient` specific missing ct

7 Correlations of `Cognitive` Features – ALL

7.1 `Memory`, `fluidIQ` dot-plt

7.2 `Memory`, `CrystIQ` dot-plt

7.3 `fluidIQ`, `CrystIQ` dot-plt

8 Diet Correlations w/ `Memory`

9 Diet Correlations w/ `fluidIQ`

10 Diet Correlations w/ `CrystIQ`

12 Patient Correlations of `Cognitive` Features