1 Read Data

2 Order of Events

2.1 event, study pairs

all |> dplyr::distinct(event, study)
  • notice some peoples first visit ( V1 ) may be in HCA or, AABC – same being true for V2

    • suggesting people joined study at different time periods
all <- all |>
  dplyr::mutate(
    event = factor(
      event,
      levels = c(
        "V1", "F1",
        "V2", "F2",
        "CR",
        "V3", "F3",
        "V4",
        "AF1"
      ),
      ordered = TRUE
    ),

    study = factor(
      study,
      levels = c("HCA", "AABC"),
      ordered = TRUE
    )
  )
patient_paths <-
  all |>
  filter(age_open != "90 or older") |>
  mutate(age_open = as.numeric(age_open)) |> 
  mutate(
    patient = str_remove(id, "^[A-Z]+")
  ) |>
  distinct(patient, study, event, age_open) |>
  arrange(patient, study, event) |>
  group_by(patient) |>
  arrange(study, event) |> 
  summarise(
    n_studies = n_distinct(study),
    n_events = n(),

    years_passed = if_else(
      n() > 1,
      max(age_open, na.rm = TRUE) - min(age_open, na.rm = TRUE),
      NA_real_
    ),

    study_event_path = paste(
      paste(study, event, sep = ":"),
      collapse = " → "
    ),

    .groups = "drop"
  ) |>
  select(patient, years_passed, study_event_path, years_passed, everything()) |>
  arrange(desc(years_passed))
patient_paths
plot(patient_paths$years_passed, patient_paths$n_events, xlab = "Yrs Passed", ylab = "n_events")

3 Path Diagram

patient_paths
library(tidyverse)

patient_steps <-
  patient_paths |>
  separate_rows(study_event_path, sep = " → ") |>
  separate(
    study_event_path,
    into = c("study", "event"),
    sep = ":"
  ) |>
  mutate(
    study = factor(
      study,
      levels = c("HCA", "AABC"),
      ordered = TRUE
    ),
    event = factor(
      event,
      levels = c(
        "V1", "F1",
        "V2", "F2",
        "CR",
        "V3", "F3",
        "V4",
        "AF1"
      ),
      ordered = TRUE
    )
  ) |>
  arrange(patient, study, event) |>
  group_by(patient) |>
  mutate(step = row_number()) |>
  ungroup()
patient_steps <-
  patient_steps |>
  mutate(
    study_event = interaction(
      study,
      event,
      sep = ":",
      lex.order = TRUE
    )
  )
patient_edges <-
  patient_steps |>
  arrange(patient, step) |>
  group_by(patient) |>
  mutate(
    next_study_event = lead(study_event),
    next_step = lead(step)
  ) |>
  filter(!is.na(next_study_event)) |>
  ungroup()
transition_counts <-
  patient_edges |>
  count(
    step,
    next_step,
    study_event,
    next_study_event,
    name = "n_patients"
  )
library(grid)

max_step <- max(patient_steps$step)

path_plot <-
  ggplot(transition_counts) +
  geom_segment(
    aes(
      x = step,
      xend = next_step,
      y = study_event,
      yend = next_study_event,
      color = n_patients,
      linewidth = n_patients
    ),
    arrow = arrow(length = unit(0.15, "cm")),
    alpha = 0.8
  ) +
  geom_point(
    data = patient_steps |> distinct(step, study_event),
    aes(x = step, y = study_event),
    size = 3
  ) +
  scale_x_continuous(
    breaks = seq(1, max_step, by = 1),
    limits = c(1, max_step)
  ) +
  scale_linewidth(range = c(0.5, 3)) +
  scale_color_gradient(
    low = "mistyrose",
    high = "red4"
  ) +
  labs(
    x = "Sequence Step",
    y = "Study:Event",
    color = "Number of Patients",
    linewidth = "Number of Patients",
    title = "Patient Study/Event Paths"
  ) +
  theme_minimal()

path_plot

4 Missingness Relevance

A great deal of data is missing ( NA ) – therefore our data is heavily biased. We would like to determine in what way its biased. Perhaps by adjusting for this bias we see strong associations for each variable to the cognitive markers in studying dimentia.

4.1 Blood Data

library(dplyr)
library(naniar)
blood |> 
  miss_var_summary() |>
  as.data.frame() |>
  select(variable, pct_miss) 
  • direct_ldl is entirely missing, we may discard this feature

  • Many repeated values

4.2 Diet Data

diet |> 
  miss_var_summary() |>
  as.data.frame() |>
  select(variable, pct_miss) 

interpretation :

  • Notice the only features without missingness

  • Notice the large amount of missingness is the

    Participant info/Demographics
    • data_category
  • Notice that the missingness appears to repeat – showing runs of the same pct_miss for both features

4.3 Ct and Graph Repeated Missingness

4.4 Diet Data

ct <-
  diet |> 
  miss_var_summary() |> 
  mutate(
    ct = ifelse(pct_miss == 0, 0, 1)
    ) |> count(ct) 
ct

Interpretation :

  • There are only 4 non-missing values every other value is just 84.6 percent missing – which is a HUGE amt.

4.5 Blood Data

ct <- 
  blood |> 
  miss_var_summary() 

library(ggplot2)
ct |> 
  ggplot(aes(pct_miss)) + 
  geom_histogram(binwidth = 10) 

interpretation

  • this one is slightly more varied

  • 0, about 60, 90 100% percent missing – with 60, 90 happening quite frequently

ct <- ct |>
  mutate(
    char = format(pct_miss)
  )

ct <- as.vector(ct$pct_miss)/100
ct <- ct |> round(1) 
ct <- ct |> as.character()
barplot(table(ct))

  • We can see that the repeated values are near 0 (id features), 60%, 90% or 100%

5 Build Features

5.1 patient study & events ct

Purpose : identify how often a patients appear in our data.

For example :

  • Are they in both studies?

  • How many unique events?

5.1.1 Helper Func : Create unique_id, Patient Variable

library(dplyr)

diet <-
  diet |>
  mutate(
    patient = str_remove(id, "^[A-Z]+") 
  ) |> select(patient, everything())

blood <-
  blood |>
  mutate(
    patient = str_remove(id, "^[A-Z]+") 
  ) |> select(patient, everything())

6 patient specific missing ct

Purpose : identify how often patients appear in our data to better understand missing values.

6.1 Diet

diet_patients <-
  diet |>
  group_by(patient) |>
  summarize(
    n_rows = n(),
    total_missing =
      sum(across(everything(), ~ sum(is.na(.x)))),
    .groups = "drop"
  ) |>
  mutate(
    pct =
      round(
        total_missing /
          (n_rows * (ncol(diet) - 1)),
        2
      )
  ) |>
  arrange(desc(pct))

diet_patients <-
  diet_patients |>  mutate(
    p_rank = paste0("patient", row_number())
  ) |> select(p_rank, everything(), patient)
plot(diet_patients$n_rows, diet_patients$total_missing)

6.2 blood

blood_patients <-
  blood |>
  group_by(patient) |>
  summarize(
    n_rows = n(),
    total_missing =
      sum(across(everything(), ~ sum(is.na(.x)))),
    total_cells =
      n_rows * ncol(pick(everything())),
    pct =
      round(total_missing / total_cells, 2),
    .groups = "drop"
  ) |>
  arrange(desc(pct))

blood_patients <-
  blood_patients |>  mutate(
    p_rank = paste0("patient", row_number())
  ) |> select(p_rank, everything(), patient)
plot(blood_patients$n_rows,blood_patients$total_missing)

7 Correlations of Cognitive Features – ALL

Note :

  • we have repeated observations, so we are taking people who had more visits more into account

  • Relevant factor analysis Features^

    • Memory_Tr35_60y,

      • Cognition Factor Analysis: Memory factor estimated in Union V1 35-60 yo subjects and applied to all subjects all visits
    • FluidIQ_Tr35_60y,

      • Cognition Factor Analysis: Fluid IQ factor estimated in Union V1 35-60 yo subjects and applied to all subjects all visits
    • CrystIQ_Tr35_60y

      • Cognition Factor Analysis: Crystallized IQ factor estimated in Union V1 35-60 yo subjects and applied to all subjects all visits
Memory <- all$Memory_Tr35_60y |> as.numeric()
fluidIQ <- all$FluidIQ_Tr35_60y |> as.numeric()
CrystIQ <- all$CrystIQ_Tr35_60y |> as.numeric()

7.1 Memory, fluidIQ dot-plt

# Memory ~ fluidIQ
Memory_fluidIQ_mdl <- lm(Memory ~ fluidIQ)

plot(Memory, fluidIQ)
abline(Memory_fluidIQ_mdl, col = "red")

7.2 Memory, CrystIQ dot-plt

# Memory ~ CrystIQ
Memory_CrystIQ_mdl <- lm(Memory ~ CrystIQ)

plot(Memory, CrystIQ)
abline(Memory_CrystIQ_mdl, col = "red")

7.3 fluidIQ, CrystIQ dot-plt

# fluidIQ ~ CrystIQ
fluidIQ_CrystIQ_mdl <- lm(fluidIQ ~ CrystIQ)

plot(fluidIQ, CrystIQ)
abline(fluidIQ_CrystIQ_mdl, col = "red")

diet_num <- diet |> select(where(is.numeric))
blood_num <- blood |> select(where(is.numeric))
diet_num <- cbind(Memory, fluidIQ, CrystIQ, diet_num)
blood_num <- cbind(Memory, fluidIQ, CrystIQ, blood_num)

8 Diet Correlations w/ Memory

mem_cor <-
  cor(diet_num$Memory, diet_num[4:length(diet_num)],
      use = "complete.obs")


mem_cor <-
  mem_cor |> 
  t() |> as.data.frame()
mem_cor <- 
  mem_cor |> 
  rename(Memory = V1) |> 
  round(3) |>
  select(Memory)

mem_cor 

9 Diet Correlations w/ fluidIQ

fluid_cor <-
  cor(diet_num$fluidIQ, diet_num[4:length(diet_num)],
      use = "complete.obs")


fluid_cor <-
  fluid_cor |> 
  t() |> as.data.frame()

fluid_cor |> 
  rename(fluidIQ = V1) |> 
  round(3) |>
  select(fluidIQ)

10 Diet Correlations w/ CrystIQ

cryst_cor <-
  cor(diet_num$CrystIQ, diet_num[4:length(diet_num)],
      use = "complete.obs")


cryst_cor <-
  cryst_cor |> 
  t() |> as.data.frame()

cryst_cor <-
  cryst_cor |> 
  rename(CrystIQ = V1) |> 
  round(3) |> select(CrystIQ) 


cryst_cor

Overall interpretation

  • Extremely weak correlations for all variables in Diet Data

    • Diet Data doesnt appear strongly related

Possible Causes

  • underlying relationship isnt linear

  • Missing values may be biasing variables

diet_cor_lst <-
  list(
  cryst_cor=cryst_cor,
  fluid_cor=fluid_cor,
  mem_cor=mem_cor
)

11 Patient level analysis

blood |>
  group_by(patient) |> 
  select(patient, where(is.numeric)) |> 
  summarize(
    across(everything(), ~ mean(.x, na.rm = TRUE))
  )

12 Patient Correlations of Cognitive Features

  • here we generate a report of correlations for each individual patient

  • Suppose we just average out their values across all Study and event

diet_patients

13 Multivariate Normal – Cognitive Features

cognitive_df <- 
  all |> 
  select(id_event, id, event, study,
         Memory_Tr35_60y, 
         FluidIQ_Tr35_60y, 
         CrystIQ_Tr35_60y) |>
  mutate(
    patient = str_remove(id, "^[A-Z]+") 
  ) |> select(patient, everything()) |>
  mutate(
    across(
      -c(patient, id_event, id, event, study),
      as.numeric
    )
  )
par(mfrow = c(1, 3))

hist(
  cognitive_df$Memory_Tr35_60y,
  main = "Memory",
  xlab = ""
)

hist(
  cognitive_df$FluidIQ_Tr35_60y,
  main = "FluidIQ",
  xlab = ""
)

hist(
  cognitive_df$CrystIQ_Tr35_60y,
  main = "CrystIQ",
  xlab = ""
)

  • notice each is normally distributed
cognitive_df |>
  select(where(is.numeric)) |>
  summarize(
    mean_Memory = mean(Memory_Tr35_60y, na.rm = TRUE), 
    sd_Memory = sd(Memory_Tr35_60y, na.rm = TRUE), 
    mean_FluidIQ_y = mean(FluidIQ_Tr35_60y, na.rm = TRUE),
    sd_FluidIQ = sd(FluidIQ_Tr35_60y, na.rm = TRUE), 
    mean_CrystIQ = mean(CrystIQ_Tr35_60y, na.rm = TRUE),
    sd_CrystIQ = sd(CrystIQ_Tr35_60y, na.rm = TRUE)
  )