library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

all_structure <- all %>%
  group_by(study, id) %>%
  summarise(
    n_events = n_distinct(id_event),
    events = paste(sort(unique(event)), collapse = ", "),
    .groups = "drop"
  )

all_structure |> arrange(desc(n_events))

## # A tibble: 2,117 × 4
##    study id         n_events events                
##    <chr> <chr>         <int> <chr>                 
##  1 HCA   HCA6037457        6 CR, F1, F2, F3, V1, V2
##  2 HCA   HCA6228767        6 CR, F1, F2, F3, V1, V2
##  3 HCA   HCA6249977        6 CR, F1, F2, F3, V1, V2
##  4 HCA   HCA6290368        6 CR, F1, F2, F3, V1, V2
##  5 HCA   HCA6374576        6 CR, F1, F2, F3, V1, V2
##  6 HCA   HCA6461066        6 CR, F1, F2, F3, V1, V2
##  7 HCA   HCA6498190        6 CR, F1, F2, F3, V1, V2
##  8 HCA   HCA6606571        6 CR, F1, F2, F3, V1, V2
##  9 HCA   HCA6618679        6 CR, F1, F2, F3, V1, V2
## 10 HCA   HCA6645985        6 CR, F1, F2, F3, V1, V2
## # ℹ 2,107 more rows

all_structure %>%
  ggplot(aes(x = n_events)) +
  geom_bar() +
  scale_x_continuous(breaks = 1:max(all_structure$n_events)) +
  labs(
    x = "Number of visits/events per participant",
    y = "Number of participants"
  )

summary(all_structure$n_events)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   3.054   4.000   6.000

usually people are getting about 4-6 events some are only going 1-3 times

all_structure %>%
  ggplot(aes(x = n_events)) +
  geom_bar() +
  facet_wrap(~study) +
  scale_x_continuous(
    breaks = 1:max(all_structure$n_events)
  ) +
  labs(
    x = "Number of visits/events per participant",
    y = "Number of participants"
  )

notice AABC sees less visits – which makes sense HCA is older w/ more events

all %>%
  group_by(study) %>%
  summarise(
    n_events = n_distinct(event),
    events = paste(sort(unique(event)), collapse = ", "),
    .groups = "drop"
  )

## # A tibble: 2 × 3
##   study n_events events                
##   <chr>    <int> <chr>                 
## 1 AABC         5 AF1, V1, V2, V3, V4   
## 2 HCA          6 CR, F1, F2, F3, V1, V2

notice that

Format & Combine data

library(tidyr)
relevant_dict <- relevant_dict |>
  separate(
    `variable name - description`,
    into = c("variable", "description"),
    sep = " - ",
    extra = "merge"
  )

library(dplyr)
# keep only id_event + age_open from all_dict : unique ages of people for each visit
age_df <- all %>%
  select(id_event, age_open) %>%
  distinct()

# join onto blood
blood <- blood %>%
  left_join(age_df, by = "id_event")

# join onto diet
diet <- diet %>%
  left_join(age_df, by = "id_event")

Research Question

What is the relationship between measures of diet derived from self-report (ASA24) and blood markers (e.g., glucose)? Is change in one measure over time reflected in the other?

Study Description – Research Design

blood$study |> unique()

## [1] "HCA"  "AABC"

HCA : Focused on brain connectivity, cognition, and healthy aging (before)

AABC : Focused on modifiable lifestyle factors (after)

So we should order it temporally HCA < AABC

blood <- blood %>%
  mutate(
    study = factor(
      study,
      levels = c("HCA", "AABC"),
      ordered = TRUE
    )
  )

diet <- diet %>%
  mutate(
    study = factor(
      study,
      levels = c("HCA", "AABC"),
      ordered = TRUE
    )
  )

Age Range of study

d <- diet[!(diet$age_open=="90 or older"),]
d$age_open <- as.numeric(d$age_open)
summary(d$age_open)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   36.00   50.00   62.00   62.31   75.00   89.00

hist(d$age_open)

How long was the study?

time_span <-
  d %>%
  group_by(id) %>%
  summarise(
    youngest_age = min(age_open, na.rm = TRUE),
    oldest_age   = max(age_open, na.rm = TRUE),
    n_visits     = n()
  ) |> 
  mutate(diff = oldest_age-youngest_age) %>%
  arrange(desc(diff))
time_span

## # A tibble: 1,350 × 5
##    id         youngest_age oldest_age n_visits  diff
##    <chr>             <dbl>      <dbl>    <int> <dbl>
##  1 HCA6010538           64         72        8     8
##  2 HCA6042046           58         66        5     8
##  3 HCA6131449           66         74        7     8
##  4 HCA6228767           50         58        8     8
##  5 HCA6249977           66         74        9     8
##  6 HCA6367781           67         75        8     8
##  7 HCA6397992           65         73        8     8
##  8 HCA6399087           51         59        7     8
##  9 HCA6429272           59         67        8     8
## 10 HCA6461066           65         73        9     8
## # ℹ 1,340 more rows

d %>%
  group_by(id) %>%
  summarise(
    youngest_age = min(age_open, na.rm = TRUE),
    oldest_age   = max(age_open, na.rm = TRUE),
    n_visits     = n()
  ) |> 
  mutate(diff = oldest_age-youngest_age) %>%
  arrange(desc(n_visits))

## # A tibble: 1,350 × 5
##    id         youngest_age oldest_age n_visits  diff
##    <chr>             <dbl>      <dbl>    <int> <dbl>
##  1 HCA6249977           66         74        9     8
##  2 HCA6290368           60         67        9     7
##  3 HCA6461066           65         73        9     8
##  4 HCA6645985           64         72        9     8
##  5 HCA6937998           63         70        9     7
##  6 HCA7530670           38         45        9     7
##  7 HCA7902782           58         65        9     7
##  8 HCA7987312           73         80        9     7
##  9 HCA8934497           60         66        9     6
## 10 HCA9217575           70         78        9     8
## # ℹ 1,340 more rows

plot(time_span$n_visits, time_span$diff)

notice the number of visits and amount of years someone gained through the study isnt 1:1 – for some people, by the second visit – upwards of 5 years passed

Youngest oldest by study

d %>%
  group_by(study) %>%
  summarise(
    youngest_age = min(age_open, na.rm = TRUE),
    oldest_age   = max(age_open, na.rm = TRUE),
    n_visits     = n()
  ) %>%
  arrange(desc(oldest_age))

## # A tibble: 2 × 4
##   study youngest_age oldest_age n_visits
##   <ord>        <dbl>      <dbl>    <int>
## 1 HCA             36         89     4631
## 2 AABC            36         89     1544

how many people (id) show up in HCA but not AABC?

hca_ids <- all %>%
  filter(study == "HCA") %>%
  distinct(id)

aabc_ids <- all %>%
  filter(study == "AABC") %>%
  distinct(id)

(anti_join(
  hca_ids,
  aabc_ids,
  by = "id"
) |> nrow() / 1396) |> round(2)

## [1] 0.34

how many people (id) show up in AABC but not HCA?

(anti_join(aabc_ids, hca_ids, by = "id") |> nrow()/1396) |> round(2)

## [1] 0.14

How many people show up in both?

(inner_join(hca_ids, aabc_ids, by = "id")  |> nrow()/1396) |> round(2)

## [1] 0.52

Number of unique participants

all |> distinct(id) |> nrow()

## [1] 1396

notice these groups sum to 13

When did each study happen – sequence of events :

diet |> distinct(study, event) |> arrange(study)

## # A tibble: 11 × 2
##    study event
##    <ord> <chr>
##  1 HCA   V1   
##  2 HCA   F1   
##  3 HCA   V2   
##  4 HCA   F2   
##  5 HCA   CR   
##  6 HCA   F3   
##  7 AABC  V3   
##  8 AABC  V2   
##  9 AABC  AF1  
## 10 AABC  V4   
## 11 AABC  V1

relevant_dict |> filter(variable == "event") |> select(description)

## # A tibble: 1 × 1
##   description                                                                   
##   <chr>                                                                         
## 1 Study-specific visit/event short name (V1, In-person visit 1 | V2, In-person …

blood$event |> unique()

## [1] "V1"  "F1"  "V2"  "F2"  "CR"  "V3"  "F3"  "AF1" "V4"

event descriptions :

Study-specific visit/event short name (V1, In-person visit 1 | V2, In-person visit 2 | V3, In-person visit 3 | V4, In-person visit 4 | F1, First followup survey one year after first in-person visit in HCA | F2, Second followup survey in HCA | F3, Third followup survey in HCA | CR, Surveys collected remotely during Covid lock-down without regard to visit timing | AF1, First followup survey one year after first in-person visit in AABC)

tbl <- diet %>%
     distinct(study, event, id) %>%
     count(study, event, name = "n_ids") %>%
     arrange(study, event)
tbl |> mutate(pct = (n_ids/nrow(diet)) |> round(2))

## # A tibble: 11 × 4
##    study event n_ids   pct
##    <ord> <chr> <int> <dbl>
##  1 HCA   CR      491  0.08
##  2 HCA   F1     1136  0.18
##  3 HCA   F2      843  0.13
##  4 HCA   F3      550  0.09
##  5 HCA   V1     1198  0.19
##  6 HCA   V2      609  0.09
##  7 AABC  AF1     567  0.09
##  8 AABC  V1      198  0.03
##  9 AABC  V2      306  0.05
## 10 AABC  V3      471  0.07
## 11 AABC  V4       96  0.01

Notice the correct ordering and flow of events^

event_levels <- c(
  "V1",
  "F1",
  "V2",
  "F2",
  "CR",
  "F3",
  "V3",
  "AF1",
  "V4"
)

blood <- blood %>%
  mutate(
    event = factor(
      event,
      levels = event_levels,
      ordered = TRUE
    )
  )

diet <- diet %>%
  mutate(
    event = factor(
      event,
      levels = event_levels,
      ordered = TRUE
    )
  )

Missingness

How Much & How is it missing?

col_diet <- colnames(diet[,-1])
col_blood <- colnames(blood[,-1])

library(dplyr)
library(tibble)

diet_missing <- tibble(
  variable = names(diet),
  n_missing = colSums(is.na(diet)),
  pct_missing = colMeans(is.na(diet)) * 100
) %>%
  arrange(desc(n_missing))

diet_missing

## # A tibble: 110 × 3
##    variable       n_missing pct_missing
##    <chr>              <dbl>       <dbl>
##  1 asa24_numfoods      5468        84.6
##  2 asa24_numcodes      5468        84.6
##  3 asa24_kcal          5468        84.6
##  4 asa24_prot          5468        84.6
##  5 asa24_tfat          5468        84.6
##  6 asa24_carb          5468        84.6
##  7 asa24_mois          5468        84.6
##  8 asa24_alc           5468        84.6
##  9 asa24_caff          5468        84.6
## 10 asa24_theo          5468        84.6
## # ℹ 100 more rows

blood_missing <- tibble(
  variable = names(blood),
  n_missing = colSums(is.na(blood)),
  pct_missing = colMeans(is.na(blood)) * 100
) %>%
  arrange(desc(n_missing))

blood_missing

## # A tibble: 82 × 3
##    variable    n_missing pct_missing
##    <chr>           <dbl>       <dbl>
##  1 direct_ldl       6465       100  
##  2 aldosterone      6236        96.5
##  3 IL13             6019        93.1
##  4 IL2              6017        93.1
##  5 IL17A            6010        93.0
##  6 CCL8             5995        92.7
##  7 TNFSF10          5995        92.7
##  8 IL4              5983        92.5
##  9 IL33             5968        92.3
## 10 CCL13            5959        92.2
## # ℹ 72 more rows

print("B")

## [1] "B"

summary(blood_missing$pct_missing); sd(blood_missing$pct_missing)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   60.22   92.11   76.29   92.11  100.00

## [1] 24.41539

print("D")

## [1] "D"

summary(diet_missing$pct_missing ); sd(diet_missing$pct_missing)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   84.58   84.58   80.73   84.58   84.58

## [1] 17.69824

We have a strong amount of missing values
Medians for blood & diet respectively 92%, 85%
However our missingness is constant for diet 84.5785 – while blood tends to have packets of missingness

blood_missing |> arrange(pct_missing)

## # A tibble: 82 × 3
##    variable   n_missing pct_missing
##    <chr>          <dbl>       <dbl>
##  1 id_event           0         0  
##  2 id                 0         0  
##  3 event              0         0  
##  4 study              0         0  
##  5 age_open           0         0  
##  6 insulin         3891        60.2
##  7 albumin         3892        60.2
##  8 chloride        3892        60.2
##  9 co2content      3892        60.2
## 10 creatinine      3892        60.2
## # ℹ 72 more rows

diet_missing |> arrange(pct_missing)

## # A tibble: 110 × 3
##    variable       n_missing pct_missing
##    <chr>              <dbl>       <dbl>
##  1 id_event               0         0  
##  2 id                     0         0  
##  3 event                  0         0  
##  4 study                  0         0  
##  5 age_open               0         0  
##  6 asa24_numfoods      5468        84.6
##  7 asa24_numcodes      5468        84.6
##  8 asa24_kcal          5468        84.6
##  9 asa24_prot          5468        84.6
## 10 asa24_tfat          5468        84.6
## # ℹ 100 more rows

order the missingness over time

Why might it be missing this way?

blood

## # A tibble: 6,465 × 82
##    id_event id    event study hba1c hscrp insulin vitamind albumin alkphos_total
##    <chr>    <chr> <ord> <ord> <dbl> <dbl>   <dbl>    <dbl>   <dbl>         <dbl>
##  1 HCA6000… HCA6… V1    HCA    NA   NA       NA       NA      NA              NA
##  2 HCA6002… HCA6… V1    HCA     5.1  2.86    19.3     35.3     4              57
##  3 HCA6002… HCA6… F1    HCA    NA   NA       NA       NA      NA              NA
##  4 HCA6002… HCA6… V2    HCA     5.2  8.03    18.1     62.7     4.2            58
##  5 HCA6002… HCA6… F2    HCA    NA   NA       NA       NA      NA              NA
##  6 HCA6002… HCA6… CR    HCA    NA   NA       NA       NA      NA              NA
##  7 HCA6002… HCA6… V3    AABC    4.8  3       16.7    154.      4.2            69
##  8 HCA6003… HCA6… V1    HCA     5.9  1.82    29.5     25.1     4.2            55
##  9 HCA6003… HCA6… F1    HCA    NA   NA       NA       NA      NA              NA
## 10 HCA6003… HCA6… F2    HCA    NA   NA       NA       NA      NA              NA
## # ℹ 6,455 more rows
## # ℹ 72 more variables: alt_sgpt <dbl>, ast_sgot <dbl>, calcium <dbl>,
## #   chloride <dbl>, co2content <dbl>, creatinine <dbl>, glucose <dbl>,
## #   potassium <dbl>, sodium <dbl>, totalbilirubin <dbl>, totalprotein <dbl>,
## #   ureanitrogen <dbl>, friedewald_ldl <dbl>, hdl <dbl>, cholesterol <dbl>,
## #   triglyceride <dbl>, direct_ldl <lgl>, estradiol <dbl>, testosterone <dbl>,
## #   lh <dbl>, fsh <dbl>, aldosterone <dbl>, dheas <dbl>, cortisol <dbl>, …

How many old ahh mfs?

diet[diet$age_open == "90 or older",]

## # A tibble: 290 × 110
##    id_event       id        event study asa24_numfoods asa24_numcodes asa24_kcal
##    <chr>          <chr>     <ord> <ord>          <dbl>          <dbl>      <dbl>
##  1 HCA6012744_V1  HCA60127… V1    HCA               NA             NA        NA 
##  2 HCA6012744_F1  HCA60127… F1    HCA               NA             NA        NA 
##  3 HCA6062456_V2  HCA60624… V2    HCA               NA             NA        NA 
##  4 HCA6062456_CR  HCA60624… CR    HCA               NA             NA        NA 
##  5 HCA6062456_V3  HCA60624… V3    AABC              21             21      2061.
##  6 HCA6062456_AF1 HCA60624… AF1   AABC              NA             NA        NA 
##  7 HCA6068670_V2  HCA60686… V2    HCA               NA             NA        NA 
##  8 HCA6068670_CR  HCA60686… CR    HCA               NA             NA        NA 
##  9 HCA6068670_V3  HCA60686… V3    AABC              13             13      2207.
## 10 HCA6102038_F1  HCA61020… F1    HCA               NA             NA        NA 
## # ℹ 280 more rows
## # ℹ 103 more variables: asa24_prot <dbl>, asa24_tfat <dbl>, asa24_carb <dbl>,
## #   asa24_mois <dbl>, asa24_alc <dbl>, asa24_caff <dbl>, asa24_theo <dbl>,
## #   asa24_sugr <dbl>, asa24_fibe <dbl>, asa24_calc <dbl>, asa24_iron <dbl>,
## #   asa24_magn <dbl>, asa24_phos <dbl>, asa24_pota <dbl>, asa24_sodi <dbl>,
## #   asa24_zinc <dbl>, asa24_copp <dbl>, asa24_sele <dbl>, asa24_vc <dbl>,
## #   asa24_vb1 <dbl>, asa24_vb2 <dbl>, asa24_niac <dbl>, asa24_vb6 <dbl>, …

paste0((290/6465) |> round(2), "%", " old mfs")

## [1] "0.04% old mfs"

Are old people loosing their minds

Missing Values

Isaiah C. Mireles

2026-05-16