1 Read Data

2 Missingness Relevance

A great deal of data is missing ( NA ) – therefore our data is heavily biased. We would like to determine in what way its biased. Perhaps by adjusting for this bias we see strong associations for each variable to the cognitive markers in studying dimentia.

2.1 Blood Data

library(dplyr)
library(naniar)
blood |> 
  miss_var_summary() |>
  as.data.frame() |>
  select(variable, pct_miss) |> 
  head(6)
##      variable pct_miss
## 1  direct_ldl    100  
## 2 aldosterone     96.5
## 3        IL13     93.1
## 4         IL2     93.1
## 5       IL17A     93.0
## 6        CCL8     92.7
  • direct_ldl is entirely missing, we may discard this feature

  • Many repeated values

2.2 Diet Data

diet |> 
  miss_var_summary() |>
  as.data.frame() |>
  select(variable, pct_miss) |> 
  head(6)
##         variable pct_miss
## 1 asa24_numfoods     84.6
## 2 asa24_numcodes     84.6
## 3     asa24_kcal     84.6
## 4     asa24_prot     84.6
## 5     asa24_tfat     84.6
## 6     asa24_carb     84.6

interpretation :

  • Notice the only features without missingness

  • Notice the large amount of missingness is the

    Participant info/Demographics
    • data_category
  • Notice that the missingness appears to repeat – showing runs of the same pct_miss for both features

2.3 Ct and Graph Repeated Missingness

2.4 Diet Data

ct <-
  diet |> 
  miss_var_summary() |> 
  mutate(
    ct = ifelse(pct_miss == 0, 0, 1)
    ) |> count(ct) 
ct
## # A tibble: 2 × 2
##      ct     n
##   <dbl> <int>
## 1     0     4
## 2     1   105

Interpretation :

  • There are only 4 non-missing values every other value is just 84.6 percent missing – which is a HUGE amt.

2.5 Blood Data

ct <- 
  blood |> 
  miss_var_summary() 

library(ggplot2)
ct |> 
  ggplot(aes(pct_miss)) + 
  geom_histogram(binwidth = 10) 

interpretation

  • this one is slightly more varied

  • 0, about 60, 90 100% percent missing – with 60, 90 happening quite frequently

ct <- ct |>
  mutate(
    char = format(pct_miss)
  )

ct <- as.vector(ct$pct_miss)/100
ct <- ct |> round(1) 
ct <- ct |> as.character()
barplot(table(ct))

  • We can see that the repeated values are near 0 (id features), 60%, 90% or 100%

3 Build Features

3.1 patient study & events ct

Purpose : identify how often a patients appear in our data.

For example :

  • Are they in both studies?

  • How many unique events?

3.1.1 Helper Func : Create unique_id, Patient Variable

library(stringr)

diet <-
  diet |>
  mutate(
    patient = str_remove(id, "^[A-Z]+") 
  ) |> select(patient, everything())

blood <-
  blood |>
  mutate(
    patient = str_remove(id, "^[A-Z]+") 
  ) |> select(patient, everything())

4 patient specific missing ct

Purpose : identify how often patients appear in our data to better understand missing values.

4.1 Diet

diet_patients <-
  diet |>
  group_by(patient) |>
  summarize(
    n_rows = n(),
    total_missing =
      sum(across(everything(), ~ sum(is.na(.x)))),
    .groups = "drop"
  ) |>
  mutate(
    pct =
      round(
        total_missing /
          (n_rows * (ncol(diet) - 1)),
        2
      )
  ) |>
  arrange(desc(pct))

diet_patients <-
  diet_patients |>  mutate(
    p_rank = paste0("patient", row_number())
  ) |> select(p_rank, everything(), patient)
plot(diet_patients$n_rows, diet_patients$total_missing)

4.2 blood

blood_patients <-
  blood |>
  group_by(patient) |>
  summarize(
    n_rows = n(),
    total_missing =
      sum(across(everything(), ~ sum(is.na(.x)))),
    total_cells =
      n_rows * ncol(pick(everything())),
    pct =
      round(total_missing / total_cells, 2),
    .groups = "drop"
  ) |>
  arrange(desc(pct))

blood_patients <-
  blood_patients |>  mutate(
    p_rank = paste0("patient", row_number())
  ) |> select(p_rank, everything(), patient)
plot(blood_patients$n_rows,blood_patients$total_missing)

5 Correlations of Cognitive Features – ALL

Note :

  • we have repeated observations, so we are taking people who had more visits more into account

  • Relevant factor analysis Features^

    • Memory_Tr35_60y,

      • Cognition Factor Analysis: Memory factor estimated in Union V1 35-60 yo subjects and applied to all subjects all visits
    • FluidIQ_Tr35_60y,

      • Cognition Factor Analysis: Fluid IQ factor estimated in Union V1 35-60 yo subjects and applied to all subjects all visits
    • CrystIQ_Tr35_60y

      • Cognition Factor Analysis: Crystallized IQ factor estimated in Union V1 35-60 yo subjects and applied to all subjects all visits
Memory <- all$Memory_Tr35_60y |> as.numeric()
fluidIQ <- all$FluidIQ_Tr35_60y |> as.numeric()
CrystIQ <- all$CrystIQ_Tr35_60y |> as.numeric()

5.1 Memory, fluidIQ dot-plt

# Memory ~ fluidIQ
Memory_fluidIQ_mdl <- lm(Memory ~ fluidIQ)

plot(Memory, fluidIQ)
abline(Memory_fluidIQ_mdl, col = "red")

5.2 Memory, CrystIQ dot-plt

# Memory ~ CrystIQ
Memory_CrystIQ_mdl <- lm(Memory ~ CrystIQ)

plot(Memory, CrystIQ)
abline(Memory_CrystIQ_mdl, col = "red")

5.3 fluidIQ, CrystIQ dot-plt

# fluidIQ ~ CrystIQ
fluidIQ_CrystIQ_mdl <- lm(fluidIQ ~ CrystIQ)

plot(fluidIQ, CrystIQ)
abline(fluidIQ_CrystIQ_mdl, col = "red")

diet_num <- diet |> select(where(is.numeric))
blood_num <- blood |> select(where(is.numeric))
diet_num <- cbind(Memory, fluidIQ, CrystIQ, diet_num)
blood_num <- cbind(Memory, fluidIQ, CrystIQ, blood_num)

6 Diet Correlations w/ Memory

mem_cor <-
  cor(diet_num$Memory, diet_num[4:length(diet_num)],
      use = "complete.obs")


mem_cor <-
  mem_cor |> 
  t() |> as.data.frame()
mem_cor |> 
  rename(Memory = V1) |> 
  round(3) |>
  mutate(abs = abs(Memory)) |> 
  arrange(desc(abs)) |> 
  head(5) |> select(Memory)
##                Memory
## asa24_fibe      0.168
## asa24_numfoods  0.156
## asa24_mois      0.156
## asa24_ff        0.148
## asa24_magn      0.139

7 Diet Correlations w/ fluidIQ

fluid_cor <-
  cor(diet_num$fluidIQ, diet_num[4:length(diet_num)],
      use = "complete.obs")


fluid_cor <-
  fluid_cor |> 
  t() |> as.data.frame()

fluid_cor |> 
  rename(fluidIQ = V1) |> 
  round(3) |>
  mutate(abs = abs(fluidIQ)) |> 
  arrange(desc(abs)) |> 
  head(5) |> select(fluidIQ)
##            fluidIQ
## asa24_fibe   0.191
## asa24_magn   0.180
## asa24_ff     0.176
## asa24_fola   0.171
## asa24_phos   0.170

8 Diet Correlations w/ CrystIQ

cryst_cor <-
  cor(diet_num$CrystIQ, diet_num[4:length(diet_num)],
      use = "complete.obs")


cryst_cor <-
  cryst_cor |> 
  t() |> as.data.frame()

cryst_cor |> 
  rename(CrystIQ = V1) |> 
  round(3) |>
  mutate(abs = abs(CrystIQ)) |> 
  arrange(desc(abs)) |> 
  head(5) |> select(CrystIQ) 
##                CrystIQ
## asa24_fibe       0.214
## asa24_numfoods   0.188
## asa24_ff         0.178
## asa24_magn       0.175
## asa24_copp       0.167

Overall interpretation

  • Extremely weak correlations for all variables in Diet Data

    • Diet Data doesnt appear strongly related

Possible Causes

  • underlying relationship isnt linear

  • Missing values may be biasing variables

9 Patient Correlations of Cognitive Features

  • here we generate a report of correlations for each individual patient

  • Suppose we just average out their values across all Study and event

diet_patients
## # A tibble: 1,396 × 5
##    p_rank    patient n_rows total_missing   pct
##    <chr>     <chr>    <int>         <int> <dbl>
##  1 patient1  6000030      1           105  0.96
##  2 patient2  6012744      2           210  0.96
##  3 patient3  6030645      4           420  0.96
##  4 patient4  6053758      4           420  0.96
##  5 patient5  6058162      5           525  0.96
##  6 patient6  6061757      4           420  0.96
##  7 patient7  6075263      4           420  0.96
##  8 patient8  6086975      4           420  0.96
##  9 patient9  6089779      4           420  0.96
## 10 patient10 6098073      4           420  0.96
## # ℹ 1,386 more rows