A great deal of data is missing ( NA ) – therefore our
data is heavily biased. We would like to determine in
what way its biased. Perhaps by adjusting for
this bias we see strong associations for each variable to the cognitive
markers in studying dimentia.
library(dplyr)
library(naniar)
blood |>
miss_var_summary() |>
as.data.frame() |>
select(variable, pct_miss) |>
head(6)
## variable pct_miss
## 1 direct_ldl 100
## 2 aldosterone 96.5
## 3 IL13 93.1
## 4 IL2 93.1
## 5 IL17A 93.0
## 6 CCL8 92.7
direct_ldl is entirely missing, we may discard this feature
Many repeated values
diet |>
miss_var_summary() |>
as.data.frame() |>
select(variable, pct_miss) |>
head(6)
## variable pct_miss
## 1 asa24_numfoods 84.6
## 2 asa24_numcodes 84.6
## 3 asa24_kcal 84.6
## 4 asa24_prot 84.6
## 5 asa24_tfat 84.6
## 6 asa24_carb 84.6
interpretation :
Notice the only features without missingness
Notice the large amount of missingness is the
| Participant info/Demographics |
data_categoryNotice that the missingness appears to repeat – showing
runs of the same pct_miss for both
features
ct <-
diet |>
miss_var_summary() |>
mutate(
ct = ifelse(pct_miss == 0, 0, 1)
) |> count(ct)
ct
## # A tibble: 2 × 2
## ct n
## <dbl> <int>
## 1 0 4
## 2 1 105
Interpretation :
84.6 percent missing – which is a HUGE
amt.ct <-
blood |>
miss_var_summary()
library(ggplot2)
ct |>
ggplot(aes(pct_miss)) +
geom_histogram(binwidth = 10)
interpretation
this one is slightly more varied
0, about 60, 90 100% percent missing – with 60, 90 happening quite frequently
ct <- ct |>
mutate(
char = format(pct_miss)
)
ct <- as.vector(ct$pct_miss)/100
ct <- ct |> round(1)
ct <- ct |> as.character()
barplot(table(ct))
patient
study & events ctPurpose : identify how often a patients appear in
our data.
For example :
Are they in both studies?
How many unique events?
unique_id, Patient Variablelibrary(stringr)
diet <-
diet |>
mutate(
patient = str_remove(id, "^[A-Z]+")
) |> select(patient, everything())
blood <-
blood |>
mutate(
patient = str_remove(id, "^[A-Z]+")
) |> select(patient, everything())
patient
specific missing ctPurpose : identify how often patients appear in
our data to better understand missing values.
diet_patients <-
diet |>
group_by(patient) |>
summarize(
n_rows = n(),
total_missing =
sum(across(everything(), ~ sum(is.na(.x)))),
.groups = "drop"
) |>
mutate(
pct =
round(
total_missing /
(n_rows * (ncol(diet) - 1)),
2
)
) |>
arrange(desc(pct))
diet_patients <-
diet_patients |> mutate(
p_rank = paste0("patient", row_number())
) |> select(p_rank, everything(), patient)
plot(diet_patients$n_rows, diet_patients$total_missing)
blood_patients <-
blood |>
group_by(patient) |>
summarize(
n_rows = n(),
total_missing =
sum(across(everything(), ~ sum(is.na(.x)))),
total_cells =
n_rows * ncol(pick(everything())),
pct =
round(total_missing / total_cells, 2),
.groups = "drop"
) |>
arrange(desc(pct))
blood_patients <-
blood_patients |> mutate(
p_rank = paste0("patient", row_number())
) |> select(p_rank, everything(), patient)
plot(blood_patients$n_rows,blood_patients$total_missing)
Cognitive Features – ALLNote :
we have repeated observations, so we are taking people who had more visits more into account
Relevant factor analysis Features^
Memory_Tr35_60y,
FluidIQ_Tr35_60y,
CrystIQ_Tr35_60y
Memory <- all$Memory_Tr35_60y |> as.numeric()
fluidIQ <- all$FluidIQ_Tr35_60y |> as.numeric()
CrystIQ <- all$CrystIQ_Tr35_60y |> as.numeric()
Memory,
fluidIQ dot-plt# Memory ~ fluidIQ
Memory_fluidIQ_mdl <- lm(Memory ~ fluidIQ)
plot(Memory, fluidIQ)
abline(Memory_fluidIQ_mdl, col = "red")
Memory,
CrystIQ dot-plt# Memory ~ CrystIQ
Memory_CrystIQ_mdl <- lm(Memory ~ CrystIQ)
plot(Memory, CrystIQ)
abline(Memory_CrystIQ_mdl, col = "red")
fluidIQ,
CrystIQ dot-plt# fluidIQ ~ CrystIQ
fluidIQ_CrystIQ_mdl <- lm(fluidIQ ~ CrystIQ)
plot(fluidIQ, CrystIQ)
abline(fluidIQ_CrystIQ_mdl, col = "red")
diet_num <- diet |> select(where(is.numeric))
blood_num <- blood |> select(where(is.numeric))
diet_num <- cbind(Memory, fluidIQ, CrystIQ, diet_num)
blood_num <- cbind(Memory, fluidIQ, CrystIQ, blood_num)
Memorymem_cor <-
cor(diet_num$Memory, diet_num[4:length(diet_num)],
use = "complete.obs")
mem_cor <-
mem_cor |>
t() |> as.data.frame()
mem_cor |>
rename(Memory = V1) |>
round(3) |>
mutate(abs = abs(Memory)) |>
arrange(desc(abs)) |>
head(5) |> select(Memory)
## Memory
## asa24_fibe 0.168
## asa24_numfoods 0.156
## asa24_mois 0.156
## asa24_ff 0.148
## asa24_magn 0.139
fluidIQfluid_cor <-
cor(diet_num$fluidIQ, diet_num[4:length(diet_num)],
use = "complete.obs")
fluid_cor <-
fluid_cor |>
t() |> as.data.frame()
fluid_cor |>
rename(fluidIQ = V1) |>
round(3) |>
mutate(abs = abs(fluidIQ)) |>
arrange(desc(abs)) |>
head(5) |> select(fluidIQ)
## fluidIQ
## asa24_fibe 0.191
## asa24_magn 0.180
## asa24_ff 0.176
## asa24_fola 0.171
## asa24_phos 0.170
CrystIQcryst_cor <-
cor(diet_num$CrystIQ, diet_num[4:length(diet_num)],
use = "complete.obs")
cryst_cor <-
cryst_cor |>
t() |> as.data.frame()
cryst_cor |>
rename(CrystIQ = V1) |>
round(3) |>
mutate(abs = abs(CrystIQ)) |>
arrange(desc(abs)) |>
head(5) |> select(CrystIQ)
## CrystIQ
## asa24_fibe 0.214
## asa24_numfoods 0.188
## asa24_ff 0.178
## asa24_magn 0.175
## asa24_copp 0.167
Overall interpretation
Extremely weak correlations for all variables in Diet Data
Possible Causes
underlying relationship isnt linear
Missing values may be biasing variables
Cognitive Featureshere we generate a report of correlations for each individual patient
Suppose we just average out their values across all
Study and event
diet_patients
## # A tibble: 1,396 × 5
## p_rank patient n_rows total_missing pct
## <chr> <chr> <int> <int> <dbl>
## 1 patient1 6000030 1 105 0.96
## 2 patient2 6012744 2 210 0.96
## 3 patient3 6030645 4 420 0.96
## 4 patient4 6053758 4 420 0.96
## 5 patient5 6058162 5 525 0.96
## 6 patient6 6061757 4 420 0.96
## 7 patient7 6075263 4 420 0.96
## 8 patient8 6086975 4 420 0.96
## 9 patient9 6089779 4 420 0.96
## 10 patient10 6098073 4 420 0.96
## # ℹ 1,386 more rows