all |> dplyr::distinct(event, study)
notice some peoples first visit ( V1 ) may be in
HCA or, AABC – same being true for
V2
all <- all |>
dplyr::mutate(
event = factor(
event,
levels = c(
"V1", "F1",
"V2", "F2",
"CR",
"V3", "F3",
"V4",
"AF1"
),
ordered = TRUE
),
study = factor(
study,
levels = c("HCA", "AABC"),
ordered = TRUE
)
)
patient_paths <-
all |>
filter(age_open != "90 or older") |>
mutate(age_open = as.numeric(age_open)) |>
mutate(
patient = str_remove(id, "^[A-Z]+")
) |>
distinct(patient, study, event, age_open) |>
arrange(patient, study, event) |>
group_by(patient) |>
arrange(study, event) |>
summarise(
n_studies = n_distinct(study),
n_events = n(),
years_passed = if_else(
n() > 1,
max(age_open, na.rm = TRUE) - min(age_open, na.rm = TRUE),
NA_real_
),
study_event_path = paste(
paste(study, event, sep = ":"),
collapse = " → "
),
.groups = "drop"
) |>
select(patient, years_passed, study_event_path, years_passed, everything()) |>
arrange(desc(years_passed))
patient_paths
plot(patient_paths$years_passed, patient_paths$n_events, xlab = "Yrs Passed", ylab = "n_events")
patient_paths
library(tidyverse)
patient_steps <-
patient_paths |>
separate_rows(study_event_path, sep = " → ") |>
separate(
study_event_path,
into = c("study", "event"),
sep = ":"
) |>
mutate(
study = factor(
study,
levels = c("HCA", "AABC"),
ordered = TRUE
),
event = factor(
event,
levels = c(
"V1", "F1",
"V2", "F2",
"CR",
"V3", "F3",
"V4",
"AF1"
),
ordered = TRUE
)
) |>
arrange(patient, study, event) |>
group_by(patient) |>
mutate(step = row_number()) |>
ungroup()
patient_steps <-
patient_steps |>
mutate(
study_event = interaction(
study,
event,
sep = ":",
lex.order = TRUE
)
)
patient_edges <-
patient_steps |>
arrange(patient, step) |>
group_by(patient) |>
mutate(
next_study_event = lead(study_event),
next_step = lead(step)
) |>
filter(!is.na(next_study_event)) |>
ungroup()
transition_counts <-
patient_edges |>
count(
step,
next_step,
study_event,
next_study_event,
name = "n_patients"
)
library(grid)
max_step <- max(patient_steps$step)
path_plot <-
ggplot(transition_counts) +
geom_segment(
aes(
x = step,
xend = next_step,
y = study_event,
yend = next_study_event,
color = n_patients,
linewidth = n_patients
),
arrow = arrow(length = unit(0.15, "cm")),
alpha = 0.8
) +
geom_point(
data = patient_steps |> distinct(step, study_event),
aes(x = step, y = study_event),
size = 3
) +
scale_x_continuous(
breaks = seq(1, max_step, by = 1),
limits = c(1, max_step)
) +
scale_linewidth(range = c(0.5, 3)) +
scale_color_gradient(
low = "mistyrose",
high = "red4"
) +
labs(
x = "Sequence Step",
y = "Study:Event",
color = "Number of Patients",
linewidth = "Number of Patients",
title = "Patient Study/Event Paths"
) +
theme_minimal()
path_plot
A great deal of data is missing ( NA ) – therefore our
data is heavily biased. We would like to determine in
what way its biased. Perhaps by adjusting for
this bias we see strong associations for each variable to the cognitive
markers in studying dimentia.
library(dplyr)
library(naniar)
blood |>
miss_var_summary() |>
as.data.frame() |>
select(variable, pct_miss)
direct_ldl is entirely missing, we may discard this feature
Many repeated values
diet |>
miss_var_summary() |>
as.data.frame() |>
select(variable, pct_miss)
interpretation :
Notice the only features without missingness
Notice the large amount of missingness is the
| Participant info/Demographics |
data_categoryNotice that the missingness appears to repeat – showing
runs of the same pct_miss for both
features
ct <-
diet |>
miss_var_summary() |>
mutate(
ct = ifelse(pct_miss == 0, 0, 1)
) |> count(ct)
ct
Interpretation :
84.6 percent missing – which is a HUGE
amt.ct <-
blood |>
miss_var_summary()
library(ggplot2)
ct |>
ggplot(aes(pct_miss)) +
geom_histogram(binwidth = 10)
interpretation
this one is slightly more varied
0, about 60, 90 100% percent missing – with 60, 90 happening quite frequently
ct <- ct |>
mutate(
char = format(pct_miss)
)
ct <- as.vector(ct$pct_miss)/100
ct <- ct |> round(1)
ct <- ct |> as.character()
barplot(table(ct))
patient
study & events ctPurpose : identify how often a patients appear in
our data.
For example :
Are they in both studies?
How many unique events?
unique_id, Patient Variablelibrary(dplyr)
diet <-
diet |>
mutate(
patient = str_remove(id, "^[A-Z]+")
) |> select(patient, everything())
blood <-
blood |>
mutate(
patient = str_remove(id, "^[A-Z]+")
) |> select(patient, everything())
patient
specific missing ctPurpose : identify how often patients appear in
our data to better understand missing values.
diet_patients <-
diet |>
group_by(patient) |>
summarize(
n_rows = n(),
total_missing =
sum(across(everything(), ~ sum(is.na(.x)))),
.groups = "drop"
) |>
mutate(
pct =
round(
total_missing /
(n_rows * (ncol(diet) - 1)),
2
)
) |>
arrange(desc(pct))
diet_patients <-
diet_patients |> mutate(
p_rank = paste0("patient", row_number())
) |> select(p_rank, everything(), patient)
plot(diet_patients$n_rows, diet_patients$total_missing)
blood_patients <-
blood |>
group_by(patient) |>
summarize(
n_rows = n(),
total_missing =
sum(across(everything(), ~ sum(is.na(.x)))),
total_cells =
n_rows * ncol(pick(everything())),
pct =
round(total_missing / total_cells, 2),
.groups = "drop"
) |>
arrange(desc(pct))
blood_patients <-
blood_patients |> mutate(
p_rank = paste0("patient", row_number())
) |> select(p_rank, everything(), patient)
plot(blood_patients$n_rows,blood_patients$total_missing)
Cognitive Features – ALLNote :
we have repeated observations, so we are taking people who had more visits more into account
Relevant factor analysis Features^
Memory_Tr35_60y,
FluidIQ_Tr35_60y,
CrystIQ_Tr35_60y
Memory <- all$Memory_Tr35_60y |> as.numeric()
fluidIQ <- all$FluidIQ_Tr35_60y |> as.numeric()
CrystIQ <- all$CrystIQ_Tr35_60y |> as.numeric()
Memory,
fluidIQ dot-plt# Memory ~ fluidIQ
Memory_fluidIQ_mdl <- lm(Memory ~ fluidIQ)
plot(Memory, fluidIQ)
abline(Memory_fluidIQ_mdl, col = "red")
Memory,
CrystIQ dot-plt# Memory ~ CrystIQ
Memory_CrystIQ_mdl <- lm(Memory ~ CrystIQ)
plot(Memory, CrystIQ)
abline(Memory_CrystIQ_mdl, col = "red")
fluidIQ,
CrystIQ dot-plt# fluidIQ ~ CrystIQ
fluidIQ_CrystIQ_mdl <- lm(fluidIQ ~ CrystIQ)
plot(fluidIQ, CrystIQ)
abline(fluidIQ_CrystIQ_mdl, col = "red")
diet_num <- diet |> select(where(is.numeric))
blood_num <- blood |> select(where(is.numeric))
diet_num <- cbind(Memory, fluidIQ, CrystIQ, diet_num)
blood_num <- cbind(Memory, fluidIQ, CrystIQ, blood_num)
Memorymem_cor <-
cor(diet_num$Memory, diet_num[4:length(diet_num)],
use = "complete.obs")
mem_cor <-
mem_cor |>
t() |> as.data.frame()
mem_cor <-
mem_cor |>
rename(Memory = V1) |>
round(3) |>
select(Memory)
mem_cor
fluidIQfluid_cor <-
cor(diet_num$fluidIQ, diet_num[4:length(diet_num)],
use = "complete.obs")
fluid_cor <-
fluid_cor |>
t() |> as.data.frame()
fluid_cor |>
rename(fluidIQ = V1) |>
round(3) |>
select(fluidIQ)
CrystIQcryst_cor <-
cor(diet_num$CrystIQ, diet_num[4:length(diet_num)],
use = "complete.obs")
cryst_cor <-
cryst_cor |>
t() |> as.data.frame()
cryst_cor <-
cryst_cor |>
rename(CrystIQ = V1) |>
round(3) |> select(CrystIQ)
cryst_cor
Overall interpretation
Extremely weak correlations for all variables in Diet Data
Possible Causes
underlying relationship isnt linear
Missing values may be biasing variables
diet_cor_lst <-
list(
cryst_cor=cryst_cor,
fluid_cor=fluid_cor,
mem_cor=mem_cor
)
blood |>
group_by(patient) |>
select(patient, where(is.numeric)) |>
summarize(
across(everything(), ~ mean(.x, na.rm = TRUE))
)
Cognitive Featureshere we generate a report of correlations for each individual patient
Suppose we just average out their values across all
Study and event
diet_patients
cognitive_df <-
all |>
select(id_event, id, event, study,
Memory_Tr35_60y,
FluidIQ_Tr35_60y,
CrystIQ_Tr35_60y) |>
mutate(
patient = str_remove(id, "^[A-Z]+")
) |> select(patient, everything()) |>
mutate(
across(
-c(patient, id_event, id, event, study),
as.numeric
)
)
par(mfrow = c(1, 3))
hist(
cognitive_df$Memory_Tr35_60y,
main = "Memory",
xlab = ""
)
hist(
cognitive_df$FluidIQ_Tr35_60y,
main = "FluidIQ",
xlab = ""
)
hist(
cognitive_df$CrystIQ_Tr35_60y,
main = "CrystIQ",
xlab = ""
)
cognitive_df |>
select(where(is.numeric)) |>
summarize(
mean_Memory = mean(Memory_Tr35_60y, na.rm = TRUE),
sd_Memory = sd(Memory_Tr35_60y, na.rm = TRUE),
mean_FluidIQ_y = mean(FluidIQ_Tr35_60y, na.rm = TRUE),
sd_FluidIQ = sd(FluidIQ_Tr35_60y, na.rm = TRUE),
mean_CrystIQ = mean(CrystIQ_Tr35_60y, na.rm = TRUE),
sd_CrystIQ = sd(CrystIQ_Tr35_60y, na.rm = TRUE)
)