library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
all_structure <- all %>%
group_by(study, id) %>%
summarise(
n_events = n_distinct(id_event),
events = paste(sort(unique(event)), collapse = ", "),
.groups = "drop"
)
all_structure |> arrange(desc(n_events))
## # A tibble: 2,117 × 4
## study id n_events events
## <chr> <chr> <int> <chr>
## 1 HCA HCA6037457 6 CR, F1, F2, F3, V1, V2
## 2 HCA HCA6228767 6 CR, F1, F2, F3, V1, V2
## 3 HCA HCA6249977 6 CR, F1, F2, F3, V1, V2
## 4 HCA HCA6290368 6 CR, F1, F2, F3, V1, V2
## 5 HCA HCA6374576 6 CR, F1, F2, F3, V1, V2
## 6 HCA HCA6461066 6 CR, F1, F2, F3, V1, V2
## 7 HCA HCA6498190 6 CR, F1, F2, F3, V1, V2
## 8 HCA HCA6606571 6 CR, F1, F2, F3, V1, V2
## 9 HCA HCA6618679 6 CR, F1, F2, F3, V1, V2
## 10 HCA HCA6645985 6 CR, F1, F2, F3, V1, V2
## # ℹ 2,107 more rows
all_structure %>%
ggplot(aes(x = n_events)) +
geom_bar() +
scale_x_continuous(breaks = 1:max(all_structure$n_events)) +
labs(
x = "Number of visits/events per participant",
y = "Number of participants"
)
summary(all_structure$n_events)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 3.054 4.000 6.000
all_structure %>%
ggplot(aes(x = n_events)) +
geom_bar() +
facet_wrap(~study) +
scale_x_continuous(
breaks = 1:max(all_structure$n_events)
) +
labs(
x = "Number of visits/events per participant",
y = "Number of participants"
)
all %>%
group_by(study) %>%
summarise(
n_events = n_distinct(event),
events = paste(sort(unique(event)), collapse = ", "),
.groups = "drop"
)
## # A tibble: 2 × 3
## study n_events events
## <chr> <int> <chr>
## 1 AABC 5 AF1, V1, V2, V3, V4
## 2 HCA 6 CR, F1, F2, F3, V1, V2
library(tidyr)
relevant_dict <- relevant_dict |>
separate(
`variable name - description`,
into = c("variable", "description"),
sep = " - ",
extra = "merge"
)
library(dplyr)
# keep only id_event + age_open from all_dict : unique ages of people for each visit
age_df <- all %>%
select(id_event, age_open) %>%
distinct()
# join onto blood
blood <- blood %>%
left_join(age_df, by = "id_event")
# join onto diet
diet <- diet %>%
left_join(age_df, by = "id_event")
blood$study |> unique()
## [1] "HCA" "AABC"
HCA : Focused on brain connectivity, cognition, and healthy aging (before)
AABC : Focused on modifiable lifestyle factors (after)
So we should order it temporally HCA < AABC
blood <- blood %>%
mutate(
study = factor(
study,
levels = c("HCA", "AABC"),
ordered = TRUE
)
)
diet <- diet %>%
mutate(
study = factor(
study,
levels = c("HCA", "AABC"),
ordered = TRUE
)
)
d <- diet[!(diet$age_open=="90 or older"),]
d$age_open <- as.numeric(d$age_open)
summary(d$age_open)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 36.00 50.00 62.00 62.31 75.00 89.00
hist(d$age_open)
time_span <-
d %>%
group_by(id) %>%
summarise(
youngest_age = min(age_open, na.rm = TRUE),
oldest_age = max(age_open, na.rm = TRUE),
n_visits = n()
) |>
mutate(diff = oldest_age-youngest_age) %>%
arrange(desc(diff))
time_span
## # A tibble: 1,350 × 5
## id youngest_age oldest_age n_visits diff
## <chr> <dbl> <dbl> <int> <dbl>
## 1 HCA6010538 64 72 8 8
## 2 HCA6042046 58 66 5 8
## 3 HCA6131449 66 74 7 8
## 4 HCA6228767 50 58 8 8
## 5 HCA6249977 66 74 9 8
## 6 HCA6367781 67 75 8 8
## 7 HCA6397992 65 73 8 8
## 8 HCA6399087 51 59 7 8
## 9 HCA6429272 59 67 8 8
## 10 HCA6461066 65 73 9 8
## # ℹ 1,340 more rows
d %>%
group_by(id) %>%
summarise(
youngest_age = min(age_open, na.rm = TRUE),
oldest_age = max(age_open, na.rm = TRUE),
n_visits = n()
) |>
mutate(diff = oldest_age-youngest_age) %>%
arrange(desc(n_visits))
## # A tibble: 1,350 × 5
## id youngest_age oldest_age n_visits diff
## <chr> <dbl> <dbl> <int> <dbl>
## 1 HCA6249977 66 74 9 8
## 2 HCA6290368 60 67 9 7
## 3 HCA6461066 65 73 9 8
## 4 HCA6645985 64 72 9 8
## 5 HCA6937998 63 70 9 7
## 6 HCA7530670 38 45 9 7
## 7 HCA7902782 58 65 9 7
## 8 HCA7987312 73 80 9 7
## 9 HCA8934497 60 66 9 6
## 10 HCA9217575 70 78 9 8
## # ℹ 1,340 more rows
plot(time_span$n_visits, time_span$diff)
d %>%
group_by(study) %>%
summarise(
youngest_age = min(age_open, na.rm = TRUE),
oldest_age = max(age_open, na.rm = TRUE),
n_visits = n()
) %>%
arrange(desc(oldest_age))
## # A tibble: 2 × 4
## study youngest_age oldest_age n_visits
## <ord> <dbl> <dbl> <int>
## 1 HCA 36 89 4631
## 2 AABC 36 89 1544
hca_ids <- all %>%
filter(study == "HCA") %>%
distinct(id)
aabc_ids <- all %>%
filter(study == "AABC") %>%
distinct(id)
(anti_join(
hca_ids,
aabc_ids,
by = "id"
) |> nrow() / 1396) |> round(2)
## [1] 0.34
(anti_join(aabc_ids, hca_ids, by = "id") |> nrow()/1396) |> round(2)
## [1] 0.14
(inner_join(hca_ids, aabc_ids, by = "id") |> nrow()/1396) |> round(2)
## [1] 0.52
all |> distinct(id) |> nrow()
## [1] 1396
diet |> distinct(study, event) |> arrange(study)
## # A tibble: 11 × 2
## study event
## <ord> <chr>
## 1 HCA V1
## 2 HCA F1
## 3 HCA V2
## 4 HCA F2
## 5 HCA CR
## 6 HCA F3
## 7 AABC V3
## 8 AABC V2
## 9 AABC AF1
## 10 AABC V4
## 11 AABC V1
relevant_dict |> filter(variable == "event") |> select(description)
## # A tibble: 1 × 1
## description
## <chr>
## 1 Study-specific visit/event short name (V1, In-person visit 1 | V2, In-person …
blood$event |> unique()
## [1] "V1" "F1" "V2" "F2" "CR" "V3" "F3" "AF1" "V4"
event descriptions :
| Study-specific visit/event short name (V1, In-person visit 1 | V2, In-person visit 2 | V3, In-person visit 3 | V4, In-person visit 4 | F1, First followup survey one year after first in-person visit in HCA | F2, Second followup survey in HCA | F3, Third followup survey in HCA | CR, Surveys collected remotely during Covid lock-down without regard to visit timing | AF1, First followup survey one year after first in-person visit in AABC) |
tbl <- diet %>%
distinct(study, event, id) %>%
count(study, event, name = "n_ids") %>%
arrange(study, event)
tbl |> mutate(pct = (n_ids/nrow(diet)) |> round(2))
## # A tibble: 11 × 4
## study event n_ids pct
## <ord> <chr> <int> <dbl>
## 1 HCA CR 491 0.08
## 2 HCA F1 1136 0.18
## 3 HCA F2 843 0.13
## 4 HCA F3 550 0.09
## 5 HCA V1 1198 0.19
## 6 HCA V2 609 0.09
## 7 AABC AF1 567 0.09
## 8 AABC V1 198 0.03
## 9 AABC V2 306 0.05
## 10 AABC V3 471 0.07
## 11 AABC V4 96 0.01
event_levels <- c(
"V1",
"F1",
"V2",
"F2",
"CR",
"F3",
"V3",
"AF1",
"V4"
)
blood <- blood %>%
mutate(
event = factor(
event,
levels = event_levels,
ordered = TRUE
)
)
diet <- diet %>%
mutate(
event = factor(
event,
levels = event_levels,
ordered = TRUE
)
)
col_diet <- colnames(diet[,-1])
col_blood <- colnames(blood[,-1])
library(dplyr)
library(tibble)
diet_missing <- tibble(
variable = names(diet),
n_missing = colSums(is.na(diet)),
pct_missing = colMeans(is.na(diet)) * 100
) %>%
arrange(desc(n_missing))
diet_missing
## # A tibble: 110 × 3
## variable n_missing pct_missing
## <chr> <dbl> <dbl>
## 1 asa24_numfoods 5468 84.6
## 2 asa24_numcodes 5468 84.6
## 3 asa24_kcal 5468 84.6
## 4 asa24_prot 5468 84.6
## 5 asa24_tfat 5468 84.6
## 6 asa24_carb 5468 84.6
## 7 asa24_mois 5468 84.6
## 8 asa24_alc 5468 84.6
## 9 asa24_caff 5468 84.6
## 10 asa24_theo 5468 84.6
## # ℹ 100 more rows
blood_missing <- tibble(
variable = names(blood),
n_missing = colSums(is.na(blood)),
pct_missing = colMeans(is.na(blood)) * 100
) %>%
arrange(desc(n_missing))
blood_missing
## # A tibble: 82 × 3
## variable n_missing pct_missing
## <chr> <dbl> <dbl>
## 1 direct_ldl 6465 100
## 2 aldosterone 6236 96.5
## 3 IL13 6019 93.1
## 4 IL2 6017 93.1
## 5 IL17A 6010 93.0
## 6 CCL8 5995 92.7
## 7 TNFSF10 5995 92.7
## 8 IL4 5983 92.5
## 9 IL33 5968 92.3
## 10 CCL13 5959 92.2
## # ℹ 72 more rows
print("B")
## [1] "B"
summary(blood_missing$pct_missing); sd(blood_missing$pct_missing)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 60.22 92.11 76.29 92.11 100.00
## [1] 24.41539
print("D")
## [1] "D"
summary(diet_missing$pct_missing ); sd(diet_missing$pct_missing)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 84.58 84.58 80.73 84.58 84.58
## [1] 17.69824
We have a strong amount of missing values
Medians for blood & diet respectively 92%, 85%
However our missingness is constant for diet 84.5785 – while blood tends to have packets of missingness
blood_missing |> arrange(pct_missing)
## # A tibble: 82 × 3
## variable n_missing pct_missing
## <chr> <dbl> <dbl>
## 1 id_event 0 0
## 2 id 0 0
## 3 event 0 0
## 4 study 0 0
## 5 age_open 0 0
## 6 insulin 3891 60.2
## 7 albumin 3892 60.2
## 8 chloride 3892 60.2
## 9 co2content 3892 60.2
## 10 creatinine 3892 60.2
## # ℹ 72 more rows
diet_missing |> arrange(pct_missing)
## # A tibble: 110 × 3
## variable n_missing pct_missing
## <chr> <dbl> <dbl>
## 1 id_event 0 0
## 2 id 0 0
## 3 event 0 0
## 4 study 0 0
## 5 age_open 0 0
## 6 asa24_numfoods 5468 84.6
## 7 asa24_numcodes 5468 84.6
## 8 asa24_kcal 5468 84.6
## 9 asa24_prot 5468 84.6
## 10 asa24_tfat 5468 84.6
## # ℹ 100 more rows
blood
## # A tibble: 6,465 × 82
## id_event id event study hba1c hscrp insulin vitamind albumin alkphos_total
## <chr> <chr> <ord> <ord> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 HCA6000… HCA6… V1 HCA NA NA NA NA NA NA
## 2 HCA6002… HCA6… V1 HCA 5.1 2.86 19.3 35.3 4 57
## 3 HCA6002… HCA6… F1 HCA NA NA NA NA NA NA
## 4 HCA6002… HCA6… V2 HCA 5.2 8.03 18.1 62.7 4.2 58
## 5 HCA6002… HCA6… F2 HCA NA NA NA NA NA NA
## 6 HCA6002… HCA6… CR HCA NA NA NA NA NA NA
## 7 HCA6002… HCA6… V3 AABC 4.8 3 16.7 154. 4.2 69
## 8 HCA6003… HCA6… V1 HCA 5.9 1.82 29.5 25.1 4.2 55
## 9 HCA6003… HCA6… F1 HCA NA NA NA NA NA NA
## 10 HCA6003… HCA6… F2 HCA NA NA NA NA NA NA
## # ℹ 6,455 more rows
## # ℹ 72 more variables: alt_sgpt <dbl>, ast_sgot <dbl>, calcium <dbl>,
## # chloride <dbl>, co2content <dbl>, creatinine <dbl>, glucose <dbl>,
## # potassium <dbl>, sodium <dbl>, totalbilirubin <dbl>, totalprotein <dbl>,
## # ureanitrogen <dbl>, friedewald_ldl <dbl>, hdl <dbl>, cholesterol <dbl>,
## # triglyceride <dbl>, direct_ldl <lgl>, estradiol <dbl>, testosterone <dbl>,
## # lh <dbl>, fsh <dbl>, aldosterone <dbl>, dheas <dbl>, cortisol <dbl>, …
diet[diet$age_open == "90 or older",]
## # A tibble: 290 × 110
## id_event id event study asa24_numfoods asa24_numcodes asa24_kcal
## <chr> <chr> <ord> <ord> <dbl> <dbl> <dbl>
## 1 HCA6012744_V1 HCA60127… V1 HCA NA NA NA
## 2 HCA6012744_F1 HCA60127… F1 HCA NA NA NA
## 3 HCA6062456_V2 HCA60624… V2 HCA NA NA NA
## 4 HCA6062456_CR HCA60624… CR HCA NA NA NA
## 5 HCA6062456_V3 HCA60624… V3 AABC 21 21 2061.
## 6 HCA6062456_AF1 HCA60624… AF1 AABC NA NA NA
## 7 HCA6068670_V2 HCA60686… V2 HCA NA NA NA
## 8 HCA6068670_CR HCA60686… CR HCA NA NA NA
## 9 HCA6068670_V3 HCA60686… V3 AABC 13 13 2207.
## 10 HCA6102038_F1 HCA61020… F1 HCA NA NA NA
## # ℹ 280 more rows
## # ℹ 103 more variables: asa24_prot <dbl>, asa24_tfat <dbl>, asa24_carb <dbl>,
## # asa24_mois <dbl>, asa24_alc <dbl>, asa24_caff <dbl>, asa24_theo <dbl>,
## # asa24_sugr <dbl>, asa24_fibe <dbl>, asa24_calc <dbl>, asa24_iron <dbl>,
## # asa24_magn <dbl>, asa24_phos <dbl>, asa24_pota <dbl>, asa24_sodi <dbl>,
## # asa24_zinc <dbl>, asa24_copp <dbl>, asa24_sele <dbl>, asa24_vc <dbl>,
## # asa24_vb1 <dbl>, asa24_vb2 <dbl>, asa24_niac <dbl>, asa24_vb6 <dbl>, …
paste0((290/6465) |> round(2), "%", " old mfs")
## [1] "0.04% old mfs"