Import data
metadata <-
read_csv("metadata.csv") |>
select(1:6) |>
filter(subtype %in% c("HER2+", "luminal A", "luminal B", "TNBC", "healthy")) |>
mutate(progression_verbose = case_when(
progression == "high MBC" ~ "> 5 metastases",
progression == "OMBC" ~ "< 5 metastases",
progression == "EBC" ~ "early 0 metastases",
TRUE ~ progression
) |>
fct_relevel(c("healthy", "early 0 metastases", "In remission", "local recurrance (no mets)", "< 5 metastases", "> 5 metastases"))
) |>
mutate(source = case_when(
id1 |> str_detect("^BCB") ~ "internal",
id1 |> str_detect("AH") ~ "?",
id1 |> str_detect("^CRP") ~ "victoria biobank?"
))
## Rows: 84 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): id1, id2, subtype, progression, is_treated, notes
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Define cohort across all subtypes
full_cohort =
metadata |>
# Filter
filter(progression_verbose !="local recurrance (no mets)") |>
filter(progression_verbose != "early 0 metastases") |>
droplevels() |>
# Count
nest(data = -c(subtype , progression_verbose, is_treated, source)) |>
mutate(n = map_int(data, ~ nrow(.x))) |>
# Complete only for cancer
complete(progression_verbose, subtype, fill = list(n=0)) |>
filter( (subtype == "healthy" & progression_verbose == "healthy") | (subtype != "healthy" & progression_verbose != "healthy")) |>
# Count
with_groups(progression_verbose, ~ .x |> mutate(count_of_progression = sum(n)) )
full_cohort
## # A tibble: 13 × 7
## progression_verbose subtype is_treated source data n count_of_progre…
## <fct> <chr> <chr> <chr> <list> <int> <int>
## 1 healthy healthy untreated inter… <tibble> 1 1
## 2 In remission HER2+ treated victo… <tibble> 2 9
## 3 In remission lumina… treated victo… <tibble> 5 9
## 4 In remission lumina… treated victo… <tibble> 1 9
## 5 In remission TNBC treated inter… <tibble> 1 9
## 6 < 5 metastases HER2+ treated inter… <tibble> 3 10
## 7 < 5 metastases lumina… treated inter… <tibble> 3 10
## 8 < 5 metastases lumina… treated inter… <tibble> 4 10
## 9 < 5 metastases TNBC <NA> <NA> <NULL> 0 10
## 10 > 5 metastases HER2+ treated inter… <tibble> 9 26
## 11 > 5 metastases lumina… treated inter… <tibble> 9 26
## 12 > 5 metastases lumina… treated inter… <tibble> 5 26
## 13 > 5 metastases TNBC treated inter… <tibble> 3 26
Define cohort across ER+ subtypes
er_positive_cohort =
metadata |>
# Filter
filter(subtype |> str_detect("luminal") | subtype == "healthy") |>
filter(progression_verbose !="local recurrance (no mets)") |>
filter(progression_verbose != "early 0 metastases") |>
droplevels() |>
# Count
nest(data = -c(subtype , progression_verbose, is_treated, source)) |>
mutate(n = map_int(data, ~ nrow(.x))) |>
# Complete only for cancer
complete(progression_verbose, subtype, fill = list(n=0)) |>
filter( (subtype == "healthy" & progression_verbose == "healthy") | (subtype != "healthy" & progression_verbose != "healthy")) |>
# Count
with_groups(progression_verbose, ~ .x |> mutate(count_of_progression = sum(n)) )
er_positive_cohort
## # A tibble: 7 × 7
## progression_verbose subtype is_treated source data n count_of_progre…
## <fct> <chr> <chr> <chr> <list> <int> <int>
## 1 healthy healthy untreated inter… <tibble> 1 1
## 2 In remission luminal… treated victo… <tibble> 5 6
## 3 In remission luminal… treated victo… <tibble> 1 6
## 4 < 5 metastases luminal… treated inter… <tibble> 3 7
## 5 < 5 metastases luminal… treated inter… <tibble> 4 7
## 6 > 5 metastases luminal… treated inter… <tibble> 9 14
## 7 > 5 metastases luminal… treated inter… <tibble> 5 14
Now, let’s suppose we want to select the first batch. 4 healthy and 34 unhealthy. This results in 11.3 samples for the three cancer progression categories
For the whole cohort, we are limited by in remission and oligometastatic, so we divide the cohort as 9 + 10 + 14
full_cohort_batch_1 =
full_cohort |>
mutate(data = pmap(
list(data, n, progression_verbose),
~ {
if(..3 == "> 5 metastases" & ..2 == 5) sample_n(..1, 3)
else if(..3 == "> 5 metastases" & ..2 == 9 ) sample_n(..1, 4)
else ..1
}
)) |>
drop_na() |>
mutate(n = map_int(data, ~ nrow(.x))) |>
with_groups(progression_verbose, ~ .x |> mutate(count_of_progression = sum(n)) )
full_cohort_batch_1
## # A tibble: 12 × 7
## progression_verbose subtype is_treated source data n count_of_progre…
## <fct> <chr> <chr> <chr> <list> <int> <int>
## 1 healthy healthy untreated inter… <tibble> 1 1
## 2 In remission HER2+ treated victo… <tibble> 2 9
## 3 In remission lumina… treated victo… <tibble> 5 9
## 4 In remission lumina… treated victo… <tibble> 1 9
## 5 In remission TNBC treated inter… <tibble> 1 9
## 6 < 5 metastases HER2+ treated inter… <tibble> 3 10
## 7 < 5 metastases lumina… treated inter… <tibble> 3 10
## 8 < 5 metastases lumina… treated inter… <tibble> 4 10
## 9 > 5 metastases HER2+ treated inter… <tibble> 4 14
## 10 > 5 metastases lumina… treated inter… <tibble> 4 14
## 11 > 5 metastases lumina… treated inter… <tibble> 3 14
## 12 > 5 metastases TNBC treated inter… <tibble> 3 14
How many samples for support each category combination
full_cohort_batch_1 |>
ggplot(aes(n)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The ER+ cohort
er_positive_cohort_batch_1 =
er_positive_cohort |>
mutate(data = pmap(
list(data, n, progression_verbose),
~ {
if(..3 == "> 5 metastases" & ..2 == 5) sample_n(..1, 3)
else if(..3 == "> 5 metastases" & ..2 == 9 ) sample_n(..1, 4)
else ..1
}
)) |>
drop_na() |>
mutate(n = map_int(data, ~ nrow(.x))) |>
with_groups(progression_verbose, ~ .x |> mutate(count_of_progression = sum(n)) )
er_positive_cohort_batch_1
## # A tibble: 7 × 7
## progression_verbose subtype is_treated source data n count_of_progre…
## <fct> <chr> <chr> <chr> <list> <int> <int>
## 1 healthy healthy untreated inter… <tibble> 1 1
## 2 In remission luminal… treated victo… <tibble> 5 6
## 3 In remission luminal… treated victo… <tibble> 1 6
## 4 < 5 metastases luminal… treated inter… <tibble> 3 7
## 5 < 5 metastases luminal… treated inter… <tibble> 4 7
## 6 > 5 metastases luminal… treated inter… <tibble> 4 7
## 7 > 5 metastases luminal… treated inter… <tibble> 3 7
How many samples for support each category combination
er_positive_cohort_batch_1 |>
ggplot(aes(n)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Define cohort across all subtypes
full_cohort =
metadata |>
# Filter
filter(progression_verbose !="local recurrance (no mets)") |>
filter((progression_verbose=="early 0 metastases" & is_treated=="untreated") | progression_verbose!="early 0 metastases") |>
droplevels() |>
# Count
nest(data = -c(subtype , progression_verbose, is_treated, source)) |>
mutate(n = map_int(data, ~ nrow(.x))) |>
# Complete only for cancer
complete(progression_verbose, subtype, fill = list(n=0)) |>
filter( (subtype == "healthy" & progression_verbose == "healthy") | (subtype != "healthy" & progression_verbose != "healthy")) |>
# Count
with_groups(progression_verbose, ~ .x |> mutate(count_of_progression = sum(n)) )
full_cohort
## # A tibble: 19 × 7
## progression_verbose subtype is_treated source data n count_of_progre…
## <fct> <chr> <chr> <chr> <list> <int> <int>
## 1 healthy healthy untreated inter… <tibble> 1 1
## 2 early 0 metastases HER2+ untreated inter… <tibble> 2 14
## 3 early 0 metastases HER2+ untreated ? <tibble> 1 14
## 4 early 0 metastases lumina… untreated inter… <tibble> 3 14
## 5 early 0 metastases lumina… untreated ? <tibble> 5 14
## 6 early 0 metastases lumina… untreated ? <tibble> 2 14
## 7 early 0 metastases TNBC untreated ? <tibble> 1 14
## 8 In remission HER2+ treated victo… <tibble> 2 9
## 9 In remission lumina… treated victo… <tibble> 5 9
## 10 In remission lumina… treated victo… <tibble> 1 9
## 11 In remission TNBC treated inter… <tibble> 1 9
## 12 < 5 metastases HER2+ treated inter… <tibble> 3 10
## 13 < 5 metastases lumina… treated inter… <tibble> 3 10
## 14 < 5 metastases lumina… treated inter… <tibble> 4 10
## 15 < 5 metastases TNBC <NA> <NA> <NULL> 0 10
## 16 > 5 metastases HER2+ treated inter… <tibble> 9 26
## 17 > 5 metastases lumina… treated inter… <tibble> 9 26
## 18 > 5 metastases lumina… treated inter… <tibble> 5 26
## 19 > 5 metastases TNBC treated inter… <tibble> 3 26
Define cohort across ER+ subtypes
er_positive_cohort =
metadata |>
# Filter
filter(subtype |> str_detect("luminal") | subtype == "healthy") |>
filter((progression_verbose=="early 0 metastases" & is_treated=="untreated") | progression_verbose!="early 0 metastases") |>
filter(progression_verbose !="local recurrance (no mets)") |>
droplevels() |>
# Count
nest(data = -c(subtype , progression_verbose, is_treated, source)) |>
mutate(n = map_int(data, ~ nrow(.x))) |>
# Complete only for cancer
complete(progression_verbose, subtype, fill = list(n=0)) |>
filter( (subtype == "healthy" & progression_verbose == "healthy") | (subtype != "healthy" & progression_verbose != "healthy")) |>
# Count
with_groups(progression_verbose, ~ .x |> mutate(count_of_progression = sum(n)) )
er_positive_cohort
## # A tibble: 10 × 7
## progression_verbose subtype is_treated source data n count_of_progre…
## <fct> <chr> <chr> <chr> <list> <int> <int>
## 1 healthy healthy untreated inter… <tibble> 1 1
## 2 early 0 metastases lumina… untreated inter… <tibble> 3 10
## 3 early 0 metastases lumina… untreated ? <tibble> 5 10
## 4 early 0 metastases lumina… untreated ? <tibble> 2 10
## 5 In remission lumina… treated victo… <tibble> 5 6
## 6 In remission lumina… treated victo… <tibble> 1 6
## 7 < 5 metastases lumina… treated inter… <tibble> 3 7
## 8 < 5 metastases lumina… treated inter… <tibble> 4 7
## 9 > 5 metastases lumina… treated inter… <tibble> 9 14
## 10 > 5 metastases lumina… treated inter… <tibble> 5 14
Now, let’s suppose we want to select the first batch. 4 healthy and 34 unhealthy. This results in 11.3 samples for the three cancer progression categories
For the whole cohort, we are limited by in remission and oligometastatic, so we divide the cohort as 9 + 10 + 14
full_cohort_batch_1 =
full_cohort |>
drop_na() |>
mutate(data = pmap(
list(data, n, progression_verbose),
~ {
if(..3 == "> 5 metastases") sample_n(..1, 2)
else sample_n(..1, min(3, nrow(..1)))
}
)) |>
mutate(n = map_int(data, ~ nrow(.x))) |>
with_groups(progression_verbose, ~ .x |> mutate(count_of_progression = sum(n)) )
full_cohort_batch_1
## # A tibble: 18 × 7
## progression_verbose subtype is_treated source data n count_of_progre…
## <fct> <chr> <chr> <chr> <list> <int> <int>
## 1 healthy healthy untreated inter… <tibble> 1 1
## 2 early 0 metastases HER2+ untreated inter… <tibble> 2 12
## 3 early 0 metastases HER2+ untreated ? <tibble> 1 12
## 4 early 0 metastases lumina… untreated inter… <tibble> 3 12
## 5 early 0 metastases lumina… untreated ? <tibble> 3 12
## 6 early 0 metastases lumina… untreated ? <tibble> 2 12
## 7 early 0 metastases TNBC untreated ? <tibble> 1 12
## 8 In remission HER2+ treated victo… <tibble> 2 7
## 9 In remission lumina… treated victo… <tibble> 3 7
## 10 In remission lumina… treated victo… <tibble> 1 7
## 11 In remission TNBC treated inter… <tibble> 1 7
## 12 < 5 metastases HER2+ treated inter… <tibble> 3 9
## 13 < 5 metastases lumina… treated inter… <tibble> 3 9
## 14 < 5 metastases lumina… treated inter… <tibble> 3 9
## 15 > 5 metastases HER2+ treated inter… <tibble> 2 8
## 16 > 5 metastases lumina… treated inter… <tibble> 2 8
## 17 > 5 metastases lumina… treated inter… <tibble> 2 8
## 18 > 5 metastases TNBC treated inter… <tibble> 2 8
How many samples for support each category combination
full_cohort_batch_1 |>
ggplot(aes(n)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
er_positive_cohort_batch_1 =
er_positive_cohort |>
mutate(data = pmap(
list(data, n, progression_verbose),
~ {
if(..3 == "> 5 metastases" & ..2 == 5) sample_n(..1, 3)
else if(..3 == "> 5 metastases" & ..2 == 9 ) sample_n(..1, 4)
else ..1
}
)) |>
drop_na() |>
mutate(n = map_int(data, ~ nrow(.x))) |>
with_groups(progression_verbose, ~ .x |> mutate(count_of_progression = sum(n)) )
er_positive_cohort_batch_1
## # A tibble: 10 × 7
## progression_verbose subtype is_treated source data n count_of_progre…
## <fct> <chr> <chr> <chr> <list> <int> <int>
## 1 healthy healthy untreated inter… <tibble> 1 1
## 2 early 0 metastases lumina… untreated inter… <tibble> 3 10
## 3 early 0 metastases lumina… untreated ? <tibble> 5 10
## 4 early 0 metastases lumina… untreated ? <tibble> 2 10
## 5 In remission lumina… treated victo… <tibble> 5 6
## 6 In remission lumina… treated victo… <tibble> 1 6
## 7 < 5 metastases lumina… treated inter… <tibble> 3 7
## 8 < 5 metastases lumina… treated inter… <tibble> 4 7
## 9 > 5 metastases lumina… treated inter… <tibble> 4 7
## 10 > 5 metastases lumina… treated inter… <tibble> 3 7
How many samples for support each category combination
er_positive_cohort_batch_1 |>
ggplot(aes(n)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.