Import data

metadata <- 
  read_csv("metadata.csv") |> 
  select(1:6) |> 
  filter(subtype %in% c("HER2+", "luminal A", "luminal B", "TNBC", "healthy")) |> 
  mutate(progression_verbose  = case_when(
    progression == "high MBC" ~ "> 5 metastases",
    progression == "OMBC" ~ "< 5 metastases",
    progression == "EBC" ~ "early 0 metastases",
    TRUE ~ progression
  ) |>
    fct_relevel(c("healthy", "early 0 metastases", "In remission", "local recurrance (no mets)", "< 5 metastases", "> 5 metastases"))
) |> 
  mutate(source = case_when(
    id1 |> str_detect("^BCB") ~ "internal",
    id1 |> str_detect("AH") ~ "?",
    id1 |> str_detect("^CRP") ~ "victoria biobank?"
  ))
## Rows: 84 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): id1, id2, subtype, progression, is_treated, notes
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Initially exluding early as their potential is not expressed

Define cohort across all subtypes

full_cohort = 
  metadata |> 
  
  # Filter
  filter(progression_verbose !="local recurrance (no mets)") |> 
  filter(progression_verbose != "early 0 metastases") |> 
  droplevels() |> 
  
  # Count
  nest(data = -c(subtype , progression_verbose, is_treated, source)) |> 
  mutate(n = map_int(data, ~ nrow(.x))) |> 
  
  # Complete only for cancer
  complete(progression_verbose, subtype, fill = list(n=0)) |> 
  filter( (subtype == "healthy" & progression_verbose == "healthy") |  (subtype != "healthy" &  progression_verbose != "healthy")) |> 
  
  # Count
  with_groups(progression_verbose, ~ .x |> mutate(count_of_progression = sum(n)) )

full_cohort
## # A tibble: 13 × 7
##    progression_verbose subtype is_treated source data         n count_of_progre…
##    <fct>               <chr>   <chr>      <chr>  <list>   <int>            <int>
##  1 healthy             healthy untreated  inter… <tibble>     1                1
##  2 In remission        HER2+   treated    victo… <tibble>     2                9
##  3 In remission        lumina… treated    victo… <tibble>     5                9
##  4 In remission        lumina… treated    victo… <tibble>     1                9
##  5 In remission        TNBC    treated    inter… <tibble>     1                9
##  6 < 5 metastases      HER2+   treated    inter… <tibble>     3               10
##  7 < 5 metastases      lumina… treated    inter… <tibble>     3               10
##  8 < 5 metastases      lumina… treated    inter… <tibble>     4               10
##  9 < 5 metastases      TNBC    <NA>       <NA>   <NULL>       0               10
## 10 > 5 metastases      HER2+   treated    inter… <tibble>     9               26
## 11 > 5 metastases      lumina… treated    inter… <tibble>     9               26
## 12 > 5 metastases      lumina… treated    inter… <tibble>     5               26
## 13 > 5 metastases      TNBC    treated    inter… <tibble>     3               26

Define cohort across ER+ subtypes

er_positive_cohort = 
  metadata |> 
  
  # Filter
  filter(subtype |> str_detect("luminal") | subtype == "healthy") |> 
  filter(progression_verbose !="local recurrance (no mets)") |> 
  filter(progression_verbose != "early 0 metastases") |> 
  droplevels() |> 
  
  # Count
  nest(data = -c(subtype , progression_verbose, is_treated, source)) |> 
  mutate(n = map_int(data, ~ nrow(.x))) |> 
  
  # Complete only for cancer
  complete(progression_verbose, subtype, fill = list(n=0)) |> 
  filter( (subtype == "healthy" & progression_verbose == "healthy") |  (subtype != "healthy" &  progression_verbose != "healthy")) |> 
  
  # Count
  with_groups(progression_verbose, ~ .x |> mutate(count_of_progression = sum(n)) )

er_positive_cohort
## # A tibble: 7 × 7
##   progression_verbose subtype  is_treated source data         n count_of_progre…
##   <fct>               <chr>    <chr>      <chr>  <list>   <int>            <int>
## 1 healthy             healthy  untreated  inter… <tibble>     1                1
## 2 In remission        luminal… treated    victo… <tibble>     5                6
## 3 In remission        luminal… treated    victo… <tibble>     1                6
## 4 < 5 metastases      luminal… treated    inter… <tibble>     3                7
## 5 < 5 metastases      luminal… treated    inter… <tibble>     4                7
## 6 > 5 metastases      luminal… treated    inter… <tibble>     9               14
## 7 > 5 metastases      luminal… treated    inter… <tibble>     5               14

Now, let’s suppose we want to select the first batch. 4 healthy and 34 unhealthy. This results in 11.3 samples for the three cancer progression categories

For the whole cohort, we are limited by in remission and oligometastatic, so we divide the cohort as 9 + 10 + 14

full_cohort_batch_1 = 
  full_cohort |> 
  mutate(data = pmap(
    list(data, n, progression_verbose),
    ~ {
      if(..3 == "> 5 metastases" & ..2 == 5) sample_n(..1, 3)
      else if(..3 == "> 5 metastases" & ..2 == 9 ) sample_n(..1, 4)
      else ..1
    }
  )) |> 
  drop_na() |> 
  mutate(n = map_int(data, ~ nrow(.x))) |> 
  
 with_groups(progression_verbose, ~ .x |> mutate(count_of_progression = sum(n)) )

full_cohort_batch_1
## # A tibble: 12 × 7
##    progression_verbose subtype is_treated source data         n count_of_progre…
##    <fct>               <chr>   <chr>      <chr>  <list>   <int>            <int>
##  1 healthy             healthy untreated  inter… <tibble>     1                1
##  2 In remission        HER2+   treated    victo… <tibble>     2                9
##  3 In remission        lumina… treated    victo… <tibble>     5                9
##  4 In remission        lumina… treated    victo… <tibble>     1                9
##  5 In remission        TNBC    treated    inter… <tibble>     1                9
##  6 < 5 metastases      HER2+   treated    inter… <tibble>     3               10
##  7 < 5 metastases      lumina… treated    inter… <tibble>     3               10
##  8 < 5 metastases      lumina… treated    inter… <tibble>     4               10
##  9 > 5 metastases      HER2+   treated    inter… <tibble>     4               14
## 10 > 5 metastases      lumina… treated    inter… <tibble>     4               14
## 11 > 5 metastases      lumina… treated    inter… <tibble>     3               14
## 12 > 5 metastases      TNBC    treated    inter… <tibble>     3               14

How many samples for support each category combination

full_cohort_batch_1 |> 
  ggplot(aes(n)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The ER+ cohort

er_positive_cohort_batch_1 = 
  er_positive_cohort |> 
  mutate(data = pmap(
    list(data, n, progression_verbose),
    ~ {
      if(..3 == "> 5 metastases" & ..2 == 5) sample_n(..1, 3)
      else if(..3 == "> 5 metastases" & ..2 == 9 ) sample_n(..1, 4)
      else ..1
    }
  )) |> 
  drop_na() |> 
  mutate(n = map_int(data, ~ nrow(.x))) |> 
  
 with_groups(progression_verbose, ~ .x |> mutate(count_of_progression = sum(n)) )

er_positive_cohort_batch_1
## # A tibble: 7 × 7
##   progression_verbose subtype  is_treated source data         n count_of_progre…
##   <fct>               <chr>    <chr>      <chr>  <list>   <int>            <int>
## 1 healthy             healthy  untreated  inter… <tibble>     1                1
## 2 In remission        luminal… treated    victo… <tibble>     5                6
## 3 In remission        luminal… treated    victo… <tibble>     1                6
## 4 < 5 metastases      luminal… treated    inter… <tibble>     3                7
## 5 < 5 metastases      luminal… treated    inter… <tibble>     4                7
## 6 > 5 metastases      luminal… treated    inter… <tibble>     4                7
## 7 > 5 metastases      luminal… treated    inter… <tibble>     3                7

How many samples for support each category combination

er_positive_cohort_batch_1 |> 
  ggplot(aes(n)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Including early as their potential is not expressed

Define cohort across all subtypes

full_cohort = 
  metadata |> 
  
  # Filter
  filter(progression_verbose !="local recurrance (no mets)") |>
  filter((progression_verbose=="early 0 metastases" & is_treated=="untreated") | progression_verbose!="early 0 metastases") |> 
  droplevels() |> 
  
  # Count
  nest(data = -c(subtype , progression_verbose, is_treated, source)) |> 
  mutate(n = map_int(data, ~ nrow(.x))) |> 
  
  # Complete only for cancer
  complete(progression_verbose, subtype, fill = list(n=0)) |> 
  filter( (subtype == "healthy" & progression_verbose == "healthy") |  (subtype != "healthy" &  progression_verbose != "healthy")) |> 
  
  # Count
  with_groups(progression_verbose, ~ .x |> mutate(count_of_progression = sum(n)) )

full_cohort
## # A tibble: 19 × 7
##    progression_verbose subtype is_treated source data         n count_of_progre…
##    <fct>               <chr>   <chr>      <chr>  <list>   <int>            <int>
##  1 healthy             healthy untreated  inter… <tibble>     1                1
##  2 early 0 metastases  HER2+   untreated  inter… <tibble>     2               14
##  3 early 0 metastases  HER2+   untreated  ?      <tibble>     1               14
##  4 early 0 metastases  lumina… untreated  inter… <tibble>     3               14
##  5 early 0 metastases  lumina… untreated  ?      <tibble>     5               14
##  6 early 0 metastases  lumina… untreated  ?      <tibble>     2               14
##  7 early 0 metastases  TNBC    untreated  ?      <tibble>     1               14
##  8 In remission        HER2+   treated    victo… <tibble>     2                9
##  9 In remission        lumina… treated    victo… <tibble>     5                9
## 10 In remission        lumina… treated    victo… <tibble>     1                9
## 11 In remission        TNBC    treated    inter… <tibble>     1                9
## 12 < 5 metastases      HER2+   treated    inter… <tibble>     3               10
## 13 < 5 metastases      lumina… treated    inter… <tibble>     3               10
## 14 < 5 metastases      lumina… treated    inter… <tibble>     4               10
## 15 < 5 metastases      TNBC    <NA>       <NA>   <NULL>       0               10
## 16 > 5 metastases      HER2+   treated    inter… <tibble>     9               26
## 17 > 5 metastases      lumina… treated    inter… <tibble>     9               26
## 18 > 5 metastases      lumina… treated    inter… <tibble>     5               26
## 19 > 5 metastases      TNBC    treated    inter… <tibble>     3               26

Define cohort across ER+ subtypes

er_positive_cohort = 
  metadata |> 
  
  # Filter
  filter(subtype |> str_detect("luminal") | subtype == "healthy") |> 
  filter((progression_verbose=="early 0 metastases" & is_treated=="untreated") | progression_verbose!="early 0 metastases") |> 
  filter(progression_verbose !="local recurrance (no mets)") |> 
  droplevels() |> 
  
  # Count
  nest(data = -c(subtype , progression_verbose, is_treated, source)) |> 
  mutate(n = map_int(data, ~ nrow(.x))) |> 
  
  # Complete only for cancer
  complete(progression_verbose, subtype, fill = list(n=0)) |> 
  filter( (subtype == "healthy" & progression_verbose == "healthy") |  (subtype != "healthy" &  progression_verbose != "healthy")) |> 
  
  # Count
  with_groups(progression_verbose, ~ .x |> mutate(count_of_progression = sum(n)) )

er_positive_cohort
## # A tibble: 10 × 7
##    progression_verbose subtype is_treated source data         n count_of_progre…
##    <fct>               <chr>   <chr>      <chr>  <list>   <int>            <int>
##  1 healthy             healthy untreated  inter… <tibble>     1                1
##  2 early 0 metastases  lumina… untreated  inter… <tibble>     3               10
##  3 early 0 metastases  lumina… untreated  ?      <tibble>     5               10
##  4 early 0 metastases  lumina… untreated  ?      <tibble>     2               10
##  5 In remission        lumina… treated    victo… <tibble>     5                6
##  6 In remission        lumina… treated    victo… <tibble>     1                6
##  7 < 5 metastases      lumina… treated    inter… <tibble>     3                7
##  8 < 5 metastases      lumina… treated    inter… <tibble>     4                7
##  9 > 5 metastases      lumina… treated    inter… <tibble>     9               14
## 10 > 5 metastases      lumina… treated    inter… <tibble>     5               14

Now, let’s suppose we want to select the first batch. 4 healthy and 34 unhealthy. This results in 11.3 samples for the three cancer progression categories

For the whole cohort, we are limited by in remission and oligometastatic, so we divide the cohort as 9 + 10 + 14

full_cohort_batch_1 = 
  full_cohort |> 
  drop_na() |> 
    mutate(data = pmap(
    list(data, n, progression_verbose),
    ~ {
      if(..3 == "> 5 metastases") sample_n(..1, 2)
      else sample_n(..1, min(3, nrow(..1)))
    }
  )) |> 

  mutate(n = map_int(data, ~ nrow(.x))) |> 
  
 with_groups(progression_verbose, ~ .x |> mutate(count_of_progression = sum(n)) )

full_cohort_batch_1
## # A tibble: 18 × 7
##    progression_verbose subtype is_treated source data         n count_of_progre…
##    <fct>               <chr>   <chr>      <chr>  <list>   <int>            <int>
##  1 healthy             healthy untreated  inter… <tibble>     1                1
##  2 early 0 metastases  HER2+   untreated  inter… <tibble>     2               12
##  3 early 0 metastases  HER2+   untreated  ?      <tibble>     1               12
##  4 early 0 metastases  lumina… untreated  inter… <tibble>     3               12
##  5 early 0 metastases  lumina… untreated  ?      <tibble>     3               12
##  6 early 0 metastases  lumina… untreated  ?      <tibble>     2               12
##  7 early 0 metastases  TNBC    untreated  ?      <tibble>     1               12
##  8 In remission        HER2+   treated    victo… <tibble>     2                7
##  9 In remission        lumina… treated    victo… <tibble>     3                7
## 10 In remission        lumina… treated    victo… <tibble>     1                7
## 11 In remission        TNBC    treated    inter… <tibble>     1                7
## 12 < 5 metastases      HER2+   treated    inter… <tibble>     3                9
## 13 < 5 metastases      lumina… treated    inter… <tibble>     3                9
## 14 < 5 metastases      lumina… treated    inter… <tibble>     3                9
## 15 > 5 metastases      HER2+   treated    inter… <tibble>     2                8
## 16 > 5 metastases      lumina… treated    inter… <tibble>     2                8
## 17 > 5 metastases      lumina… treated    inter… <tibble>     2                8
## 18 > 5 metastases      TNBC    treated    inter… <tibble>     2                8

How many samples for support each category combination

full_cohort_batch_1 |> 
  ggplot(aes(n)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

hat about the ER+ cohort

er_positive_cohort_batch_1 = 
  er_positive_cohort |> 
  mutate(data = pmap(
    list(data, n, progression_verbose),
    ~ {
      if(..3 == "> 5 metastases" & ..2 == 5) sample_n(..1, 3)
      else if(..3 == "> 5 metastases" & ..2 == 9 ) sample_n(..1, 4)
      else ..1
    }
  )) |> 
  drop_na() |> 
  mutate(n = map_int(data, ~ nrow(.x))) |> 
  
 with_groups(progression_verbose, ~ .x |> mutate(count_of_progression = sum(n)) )

er_positive_cohort_batch_1
## # A tibble: 10 × 7
##    progression_verbose subtype is_treated source data         n count_of_progre…
##    <fct>               <chr>   <chr>      <chr>  <list>   <int>            <int>
##  1 healthy             healthy untreated  inter… <tibble>     1                1
##  2 early 0 metastases  lumina… untreated  inter… <tibble>     3               10
##  3 early 0 metastases  lumina… untreated  ?      <tibble>     5               10
##  4 early 0 metastases  lumina… untreated  ?      <tibble>     2               10
##  5 In remission        lumina… treated    victo… <tibble>     5                6
##  6 In remission        lumina… treated    victo… <tibble>     1                6
##  7 < 5 metastases      lumina… treated    inter… <tibble>     3                7
##  8 < 5 metastases      lumina… treated    inter… <tibble>     4                7
##  9 > 5 metastases      lumina… treated    inter… <tibble>     4                7
## 10 > 5 metastases      lumina… treated    inter… <tibble>     3                7

How many samples for support each category combination

er_positive_cohort_batch_1 |> 
  ggplot(aes(n)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.