Original list of SNPs

snps_orig <- read_tsv(file = "APM_CT2_pilot_SNP_assay_custom_plate_information.txt")
snps_orig %>%
  mutate(n = row_number()) %>%
  relocate(n)

Annotated SNPs list

dbSNP (version 151) information was pulled from UCSC genome browser using the Table Browser tool: https://genome.ucsc.edu/cgi-bin/hgTables.

snps <- read_tsv(file = "149SNPs_dbSNP.txt")
snps <- snps %>%
  filter(!grepl("alt", chrom)) %>%
  rename(rsID = name) %>%
  arrange(as.integer(substr(chrom, 4, 6))) %>%
  dplyr::select(c("rsID", "chrom", "chromStart", "chromEnd", "refNCBI", "observed", "class", "func", "alleles", "alleleNs", "alleleFreqs"))
snps <- snps_orig %>%
  inner_join(snps, by = "rsID") %>%
  mutate(n = row_number()) %>%
  relocate(n)
write_tsv(snps,"snps_annotated.tsv")
save(snps, file = 'snps.RData')
snps

Genotype for 48 samples

genotypes <- read_tsv(file = "48samples_149SNPs_genotypes_by_rsID.txt") %>%
  mutate(
    rsID = if_else(rsID == ".", "rs28691231", rsID),
    GT = gsub("|", "/", GT, fixed = T)
  )
genotypes_ann <- genotypes %>%
  inner_join(snps, by = "rsID") %>%
  rename(chrom = chrom.x, chromEnd = chromEnd.x, chromStart.y = chromStart) %>%
  relocate(n) %>%
  dplyr::select(-chrom.y) %>%
  relocate(chromStart.y, chromEnd.y, .after = chromEnd) %>%
  arrange(n, sample)
write_tsv(genotypes_ann %>% select (sample, AssayID, GT), "genotypes_WGS.tsv")
genotypes_ann

SNPs which are not present in any of the 48 samples

# setdiff(unique(snps$n), unique(genotypes_ann$n))
# setdiff(unique(snps$rsID), unique(genotypes_ann$rsID))
# setdiff(unique(genotypes_ann$rsID), unique(snps$rsID))
# keys0 <- snps %>% mutate(key=paste(chrom, as.character(chromEnd))) %>% select(key) %>% unique() %>% unlist()
# keys1 <- genotypes %>% mutate(key=paste(chrom, as.character(chromEnd))) %>% select(key) %>% unique() %>% unlist()
# keys2 <- genotypes_ann %>% mutate(key=paste(chrom, as.character(chromEnd))) %>% select(key) %>% unique() %>% unlist()
keys0 <- snps %>%
  dplyr::select(rsID) %>%
  unique() %>%
  unlist()
keys1 <- genotypes %>%
  dplyr::select(rsID) %>%
  unique() %>%
  unlist()
# keys2 <- genotypes_ann %>% select(rsID) %>% unique() %>% unlist()
# setdiff(keys0, keys1)
# setdiff(keys1, keys0)
# setdiff(keys0, keys2)
# setdiff(keys2, keys0)
# setdiff(keys1, keys2)
# setdiff(keys2, keys1)
missing_rsIDs <- setdiff(keys0, keys1)
snps %>%
  filter(rsID %in% missing_rsIDs)

Special cases

  • SNP rs72846765 occurs just as homozygous reference (0/0) and homozygous alt (1/1) - this reduces the maximum number of SNPs that occur as hets to 139
  • SNPs rs112957100 and rs75214905 occur as 2 different allele hets - for the purpose of this analysis both alleles will be considered as hets (but they are not guaranteed to be both in the test set)
a <- genotypes_ann %>%
  dplyr::select(rsID) %>%
  unique() %>%
  unlist()
b <- genotypes_ann %>%
  group_by(rsID) %>%
  dplyr::select("GT") %>%
  count(GT) %>%
  filter(GT %in% c("0/1", "0/2")) %>%
  dplyr::select(rsID) %>%
  unique() %>%
  unlist()
setdiff(a, b)
## [1] "rs72846765"
genotypes_ann %>%
  filter(rsID == "rs72846765") %>%
  dplyr::select("GT") %>%
  count(GT)
genotypes_ann %>%
  filter(rsID == "rs112957100") %>%
  dplyr::select("GT") %>%
  count(GT)
genotypes_ann %>%
  filter(rsID == "rs75214905") %>%
  dplyr::select("GT") %>%
  count(GT)
missing_rsIDs <- c(missing_rsIDs, setdiff(a, b))

The matrix

  • Each row represents one of the 139 SNPs that have at least one Het call
  • Each column represents a sample
  • Blue means homozygous reference or homozygous alt, light green means heterozygous alt
  • The samples (columns) are hierarchically clustered, according to how similar they are in their het alt composition
  • The total number of hets across the samples for each SNP is shown on the right as bars
m <- genotypes %>%
  dplyr::select(c("rsID", "sample", "GT")) %>%
  filter(!rsID %in% missing_rsIDs) %>%
  mutate(GT = if_else((GT %in% c("0/0", "1/1")) |
    (rsID == "rs112957100" & GT == "0/1") |
    (rsID == "rs75214905" & GT == "0/2"), 0, 1)) %>%
  spread(sample, GT) %>%
  column_to_rownames("rsID")
m1 <- m %>%
  rowwise() %>%
  mutate(total = sum(c_across(where(is.numeric))))
superheat(m,
  pretty.order.rows = T,
  pretty.order.cols = T,
  # row.dendrogram = T,
  col.dendrogram = T,
  bottom.label.text.angle = 90,
  bottom.label.text.alignment = "right",
  bottom.label.size = 0.3,
  bottom.label.text.size = 4,
  yr = m1$total,
  yr.axis.name = "# of hets",
  yr.plot.type = "bar"
)

Clustering the samples

create a one-letter dictionary for the samples

# make the sample names one letter
intToLetter <- function(n) {
  return(as.character(ifelse(n < 27, intToUtf8(n + 64), intToUtf8(n + 70))))
}

dict <- colnames(m)
names(dict) <- map(1:length(dict), intToLetter)
colnames(m) <- names(dict)

dict_df <- tibble(letter = names(dict), sample = dict)
write_tsv(dict_df, "48samples_dict.tsv")
dict_df

separate “singleton” SNPs and samples

SNPs that occur just once in this 48 samples dataset and the corresponding samples:

singlem <- m[which(m1$total == 1), ] %>% dplyr::select_if(~ any(. == 1))
singletonSNP_samples <- dict[colnames(singlem)]
mm <- m %>% dplyr::select(names(singletonSNP_samples), "I")
mm1 <- mm %>%
  rowwise() %>%
  mutate(total = sum(c_across(where(is.numeric))))
covered_snps <- tibble(rsID = rownames(mm[which(mm1$total > 0), ]))
singlem
covered_snps

with hclust

mm <- m %>%
  dplyr::select(-names(singletonSNP_samples), -I) %>%
  rownames_to_column("rsID") %>%
  filter(!rsID %in% covered_snps$rsID) %>%
  column_to_rownames("rsID")
data <- t(mm)
d <- dist(data, method = "euclidean")
hc1 <- hclust(d, method = "complete")
n1 <- table(cutree(hc1, k = 14)) %>% reduce(`*`)
plot(hc1, cex = 0.6, hang = -1, xlab = paste(as.character(n1), "combinations"))
rect.hclust(hc1, k = 14, border = 2:15)

with agnes, method=complete

hc2 <- agnes(d, method = "complete")
n2 <- table(cutree(hc2, k = 14)) %>% reduce(`*`)
pltree(hc2, cex = 0.6, hang = -1, xlab = paste(as.character(n2), "combinations"))
rect.hclust(hc2, k = 14, border = 2:15)

with agnes, method=ward

hc3 <- agnes(d, method = "ward")
n3 <- table(cutree(hc3, k = 14)) %>% reduce(`*`)
pltree(hc3, cex = 0.6, hang = -1, xlab = paste(as.character(n3), "combinations"))
rect.hclust(hc3, k = 14, border = 2:15)

with diana

hc4 <- diana(d, metric = "euclidean")
n4 <- table(cutree(hc4, k = 14)) %>% reduce(`*`)
pltree(hc4, cex = 0.6, hang = -1, xlab = paste(as.character(n4), "combinations"))
rect.hclust(hc4, k = 14, border = 2:18)

Optimizing the het SNP coverage - the “winning” combination

l1 <- cutree(hc4, k = 14)
d1 <- 1:14 %>% map(~ names(mm)[which(l1 == .x)])

process_combination <- function(x) {
  p()
  return(tibble(
    comb = paste0(unlist(x), collapse = ""),
    het = (mm %>%
      dplyr::select(unlist(x)) %>%
      filter_all(any_vars(. == 1)) %>%
      nrow()
    )
  ))
}

all_comb <- d1 %>%
  cross()

N <- 100
set.seed(42)
my_sample <- sample(1:n4, N)
handlers("rstudio")
with_progress({
  p <- progressor(steps = N)
  count_hets <- my_sample %>%
    map(~ all_comb[[.x]]) %>%
    map_df(process_combination)
})
count_hets
# save(list=c('all_comb', 'count_hets'), file="combinations2.RData")

# load("combinations.RData")
# count_hets %>% filter(het == max(count_hets$het) & grepl('I', comb))
winner <- count_hets %>%
  filter(het == max(count_hets$het)) %>%
  head(1) %>%
  mutate(comb = paste0(comb, "I", names(singletonSNP_samples) %>% reduce(paste0))) %>%
  dplyr::select("comb") %>%
  str_split("") %>%
  map(~ dict[.x]) %>%
  unlist()
winner_df <- tibble(letter = names(winner), sample = winner) %>% arrange(sample)
winner_df

The reduced matrix

m_reduced <- m %>% dplyr::select(names(winner))
colnames(m_reduced) <- winner
m1_reduced <- m_reduced %>%
  rowwise() %>%
  mutate(totalHets = sum(c_across(where(is.numeric))))
superheat(m_reduced,
  pretty.order.rows = T,
  pretty.order.cols = T,
  # row.dendrogram = T,
  col.dendrogram = T,
  bottom.label.text.angle = 90,
  bottom.label.text.alignment = "right",
  bottom.label.size = 0.3,
  bottom.label.text.size = 4,
  yr = m1_reduced$totalHets,
  yr.axis.name = "# of hets",
  yr.plot.type = "bar"
)

Final selection of 20 samples covering 139 SNPs

This table is also available in Trello as selected_samples.csv

rownames(m1_reduced) <- rownames(m_reduced)
selected_samples <- m1_reduced %>%
  rownames_to_column("rsID") %>%
  inner_join(snps, by = "rsID") %>%
  arrange(n) %>%
  relocate(n, AssayID, TargetInformation, rsID, totalHets)
write_csv(selected_samples, "selected_samples.csv")
selected_samples