TaqMan Select

Original list of SNPs

snps_orig <- read_tsv(file = "APM_CT2_pilot_SNP_assay_custom_plate_information.txt")
snps_orig %>%
  mutate(n = row_number()) %>%
  relocate(n)

Annotated SNPs list

dbSNP (version 151) information was pulled from UCSC genome browser using the Table Browser tool: https://genome.ucsc.edu/cgi-bin/hgTables.

snps <- read_tsv(file = "149SNPs_dbSNP.txt")
snps <- snps %>%
  filter(!grepl("alt", chrom)) %>%
  rename(rsID = name) %>%
  arrange(as.integer(substr(chrom, 4, 6))) %>%
  dplyr::select(c("rsID", "chrom", "chromStart", "chromEnd", "refNCBI", "observed", "class", "func", "alleles", "alleleNs", "alleleFreqs"))
snps <- snps_orig %>%
  inner_join(snps, by = "rsID") %>%
  mutate(n = row_number()) %>%
  relocate(n)
write_tsv(snps,"snps_annotated.tsv")
save(snps, file = 'snps.RData')
snps

Genotype for 48 samples

genotypes <- read_tsv(file = "48samples_149SNPs_genotypes_by_rsID.txt") %>%
  mutate(
    rsID = if_else(rsID == ".", "rs28691231", rsID),
    GT = gsub("|", "/", GT, fixed = T)
  )
genotypes_ann <- genotypes %>%
  inner_join(snps, by = "rsID") %>%
  rename(chrom = chrom.x, chromEnd = chromEnd.x, chromStart.y = chromStart) %>%
  relocate(n) %>%
  dplyr::select(-chrom.y) %>%
  relocate(chromStart.y, chromEnd.y, .after = chromEnd) %>%
  arrange(n, sample)
write_tsv(genotypes_ann %>% select (sample, AssayID, GT), "genotypes_WGS.tsv")
genotypes_ann

SNPs which are not present in any of the 48 samples

# setdiff(unique(snps$n), unique(genotypes_ann$n))
# setdiff(unique(snps$rsID), unique(genotypes_ann$rsID))
# setdiff(unique(genotypes_ann$rsID), unique(snps$rsID))
# keys0 <- snps %>% mutate(key=paste(chrom, as.character(chromEnd))) %>% select(key) %>% unique() %>% unlist()
# keys1 <- genotypes %>% mutate(key=paste(chrom, as.character(chromEnd))) %>% select(key) %>% unique() %>% unlist()
# keys2 <- genotypes_ann %>% mutate(key=paste(chrom, as.character(chromEnd))) %>% select(key) %>% unique() %>% unlist()
keys0 <- snps %>%
  dplyr::select(rsID) %>%
  unique() %>%
  unlist()
keys1 <- genotypes %>%
  dplyr::select(rsID) %>%
  unique() %>%
  unlist()
# keys2 <- genotypes_ann %>% select(rsID) %>% unique() %>% unlist()
# setdiff(keys0, keys1)
# setdiff(keys1, keys0)
# setdiff(keys0, keys2)
# setdiff(keys2, keys0)
# setdiff(keys1, keys2)
# setdiff(keys2, keys1)
missing_rsIDs <- setdiff(keys0, keys1)
snps %>%
  filter(rsID %in% missing_rsIDs)

Special cases

SNP rs72846765 occurs just as homozygous reference (0/0) and homozygous alt (1/1) - this reduces the maximum number of SNPs that occur as hets to 139
SNPs rs112957100 and rs75214905 occur as 2 different allele hets - for the purpose of this analysis both alleles will be considered as hets (but they are not guaranteed to be both in the test set)

a <- genotypes_ann %>%
  dplyr::select(rsID) %>%
  unique() %>%
  unlist()
b <- genotypes_ann %>%
  group_by(rsID) %>%
  dplyr::select("GT") %>%
  count(GT) %>%
  filter(GT %in% c("0/1", "0/2")) %>%
  dplyr::select(rsID) %>%
  unique() %>%
  unlist()
setdiff(a, b)

## [1] "rs72846765"

genotypes_ann %>%
  filter(rsID == "rs72846765") %>%
  dplyr::select("GT") %>%
  count(GT)

genotypes_ann %>%
  filter(rsID == "rs112957100") %>%
  dplyr::select("GT") %>%
  count(GT)

genotypes_ann %>%
  filter(rsID == "rs75214905") %>%
  dplyr::select("GT") %>%
  count(GT)

missing_rsIDs <- c(missing_rsIDs, setdiff(a, b))

The matrix

Each row represents one of the 139 SNPs that have at least one Het call
Each column represents a sample
Blue means homozygous reference or homozygous alt, light green means heterozygous alt
The samples (columns) are hierarchically clustered, according to how similar they are in their het alt composition
The total number of hets across the samples for each SNP is shown on the right as bars

m <- genotypes %>%
  dplyr::select(c("rsID", "sample", "GT")) %>%
  filter(!rsID %in% missing_rsIDs) %>%
  mutate(GT = if_else((GT %in% c("0/0", "1/1")) |
    (rsID == "rs112957100" & GT == "0/1") |
    (rsID == "rs75214905" & GT == "0/2"), 0, 1)) %>%
  spread(sample, GT) %>%
  column_to_rownames("rsID")
m1 <- m %>%
  rowwise() %>%
  mutate(total = sum(c_across(where(is.numeric))))
superheat(m,
  pretty.order.rows = T,
  pretty.order.cols = T,
  # row.dendrogram = T,
  col.dendrogram = T,
  bottom.label.text.angle = 90,
  bottom.label.text.alignment = "right",
  bottom.label.size = 0.3,
  bottom.label.text.size = 4,
  yr = m1$total,
  yr.axis.name = "# of hets",
  yr.plot.type = "bar"
)

Clustering the samples

create a one-letter dictionary for the samples

# make the sample names one letter
intToLetter <- function(n) {
  return(as.character(ifelse(n < 27, intToUtf8(n + 64), intToUtf8(n + 70))))
}

dict <- colnames(m)
names(dict) <- map(1:length(dict), intToLetter)
colnames(m) <- names(dict)

dict_df <- tibble(letter = names(dict), sample = dict)
write_tsv(dict_df, "48samples_dict.tsv")
dict_df

separate “singleton” SNPs and samples

SNPs that occur just once in this 48 samples dataset and the corresponding samples:

singlem <- m[which(m1$total == 1), ] %>% dplyr::select_if(~ any(. == 1))
singletonSNP_samples <- dict[colnames(singlem)]
mm <- m %>% dplyr::select(names(singletonSNP_samples), "I")
mm1 <- mm %>%
  rowwise() %>%
  mutate(total = sum(c_across(where(is.numeric))))
covered_snps <- tibble(rsID = rownames(mm[which(mm1$total > 0), ]))
singlem

covered_snps

with `hclust`

mm <- m %>%
  dplyr::select(-names(singletonSNP_samples), -I) %>%
  rownames_to_column("rsID") %>%
  filter(!rsID %in% covered_snps$rsID) %>%
  column_to_rownames("rsID")
data <- t(mm)
d <- dist(data, method = "euclidean")
hc1 <- hclust(d, method = "complete")
n1 <- table(cutree(hc1, k = 14)) %>% reduce(`*`)
plot(hc1, cex = 0.6, hang = -1, xlab = paste(as.character(n1), "combinations"))
rect.hclust(hc1, k = 14, border = 2:15)

with `agnes`, method=`complete`

hc2 <- agnes(d, method = "complete")
n2 <- table(cutree(hc2, k = 14)) %>% reduce(`*`)
pltree(hc2, cex = 0.6, hang = -1, xlab = paste(as.character(n2), "combinations"))
rect.hclust(hc2, k = 14, border = 2:15)

with `agnes`, method=`ward`

hc3 <- agnes(d, method = "ward")
n3 <- table(cutree(hc3, k = 14)) %>% reduce(`*`)
pltree(hc3, cex = 0.6, hang = -1, xlab = paste(as.character(n3), "combinations"))
rect.hclust(hc3, k = 14, border = 2:15)

with `diana`

hc4 <- diana(d, metric = "euclidean")
n4 <- table(cutree(hc4, k = 14)) %>% reduce(`*`)
pltree(hc4, cex = 0.6, hang = -1, xlab = paste(as.character(n4), "combinations"))
rect.hclust(hc4, k = 14, border = 2:18)

Optimizing the het SNP coverage - the “winning” combination

l1 <- cutree(hc4, k = 14)
d1 <- 1:14 %>% map(~ names(mm)[which(l1 == .x)])

process_combination <- function(x) {
  p()
  return(tibble(
    comb = paste0(unlist(x), collapse = ""),
    het = (mm %>%
      dplyr::select(unlist(x)) %>%
      filter_all(any_vars(. == 1)) %>%
      nrow()
    )
  ))
}

all_comb <- d1 %>%
  cross()

N <- 100
set.seed(42)
my_sample <- sample(1:n4, N)
handlers("rstudio")
with_progress({
  p <- progressor(steps = N)
  count_hets <- my_sample %>%
    map(~ all_comb[[.x]]) %>%
    map_df(process_combination)
})
count_hets

# save(list=c('all_comb', 'count_hets'), file="combinations2.RData")

# load("combinations.RData")
# count_hets %>% filter(het == max(count_hets$het) & grepl('I', comb))
winner <- count_hets %>%
  filter(het == max(count_hets$het)) %>%
  head(1) %>%
  mutate(comb = paste0(comb, "I", names(singletonSNP_samples) %>% reduce(paste0))) %>%
  dplyr::select("comb") %>%
  str_split("") %>%
  map(~ dict[.x]) %>%
  unlist()
winner_df <- tibble(letter = names(winner), sample = winner) %>% arrange(sample)
winner_df

The reduced matrix

m_reduced <- m %>% dplyr::select(names(winner))
colnames(m_reduced) <- winner
m1_reduced <- m_reduced %>%
  rowwise() %>%
  mutate(totalHets = sum(c_across(where(is.numeric))))
superheat(m_reduced,
  pretty.order.rows = T,
  pretty.order.cols = T,
  # row.dendrogram = T,
  col.dendrogram = T,
  bottom.label.text.angle = 90,
  bottom.label.text.alignment = "right",
  bottom.label.size = 0.3,
  bottom.label.text.size = 4,
  yr = m1_reduced$totalHets,
  yr.axis.name = "# of hets",
  yr.plot.type = "bar"
)

Final selection of 20 samples covering 139 SNPs

This table is also available in Trello as selected_samples.csv

rownames(m1_reduced) <- rownames(m_reduced)
selected_samples <- m1_reduced %>%
  rownames_to_column("rsID") %>%
  inner_join(snps, by = "rsID") %>%
  arrange(n) %>%
  relocate(n, AssayID, TargetInformation, rsID, totalHets)
write_csv(selected_samples, "selected_samples.csv")
selected_samples

TaqMan Select

Razvan Sultana

2022-05-04

Original list of SNPs

Annotated SNPs list

Genotype for 48 samples

SNPs which are not present in any of the 48 samples

Special cases

The matrix

Clustering the samples

create a one-letter dictionary for the samples

separate “singleton” SNPs and samples

with `hclust`

with `agnes`, method=`complete`

with `agnes`, method=`ward`

with `diana`

Optimizing the het SNP coverage - the “winning” combination

The reduced matrix

Final selection of 20 samples covering 139 SNPs

TaqMan Select

Razvan Sultana

2022-05-04

Original list of SNPs

Annotated SNPs list

Genotype for 48 samples

SNPs which are not present in any of the 48 samples

Special cases

The matrix

Clustering the samples

create a one-letter dictionary for the samples

separate “singleton” SNPs and samples

with hclust

with agnes, method=complete

with agnes, method=ward

with diana

Optimizing the het SNP coverage - the “winning” combination

The reduced matrix

Final selection of 20 samples covering 139 SNPs

with `hclust`

with `agnes`, method=`complete`

with `agnes`, method=`ward`

with `diana`