snps_orig <- read_tsv(file = "APM_CT2_pilot_SNP_assay_custom_plate_information.txt")
snps_orig %>%
mutate(n = row_number()) %>%
relocate(n)
dbSNP (version 151) information was pulled from UCSC genome browser using the Table Browser tool: https://genome.ucsc.edu/cgi-bin/hgTables.
snps <- read_tsv(file = "149SNPs_dbSNP.txt")
snps <- snps %>%
filter(!grepl("alt", chrom)) %>%
rename(rsID = name) %>%
arrange(as.integer(substr(chrom, 4, 6))) %>%
dplyr::select(c("rsID", "chrom", "chromStart", "chromEnd", "refNCBI", "observed", "class", "func", "alleles", "alleleNs", "alleleFreqs"))
snps <- snps_orig %>%
inner_join(snps, by = "rsID") %>%
mutate(n = row_number()) %>%
relocate(n)
write_tsv(snps,"snps_annotated.tsv")
save(snps, file = 'snps.RData')
snps
genotypes <- read_tsv(file = "48samples_149SNPs_genotypes_by_rsID.txt") %>%
mutate(
rsID = if_else(rsID == ".", "rs28691231", rsID),
GT = gsub("|", "/", GT, fixed = T)
)
genotypes_ann <- genotypes %>%
inner_join(snps, by = "rsID") %>%
rename(chrom = chrom.x, chromEnd = chromEnd.x, chromStart.y = chromStart) %>%
relocate(n) %>%
dplyr::select(-chrom.y) %>%
relocate(chromStart.y, chromEnd.y, .after = chromEnd) %>%
arrange(n, sample)
write_tsv(genotypes_ann %>% select (sample, AssayID, GT), "genotypes_WGS.tsv")
genotypes_ann
# setdiff(unique(snps$n), unique(genotypes_ann$n))
# setdiff(unique(snps$rsID), unique(genotypes_ann$rsID))
# setdiff(unique(genotypes_ann$rsID), unique(snps$rsID))
# keys0 <- snps %>% mutate(key=paste(chrom, as.character(chromEnd))) %>% select(key) %>% unique() %>% unlist()
# keys1 <- genotypes %>% mutate(key=paste(chrom, as.character(chromEnd))) %>% select(key) %>% unique() %>% unlist()
# keys2 <- genotypes_ann %>% mutate(key=paste(chrom, as.character(chromEnd))) %>% select(key) %>% unique() %>% unlist()
keys0 <- snps %>%
dplyr::select(rsID) %>%
unique() %>%
unlist()
keys1 <- genotypes %>%
dplyr::select(rsID) %>%
unique() %>%
unlist()
# keys2 <- genotypes_ann %>% select(rsID) %>% unique() %>% unlist()
# setdiff(keys0, keys1)
# setdiff(keys1, keys0)
# setdiff(keys0, keys2)
# setdiff(keys2, keys0)
# setdiff(keys1, keys2)
# setdiff(keys2, keys1)
missing_rsIDs <- setdiff(keys0, keys1)
snps %>%
filter(rsID %in% missing_rsIDs)
rs72846765 occurs just as homozygous reference
(0/0) and homozygous alt (1/1) - this reduces the maximum number of SNPs
that occur as hets to 139rs112957100 and rs75214905 occur as 2
different allele hets - for the purpose of this analysis both alleles
will be considered as hets (but they are not guaranteed to be both in
the test set)a <- genotypes_ann %>%
dplyr::select(rsID) %>%
unique() %>%
unlist()
b <- genotypes_ann %>%
group_by(rsID) %>%
dplyr::select("GT") %>%
count(GT) %>%
filter(GT %in% c("0/1", "0/2")) %>%
dplyr::select(rsID) %>%
unique() %>%
unlist()
setdiff(a, b)
## [1] "rs72846765"
genotypes_ann %>%
filter(rsID == "rs72846765") %>%
dplyr::select("GT") %>%
count(GT)
genotypes_ann %>%
filter(rsID == "rs112957100") %>%
dplyr::select("GT") %>%
count(GT)
genotypes_ann %>%
filter(rsID == "rs75214905") %>%
dplyr::select("GT") %>%
count(GT)
missing_rsIDs <- c(missing_rsIDs, setdiff(a, b))
m <- genotypes %>%
dplyr::select(c("rsID", "sample", "GT")) %>%
filter(!rsID %in% missing_rsIDs) %>%
mutate(GT = if_else((GT %in% c("0/0", "1/1")) |
(rsID == "rs112957100" & GT == "0/1") |
(rsID == "rs75214905" & GT == "0/2"), 0, 1)) %>%
spread(sample, GT) %>%
column_to_rownames("rsID")
m1 <- m %>%
rowwise() %>%
mutate(total = sum(c_across(where(is.numeric))))
superheat(m,
pretty.order.rows = T,
pretty.order.cols = T,
# row.dendrogram = T,
col.dendrogram = T,
bottom.label.text.angle = 90,
bottom.label.text.alignment = "right",
bottom.label.size = 0.3,
bottom.label.text.size = 4,
yr = m1$total,
yr.axis.name = "# of hets",
yr.plot.type = "bar"
)
# make the sample names one letter
intToLetter <- function(n) {
return(as.character(ifelse(n < 27, intToUtf8(n + 64), intToUtf8(n + 70))))
}
dict <- colnames(m)
names(dict) <- map(1:length(dict), intToLetter)
colnames(m) <- names(dict)
dict_df <- tibble(letter = names(dict), sample = dict)
write_tsv(dict_df, "48samples_dict.tsv")
dict_df
SNPs that occur just once in this 48 samples dataset and the corresponding samples:
singlem <- m[which(m1$total == 1), ] %>% dplyr::select_if(~ any(. == 1))
singletonSNP_samples <- dict[colnames(singlem)]
mm <- m %>% dplyr::select(names(singletonSNP_samples), "I")
mm1 <- mm %>%
rowwise() %>%
mutate(total = sum(c_across(where(is.numeric))))
covered_snps <- tibble(rsID = rownames(mm[which(mm1$total > 0), ]))
singlem
covered_snps
hclustmm <- m %>%
dplyr::select(-names(singletonSNP_samples), -I) %>%
rownames_to_column("rsID") %>%
filter(!rsID %in% covered_snps$rsID) %>%
column_to_rownames("rsID")
data <- t(mm)
d <- dist(data, method = "euclidean")
hc1 <- hclust(d, method = "complete")
n1 <- table(cutree(hc1, k = 14)) %>% reduce(`*`)
plot(hc1, cex = 0.6, hang = -1, xlab = paste(as.character(n1), "combinations"))
rect.hclust(hc1, k = 14, border = 2:15)
agnes, method=completehc2 <- agnes(d, method = "complete")
n2 <- table(cutree(hc2, k = 14)) %>% reduce(`*`)
pltree(hc2, cex = 0.6, hang = -1, xlab = paste(as.character(n2), "combinations"))
rect.hclust(hc2, k = 14, border = 2:15)
agnes, method=wardhc3 <- agnes(d, method = "ward")
n3 <- table(cutree(hc3, k = 14)) %>% reduce(`*`)
pltree(hc3, cex = 0.6, hang = -1, xlab = paste(as.character(n3), "combinations"))
rect.hclust(hc3, k = 14, border = 2:15)
dianahc4 <- diana(d, metric = "euclidean")
n4 <- table(cutree(hc4, k = 14)) %>% reduce(`*`)
pltree(hc4, cex = 0.6, hang = -1, xlab = paste(as.character(n4), "combinations"))
rect.hclust(hc4, k = 14, border = 2:18)
l1 <- cutree(hc4, k = 14)
d1 <- 1:14 %>% map(~ names(mm)[which(l1 == .x)])
process_combination <- function(x) {
p()
return(tibble(
comb = paste0(unlist(x), collapse = ""),
het = (mm %>%
dplyr::select(unlist(x)) %>%
filter_all(any_vars(. == 1)) %>%
nrow()
)
))
}
all_comb <- d1 %>%
cross()
N <- 100
set.seed(42)
my_sample <- sample(1:n4, N)
handlers("rstudio")
with_progress({
p <- progressor(steps = N)
count_hets <- my_sample %>%
map(~ all_comb[[.x]]) %>%
map_df(process_combination)
})
count_hets
# save(list=c('all_comb', 'count_hets'), file="combinations2.RData")
# load("combinations.RData")
# count_hets %>% filter(het == max(count_hets$het) & grepl('I', comb))
winner <- count_hets %>%
filter(het == max(count_hets$het)) %>%
head(1) %>%
mutate(comb = paste0(comb, "I", names(singletonSNP_samples) %>% reduce(paste0))) %>%
dplyr::select("comb") %>%
str_split("") %>%
map(~ dict[.x]) %>%
unlist()
winner_df <- tibble(letter = names(winner), sample = winner) %>% arrange(sample)
winner_df
m_reduced <- m %>% dplyr::select(names(winner))
colnames(m_reduced) <- winner
m1_reduced <- m_reduced %>%
rowwise() %>%
mutate(totalHets = sum(c_across(where(is.numeric))))
superheat(m_reduced,
pretty.order.rows = T,
pretty.order.cols = T,
# row.dendrogram = T,
col.dendrogram = T,
bottom.label.text.angle = 90,
bottom.label.text.alignment = "right",
bottom.label.size = 0.3,
bottom.label.text.size = 4,
yr = m1_reduced$totalHets,
yr.axis.name = "# of hets",
yr.plot.type = "bar"
)
This table is also available in Trello as selected_samples.csv
rownames(m1_reduced) <- rownames(m_reduced)
selected_samples <- m1_reduced %>%
rownames_to_column("rsID") %>%
inner_join(snps, by = "rsID") %>%
arrange(n) %>%
relocate(n, AssayID, TargetInformation, rsID, totalHets)
write_csv(selected_samples, "selected_samples.csv")
selected_samples