load("snps.RData") # read original snps data
snps <- snps %>%
mutate(AssayID = ifelse(AssayID == "C__10029783_10", "AN2XP2N", AssayID))
metadata <- read_csv("metadata.csv") %>%
mutate(AssayID = ifelse(AssayID == "C__10029783_10", "AN2XP2N", AssayID))
genotypes_WGS <- read_tsv("genotypes_WGS.tsv") %>%
mutate(AssayID = ifelse(AssayID == "C__10029783_10", "AN2XP2N", AssayID))
shifted_labels <- unique(genotypes_WGS$sample)
true_labels <- c(shifted_labels[24],shifted_labels[1:23], shifted_labels[25:48])
map_wgs <- true_labels
names(map_wgs) <- shifted_labels
genotypes_WGS <- genotypes_WGS %>%
mutate(sample = map_wgs[sample])
taqman0 <- read_csv("TaqMan_genotypes/2022-05-09-strict/Genotype Matrix.csv", comment = "#") %>%
dplyr::rename(sample=`Sample/Assay`) %>%
filter(!str_detect(sample, regex("Empty|AMP|Sample")))
taqman <- taqman0 %>% select(-`...152`) %>%
pivot_longer(cols = 2:150, names_to = "AssayID", values_to = "genotype") %>%
mutate(genotype = str_replace_all(genotype, fixed(paste0(AssayID,"-Allele ")), "")) %>%
mutate(genotype = str_replace_all(str_replace_all(genotype, "1", "0"), "2", "1"))
taqman1 <- taqman %>%
inner_join(metadata, by='AssayID') %>%
mutate(genotype = replace_na(genotype, "NULL")) %>%
mutate(genotype = ifelse(dbSNP_ref_allele == QC_alt_allele & dbSNP_alt_alleles != "-",
case_when(
genotype == "0/0" ~ "1/1",
genotype == "1/1" ~ "0/0",
TRUE ~ genotype),
genotype)
) %>%
mutate(genotype = factor(genotype, levels = c( "0/0", "0/1", "1/1", "NOAMP", "UNDET", "INVALID", "NULL"))) %>%
left_join(genotypes_WGS, by=c('AssayID', 'sample')) %>%
mutate(GT = ifelse(is.na(GT), '0/0', GT)) %>%
mutate(GT = factor(GT, levels = c( "0/0", "0/1", "1/1", "NOAMP", "UNDET", "INVALID", "NULL"))) %>%
mutate(fill_color = ifelse(str_detect(as.character(genotype), regex("\\d|\\d")), as.character(GT), as.character(genotype))) %>%
mutate(fill_color = factor(fill_color, levels = c( "0/0", "0/1", "1/1", "NOAMP", "UNDET", "INVALID", "NULL"))) %>%
mutate(correct = ifelse(str_detect(as.character(genotype), regex("\\d|\\d")),
as.character(genotype) == as.character(GT), FALSE)) %>%
mutate(correct = ifelse(is.na(correct), FALSE, correct)) %>%
mutate(rowlabel = fct_rev(factor(paste0(AssayID, "\n(",str_pad(n, 3, pad="0"), ": ", rsID, ")")))) %>%
arrange(AssayID, sample)
pseudosequences <- taqman1 %>%
mutate(gt1 = case_when(genotype == "0/1" ~ "A", genotype == "1/1" ~ "C", genotype == "0/0" ~ "G", TRUE ~ "T"),
gt2 = case_when(GT == "0/1" ~ "A", GT == "1/1" ~ "C", GT == "0/0" ~ "G", TRUE ~ "T")) %>%
group_by(sample) %>%
summarize(taqman = paste0(gt1, collapse=""), wgs = paste0(gt2, collapse="")) %>%
pivot_longer(cols=2:3, names_to = "type", values_to = "sequence") %>%
mutate(defline = paste(sample, type, sep="_")) %>%
select(defline, sequence)
seqs <- pseudosequences$sequence
names(seqs) <- pseudosequences$defline
bs <- DNAStringSet(seqs)
writeXStringSet(bs, "pseudosequences.fa")
taqman1 %>% select(n, AssayID, rsID, sample, genotype, GT, correct, no_matches) %>%
rename(taqman_genotype = genotype, wgs_genotype = GT) %>%
write_excel_csv("success_matrix_strict.csv")
# run clustalo on pseudosequences and use the guide tree to re-shuflle the sampels
# s <- taqman0$sample
# s1 <- c( s[2], s[3], s[9], s[5], s[6], s[7], s[8], s[4], s[10], s[11],
# s[1], s[12], s[13], s[14], s[15], s[16], s[17], s[18], s[19], s[20])
# taqman0$sample <- s1
#
# taqman <- taqman0 %>% select(-`...151`) %>%
# pivot_longer(cols = 2:150, names_to = "AssayID", values_to = "genotype") %>%
# mutate(genotype = str_replace_all(genotype, fixed(paste0(AssayID,"-Allele ")), "")) %>%
# mutate(genotype = str_replace_all(str_replace_all(genotype, "1", "0"), "2", "1"))
#
# taqman1 <- taqman %>%
# inner_join(metadata, by='AssayID') %>%
# mutate(genotype = replace_na(genotype, "NULL")) %>%
# mutate(genotype = ifelse(dbSNP_ref_allele == QC_alt_allele,
# case_when(
# genotype == "0/0" ~ "1/1",
# genotype == "1/1" ~ "0/0",
# TRUE ~ genotype),
# genotype)
# ) %>%
# mutate(genotype = factor(genotype, levels = c( "0/0", "0/1", "1/1", "NOAMP", "UND", "INV", "NULL"))) %>%
# inner_join(genotypes_WGS, by=c('AssayID', 'sample')) %>%
# mutate(GT = factor(GT, levels = c( "0/0", "0/1", "1/1", "NOAMP", "UND", "INV", "NULL"))) %>%
# mutate(fill_color = ifelse(str_detect(as.character(genotype), regex("\\d|\\d")), as.character(GT), as.character(genotype))) %>%
# mutate(fill_color = factor(fill_color, levels = c( "0/0", "0/1", "1/1", "NOAMP", "UND", "INV", "NULL"))) %>%
# mutate(correct = ifelse(str_detect(as.character(genotype), regex("\\d|\\d")),
# as.character(genotype) == as.character(GT), NA)) %>%
# mutate(rowlabel = fct_rev(factor(paste0(str_pad(n, 3, pad="0"), ": ", AssayID, "\n(", rsID, ")")))) %>%
# arrange(-n, sample)
Description:
mypal <- scales::brewer_pal(type = "qual", palette = "Paired")(8)[2:8]
p1 <- ggplot(taqman1, aes(sample, rowlabel)) +
geom_raster(aes(fill = fill_color)) +
geom_text(aes(label = paste0(genotype,"\n", GT)), color = "black", size = 3.5 ) +
coord_fixed(expand = FALSE) +
scale_fill_manual(values = mypal) +
scale_x_discrete(position = "top") +
labs(x = NULL, y = NULL) +
theme(panel.border = element_rect(color = NA, fill = NA),
axis.text.x = element_text(angle = 50, vjust = 1, hjust = 0, size = 8),
legend.position = "none")
p2 <- p1 + geom_point(aes(colour = correct, shape=correct), size = 12) +
scale_color_manual(values = c('firebrick', 'black')) +
scale_shape_manual(values = c(4, 1))
ggplotly(p2,tooltip=c('sample', 'label', 'correct'))
taqman1 %>% filter(correct == FALSE) %>% group_by(sample) %>% count(correct)
snps_dict_rsID <- metadata %>% select("n", "rsID")
snps_dict_AssayID <- metadata %>% select("n", "AssayID")
snps_dict_bothIDs <- metadata %>% select("n", "AssayID", "rsID")
incorrect <- taqman1 %>% filter(correct == FALSE) %>% group_by(AssayID) %>% count(correct) %>%
select(-correct) %>% filter(n>0) %>% arrange(-n) %>% rename(count=n) %>%
inner_join(snps_dict_bothIDs) %>%
inner_join(metadata %>% select(AssayID, no_matches), by="AssayID") %>%
rename(epcr_matches=no_matches) %>%
relocate(n, AssayID, rsID)
incorrect %>% write_excel_csv("incorrect_by_assay_strict.csv")
incorrect