Evaluate TaqMan success rate

Read Genotype Matrix and SNPs information

load("snps.RData") # read original snps data
snps <- snps %>% 
  mutate(AssayID = ifelse(AssayID == "C__10029783_10", "AN2XP2N", AssayID))
metadata <- read_csv("metadata.csv") %>% 
  mutate(AssayID = ifelse(AssayID == "C__10029783_10", "AN2XP2N", AssayID))
genotypes_WGS <- read_tsv("genotypes_WGS.tsv") %>% 
  mutate(AssayID = ifelse(AssayID == "C__10029783_10", "AN2XP2N", AssayID))
shifted_labels <- unique(genotypes_WGS$sample)
true_labels <- c(shifted_labels[24],shifted_labels[1:23], shifted_labels[25:48])
map_wgs <- true_labels
names(map_wgs) <- shifted_labels
  
genotypes_WGS <- genotypes_WGS %>%
  mutate(sample = map_wgs[sample])

taqman0 <- read_csv("TaqMan_genotypes/2022-05-09-strict/Genotype Matrix.csv", comment = "#")  %>%
  dplyr::rename(sample=`Sample/Assay`) %>% 
  filter(!str_detect(sample, regex("Empty|AMP|Sample")))
taqman <- taqman0 %>% select(-`...152`) %>%
  pivot_longer(cols = 2:150, names_to = "AssayID", values_to = "genotype") %>% 
  mutate(genotype = str_replace_all(genotype, fixed(paste0(AssayID,"-Allele ")), "")) %>%
  mutate(genotype = str_replace_all(str_replace_all(genotype, "1", "0"), "2", "1"))

taqman1 <- taqman %>% 
  inner_join(metadata, by='AssayID') %>% 
  mutate(genotype = replace_na(genotype, "NULL")) %>% 
  mutate(genotype = ifelse(dbSNP_ref_allele == QC_alt_allele & dbSNP_alt_alleles != "-", 
                           case_when(
                               genotype == "0/0" ~ "1/1",
                               genotype == "1/1" ~ "0/0", 
                               TRUE ~ genotype),
                           genotype) 
         ) %>% 
  mutate(genotype = factor(genotype, levels = c( "0/0", "0/1", "1/1", "NOAMP", "UNDET", "INVALID", "NULL"))) %>% 
  left_join(genotypes_WGS, by=c('AssayID', 'sample')) %>% 
  mutate(GT = ifelse(is.na(GT), '0/0', GT)) %>% 
  mutate(GT = factor(GT, levels = c( "0/0", "0/1", "1/1", "NOAMP", "UNDET", "INVALID", "NULL"))) %>% 
  mutate(fill_color = ifelse(str_detect(as.character(genotype), regex("\\d|\\d")), as.character(GT), as.character(genotype))) %>% 
  mutate(fill_color = factor(fill_color, levels = c( "0/0", "0/1", "1/1", "NOAMP", "UNDET", "INVALID", "NULL"))) %>% 
  mutate(correct = ifelse(str_detect(as.character(genotype), regex("\\d|\\d")), 
                          as.character(genotype) == as.character(GT), FALSE)) %>% 
  mutate(correct = ifelse(is.na(correct), FALSE, correct)) %>% 
  mutate(rowlabel = fct_rev(factor(paste0(AssayID, "\n(",str_pad(n, 3, pad="0"), ": ", rsID, ")")))) %>% 
  arrange(AssayID, sample)

pseudosequences <- taqman1 %>% 
  mutate(gt1 = case_when(genotype == "0/1" ~ "A", genotype == "1/1" ~ "C", genotype == "0/0" ~ "G", TRUE ~ "T"), 
         gt2 = case_when(GT == "0/1" ~ "A", GT == "1/1" ~ "C", GT == "0/0" ~ "G", TRUE ~ "T")) %>% 
  group_by(sample) %>% 
  summarize(taqman = paste0(gt1, collapse=""), wgs = paste0(gt2, collapse="")) %>% 
  pivot_longer(cols=2:3, names_to = "type", values_to = "sequence") %>% 
  mutate(defline = paste(sample, type, sep="_")) %>% 
  select(defline, sequence)
seqs <- pseudosequences$sequence
names(seqs) <- pseudosequences$defline
bs <- DNAStringSet(seqs)
writeXStringSet(bs, "pseudosequences.fa")

taqman1 %>% select(n, AssayID, rsID, sample, genotype, GT, correct, no_matches) %>% 
  rename(taqman_genotype = genotype, wgs_genotype = GT) %>% 
  write_excel_csv("success_matrix_strict.csv")

# run clustalo on pseudosequences and use the guide tree to re-shuflle the sampels
# s <- taqman0$sample
# s1 <- c( s[2],  s[3],  s[9],  s[5],  s[6],  s[7],  s[8],  s[4], s[10], s[11],
#          s[1], s[12], s[13], s[14], s[15], s[16], s[17], s[18], s[19], s[20])
# taqman0$sample <- s1
# 
# taqman <- taqman0 %>% select(-`...151`) %>%
#   pivot_longer(cols = 2:150, names_to = "AssayID", values_to = "genotype") %>% 
#   mutate(genotype = str_replace_all(genotype, fixed(paste0(AssayID,"-Allele ")), "")) %>%
#   mutate(genotype = str_replace_all(str_replace_all(genotype, "1", "0"), "2", "1"))
# 
# taqman1 <- taqman %>% 
#   inner_join(metadata, by='AssayID') %>% 
#   mutate(genotype = replace_na(genotype, "NULL")) %>% 
#   mutate(genotype = ifelse(dbSNP_ref_allele == QC_alt_allele, 
#                            case_when(
#                                genotype == "0/0" ~ "1/1",
#                                genotype == "1/1" ~ "0/0", 
#                                TRUE ~ genotype),
#                            genotype) 
#          ) %>% 
#   mutate(genotype = factor(genotype, levels = c( "0/0", "0/1", "1/1", "NOAMP", "UND", "INV", "NULL"))) %>% 
#   inner_join(genotypes_WGS, by=c('AssayID', 'sample')) %>% 
#   mutate(GT = factor(GT, levels = c( "0/0", "0/1", "1/1", "NOAMP", "UND", "INV", "NULL"))) %>% 
#   mutate(fill_color = ifelse(str_detect(as.character(genotype), regex("\\d|\\d")), as.character(GT), as.character(genotype))) %>% 
#   mutate(fill_color = factor(fill_color, levels = c( "0/0", "0/1", "1/1", "NOAMP", "UND", "INV", "NULL"))) %>% 
#   mutate(correct = ifelse(str_detect(as.character(genotype), regex("\\d|\\d")), 
#                           as.character(genotype) == as.character(GT), NA)) %>% 
#   mutate(rowlabel = fct_rev(factor(paste0(str_pad(n, 3, pad="0"), ": ", AssayID, "\n(", rsID, ")")))) %>% 
#   arrange(-n, sample)

Detailed success rate and comparison with WGS

Description:

the top genotype in each square is the TaqMan genotype
the bottom genotype is the WGS genotype (“gold standard”)

mypal <- scales::brewer_pal(type = "qual", palette = "Paired")(8)[2:8]
p1 <- ggplot(taqman1, aes(sample, rowlabel)) + 
  geom_raster(aes(fill = fill_color)) + 
  geom_text(aes(label = paste0(genotype,"\n", GT)), color = "black", size = 3.5 ) + 
  coord_fixed(expand = FALSE) + 
  scale_fill_manual(values =  mypal) +
  scale_x_discrete(position = "top") +
  labs(x = NULL, y = NULL) +
  theme(panel.border = element_rect(color = NA, fill = NA),
        axis.text.x = element_text(angle = 50, vjust = 1, hjust = 0, size = 8), 
        legend.position = "none")
p2 <- p1 + geom_point(aes(colour = correct, shape=correct), size = 12) +
  scale_color_manual(values = c('firebrick', 'black')) +
  scale_shape_manual(values = c(4, 1)) 

ggplotly(p2,tooltip=c('sample', 'label', 'correct'))

Number of discrepancies per sample

taqman1 %>% filter(correct == FALSE) %>% group_by(sample) %>% count(correct)

Number of discrepancies per assay

snps_dict_rsID <- metadata %>% select("n", "rsID")
snps_dict_AssayID <- metadata %>% select("n", "AssayID")
snps_dict_bothIDs <- metadata %>% select("n", "AssayID", "rsID")

incorrect <- taqman1 %>% filter(correct == FALSE) %>% group_by(AssayID) %>% count(correct) %>% 
  select(-correct) %>% filter(n>0) %>% arrange(-n) %>% rename(count=n) %>% 
  inner_join(snps_dict_bothIDs) %>% 
  inner_join(metadata %>% select(AssayID, no_matches), by="AssayID") %>% 
  rename(epcr_matches=no_matches) %>% 
  relocate(n, AssayID, rsID)
incorrect %>% write_excel_csv("incorrect_by_assay_strict.csv")
incorrect

TaqMan success

Razvan Sultana

2022-05-09

Evaluate TaqMan success rate

Read Genotype Matrix and SNPs information

Detailed success rate and comparison with WGS

Number of discrepancies per sample

Number of discrepancies per assay