Read in all the ThermoFisher order QC files

qc1 <- read_delim("ThermoFisher_QC/Assay_Info_TaqMan_SNP_SalesOrder_76879698_RackID_7755886_1.txt",
                  delim = "\t", escape_double = FALSE,
                  comment = "*", trim_ws = TRUE)
qc2 <- read_delim("ThermoFisher_QC/Assay_Info_TaqMan_SNP_SalesOrder_76879698_RackID_7755886_2.txt",
                  delim = "\t", escape_double = FALSE,
                  comment = "*", trim_ws = TRUE)
qc3 <- read_delim("ThermoFisher_QC/Assay_Info_Custom_TaqMan_SNP_SalesOrder_76879698_RackID_7755886_3.txt",
                  delim = "\t", escape_double = FALSE,
                  comment = "*", trim_ws = TRUE)
qc5 <- read_delim("ThermoFisher_QC/Assay_Info_TaqMan_SNP_SalesOrder_76879698_RackID_7755886_5.txt",
                  delim = "\t", escape_double = FALSE,
                  comment = "*", trim_ws = TRUE)
qc6 <- read_delim("ThermoFisher_QC/Assay_Info_Custom_TaqMan_SNP_SalesOrder_76879698_RackID_7755886_6.txt",
                  delim = "\t", escape_double = FALSE,
                  comment = "*", trim_ws = TRUE)
qc7 <- read_delim("ThermoFisher_QC/Assay_Info_TaqMan_SNP_SalesOrder_76879698_RackID_7755886_7.txt",
                  delim = "\t", escape_double = FALSE,
                  comment = "*", trim_ws = TRUE)
qc8 <- read_delim("ThermoFisher_QC/Assay_Info_Custom_TaqMan_SNP_SalesOrder_76879698_RackID_7755886_8.txt",
                  delim = "\t", escape_double = FALSE,
                  comment = "*", trim_ws = TRUE)
qc9 <- read_delim("ThermoFisher_QC/Assay_Info_TaqMan_SNP_SalesOrder_76879698_RackID_7755886_9.txt",
                  delim = "\t", escape_double = FALSE,
                  comment = "*", trim_ws = TRUE)
qc10 <- read_delim("ThermoFisher_QC/Assay_Info_TaqMan_SNP_SalesOrder_76879698_RackID_7755886_10.txt",
                   delim = "\t", escape_double = FALSE,
                   comment = "*", trim_ws = TRUE)
qc11 <- read_delim("ThermoFisher_QC/Assay_Info_TaqMan_SNP_SalesOrder_76879698_RackID_7755886_11.txt",
                   delim = "\t", escape_double = FALSE,
                   comment = "*", trim_ws = TRUE)
qc12 <- read_delim("ThermoFisher_QC/Assay_Info_Custom_TaqMan_SNP_SalesOrder_76929576_RackID_7765174_1.txt",
                   delim = "\t", escape_double = FALSE,
                   comment = "*", trim_ws = TRUE)

q1 <- qc1 %>% rbind(qc2) %>% rbind(qc5) %>% rbind(qc7) %>% rbind(qc9) %>% rbind(qc10) %>% rbind(qc11) %>% 
  dplyr::rename(rsID=`NCBI SNP Reference`) %>% select(-`Celera ID`, -`...56`)
q2 <- qc3 %>% rbind(qc6) %>% rbind(qc8) %>% rbind(qc12) %>% 
  dplyr::rename(rsID=`Assay Name`) %>% select(-`NCBI SNP Reference`, -`...56`)

qc <- q1 %>% rbind(q2) %>%
  mutate(x = as.data.frame(str_match(`Context Sequence`, regex("\\w+\\[(\\w+)/(\\w+)\\]")))) %>% 
  unnest_wider(x) %>% 
  dplyr::select(-V1) %>% 
  dplyr::rename(ref_allele_QC = V2, alt_allele_QC = V3) %>% 
  mutate(chrpos_QC = paste0(`Chromosome`, ':',`Location on NCBI Assembly` )) %>% 
  dplyr::select(`Assay ID`, `rsID`, `ref_allele_QC`, `alt_allele_QC`, `chrpos_QC`)

qc[139:149,2] <- c('rs1372420', 'rs8064765', 'rs3806760', 'rs10448130', 'rs62075803', 'rs7515370', 'rs9330264', 'rs79531911', 'rs11097213', 'rs536718528', 'rs1859223')
                #`Chromosome`, `Location on NCBI Assembly`, `Forward Primer Seq.`, `Reverse Primer Seq.`, `Context Sequence`, `Design Strand`)
datatable(qc, options=list(pageLength=10))

Pull dbSNP information through Entrez

library(rentrez)
library(XML)

rsids <- names(table(qc$rsID)) # eliminate NAs
search_snps <- rsids %>% map(~ entrez_search('snp', paste0(., '[RS]')))
snps_df <- search_snps %>% map_df(~xmlToDataFrame(entrez_fetch(db = "snp", id = .$ids, rettype="", retmode = "xml", parsed = TRUE)))

chrom_acc <- names(table(snps_df$ACC))
search_chrom <- chrom_acc %>% map(~ entrez_search('nuccore', .))
chrom_df <- search_chrom %>% map_df(~xmlToDataFrame(entrez_fetch(db = "nuccore", id = .$ids, rettype="docsum", retmode = "xml", parsed = TRUE)))
colnames(chrom_df) <- c('id', 'acc0', 'desc', 'fasta_defline', 'id1', 'date_created', 'date_modified', 'x1', 'taxon_id', 'length', 'status', 'x2', 'x3', 'ACC')

snps <- snps_df %>% 
  filter(MERGED_SORT == 0) %>% 
  inner_join(chrom_df, by="ACC") %>% 
  mutate(rsID = paste0('rs', SNP_ID)) %>% 
  relocate(rsID) %>% 
  select(-SNP_ID, -ALLELE_ORIGIN, -GLOBAL_POPULATION, -GLOBAL_SAMPLESIZE, -SUSPECTED, -TEXT, -CITED_SORT, -MERGED_SORT, -acc0, -id1, -x1, -x2, -x3)

snps

dbSNP QC

snps_dbsnp <- snps %>% as_tibble() %>% 
  mutate(ref = as.data.frame(str_match(SPDI, regex("\\w+:\\d+:(\\w+):"))), 
         alt_alleles_dbsnp = map_chr(str_match_all(SPDI, regex("\\w+:\\d+:\\w+:(\\w+)")), ~paste(.x[,2], collapse=","))) %>% 
  unnest_wider(ref) %>% rename(ref_allele_dbsnp = V2) %>% select(-V1) %>%
  dplyr::select('rsID', 'ref_allele_dbsnp', 'alt_alleles_dbsnp', 'ALLELE', 'CHRPOS') %>%
  rename(chrpos_dbsnp = CHRPOS, allele_dbsnp= ALLELE)
  
snps_dbsnp

WGS QC

qc_wgs <- read_tsv("48samples_149SNPs_genotypes_by_rsID.txt") %>% 
  select('chrom', 'chromEnd', 'rsID', 'ref', 'alt') %>% unique() %>%
  mutate(chrpos_wgs=paste0(chrom, ':', chromEnd)) %>%
  select(-chrom, -chromEnd) %>%
  rename(ref_wgs=ref, alt_wgs=alt)
qc_wgs

TaqMan vs dbSNP vs WGS

qc_all <- qc %>% 
  inner_join(snps_dbsnp, by='rsID') %>% 
  inner_join(qc_wgs) %>%
  relocate(chrpos_QC, .after = last_col()) %>% 
  relocate(chrpos_dbsnp, .after = last_col()) %>% 
  relocate(chrpos_wgs, .after = last_col())
write_excel_csv(qc_all, "QC_all.csv")
qc_all