snps_orig <- read_tsv(file = "APM_CT2_pilot_SNP_assay_custom_plate_information.txt") %>%
mutate(n = row_number()) %>%
relocate(n)
write_excel_csv(snps_orig, "table_snps_orig.csv")
datatable(snps_orig, options=list(pageLength=10))
Download this table: table_snps_orig.csv
rs115828298 with
rs10001988snps1 <- snps_orig %>%
mutate(rsID = if_else(rsID == "rs115828298", "rs10001988", rsID)) %>%
mutate(AssayID = ifelse(AssayID == "C__10029783_10", "AN2XP2N", AssayID))
snps_dict_rsID <- snps1 %>% select("n", "rsID")
snps_dict_AssayID <- snps1 %>% select("n", "AssayID") %>%
mutate(AssayID = ifelse(AssayID == "C__10029783_10", "AN2XP2N", AssayID))
snps_dict_bothIDs <- snps1 %>% select("n", "AssayID", "rsID") %>%
mutate(AssayID = ifelse(AssayID == "C__10029783_10", "AN2XP2N", AssayID))
write_excel_csv(snps1, "table_snps1.csv")
datatable(snps1, options=list(pageLength=10))
Download this table: table_snps1.csv
There are 53 custom assays - add the sequence information from
Copy_of_K_s_SNP_TaqMan_ordering.xlsx
snps_design <- read_xlsx("Copy_of_K_s_SNP_TaqMan_ordering.xlsx", sheet = 1, range = "A3:M58") %>%
select(-`...11`) %>%
rename(AssayID = `Order code`, rsID = snp, ContextSequence = `sequence from dbSNP`) %>%
filter(!is.na(AssayID)) %>%
mutate(x = as.data.frame(str_match(`ContextSequence`, regex("(\\w+)\\[(\\w+)/(.+)\\](\\w+)")))) %>%
unnest_wider(x) %>%
select(-V1) %>%
mutate(design_forw_primer=str_trunc(V2, 25, side="left", ellipsis=""), design_rev_primer=str_trunc(V5, 25, side="right", ellipsis="")) %>%
select(-V2, -V5) %>%
rename(design_ref=V3, design_alt=V4)
write_excel_csv(snps_orig, "table_snps_design.csv")
datatable(snps_design, options=list(pageLength=10))
Download this table: table_snps_design.csv
rsids <- names(table(snps1$rsID))
search_snps <- rsids %>% map(~ entrez_search('snp', paste0(., '[RS]')))
snps_df <- search_snps %>% map_df(~xmlToDataFrame(entrez_fetch(db = "snp", id = .$ids, rettype="", retmode = "xml", parsed = TRUE)))
chrom_acc <- names(table(snps_df$ACC))
search_chrom <- chrom_acc %>% map(~ entrez_search('nuccore', .))
chrom_df <- search_chrom %>% map_df(~xmlToDataFrame(entrez_fetch(db = "nuccore", id = .$ids, rettype="docsum", retmode = "xml", parsed = TRUE)))
colnames(chrom_df) <- c('id', 'acc0', 'desc', 'fasta_defline', 'id1', 'date_created', 'date_modified', 'x1', 'taxon_id', 'length', 'status', 'x2', 'x3', 'ACC')
snps_entrez <- snps_df %>%
filter(MERGED_SORT == 0) %>%
inner_join(chrom_df, by="ACC") %>%
mutate(rsID = paste0('rs', SNP_ID)) %>%
relocate(rsID) %>%
select(-SNP_ID, -ALLELE_ORIGIN, -GLOBAL_POPULATION, -GLOBAL_SAMPLESIZE, -SUSPECTED, -TEXT, -CITED_SORT, -MERGED_SORT, -acc0, -id1, -x1, -x2, -x3)
snps_entrez <- snps_entrez %>%
inner_join(snps_dict_rsID, by="rsID") %>%
relocate(n) %>%
arrange(n)
write_excel_csv(snps_entrez, "table_snps_entrez.csv")
datatable(snps_entrez, options=list(pageLength=10))
Download this table: table_snps_entrez.csv
snps_dbsnp <- snps_entrez %>% as_tibble() %>%
mutate(ref = as.data.frame(str_match(SPDI, regex("\\w+:\\d+:(\\w+):"))),
dbSNP_alt_alleles = map_chr(str_match_all(SPDI, regex("\\w+:\\d+:\\w+:(\\w+)")), ~paste(.x[,2], collapse=","))) %>%
unnest_wider(ref) %>% rename(dbSNP_ref_allele = V2) %>% select(-V1) %>%
dplyr::select("rsID", "dbSNP_ref_allele", "dbSNP_alt_alleles", "CHRPOS", "GENES", "FXN_CLASS", "ORIG_BUILD", "UPD_BUILD") %>%
rename(dbSNP_chrpos = CHRPOS, dbSNP_genes = GENES, dbSNP_fxn_class = FXN_CLASS, dbSNP_orig_build = ORIG_BUILD, dbSNP_upd_build = UPD_BUILD )
snps2 <- snps1 %>%
inner_join(snps_dbsnp, by="rsID")
write_excel_csv(snps2, "table_snps_dbsnp.csv")
datatable(snps2, options=list(pageLength=10))
Download this table: table_snps_dbsnp.csv
qc1 <- read_delim("ThermoFIsher_QC/Assay_Info_TaqMan_SNP_SalesOrder_76879698_RackID_7755886_1.txt",
delim = "\t", escape_double = FALSE,
comment = "*", trim_ws = TRUE)
qc2 <- read_delim("ThermoFIsher_QC/Assay_Info_TaqMan_SNP_SalesOrder_76879698_RackID_7755886_2.txt",
delim = "\t", escape_double = FALSE,
comment = "*", trim_ws = TRUE)
qc3 <- read_delim("ThermoFIsher_QC/Assay_Info_Custom_TaqMan_SNP_SalesOrder_76879698_RackID_7755886_3.txt",
delim = "\t", escape_double = FALSE,
comment = "*", trim_ws = TRUE)
qc5 <- read_delim("ThermoFIsher_QC/Assay_Info_TaqMan_SNP_SalesOrder_76879698_RackID_7755886_5.txt",
delim = "\t", escape_double = FALSE,
comment = "*", trim_ws = TRUE)
qc6 <- read_delim("ThermoFIsher_QC/Assay_Info_Custom_TaqMan_SNP_SalesOrder_76879698_RackID_7755886_6.txt",
delim = "\t", escape_double = FALSE,
comment = "*", trim_ws = TRUE)
qc7 <- read_delim("ThermoFIsher_QC/Assay_Info_TaqMan_SNP_SalesOrder_76879698_RackID_7755886_7.txt",
delim = "\t", escape_double = FALSE,
comment = "*", trim_ws = TRUE)
qc8 <- read_delim("ThermoFIsher_QC/Assay_Info_Custom_TaqMan_SNP_SalesOrder_76879698_RackID_7755886_8.txt",
delim = "\t", escape_double = FALSE,
comment = "*", trim_ws = TRUE)
qc9 <- read_delim("ThermoFIsher_QC/Assay_Info_TaqMan_SNP_SalesOrder_76879698_RackID_7755886_9.txt",
delim = "\t", escape_double = FALSE,
comment = "*", trim_ws = TRUE)
qc10 <- read_delim("ThermoFIsher_QC/Assay_Info_TaqMan_SNP_SalesOrder_76879698_RackID_7755886_10.txt",
delim = "\t", escape_double = FALSE,
comment = "*", trim_ws = TRUE)
qc11 <- read_delim("ThermoFIsher_QC/Assay_Info_TaqMan_SNP_SalesOrder_76879698_RackID_7755886_11.txt",
delim = "\t", escape_double = FALSE,
comment = "*", trim_ws = TRUE)
q1 <- qc1 %>% rbind(qc2) %>% rbind(qc5) %>% rbind(qc7) %>% rbind(qc9) %>% rbind(qc10) %>% rbind(qc11) %>%
rename(rsID=`NCBI SNP Reference`) %>% select(-`Celera ID`, -`...56`)
q2 <- qc3 %>% rbind(qc6) %>% rbind(qc8) %>% # rbind(qc12) %>%
rename(rsID=`Assay Name`) %>% select(-`NCBI SNP Reference`, -`...56`)
qc <- q1 %>% rbind(q2) %>%
mutate(x = as.data.frame(str_match(`Context Sequence`, regex("\\w+\\[(\\w+)/(\\w+)\\]")))) %>%
unnest_wider(x) %>%
dplyr::select(-V1) %>%
rename(QC_ref_allele = V2, QC_alt_allele = V3) %>%
mutate(QC_chrpos = paste0(`Chromosome`, ':',`Location on NCBI Assembly` ))
qc <- qc %>%
right_join(snps_dict_rsID, by="rsID") %>%
relocate(n) %>%
arrange(n)
# dplyr::select(`Assay ID`, `rsID`, `QC_ref_allele`, `QC_alt_allele`, `QC_chrpos`, `Forward Primer Seq.`, `Reverse Primer Seq.`, `Context Sequence`, `Design Strand`)
write_excel_csv(qc, "table_snps_qc.csv")
datatable(qc, options=list(pageLength=10))
Download this table: table_snps_qc.csv
library(xml2)
library(rvest)
library(curl)
forw_seq = "CAGAGGAGGGGGCGTGGGGGGGGCT"
rev_seq = "GTTTTCCCCGAGACTGAGAAATGAA"
test_html <- curl_fetch_memory(paste0("https://genome.ucsc.edu/cgi-bin/hgPcr?org=Human&db=hg38&wp_target=genome&wp_f=", forw_seq, "&wp_r=",rev_seq, "&Submit=submit&wp_size=400&wp_perfect=15&wp_good=15&wp_flipReverse=on&boolshad.wp_flipReverse=0"))
test <- xml2::read_html(rawToChar(test_html$content))
t <- test %>%
html_node("table.hgInside") %>%
html_node("pre") %>%
html_text() %>%
str_split(">") %>%
unlist() %>%
map_dfr(~as.data.frame(str_split_fixed(., "\n", 2))) %>%
filter(V1!="") %>%
mutate(a=str_split(V1, " ")) %>%
unnest_wider(col='a', names_sep = "_") %>%
mutate(matchseq = str_remove_all(V2, "\n")) %>%
select(-V1, -V2) %>%
rename(chrpos = a_1, match_len = a_2, forw_primer = a_3, rev_primer = a_4)
datatable(t)
ucsc_epcr <- function(forw, rev) {
response <- curl::curl_fetch_memory(paste0("https://genome.ucsc.edu/cgi-bin/hgPcr?org=Human&db=hg38&wp_target=genome&wp_f=", forw, "&wp_r=", rev, "&Submit=submit&wp_size=100&wp_perfect=15&wp_good=15&wp_flipReverse=on&boolshad.wp_flipReverse=0"))
return (xml2::read_html(rawToChar(response$content)))
}
parse_ucsc <- function(html_content) {
l <- html_content %>%
html_node("table.hgInside") %>%
html_node("pre") %>%
html_text() %>%
str_split(">") %>%
unlist()
if (length(l)>1) {
df <- l %>%
map_dfr(~as.data.frame(str_split_fixed(., "\n", 2))) %>%
filter(V1!="") %>%
mutate(a=str_split(V1, " ")) %>%
unnest_wider(col='a', names_sep = "_") %>%
mutate(matchseq = str_remove_all(V2, "\n")) %>%
select(-V1, -V2) %>%
rename(chrpos = a_1, match_len = a_2, forw_primer = a_3, rev_primer = a_4) %>%
filter(!str_detect(chrpos, regex("alt|fix")))
} else {
df <- tibble(chrpos=character(), match_len=character(), forw_primer=character(), rev_primer=character(), matchseq=character())
}
return(df)
}
design1 <- snps_design %>% mutate(epcr_result = map2(design_forw_primer, design_rev_primer, ucsc_epcr))
design2 <- design1 %>% mutate(x = map(epcr_result, parse_ucsc)) %>% select(AssayID, rsID, x) %>% unnest_longer(x) %>% unnest_wider(x)
design2_detail <- design2 %>% inner_join(snps_dict_AssayID) %>%
relocate(n) %>% arrange(n) %>%
mutate(designed_rsID_chrpos = paste0(str_extract(chrpos, "[^:]+:"), as.numeric(str_sub(str_extract(chrpos, ":\\d+[+-]"), 2, -2)) + length(forw_primer))) %>%
relocate(designed_rsID_chrpos, .after = rsID)
write_excel_csv(design2_detail, "table_customdesign_epcr_detail.csv")
datatable(design2_detail, options=list(pageLength=10))
Download this table: table_customdesign_epcr_detail.csv
custom_assays <- snps_dict_bothIDs %>% inner_join(snps_design %>% select(rsID))
design2_summary <- design2 %>% count(AssayID, rsID) %>% rename(no_matches=n) %>% right_join(custom_assays) %>% relocate(n) %>% arrange(n) %>% replace(is.na(.), 0)
write_excel_csv(design2_summary, "table_customdesign_epcr_summary.csv")
datatable(design2_summary, options=list(pageLength=10))
Download this table: table_customdesign_epcr_summary.csv
ucsc_epcr_noflip <- function(forw, rev) {
response <- curl::curl_fetch_memory(paste0("https://genome.ucsc.edu/cgi-bin/hgPcr?org=Human&db=hg38&wp_target=genome&wp_f=", forw, "&wp_r=", rev, "&Submit=submit&wp_size=100&wp_perfect=15&wp_good=15"))
return (xml2::read_html(rawToChar(response$content)))
}
qc_custom <- qc %>%
filter(str_detect(`Product Type`, 'Custom')) %>%
select(n, `Assay ID`, `Forward Primer Seq.`, `Reverse Primer Seq.`) %>%
rename(AssayID=`Assay ID`, qc_forw_primer=`Forward Primer Seq.`, qc_rev_primer=`Reverse Primer Seq.`)
qc_custom1 <- qc_custom %>% mutate(epcr_result = map2(qc_forw_primer, qc_rev_primer, ucsc_epcr_noflip))
qc_custom2 <- qc_custom1 %>% mutate(x = map(epcr_result, parse_ucsc)) %>% select(AssayID, x) %>% unnest_longer(x) %>% unnest_wider(x)
qc_custom2_detail <- qc_custom2 %>% inner_join(snps_dict_AssayID) %>% relocate(n) %>% arrange(n) %>%
mutate(qc_rsID_chrpos = paste0(str_extract(chrpos, "[^:]+:"), as.numeric(str_sub(str_extract(chrpos, ":\\d+[+-]"), 2, -2)) + length(forw_primer))) %>%
relocate(qc_rsID_chrpos, .after = AssayID)
write_excel_csv(qc_custom2_detail, "table_qc_custom_epcr_detail.csv")
datatable(qc_custom2_detail, options=list(pageLength=10))
Download this table: table_qc_custom_epcr_detail.csv
custom_assays <- snps_dict_bothIDs %>% inner_join(qc_custom %>% select(AssayID))
qc_custom2_summary <- qc_custom2 %>% count(AssayID) %>% rename(no_matches=n) %>% right_join(custom_assays) %>% relocate(n) %>% arrange(n) %>% replace(is.na(.), 0) %>% relocate(no_matches, .after=rsID)
write_excel_csv(qc_custom2_summary, "table_qc_custom_epcr_summary.csv")
datatable(qc_custom2_summary, options=list(pageLength=10))
Download this table: table_qc_custom_epcr_summary.csv
qc_pre <- qc %>%
filter(str_detect(`Assay ID`, 'C_')) %>%
select(n, `Assay ID`, `Context Sequence`) %>%
rename(AssayID=`Assay ID`, ContextSequence=`Context Sequence`) %>%
mutate(x = as.data.frame(str_match(`ContextSequence`, regex("(\\w+)\\[(\\w+)/(.+)\\](\\w+)")))) %>%
unnest_wider(x) %>%
select(-V1) %>%
rename(qc_forw_primer=V2, qc_rev_primer=V5, qc_ref=V3, qc_alt=V4)
qc_pre1 <- qc_pre %>% mutate(epcr_result = map2(qc_forw_primer, qc_rev_primer, ucsc_epcr))
qc_pre2 <- qc_pre1 %>% mutate(x = map(epcr_result, parse_ucsc)) %>% select(AssayID, x) %>% unnest_longer(x) %>% unnest_wider(x)
qc_pre2_detail <- qc_pre2 %>% inner_join(snps_dict_AssayID) %>% relocate(n) %>% arrange(n) %>%
mutate(qc_rsID_chrpos = paste0(str_extract(chrpos, "[^:]+:"), as.numeric(str_sub(str_extract(chrpos, ":\\d+[+-]"), 2, -2)) + length(forw_primer))) %>%
relocate(qc_rsID_chrpos, .after = AssayID)
write_excel_csv(qc_pre2_detail, "table_qc_predesigned_epcr_detail.csv")
datatable(qc_pre2_detail, options=list(pageLength=10))
Download this table: table_qc_predesigned_epcr_detail.csv
pre_assays <- snps_dict_bothIDs %>% inner_join(qc_pre %>% select(AssayID))
qc_pre2_summary <- qc_pre2 %>% count(AssayID) %>% rename(no_matches=n) %>% right_join(pre_assays) %>% relocate(n) %>% arrange(n) %>% replace(is.na(.), 0) %>% relocate(no_matches, .after=rsID)
write_excel_csv(qc_pre2_summary, "table_qc_predesigned_epcr_summary.csv")
datatable(qc_pre2_summary, options=list(pageLength=10))
Download this table: table_qc_predesigned_epcr_summary.csv
# qc12 <- read_delim("ThermoFIsher_QC/Assay_Info_Custom_TaqMan_SNP_SalesOrder_76929576_RackID_7765174_1.txt",
# delim = "\t", escape_double = FALSE,
# comment = "*", trim_ws = TRUE)
# qc13 <- read_delim("ThermoFIsher_QC/Assay_Info_Custom_TaqMan_SNP_SalesOrder_76929576_RackID_7765174_2.txt",
# delim = "\t", escape_double = FALSE,
# comment = "*", trim_ws = TRUE)
# q3 <- qc12 %>% rbind(qc13) %>%
# select(-`NCBI SNP Reference`, -`...56`)
revcomp <- function(x) {
return(as.character(Biostrings::reverseComplement(Biostrings::DNAString(x))))
}
qc_extra <- read_excel("ThermoFisher_QC/Garvan Institute of Medical Research Sequencing.xlsx", sheet=1, range="A1:I13") %>%
mutate(x = as.data.frame(str_match(`Context Sequence`, regex("(\\w+)\\[(\\w+)/(.+)\\](\\w+)")))) %>%
unnest_wider(x) %>%
select(-V1) %>%
rename(qc_forw_primer=V2, qc_rev_primer=V5, qc_ref=V3, qc_alt=V4) %>%
mutate(qc_forw_primer = ifelse(is.na(qc_forw_primer), `Forward Primer Seq.`, qc_forw_primer),
qc_rev_primer = map2_chr(qc_rev_primer, `Reverse Primer Seq.`, ~ifelse(is.na(.x), revcomp(.y), .x))) %>%
left_join((snps_design %>% select(rsID, design_ref, design_alt)), by="rsID") %>%
mutate(QC_ref_allele = if_else(is.na(qc_ref), design_ref, qc_ref),
QC_alt_allele = if_else(is.na(qc_alt), design_alt, qc_alt)) %>%
select(-design_ref, -design_alt, -qc_ref, -qc_alt)
datatable(qc_extra, options=list(pageLength=12))
qc_extra1 <- qc_extra %>% mutate(epcr_result = map2(qc_forw_primer, qc_rev_primer, ucsc_epcr))
qc_extra2 <- qc_extra1 %>% mutate(x = map(epcr_result, parse_ucsc)) %>% select(AssayID, x) %>% unnest_longer(x) %>% unnest_wider(x) %>%
mutate(extra_rsID_chrpos = paste0(str_extract(chrpos, "[^:]+:"), as.numeric(str_sub(str_extract(chrpos, ":\\d+[+-]"), 2, -2)) + length(forw_primer))) %>%
relocate(extra_rsID_chrpos, .after = AssayID)
#qc_extra2_detail <- qc_extra2 %>% inner_join(snps_dict_AssayID) %>% relocate(n) %>% arrange(n)
write_excel_csv(qc_extra2, "table_qc_extra_epcr_detail.csv")
datatable(qc_extra2, options=list(pageLength=10))
Download this table: table_qc_extra_epcr_detail.csv
extra_assays <- snps_dict_bothIDs %>% filter(rsID %in% qc_extra$rsID)
qc_extra2_summary <- qc_extra2 %>% count(AssayID) %>% rename(no_matches=n) %>%
right_join(extra_assays, by='AssayID') %>%
inner_join(qc_extra, by=c('AssayID', 'rsID')) %>%
mutate(no_matches = ifelse(is.na(no_matches), 0, no_matches)) %>%
relocate(n) %>% arrange(n)
# relocate(no_matches, .after=rsID)
write_excel_csv(qc_extra2_summary, "table_qc_extra_epcr_summary.csv")
datatable(qc_extra2_summary, options=list(pageLength=10))
Download this table: table_qc_extra_epcr_summary.csv
epcr_results <- qc_custom2_summary %>% select(AssayID, rsID, no_matches) %>%
rbind(qc_pre2_summary %>% select(AssayID, rsID, no_matches)) %>%
mutate(AssayID = if_else(AssayID == "C__10029783_10", "AN2XP2N", AssayID)) %>%
rbind(qc_extra2_summary %>% select(AssayID, rsID, no_matches))
qc_all <- qc %>%
rename(AssayID=`Assay ID`, qc_rsID=rsID) %>%
mutate(AssayID = if_else(AssayID == "C__10029783_10", "AN2XP2N", AssayID)) %>%
filter(!is.na(`Customer Name`))
qc_extra1 <- qc_extra %>% filter(AssayID != "AN2XP2N") %>% inner_join(snps_dict_rsID, by='rsID') %>% relocate(n)
qc_extra1[setdiff(colnames(qc_all), colnames(qc_extra1))] <- NA
qc_extra1 <- qc_extra1 %>%
select(colnames(qc_all))
qc_all <- qc_all %>% rbind(qc_extra1) %>% arrange(n) %>% select(-n)
metadata_all <- snps2 %>%
mutate(AssayID = if_else(AssayID == "C__10029783_10", "AN2XP2N", AssayID)) %>%
left_join(qc_all, by="AssayID") %>%
left_join(epcr_results, by=c("AssayID", "rsID")) %>%
unique() %>%
mutate(dbSNP_ref_allele = case_when(AssayID=="C__26765406_10" ~ "A", AssayID=="ANPR3AH" ~ "GT", TRUE ~ dbSNP_ref_allele),
dbSNP_alt_alleles = case_when(AssayID %in% c("C__26765406_10", "C__34117344_10","ANPR3AH") ~ "-", TRUE ~ dbSNP_alt_alleles)) %>%
arrange(n)
write_excel_csv(metadata_all, "metadata_all.csv")
datatable(metadata_all, options=list(pageLength=10))
metadata <- metadata_all %>%
select(n, AssayID, rsID, AssayType, dbSNP_ref_allele, dbSNP_alt_alleles, dbSNP_chrpos, qc_rsID, QC_ref_allele, QC_alt_allele, QC_chrpos, no_matches, `Order Number`, `Ship Date`, `Plate ID`, `Forward Primer Seq.`, `Reverse Primer Seq.`, `Context Sequence`, `SNP Type`)
write_excel_csv(metadata, "metadata.csv")
datatable(metadata, options=list(pageLength=10))