dndscv_dir = "~/data2/PureCN_manuscript/Revision/dNdSCV"
sig_genes = read.table(file.path(dndscv_dir, "data/SignificantGenes.csv"), sep = ",", header = TRUE)
t_ind = which(sig_genes$Tumor.only < 0.01)
sig_genes_t = sig_genes$Gene[t_ind] %>% as.character
sig_genes_t
## [1] "KRAS" "TP53" "KEAP1" "ATF7IP" "ARID1A" "STK11"
## [7] "OR5F1" "OR5D13" "OR2AK2" "CD8B" "COX10" "HLA-DRB1"
## [13] "IGLV5-48" "KIR2DL3" "KIR3DL2" "LILRB3" "POLRMT" "PRAMEF18"
## [19] "PRSS3" "RBMXL1" "TEK" "TRAPPC4"
g = genes(Homo.sapiens, columns = "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
g_sig_genes_t = subset(g, SYMBOL %in% sig_genes_t)
bedfiles.dir = "/home/sehyun/reference/bedfiles"
ch = import.chain(file.path(bedfiles.dir, "hg19ToHg38.over.chain"))
g_sig_genes_t_hg38 = liftOver(g_sig_genes_t, ch) %>% unlist
dNdSCV-called significant genes not found in Homo.Sapiens db
setdiff(sig_genes_t, unique(g_sig_genes_t$SYMBOL))
## [1] "HLA-DRB1" "IGLV5-48" "PRAMEF18"
MUTECT_OUT = file.path("/nobackup/16tb_a/CNVworkflow_LUAD/mutect_output")
out.dir = file.path(MUTECT_OUT, "normal_panel")
pon_size = 250
file_path = dir(out.dir, pattern="^.*_pon_no_none\\.vcf$", full.names = TRUE)[1: pon_size]
snp_gr = list()
for (i in seq_along(file_path)) {
vcf = readVcf(file_path[i], "hg38") # import VCF file
snp = rowRanges(vcf) # Convert normal VCFs to GRange
ol = findOverlaps(snp, g_sig_genes_t_hg38) # overlapping region
ol_ind = queryHits(ol) # SNPs in significantGenes_tumor
snps = snp[ol_ind,]
sample_name = stringr::str_extract(file_path[i], "TCGA.{21}")
df = data.frame(seqnames = seqnames(snps), start = start(snps), sample = 1)
snp_gr[[sample_name]] = df
}
df = snp_gr[[1]]
colnames(df)[3] = names(snp_gr[1])
for (i in 2:length(snp_gr)) {
new_df = snp_gr[[i]]
colnames(new_df)[3] = names(snp_gr[i])
df = dplyr::full_join(df, new_df, by = c("seqnames", "start"))
}
The number of samples containing each SNP
numSamples = rowSums(df[,3:ncol(df)], na.rm = TRUE)
hist(numSamples, breaks = seq(0,250,5))
abline(v = 5, col = "red")
sum(numSamples < 5)
## [1] 3801
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(org.Hs.eg.db)
df_lt5 = df[which(numSamples < 5),]
df_lt5$end = df_lt5$start # add 'end' column to create GRanges: all are SNP, so end is same as start
gr_lt5 = makeGRangesFromDataFrame(df_lt5, keep.extra.columns = TRUE)
gr_lt5 = PureCN::annotateTargets(gr_lt5, txdb = TxDb.Hsapiens.UCSC.hg38.knownGene, org.Hs.eg.db)
## 'select()' returned many:1 mapping between keys and columns
## 'select()' returned many:1 mapping between keys and columns
## WARN [2020-01-05 00:40:52] Attempted adding gene symbols to intervals. Heuristics have been used to pick symbols for overlapping genes.
Why 7 different genes are here? How PureCN::annotateTargets handle multiple genes
at the same site?
setdiff(gr_lt5$Gene, unique(g_sig_genes_t$SYMBOL))
## [1] "LILRA6" "KIR2DS5" "KIR2DL1" "RPS9" "RPS25" "ETFRF1" "KYAT3"
## [8] "."
setdiff(unique(g_sig_genes_t$SYMBOL), gr_lt5$Gene)
## list()