1 Significant genes from tumor-only dNdSCV

dndscv_dir = "~/data2/PureCN_manuscript/Revision/dNdSCV"
sig_genes = read.table(file.path(dndscv_dir, "data/SignificantGenes.csv"), sep = ",", header = TRUE)
t_ind = which(sig_genes$Tumor.only < 0.01)
sig_genes_t = sig_genes$Gene[t_ind] %>% as.character
sig_genes_t
##  [1] "KRAS"     "TP53"     "KEAP1"    "ATF7IP"   "ARID1A"   "STK11"   
##  [7] "OR5F1"    "OR5D13"   "OR2AK2"   "CD8B"     "COX10"    "HLA-DRB1"
## [13] "IGLV5-48" "KIR2DL3"  "KIR3DL2"  "LILRB3"   "POLRMT"   "PRAMEF18"
## [19] "PRSS3"    "RBMXL1"   "TEK"      "TRAPPC4"

1.1 GRanges of significant genes

g = genes(Homo.sapiens, columns = "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
g_sig_genes_t = subset(g, SYMBOL %in% sig_genes_t)

bedfiles.dir = "/home/sehyun/reference/bedfiles"
ch = import.chain(file.path(bedfiles.dir, "hg19ToHg38.over.chain"))
g_sig_genes_t_hg38 = liftOver(g_sig_genes_t, ch) %>% unlist

1.2 Trouble-shooting

dNdSCV-called significant genes not found in Homo.Sapiens db

setdiff(sig_genes_t, unique(g_sig_genes_t$SYMBOL))
## [1] "HLA-DRB1" "IGLV5-48" "PRAMEF18"

2 All the normal VCFs

MUTECT_OUT = file.path("/nobackup/16tb_a/CNVworkflow_LUAD/mutect_output")
out.dir = file.path(MUTECT_OUT, "normal_panel")
pon_size = 250

file_path = dir(out.dir, pattern="^.*_pon_no_none\\.vcf$", full.names = TRUE)[1: pon_size]

2.1 Find SNPs in normal VCFs

snp_gr = list()
for (i in seq_along(file_path)) {
    
    vcf = readVcf(file_path[i], "hg38")   # import VCF file
    snp = rowRanges(vcf)   # Convert normal VCFs to GRange
    
    ol = findOverlaps(snp, g_sig_genes_t_hg38)   # overlapping region
    ol_ind = queryHits(ol)   # SNPs in significantGenes_tumor
    snps = snp[ol_ind,]
    sample_name = stringr::str_extract(file_path[i], "TCGA.{21}")
    
    df = data.frame(seqnames = seqnames(snps), start = start(snps), sample = 1)
    snp_gr[[sample_name]] = df
}

3 Summarize SNPs

df = snp_gr[[1]]
colnames(df)[3] = names(snp_gr[1])

for (i in 2:length(snp_gr)) {
    new_df = snp_gr[[i]]
    colnames(new_df)[3] = names(snp_gr[i])
    df = dplyr::full_join(df, new_df, by = c("seqnames", "start"))
}

The number of samples containing each SNP

numSamples = rowSums(df[,3:ncol(df)], na.rm = TRUE)
hist(numSamples, breaks = seq(0,250,5))
abline(v = 5, col = "red")

sum(numSamples < 5)
## [1] 3801

3.1 GRanges of SNVs in <5 normals

library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(org.Hs.eg.db)

df_lt5 = df[which(numSamples < 5),]
df_lt5$end = df_lt5$start   # add 'end' column to create GRanges: all are SNP, so end is same as start
gr_lt5 = makeGRangesFromDataFrame(df_lt5, keep.extra.columns = TRUE)
gr_lt5 = PureCN::annotateTargets(gr_lt5, txdb = TxDb.Hsapiens.UCSC.hg38.knownGene, org.Hs.eg.db)
## 'select()' returned many:1 mapping between keys and columns
## 'select()' returned many:1 mapping between keys and columns
## WARN [2020-01-05 00:40:52] Attempted adding gene symbols to intervals. Heuristics have been used to pick symbols for overlapping genes.

3.2 Trouble-shooting

Why 7 different genes are here? How PureCN::annotateTargets handle multiple genes at the same site?

setdiff(gr_lt5$Gene, unique(g_sig_genes_t$SYMBOL))
## [1] "LILRA6"  "KIR2DS5" "KIR2DL1" "RPS9"    "RPS25"   "ETFRF1"  "KYAT3"  
## [8] "."
setdiff(unique(g_sig_genes_t$SYMBOL), gr_lt5$Gene)
## list()