dndscv_dir = "~/data2/PureCN_manuscript/Revision/dNdSCV"
sig_genes = read.table(file.path(dndscv_dir, "data/SignificantGenes.csv"), sep = ",", header = TRUE)
t_ind = which(sig_genes$Tumor.only < 0.01)
sig_genes_t = sig_genes$Gene[t_ind] %>% as.character
sig_genes_t
## [1] "KRAS" "TP53" "KEAP1" "ATF7IP" "ARID1A" "STK11"
## [7] "OR5F1" "OR5D13" "OR2AK2" "CD8B" "COX10" "HLA-DRB1"
## [13] "IGLV5-48" "KIR2DL3" "KIR3DL2" "LILRB3" "POLRMT" "PRAMEF18"
## [19] "PRSS3" "RBMXL1" "TEK" "TRAPPC4"
n_ind = which(sig_genes$Matching.normal < 0.01)
sig_genes_n = sig_genes$Gene[n_ind] %>% as.character
sig_genes_n
## [1] "KRAS" "TP53" "KEAP1" "ATF7IP" "ARID1A" "STK11" "BRAF"
## [8] "RB1" "CMTR2" "OR5F1" "SMARCA4"
tumor = readRDS(file.path(dndscv_dir, "data/LUAD_mutTable_tumor_only_rescalePriors.rds"))
normal = readRDS(file.path(dndscv_dir, "data/LUAD_mutTable_matching_normal.rds"))
df_tumor = tumor[[1]][,c("chr", "start", "end")]
df_tumor$sample = 1
colnames(df_tumor)[4] = stringr::str_extract(names(tumor[1]), "TCGA.{11}")
for (i in 2:length(tumor)) {
new_df = tumor[[i]][,c("chr", "start", "end")]
new_df$sample = 1
colnames(new_df)[4] = stringr::str_extract(names(tumor[i]), "TCGA.{11}")
df_tumor = dplyr::full_join(df_tumor, new_df, by = c("chr", "start", "end"))
}
df_normal = normal[[1]][,c("chr", "start", "end")]
df_normal$sample = 1
colnames(df_normal)[4] = stringr::str_extract(names(normal[1]), "TCGA.{11}")
for (i in 2:length(normal)) {
new_df = normal[[i]][,c("chr", "start", "end")]
new_df$sample = 1
colnames(new_df)[4] = stringr::str_extract(names(normal[i]), "TCGA.{11}")
df_normal = dplyr::full_join(df_normal, new_df, by = c("chr", "start", "end"))
}
df_tumor$numSNVs = rowSums(df_tumor[,4:ncol(df_tumor)], na.rm = TRUE)
df_normal$numSNVs = rowSums(df_normal[,4:ncol(df_normal)], na.rm = TRUE)
# df_tumor = df_tumor[,c("chr", "start", "end", "numSNVs")]
# df_normal = df_normal[,c("chr", "start", "end", "numSNVs")]
snv_normal = paste0(df_normal$chr, "_", df_normal$start)
snv_tumor = paste0(df_tumor$chr, "_", df_tumor$start)
ind = which(!snv_tumor %in% snv_normal)
10283 SNVs are in tumor, but not in normal.
snv_onlyTumor = df_tumor[ind,]
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(org.Hs.eg.db)
snv_onlyTumor = makeGRangesFromDataFrame(snv_onlyTumor, keep.extra.columns = TRUE)
snv_onlyTumor = PureCN::annotateTargets(snv_onlyTumor, txdb = TxDb.Hsapiens.UCSC.hg38.knownGene, org.Hs.eg.db)
## 'select()' returned many:1 mapping between keys and columns
## 'select()' returned many:1 mapping between keys and columns
## WARN [2020-01-05 20:06:52] Attempted adding gene symbols to intervals. Heuristics have been used to pick symbols for overlapping genes.
saveRDS(snv_onlyTumor, file.path(dndscv_dir, "data/SNVs_onlyInTumor.rds"))
FPs = setdiff(sig_genes_t, sig_genes_n)
sapply(FPs, function(x) {x %in% unique(snv_onlyTumor$Gene)})
## OR5D13 OR2AK2 CD8B COX10 HLA-DRB1 IGLV5-48 KIR2DL3 KIR3DL2
## FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE
## LILRB3 POLRMT PRAMEF18 PRSS3 RBMXL1 TEK TRAPPC4
## TRUE FALSE TRUE FALSE FALSE TRUE FALSE
sapply(FPs, function(x) {table(snv_onlyTumor$numSNVs[which(snv_onlyTumor$Gene == x)])})
## $OR5D13
## < table of extent 0 >
##
## $OR2AK2
##
## 1
## 2
##
## $CD8B
##
## 108
## 1
##
## $COX10
##
## 74
## 1
##
## $`HLA-DRB1`
##
## 1 37
## 1 1
##
## $`IGLV5-48`
## < table of extent 0 >
##
## $KIR2DL3
##
## 1 85
## 3 1
##
## $KIR3DL2
##
## 1 101
## 1 1
##
## $LILRB3
##
## 1 7 30
## 2 1 1
##
## $POLRMT
## < table of extent 0 >
##
## $PRAMEF18
##
## 1 2 184
## 6 3 1
##
## $PRSS3
## < table of extent 0 >
##
## $RBMXL1
## < table of extent 0 >
##
## $TEK
##
## 1
## 1
##
## $TRAPPC4
## < table of extent 0 >