1 Significant genes from tumor-only dNdSCV

dndscv_dir = "~/data2/PureCN_manuscript/Revision/dNdSCV"
sig_genes = read.table(file.path(dndscv_dir, "data/SignificantGenes.csv"), sep = ",", header = TRUE)
t_ind = which(sig_genes$Tumor.only < 0.01)
sig_genes_t = sig_genes$Gene[t_ind] %>% as.character
sig_genes_t

##  [1] "KRAS"     "TP53"     "KEAP1"    "ATF7IP"   "ARID1A"   "STK11"   
##  [7] "OR5F1"    "OR5D13"   "OR2AK2"   "CD8B"     "COX10"    "HLA-DRB1"
## [13] "IGLV5-48" "KIR2DL3"  "KIR3DL2"  "LILRB3"   "POLRMT"   "PRAMEF18"
## [19] "PRSS3"    "RBMXL1"   "TEK"      "TRAPPC4"

n_ind = which(sig_genes$Matching.normal < 0.01)
sig_genes_n = sig_genes$Gene[n_ind] %>% as.character
sig_genes_n

##  [1] "KRAS"    "TP53"    "KEAP1"   "ATF7IP"  "ARID1A"  "STK11"   "BRAF"   
##  [8] "RB1"     "CMTR2"   "OR5F1"   "SMARCA4"

2 Compare dNdSCV inputs

tumor = readRDS(file.path(dndscv_dir, "data/LUAD_mutTable_tumor_only_rescalePriors.rds"))
normal = readRDS(file.path(dndscv_dir, "data/LUAD_mutTable_matching_normal.rds"))

2.1 Combine all SNVs

2.1.1 tumor-only

df_tumor = tumor[[1]][,c("chr", "start", "end")]
df_tumor$sample = 1
colnames(df_tumor)[4] = stringr::str_extract(names(tumor[1]), "TCGA.{11}")

for (i in 2:length(tumor)) {
    new_df = tumor[[i]][,c("chr", "start", "end")]
    new_df$sample = 1
    colnames(new_df)[4] = stringr::str_extract(names(tumor[i]), "TCGA.{11}")
    df_tumor = dplyr::full_join(df_tumor, new_df, by = c("chr", "start", "end"))
}

2.1.2 matched-normal

df_normal = normal[[1]][,c("chr", "start", "end")]
df_normal$sample = 1
colnames(df_normal)[4] = stringr::str_extract(names(normal[1]), "TCGA.{11}")

for (i in 2:length(normal)) {
    new_df = normal[[i]][,c("chr", "start", "end")]
    new_df$sample = 1
    colnames(new_df)[4] = stringr::str_extract(names(normal[i]), "TCGA.{11}")
    df_normal = dplyr::full_join(df_normal, new_df, by = c("chr", "start", "end"))
}

2.2 Count SNVs

df_tumor$numSNVs = rowSums(df_tumor[,4:ncol(df_tumor)], na.rm = TRUE)
df_normal$numSNVs = rowSums(df_normal[,4:ncol(df_normal)], na.rm = TRUE)

# df_tumor = df_tumor[,c("chr", "start", "end", "numSNVs")]
# df_normal = df_normal[,c("chr", "start", "end", "numSNVs")]

3 SNVs only in tumor-only

snv_normal = paste0(df_normal$chr, "_", df_normal$start)
snv_tumor = paste0(df_tumor$chr, "_", df_tumor$start)

ind = which(!snv_tumor %in% snv_normal)

10283 SNVs are in tumor, but not in normal.

snv_onlyTumor = df_tumor[ind,]

3.1 Annotate with gene symbol

library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(org.Hs.eg.db)

snv_onlyTumor = makeGRangesFromDataFrame(snv_onlyTumor, keep.extra.columns = TRUE)
snv_onlyTumor = PureCN::annotateTargets(snv_onlyTumor, txdb = TxDb.Hsapiens.UCSC.hg38.knownGene, org.Hs.eg.db)

## 'select()' returned many:1 mapping between keys and columns
## 'select()' returned many:1 mapping between keys and columns

## WARN [2020-01-05 20:06:52] Attempted adding gene symbols to intervals. Heuristics have been used to pick symbols for overlapping genes.

3.2 Save SNVs only in tumor

saveRDS(snv_onlyTumor, file.path(dndscv_dir, "data/SNVs_onlyInTumor.rds"))

3.3 How many 1~4 SNVs are there for cancer genes from tumor-only analysis?

FPs = setdiff(sig_genes_t, sig_genes_n)
sapply(FPs, function(x) {x %in% unique(snv_onlyTumor$Gene)})

##   OR5D13   OR2AK2     CD8B    COX10 HLA-DRB1 IGLV5-48  KIR2DL3  KIR3DL2 
##    FALSE     TRUE     TRUE     TRUE     TRUE    FALSE     TRUE     TRUE 
##   LILRB3   POLRMT PRAMEF18    PRSS3   RBMXL1      TEK  TRAPPC4 
##     TRUE    FALSE     TRUE    FALSE    FALSE     TRUE    FALSE

sapply(FPs, function(x) {table(snv_onlyTumor$numSNVs[which(snv_onlyTumor$Gene == x)])})

## $OR5D13
## < table of extent 0 >
## 
## $OR2AK2
## 
## 1 
## 2 
## 
## $CD8B
## 
## 108 
##   1 
## 
## $COX10
## 
## 74 
##  1 
## 
## $`HLA-DRB1`
## 
##  1 37 
##  1  1 
## 
## $`IGLV5-48`
## < table of extent 0 >
## 
## $KIR2DL3
## 
##  1 85 
##  3  1 
## 
## $KIR3DL2
## 
##   1 101 
##   1   1 
## 
## $LILRB3
## 
##  1  7 30 
##  2  1  1 
## 
## $POLRMT
## < table of extent 0 >
## 
## $PRAMEF18
## 
##   1   2 184 
##   6   3   1 
## 
## $PRSS3
## < table of extent 0 >
## 
## $RBMXL1
## < table of extent 0 >
## 
## $TEK
## 
## 1 
## 1 
## 
## $TRAPPC4
## < table of extent 0 >

Germline variant (PoN) filtering condition

2020-01-05

1 Significant genes from tumor-only dNdSCV

2 Compare dNdSCV inputs

2.1 Combine all SNVs

2.1.1 tumor-only

2.1.2 matched-normal

2.2 Count SNVs

3 SNVs only in tumor-only

3.1 Annotate with gene symbol

3.2 Save SNVs only in tumor

3.3 How many 1~4 SNVs are there for cancer genes from tumor-only analysis?