library(tidyverse)
library("dplyr")
library(ggplot2)
require(reshape2)
library(biomaRt)
library(ggpubr)
library(ggrepel)
library(preprocessCore)# read in TCGA subtype information
TCGA_SKCM <- read.table(file = "/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/TCGA_SKCM_SubtypeInformation_Terrence_10-31-17.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
head(TCGA_SKCM)## Name TCGA_ID SampleID MelCore.Sample.Number
## 1 TCGA-D3-A5GL-06 TCGA-D3-A5GL M822 MelCore_12822
## 2 TCGA-D3-A2JH-06 TCGA-D3-A2JH M233 MelCore_56233
## 3 TCGA-D3-A2JP-06 TCGA-D3-A2JP M807 MelCore_54807
## 4 TCGA-D3-A2JK-06 TCGA-D3-A2JK M721 MelCore_14721
## 5 TCGA-D3-A2JG-06 TCGA-D3-A2JG M137 MelCore_56137
## 6 TCGA-D3-A2J7-06 TCGA-D3-A2J7 M263 MelCore_13263
## MelCore.Sample.Number.1 TUMOR_TISSUE_SITE1
## 1 12822 Regional Lymph Node
## 2 56233 Regional Cutaneous or Subcutaneous Tissue
## 3 54807 Regional Lymph Node
## 4 14721 Regional Lymph Node
## 5 56137 Regional Lymph Node
## 6 13263 Regional Lymph Node
## DAYS_TO_SAMPLE_PROCUREMENT2 OS_INITIALDX_DAYS
## 1 496 3826
## 2 197 1280
## 3 22 1812
## 4 176 368
## 5 3246 3453
## 6 553 3136
## POST_ACCESSION_SURVIVAL_DAYS OS_STATUS PURITY..ABS.OR.CPE. BRAF
## 1 3330 0 0.914 -
## 2 1083 0 0.300 p.K601E
## 3 1790 0 0.580 p.V600E
## 4 192 1 0.660 p.V600E
## 5 207 1 0.790 p.V600E
## 6 2583 1 0.420 p.V600E
## NRAS NF1 MUT_TYPE TOTAL_MUT UV_MUT NONSILENT_MUT
## 1 - - Triple_WT 1695 1278 381
## 2 - - BRAF_Hotspot_Mutant 2694 2453 812
## 3 p.H131N p.P458T BRAF_Hotspot_Mutant 12397 12251 4832
## 4 - - BRAF_Hotspot_Mutant 3355 3162 974
## 5 - - BRAF_Hotspot_Mutant 3166 3046 832
## 6 - - BRAF_Hotspot_Mutant 1478 1329 424
## UV_RATE RNA Meth MIR
## 1 0.754 keratin hyper-methylated MIR.type.2
## 2 0.911 immune normal-like MIR.type.4
## 3 0.988 immune hypo-methylated MIR.type.3
## 4 0.942 keratin hypo-methylated MIR.type.1
## 5 0.962 keratin CpG island-methylated MIR.type.3
## 6 0.899 immune hyper-methylated MIR.type.2
TCGA_Melcore <- read.table(file = "/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/Melcore.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
head(TCGA_Melcore)## MelCore.Sample.Number TCGA_ID_RaiLab MelCore.Sample.Number.1
## 1 MelCore_9303 TCGA-D3-A1Q3 9303
## 2 MelCore_11142 TCGA-D3-A2J9 11142
## 3 MelCore_11527 TCGA-D3-A1Q4 11527
## 4 MelCore_11559 TCGA-D3-A1Q5 11559
## 5 MelCore_12026 TCGA-D3-A3BZ 12026
## 6 MelCore_12484 TCGA-D3-A3MR 12484
## TCGA_ID_Cell_2015 Name ALL_SAMPLES MUTATIONSUBTYPES
## 1 TCGA-D3-A1Q3 TCGA-D3-A1Q3-06 Yes Triple_WT
## 2 TCGA-D3-A2J9 TCGA-D3-A2J9-06 Yes Triple_WT
## 3 TCGA-D3-A1Q4 TCGA-D3-A1Q4-06 Yes BRAF_Hotspot_Mutants_V600E
## 4 TCGA-D3-A1Q5 TCGA-D3-A1Q5-06 Yes BRAF_Hotspot_Mutants_V600E
## 5 TCGA-D3-A3BZ TCGA-D3-A3BZ-06 Yes Triple_WT
## 6 TCGA-D3-A3MR TCGA-D3-A3MR-06 Yes BRAF_Hotspot_Mutants_V600E
## ALL_PRIMARY_VS_METASTATIC REGIONAL_VS_PRIMARY UV_VS_NOUV
## 1 All_Metastases Regional_Lymph_Node No_UV_Signature
## 2 All_Metastases Regional_Lymph_Node UV_Signature
## 3 All_Metastases Regional_Lymph_Node UV_Signature
## 4 All_Metastases Regional_Lymph_Node UV_Signature
## 5 All_Metastases Regional_Lymph_Node No_UV_Signature
## 6 All_Metastases Regional_Skin_or_Soft_Tissue UV_Signature
## RNASEQ.CLUSTER_CONSENHIER MethTypes.201408 MIRCluster
## 1 immune CpG island-methylated MIR.type.4
## 2 immune normal-like MIR.type.2
## 3 MITF-low hyper-methylated MIR.type.4
## 4 MITF-low hyper-methylated MIR.type.4
## 5 immune normal-like MIR.type.1
## 6 immune hypo-methylated MIR.type.1
## ProteinCluster OncoSignCluster CDKN2A.cg13601799._meth
## 1 - OSC.1 1
## 2 - - 0
## 3 PROT.type.3 OSC.6 0
## 4 PROT.type.3 OSC.4 0
## 5 - OSC.4 0
## 6 PROT.type.2 OSC.6 0
## KIT.cg10087973._meth BRAF_cna NRAS_cna MITF_cna KIT_cna PDGFRA_cna
## 1 0 1 0 0 2 2
## 2 0 1 0 0 0 0
## 3 0 0 -1 -1 -1 -1
## 4 0 1 0 -1 0 0
## 5 0 0 1 0 0 0
## 6 0 1 1 1 -1 -1
## KDR_cna MYC_cna CCND1_cna CDK4_cna MDM2_cna TERT_cna EP300_cna
## 1 2 0 0 0 0 2 0
## 2 0 0 0 0 0 0 0
## 3 -1 1 0 0 0 0 0
## 4 0 0 -1 0 0 0 0
## 5 0 0 0 2 2 0 0
## 6 -1 1 0 0 0 0 0
## NOTCH2_cna CDKN2A_cna PTEN_cna TP53_cna PPP6C_cna ARID2_cna IDH1_cna
## 1 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0
## 3 -1 -1 -1 0 0 0 0
## 4 0 -2 -1 0 -1 0 0
## 5 1 0 0 1 0 0 0
## 6 1 -1 -1 0 -1 0 -1
## MAP2K1_cna DDX3X_cna RAC1_cna NF1_cna CASP8_cna CTNNB1_cna PCDHGA1_cna
## 1 0 1 1 0 0 0 -2
## 2 1 0 1 0 0 0 0
## 3 0 -1 1 0 0 -1 0
## 4 1 -1 1 0 0 -1 0
## 5 0 0 1 1 0 2 0
## 6 0 0 1 0 -1 1 0
## SERPINB3_cna IRF7_cna HRAS_cna PTPN11_cna KRAS_cna GNA11_cna GNAQ_cna
## 1 0 0 0 0 0 0 0
## 2 1 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0
## 4 1 -1 -1 0 0 1 -1
## 5 0 0 0 1 0 0 0
## 6 -1 0 0 0 0 0 -1
## AKT1_cna APC_cna EZH2_cna FBXW7_cna GNAS_cna PIK3CA_cna ARID2_mut
## 1 0 1 1 0 0 0 wt
## 2 0 0 1 0 0 0 wt
## 3 -1 0 0 -1 0 0 wt
## 4 -1 0 1 0 1 -1 wt
## 5 0 0 0 0 0 0 wt
## 6 -1 0 1 -1 1 1 Q894*;
## BRAF_mut NRAS_mut KRAS_mut HRAS_mut NF1_mut IDH1_mut GNA11_mut GNAQ_mut
## 1 wt wt wt wt wt wt wt wt
## 2 wt wt wt wt wt wt wt wt
## 3 V600E; wt wt wt wt wt wt wt
## 4 V600E; wt wt wt wt wt wt wt
## 5 wt wt wt wt wt wt wt wt
## 6 V600E; wt wt wt wt wt wt wt
## CDK4_mut CDKN2A_mut DDX3X_mut KIT_mut MAP2K1_mut PPP6C_mut PTEN_mut
## 1 wt wt wt K642E; wt wt wt
## 2 wt wt wt wt wt wt wt
## 3 wt W110*; wt wt wt wt G165_splice;
## 4 wt wt wt wt wt wt wt
## 5 wt wt wt wt wt wt wt
## 6 wt wt wt wt wt wt Q298*;
## RAC1_mut RB1_mut TP53_mut TERT_MUT rnaseq_TERT.RSEM.NORM_DATA
## 1 wt wt wt - 59.0884
## 2 wt wt wt - 7.7988
## 3 wt wt wt - 10.8446
## 4 wt wt wt - 21.7019
## 5 wt wt wt - 87.1098
## 6 P29L; wt wt - 10.6432
## rppa_LCK.LCK.R.V rppa_SYK.SYK.M.V PIGMENT.SCORE NECROSIS
## 1 - - 0 0
## 2 - - 0 0
## 3 -0.820475058 -1.112486938 0 0
## 4 -0.681480984 -1.427398525 0 0
## 5 - - 3 0
## 6 0.802682144 1.351974657 0 10
## LYMPHOCYTE.DISTRIBUTION LYMPHOCYTE.DENSITY LYMPHOCYTE.SCORE PURITY
## 1 3 3 6 0.65
## 2 3 2 5 0.2
## 3 3 2 5 0.52
## 4 1 1 2 0.91
## 5 2 1 3 0.19
## 6 2 2 4 0.23
## TOTAL.MUTATIONS UV.RATE VITAL_STATUS Overall_Survival
## 1 165 0.115151515 1 507
## 2 103 0.893203883 1 723
## 3 330 0.863636364 0 3408
## 4 452 0.89380531 1 3423
## 5 88 0.329545455 0 3516
## 6 1084 0.884686347 0 3151
## Post_Accession_Survival
## 1 248
## 2 636
## 3 3367
## 4 932
## 5 3004
## 6 3097
# merging TCGA subtype information - sanity check
TCGA_SKCM <- inner_join(x = TCGA_SKCM, y = TCGA_Melcore[,c(1:2)], by = c("TCGA_ID" = "TCGA_ID_RaiLab"))
head(TCGA_SKCM)## Name TCGA_ID SampleID MelCore.Sample.Number.x
## 1 TCGA-D3-A5GL-06 TCGA-D3-A5GL M822 MelCore_12822
## 2 TCGA-D3-A2JH-06 TCGA-D3-A2JH M233 MelCore_56233
## 3 TCGA-D3-A2JP-06 TCGA-D3-A2JP M807 MelCore_54807
## 4 TCGA-D3-A2JK-06 TCGA-D3-A2JK M721 MelCore_14721
## 5 TCGA-D3-A2JG-06 TCGA-D3-A2JG M137 MelCore_56137
## 6 TCGA-D3-A2J7-06 TCGA-D3-A2J7 M263 MelCore_13263
## MelCore.Sample.Number.1 TUMOR_TISSUE_SITE1
## 1 12822 Regional Lymph Node
## 2 56233 Regional Cutaneous or Subcutaneous Tissue
## 3 54807 Regional Lymph Node
## 4 14721 Regional Lymph Node
## 5 56137 Regional Lymph Node
## 6 13263 Regional Lymph Node
## DAYS_TO_SAMPLE_PROCUREMENT2 OS_INITIALDX_DAYS
## 1 496 3826
## 2 197 1280
## 3 22 1812
## 4 176 368
## 5 3246 3453
## 6 553 3136
## POST_ACCESSION_SURVIVAL_DAYS OS_STATUS PURITY..ABS.OR.CPE. BRAF
## 1 3330 0 0.914 -
## 2 1083 0 0.300 p.K601E
## 3 1790 0 0.580 p.V600E
## 4 192 1 0.660 p.V600E
## 5 207 1 0.790 p.V600E
## 6 2583 1 0.420 p.V600E
## NRAS NF1 MUT_TYPE TOTAL_MUT UV_MUT NONSILENT_MUT
## 1 - - Triple_WT 1695 1278 381
## 2 - - BRAF_Hotspot_Mutant 2694 2453 812
## 3 p.H131N p.P458T BRAF_Hotspot_Mutant 12397 12251 4832
## 4 - - BRAF_Hotspot_Mutant 3355 3162 974
## 5 - - BRAF_Hotspot_Mutant 3166 3046 832
## 6 - - BRAF_Hotspot_Mutant 1478 1329 424
## UV_RATE RNA Meth MIR MelCore.Sample.Number.y
## 1 0.754 keratin hyper-methylated MIR.type.2 MelCore_12822
## 2 0.911 immune normal-like MIR.type.4 MelCore_56233
## 3 0.988 immune hypo-methylated MIR.type.3 MelCore_54807
## 4 0.942 keratin hypo-methylated MIR.type.1 MelCore_14721
## 5 0.962 keratin CpG island-methylated MIR.type.3 MelCore_56137
## 6 0.899 immune hyper-methylated MIR.type.2 MelCore_13263
# read in SKCM RNA-seq data - already TPM transformed
SKCM_TPM <- read.table(file = "/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/SKCM_TPM_sub-2.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE,
quote = "")
# clean SKCM RNA-seq data
colnames(SKCM_TPM) <- gsub(pattern = "\\.", replacement = "-", colnames(SKCM_TPM))
colnames(SKCM_TPM) <- gsub(pattern = "TCGA-(.*)-(.*)-(.*)-(.*)-(.*)-(01|06|07)", replacement = "TCGA-\\1-\\2", colnames(SKCM_TPM))
ind.col <- which(colnames(SKCM_TPM) %in% TCGA_SKCM$Name)
SKCM_TPM_1 <- SKCM_TPM[ind.col]
SKCM_TPM_1 <- as.data.frame(t(SKCM_TPM_1))
SKCM_TPM <- inner_join(x = TCGA_SKCM, y = SKCM_TPM_1 %>% add_rownames(), by = c("Name" = "rowname"))
rownames(SKCM_TPM) <- SKCM_TPM[,3]
SKCM_TPM <- SKCM_TPM[,-3]
SKCM_TPM <- SKCM_TPM[-c(1:22)]
SKCM_TPM <- t(SKCM_TPM)
SKCM_TPM <- SKCM_TPM[,c(16,17,18,19,2,3,4,5,6,7,8,9,10,11,12,13,14,1,15,20)]
# read in roadmap RNA-seq data
RoadMap_counts <- read.table(file = "/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/57epigenomes.N.pc.gz", sep = '\t', header = TRUE, stringsAsFactors = FALSE,
quote = "")
colnames(RoadMap_counts)[1:57] <- colnames(RoadMap_counts)[2:58]
RoadMap_counts <- RoadMap_counts[,c(1:57)]
RoadMap_counts_ESC_MEL <- RoadMap_counts[,c(24,25)]
# change raw counts to TPM
library(EnsDb.Hsapiens.v75)
edb <- EnsDb.Hsapiens.v75
genes_ensemble <- genes(edb)
gene_length<- as.data.frame(genes_ensemble) %>% dplyr::select(gene_id, gene_name, width)
gene_length_in_mat<- left_join(data.frame(gene_id = rownames(RoadMap_counts_ESC_MEL)), gene_length) %>% dplyr::filter(!is.na(width))
RoadMap_counts_ESC_MEL_sub <- RoadMap_counts_ESC_MEL[rownames(RoadMap_counts_ESC_MEL) %in% gene_length_in_mat$gene_id, ]
countToTpm <- function(counts, effLen)
{
rate <- log(counts + 1) - log(effLen)
denom <- log(sum(exp(rate)))
exp(rate - denom + log(1e6))
}
RoadMap_counts_ESC_MEL_TPM <- apply(RoadMap_counts_ESC_MEL_sub, 2, countToTpm, effLen = gene_length_in_mat$width)
# get ENSGenes using biomaRt for roadmap data
ENSGenes <- useEnsembl(biomart = "ensembl", dataset="hsapiens_gene_ensembl", GRCh=37)
ENSGenes <- getBM(attributes = c('ensembl_gene_id', 'hgnc_symbol'), mart = ENSGenes)
RoadMap_counts_ESC_MEL_ENSGenes <- inner_join(x = ENSGenes, y = as.data.frame(RoadMap_counts_ESC_MEL_TPM) %>% add_rownames(), by = c("ensembl_gene_id" = "rowname"))
# combine SKCM and roadmap data
counts.all.2 <- inner_join(x = RoadMap_counts_ESC_MEL_ENSGenes, y = as.data.frame(SKCM_TPM) %>% add_rownames(), by = c("ensembl_gene_id" = "rowname"))
# Quantile Normalization
counts.all.2.matrix <- as.matrix(counts.all.2[,-c(1,2)])
counts.all.2.matrix.normalized <- normalize.quantiles(counts.all.2.matrix)
rownames(counts.all.2.matrix.normalized) <- counts.all.2$hgnc_symbol
colnames(counts.all.2.matrix.normalized) <- colnames(counts.all.2[,-c(1,2)])
counts.all.2.matrix.normalized <- as.data.frame(counts.all.2.matrix.normalized)
counts.all.2.matrix.normalized <- rownames_to_column(counts.all.2.matrix.normalized, var = "hgnc_symbol")
# set gene for boxplot
genes <- c("TWIST1")for(i in 1:length(genes)){
ind <- which(counts.all.2.matrix.normalized$hgnc_symbol == genes[i])
genotypes <- c("Melanocytes", "Melanocytes", rep("RAS",4), rep("BRAF",13), rep("WT",3))
mat <- melt(counts.all.2[ind,])
mat$value <- log2(mat$value + 1)
mat$genotypes <- factor(genotypes, levels = c("Melanocytes", "RAS", "BRAF", "WT"))
p<- ggplot(mat, aes(x = genotypes, y = value)) +
geom_boxplot(aes(fill = genotypes)) +
scale_fill_manual(values = c("purple", "blue", "red", "green")) +
theme_minimal(base_size = 15) + ylab(paste("Log2",genes[i],"TPM+1"))
print(p)
test_value<- wilcox.test(value ~ genotypes, data = mat, subset = genotypes %in% c("Melanocytes", "RAS"), alternative = "less")
print(test_value)
}##
## Wilcoxon rank sum test
##
## data: value by genotypes
## W = 0, p-value = 0.06667
## alternative hypothesis: true location shift is less than 0
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.