BoxPlot displaying RNA-seq expression levels from TCGA SKCM tumors (20 matching samples from chromatin state anaylsis) and melanocyte cell lines from Roadmap for a single gene

Christopher Terranova May 2019

Load necessary programs

library(tidyverse)
library("dplyr")
library(ggplot2)
require(reshape2)
library(biomaRt)
library(ggpubr)
library(ggrepel)
library(preprocessCore)

Representative Example for Single Gene BoxPlot in SKCM vs Melanocyte samples

# read in TCGA subtype information
TCGA_SKCM <- read.table(file = "/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/TCGA_SKCM_SubtypeInformation_Terrence_10-31-17.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
head(TCGA_SKCM)
##              Name      TCGA_ID SampleID MelCore.Sample.Number
## 1 TCGA-D3-A5GL-06 TCGA-D3-A5GL    M822          MelCore_12822
## 2 TCGA-D3-A2JH-06 TCGA-D3-A2JH    M233          MelCore_56233
## 3 TCGA-D3-A2JP-06 TCGA-D3-A2JP    M807          MelCore_54807
## 4 TCGA-D3-A2JK-06 TCGA-D3-A2JK    M721          MelCore_14721
## 5 TCGA-D3-A2JG-06 TCGA-D3-A2JG    M137          MelCore_56137
## 6 TCGA-D3-A2J7-06 TCGA-D3-A2J7    M263          MelCore_13263
##   MelCore.Sample.Number.1                        TUMOR_TISSUE_SITE1
## 1                   12822                       Regional Lymph Node
## 2                   56233 Regional Cutaneous or Subcutaneous Tissue
## 3                   54807                       Regional Lymph Node
## 4                   14721                       Regional Lymph Node
## 5                   56137                       Regional Lymph Node
## 6                   13263                       Regional Lymph Node
##   DAYS_TO_SAMPLE_PROCUREMENT2 OS_INITIALDX_DAYS
## 1                         496              3826
## 2                         197              1280
## 3                          22              1812
## 4                         176               368
## 5                        3246              3453
## 6                         553              3136
##   POST_ACCESSION_SURVIVAL_DAYS OS_STATUS PURITY..ABS.OR.CPE.    BRAF
## 1                         3330         0               0.914       -
## 2                         1083         0               0.300 p.K601E
## 3                         1790         0               0.580 p.V600E
## 4                          192         1               0.660 p.V600E
## 5                          207         1               0.790 p.V600E
## 6                         2583         1               0.420 p.V600E
##      NRAS     NF1            MUT_TYPE TOTAL_MUT UV_MUT NONSILENT_MUT
## 1       -       -           Triple_WT      1695   1278           381
## 2       -       - BRAF_Hotspot_Mutant      2694   2453           812
## 3 p.H131N p.P458T BRAF_Hotspot_Mutant     12397  12251          4832
## 4       -       - BRAF_Hotspot_Mutant      3355   3162           974
## 5       -       - BRAF_Hotspot_Mutant      3166   3046           832
## 6       -       - BRAF_Hotspot_Mutant      1478   1329           424
##   UV_RATE     RNA                  Meth        MIR
## 1   0.754 keratin      hyper-methylated MIR.type.2
## 2   0.911  immune           normal-like MIR.type.4
## 3   0.988  immune       hypo-methylated MIR.type.3
## 4   0.942 keratin       hypo-methylated MIR.type.1
## 5   0.962 keratin CpG island-methylated MIR.type.3
## 6   0.899  immune      hyper-methylated MIR.type.2
TCGA_Melcore <- read.table(file = "/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/Melcore.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
head(TCGA_Melcore)
##   MelCore.Sample.Number TCGA_ID_RaiLab MelCore.Sample.Number.1
## 1          MelCore_9303   TCGA-D3-A1Q3                    9303
## 2         MelCore_11142   TCGA-D3-A2J9                   11142
## 3         MelCore_11527   TCGA-D3-A1Q4                   11527
## 4         MelCore_11559   TCGA-D3-A1Q5                   11559
## 5         MelCore_12026   TCGA-D3-A3BZ                   12026
## 6         MelCore_12484   TCGA-D3-A3MR                   12484
##   TCGA_ID_Cell_2015            Name ALL_SAMPLES           MUTATIONSUBTYPES
## 1      TCGA-D3-A1Q3 TCGA-D3-A1Q3-06         Yes                  Triple_WT
## 2      TCGA-D3-A2J9 TCGA-D3-A2J9-06         Yes                  Triple_WT
## 3      TCGA-D3-A1Q4 TCGA-D3-A1Q4-06         Yes BRAF_Hotspot_Mutants_V600E
## 4      TCGA-D3-A1Q5 TCGA-D3-A1Q5-06         Yes BRAF_Hotspot_Mutants_V600E
## 5      TCGA-D3-A3BZ TCGA-D3-A3BZ-06         Yes                  Triple_WT
## 6      TCGA-D3-A3MR TCGA-D3-A3MR-06         Yes BRAF_Hotspot_Mutants_V600E
##   ALL_PRIMARY_VS_METASTATIC          REGIONAL_VS_PRIMARY      UV_VS_NOUV
## 1            All_Metastases          Regional_Lymph_Node No_UV_Signature
## 2            All_Metastases          Regional_Lymph_Node    UV_Signature
## 3            All_Metastases          Regional_Lymph_Node    UV_Signature
## 4            All_Metastases          Regional_Lymph_Node    UV_Signature
## 5            All_Metastases          Regional_Lymph_Node No_UV_Signature
## 6            All_Metastases Regional_Skin_or_Soft_Tissue    UV_Signature
##   RNASEQ.CLUSTER_CONSENHIER      MethTypes.201408 MIRCluster
## 1                    immune CpG island-methylated MIR.type.4
## 2                    immune           normal-like MIR.type.2
## 3                  MITF-low      hyper-methylated MIR.type.4
## 4                  MITF-low      hyper-methylated MIR.type.4
## 5                    immune           normal-like MIR.type.1
## 6                    immune       hypo-methylated MIR.type.1
##   ProteinCluster OncoSignCluster CDKN2A.cg13601799._meth
## 1              -           OSC.1                       1
## 2              -               -                       0
## 3    PROT.type.3           OSC.6                       0
## 4    PROT.type.3           OSC.4                       0
## 5              -           OSC.4                       0
## 6    PROT.type.2           OSC.6                       0
##   KIT.cg10087973._meth BRAF_cna NRAS_cna MITF_cna KIT_cna PDGFRA_cna
## 1                    0        1        0        0       2          2
## 2                    0        1        0        0       0          0
## 3                    0        0       -1       -1      -1         -1
## 4                    0        1        0       -1       0          0
## 5                    0        0        1        0       0          0
## 6                    0        1        1        1      -1         -1
##   KDR_cna MYC_cna CCND1_cna CDK4_cna MDM2_cna TERT_cna EP300_cna
## 1       2       0         0        0        0        2         0
## 2       0       0         0        0        0        0         0
## 3      -1       1         0        0        0        0         0
## 4       0       0        -1        0        0        0         0
## 5       0       0         0        2        2        0         0
## 6      -1       1         0        0        0        0         0
##   NOTCH2_cna CDKN2A_cna PTEN_cna TP53_cna PPP6C_cna ARID2_cna IDH1_cna
## 1          1          0        0        0         0         0        0
## 2          0          0        0        0         0         0        0
## 3         -1         -1       -1        0         0         0        0
## 4          0         -2       -1        0        -1         0        0
## 5          1          0        0        1         0         0        0
## 6          1         -1       -1        0        -1         0       -1
##   MAP2K1_cna DDX3X_cna RAC1_cna NF1_cna CASP8_cna CTNNB1_cna PCDHGA1_cna
## 1          0         1        1       0         0          0          -2
## 2          1         0        1       0         0          0           0
## 3          0        -1        1       0         0         -1           0
## 4          1        -1        1       0         0         -1           0
## 5          0         0        1       1         0          2           0
## 6          0         0        1       0        -1          1           0
##   SERPINB3_cna IRF7_cna HRAS_cna PTPN11_cna KRAS_cna GNA11_cna GNAQ_cna
## 1            0        0        0          0        0         0        0
## 2            1        0        0          0        0         0        0
## 3            0        0        0          0        0         0        0
## 4            1       -1       -1          0        0         1       -1
## 5            0        0        0          1        0         0        0
## 6           -1        0        0          0        0         0       -1
##   AKT1_cna APC_cna EZH2_cna FBXW7_cna GNAS_cna PIK3CA_cna ARID2_mut
## 1        0       1        1         0        0          0        wt
## 2        0       0        1         0        0          0        wt
## 3       -1       0        0        -1        0          0        wt
## 4       -1       0        1         0        1         -1        wt
## 5        0       0        0         0        0          0        wt
## 6       -1       0        1        -1        1          1    Q894*;
##   BRAF_mut NRAS_mut KRAS_mut HRAS_mut NF1_mut IDH1_mut GNA11_mut GNAQ_mut
## 1       wt       wt       wt       wt      wt       wt        wt       wt
## 2       wt       wt       wt       wt      wt       wt        wt       wt
## 3   V600E;       wt       wt       wt      wt       wt        wt       wt
## 4   V600E;       wt       wt       wt      wt       wt        wt       wt
## 5       wt       wt       wt       wt      wt       wt        wt       wt
## 6   V600E;       wt       wt       wt      wt       wt        wt       wt
##   CDK4_mut CDKN2A_mut DDX3X_mut KIT_mut MAP2K1_mut PPP6C_mut     PTEN_mut
## 1       wt         wt        wt  K642E;         wt        wt           wt
## 2       wt         wt        wt      wt         wt        wt           wt
## 3       wt     W110*;        wt      wt         wt        wt G165_splice;
## 4       wt         wt        wt      wt         wt        wt           wt
## 5       wt         wt        wt      wt         wt        wt           wt
## 6       wt         wt        wt      wt         wt        wt       Q298*;
##   RAC1_mut RB1_mut TP53_mut TERT_MUT rnaseq_TERT.RSEM.NORM_DATA
## 1       wt      wt       wt        -                    59.0884
## 2       wt      wt       wt        -                     7.7988
## 3       wt      wt       wt        -                    10.8446
## 4       wt      wt       wt        -                    21.7019
## 5       wt      wt       wt        -                    87.1098
## 6    P29L;      wt       wt        -                    10.6432
##   rppa_LCK.LCK.R.V rppa_SYK.SYK.M.V PIGMENT.SCORE NECROSIS
## 1                -                -             0        0
## 2                -                -             0        0
## 3     -0.820475058     -1.112486938             0        0
## 4     -0.681480984     -1.427398525             0        0
## 5                -                -             3        0
## 6      0.802682144      1.351974657             0       10
##   LYMPHOCYTE.DISTRIBUTION LYMPHOCYTE.DENSITY LYMPHOCYTE.SCORE PURITY
## 1                       3                  3                6   0.65
## 2                       3                  2                5    0.2
## 3                       3                  2                5   0.52
## 4                       1                  1                2   0.91
## 5                       2                  1                3   0.19
## 6                       2                  2                4   0.23
##   TOTAL.MUTATIONS     UV.RATE VITAL_STATUS Overall_Survival
## 1             165 0.115151515            1              507
## 2             103 0.893203883            1              723
## 3             330 0.863636364            0             3408
## 4             452  0.89380531            1             3423
## 5              88 0.329545455            0             3516
## 6            1084 0.884686347            0             3151
##   Post_Accession_Survival
## 1                     248
## 2                     636
## 3                    3367
## 4                     932
## 5                    3004
## 6                    3097
# merging TCGA subtype information - sanity check
TCGA_SKCM <- inner_join(x = TCGA_SKCM, y = TCGA_Melcore[,c(1:2)], by = c("TCGA_ID" = "TCGA_ID_RaiLab"))
head(TCGA_SKCM)
##              Name      TCGA_ID SampleID MelCore.Sample.Number.x
## 1 TCGA-D3-A5GL-06 TCGA-D3-A5GL    M822            MelCore_12822
## 2 TCGA-D3-A2JH-06 TCGA-D3-A2JH    M233            MelCore_56233
## 3 TCGA-D3-A2JP-06 TCGA-D3-A2JP    M807            MelCore_54807
## 4 TCGA-D3-A2JK-06 TCGA-D3-A2JK    M721            MelCore_14721
## 5 TCGA-D3-A2JG-06 TCGA-D3-A2JG    M137            MelCore_56137
## 6 TCGA-D3-A2J7-06 TCGA-D3-A2J7    M263            MelCore_13263
##   MelCore.Sample.Number.1                        TUMOR_TISSUE_SITE1
## 1                   12822                       Regional Lymph Node
## 2                   56233 Regional Cutaneous or Subcutaneous Tissue
## 3                   54807                       Regional Lymph Node
## 4                   14721                       Regional Lymph Node
## 5                   56137                       Regional Lymph Node
## 6                   13263                       Regional Lymph Node
##   DAYS_TO_SAMPLE_PROCUREMENT2 OS_INITIALDX_DAYS
## 1                         496              3826
## 2                         197              1280
## 3                          22              1812
## 4                         176               368
## 5                        3246              3453
## 6                         553              3136
##   POST_ACCESSION_SURVIVAL_DAYS OS_STATUS PURITY..ABS.OR.CPE.    BRAF
## 1                         3330         0               0.914       -
## 2                         1083         0               0.300 p.K601E
## 3                         1790         0               0.580 p.V600E
## 4                          192         1               0.660 p.V600E
## 5                          207         1               0.790 p.V600E
## 6                         2583         1               0.420 p.V600E
##      NRAS     NF1            MUT_TYPE TOTAL_MUT UV_MUT NONSILENT_MUT
## 1       -       -           Triple_WT      1695   1278           381
## 2       -       - BRAF_Hotspot_Mutant      2694   2453           812
## 3 p.H131N p.P458T BRAF_Hotspot_Mutant     12397  12251          4832
## 4       -       - BRAF_Hotspot_Mutant      3355   3162           974
## 5       -       - BRAF_Hotspot_Mutant      3166   3046           832
## 6       -       - BRAF_Hotspot_Mutant      1478   1329           424
##   UV_RATE     RNA                  Meth        MIR MelCore.Sample.Number.y
## 1   0.754 keratin      hyper-methylated MIR.type.2           MelCore_12822
## 2   0.911  immune           normal-like MIR.type.4           MelCore_56233
## 3   0.988  immune       hypo-methylated MIR.type.3           MelCore_54807
## 4   0.942 keratin       hypo-methylated MIR.type.1           MelCore_14721
## 5   0.962 keratin CpG island-methylated MIR.type.3           MelCore_56137
## 6   0.899  immune      hyper-methylated MIR.type.2           MelCore_13263
# read in SKCM RNA-seq data - already TPM transformed
SKCM_TPM <- read.table(file = "/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/SKCM_TPM_sub-2.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE, 
                      quote = "")

# clean SKCM RNA-seq data
colnames(SKCM_TPM) <- gsub(pattern = "\\.", replacement = "-", colnames(SKCM_TPM))
colnames(SKCM_TPM) <- gsub(pattern = "TCGA-(.*)-(.*)-(.*)-(.*)-(.*)-(01|06|07)", replacement = "TCGA-\\1-\\2", colnames(SKCM_TPM))
ind.col <- which(colnames(SKCM_TPM) %in% TCGA_SKCM$Name)
SKCM_TPM_1 <- SKCM_TPM[ind.col]
SKCM_TPM_1 <- as.data.frame(t(SKCM_TPM_1))
SKCM_TPM <- inner_join(x = TCGA_SKCM, y = SKCM_TPM_1 %>% add_rownames(), by = c("Name" = "rowname"))

rownames(SKCM_TPM) <- SKCM_TPM[,3]
SKCM_TPM <- SKCM_TPM[,-3]
SKCM_TPM <- SKCM_TPM[-c(1:22)]
SKCM_TPM <- t(SKCM_TPM)
SKCM_TPM <- SKCM_TPM[,c(16,17,18,19,2,3,4,5,6,7,8,9,10,11,12,13,14,1,15,20)]

# read in roadmap RNA-seq data
RoadMap_counts <- read.table(file = "/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/57epigenomes.N.pc.gz", sep = '\t', header = TRUE, stringsAsFactors = FALSE, 
                             quote = "")
colnames(RoadMap_counts)[1:57] <- colnames(RoadMap_counts)[2:58]
RoadMap_counts <- RoadMap_counts[,c(1:57)]
RoadMap_counts_ESC_MEL <- RoadMap_counts[,c(24,25)]
  
# change raw counts to TPM
library(EnsDb.Hsapiens.v75)
edb <- EnsDb.Hsapiens.v75

genes_ensemble <- genes(edb)

gene_length<- as.data.frame(genes_ensemble) %>% dplyr::select(gene_id, gene_name, width)

gene_length_in_mat<- left_join(data.frame(gene_id = rownames(RoadMap_counts_ESC_MEL)), gene_length) %>% dplyr::filter(!is.na(width))

RoadMap_counts_ESC_MEL_sub <- RoadMap_counts_ESC_MEL[rownames(RoadMap_counts_ESC_MEL) %in% gene_length_in_mat$gene_id, ]

countToTpm <- function(counts, effLen)
{
  rate <- log(counts + 1) - log(effLen)
  denom <- log(sum(exp(rate)))
  exp(rate - denom + log(1e6))
}

RoadMap_counts_ESC_MEL_TPM <- apply(RoadMap_counts_ESC_MEL_sub, 2, countToTpm, effLen = gene_length_in_mat$width)

# get ENSGenes using biomaRt for roadmap data 
ENSGenes <- useEnsembl(biomart = "ensembl", dataset="hsapiens_gene_ensembl", GRCh=37)
ENSGenes <- getBM(attributes = c('ensembl_gene_id', 'hgnc_symbol'), mart = ENSGenes)
RoadMap_counts_ESC_MEL_ENSGenes <- inner_join(x = ENSGenes, y = as.data.frame(RoadMap_counts_ESC_MEL_TPM) %>% add_rownames(), by = c("ensembl_gene_id" = "rowname"))

# combine SKCM and roadmap data
counts.all.2 <- inner_join(x = RoadMap_counts_ESC_MEL_ENSGenes, y = as.data.frame(SKCM_TPM) %>% add_rownames(), by = c("ensembl_gene_id" = "rowname"))

# Quantile Normalization
counts.all.2.matrix <- as.matrix(counts.all.2[,-c(1,2)])
counts.all.2.matrix.normalized <- normalize.quantiles(counts.all.2.matrix)
rownames(counts.all.2.matrix.normalized) <- counts.all.2$hgnc_symbol
colnames(counts.all.2.matrix.normalized) <- colnames(counts.all.2[,-c(1,2)])
counts.all.2.matrix.normalized <- as.data.frame(counts.all.2.matrix.normalized)
counts.all.2.matrix.normalized <- rownames_to_column(counts.all.2.matrix.normalized, var = "hgnc_symbol")

# set gene for boxplot
genes <- c("TWIST1")

BoxPlot for TWIST1

for(i in 1:length(genes)){
  ind <- which(counts.all.2.matrix.normalized$hgnc_symbol == genes[i])
  genotypes <- c("Melanocytes", "Melanocytes", rep("RAS",4), rep("BRAF",13), rep("WT",3))
  mat <- melt(counts.all.2[ind,])
  mat$value <- log2(mat$value + 1)
  mat$genotypes <- factor(genotypes, levels = c("Melanocytes", "RAS", "BRAF", "WT"))

  p<- ggplot(mat, aes(x = genotypes, y = value)) + 
    geom_boxplot(aes(fill = genotypes)) +
    scale_fill_manual(values = c("purple", "blue", "red", "green")) +
    theme_minimal(base_size = 15) + ylab(paste("Log2",genes[i],"TPM+1")) 
  print(p)
  test_value<- wilcox.test(value ~ genotypes, data = mat, subset = genotypes %in% c("Melanocytes", "RAS"), alternative = "less")
  print(test_value)
}

## 
##  Wilcoxon rank sum test
## 
## data:  value by genotypes
## W = 0, p-value = 0.06667
## alternative hypothesis: true location shift is less than 0

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.