BoxPlot displaying RNA-seq expression levels from TCGA SKCM tumors (all metastatic samples) and melanocyte cell lines from Roadmap for a single gene

Christopher Terranova May 2019

Load necessary programs

library("dplyr")
library(ggplot2)
require(reshape2)
library(biomaRt)
library(ggpubr)
library(ggrepel)
library(tidyverse)
library(gplots)
library(genefilter)
library(RColorBrewer)
library(vegan)
library(preprocessCore)

Representative Example for Single Gene BoxPlot in SKCM vs Melanocyte samples

# read in SKCM RNA-seq data - already TPM transformed
SKCM_TPM <- read.table(file = "/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/SKCM_TPM-2.txt", header = TRUE, stringsAsFactors = FALSE, 
                       quote = "")
colnames(SKCM_TPM) <- gsub(pattern = "\\.", replacement = "-", colnames(SKCM_TPM))
colnames(SKCM_TPM) <- gsub(pattern = "TCGA-(.*)-(.*)-(.*)-(.*)-(.*)-(01|06|07)", replacement = "TCGA-\\1-\\2", colnames(SKCM_TPM))

# read in roadmap RNA-seq data
epi_count<- read.table("/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/57epigenomes.N.pc.gz", header = T, stringsAsFactors = F, quote = "")

epi_count_sub<- epi_count[, c("gene_id", "E003", "E059", "E061")]
epi_mat<- as.matrix(epi_count_sub[, -1])
rownames(epi_mat)<- epi_count_sub$gene_id

# change raw counts to TPM
library(EnsDb.Hsapiens.v75)
edb <- EnsDb.Hsapiens.v75

genes_ensemble <- genes(edb)

gene_length<- as.data.frame(genes_ensemble) %>% dplyr::select(gene_id, gene_name, width)

gene_length_in_mat<- left_join(data.frame(gene_id = rownames(epi_mat)), gene_length) %>% dplyr::filter(!is.na(width))

epi_mat_sub<- epi_mat[rownames(epi_mat) %in% gene_length_in_mat$gene_id, ]

countToTpm <- function(counts, effLen)
{
  rate <- log(counts + 1) - log(effLen)
  denom <- log(sum(exp(rate)))
  exp(rate - denom + log(1e6))
}

epi_TPM<- apply(epi_mat_sub, 2, countToTpm, effLen = gene_length_in_mat$width)

# combine SKCM and roadmap data 
epi_SKCM_TPM<- merge(epi_TPM, SKCM_TPM, by="row.names", all = FALSE)

# read in TCGA subtype information
TCGA_info<- read_tsv("/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/TCGA_Melanoma_ID_Metadata_MetastaticTumors_only.txt", col_names = T)
table(TCGA_info$MUTATIONSUBTYPES)
## 
##                    - BRAF_Hotspot_Mutants      NF1_Any_Mutants 
##                    6                  118                   23 
##  RAS_Hotspot_Mutants            Triple_WT 
##                   81                   38
# subset based on mutation 
Mut_subtype_info<- TCGA_info %>% 
  dplyr::filter(MUTATIONSUBTYPES %in% c("RAS_Hotspot_Mutants", "BRAF_Hotspot_Mutants", "Triple_WT")) %>%
  dplyr::select(Name, MUTATIONSUBTYPES)

# check how many RNAseq samples have mutation subtype info
table(colnames(SKCM_TPM) %in% Mut_subtype_info$Name)
## 
## FALSE  TRUE 
##   235   237
# subset the dataframe
epi_SKCM_TPM<- epi_SKCM_TPM %>% dplyr::rename(gene_id = Row.names, H1 = E003, melanoctye_1 = E059, melanoctye_2 = E061)
sample_idx<- colnames(epi_SKCM_TPM) %in% c("gene_id", "H1", "melanoctye_1", "melanoctye_2", Mut_subtype_info$Name)
epi_SKCM_TPM<- epi_SKCM_TPM[, sample_idx]

# Quantile Normalization
epi_SKCM_TPM.matrix <- as.matrix(epi_SKCM_TPM[,-c(1)])
epi_SKCM_TPM.normalized <- normalize.quantiles(epi_SKCM_TPM.matrix)
rownames(epi_SKCM_TPM.normalized) <- epi_SKCM_TPM$gene_id
colnames(epi_SKCM_TPM.normalized) <- colnames(epi_SKCM_TPM[,-c(1)])
epi_SKCM_TPM.normalized <- as.data.frame(epi_SKCM_TPM.normalized)
epi_SKCM_TPM.normalized <- rownames_to_column(epi_SKCM_TPM.normalized, var = "gene_id")

# set gene for boxplot - TWIST1=ENSG00000122691 
gene_ids<- "ENSG00000122691"

# set levels for boxplot - can include ESCs here
epi_SKCM_TPM_long<- epi_SKCM_TPM.normalized %>% dplyr::filter(gene_id %in% gene_ids) %>% gather(sample, TPM, 2: ncol(epi_SKCM_TPM.normalized)) %>% left_join(Mut_subtype_info, by = c("sample" = "Name"))

epi_SKCM_TPM_long<- epi_SKCM_TPM_long %>% 
  dplyr::rename(sample_type = MUTATIONSUBTYPES) %>%
  mutate(sample_type = case_when(
    sample == "H1" ~ "ESC",
    sample == "melanoctye_1" | sample == "melanoctye_2" ~ "melanoctye",
    TRUE ~ sample_type
  ))

epi_SKCM_TPM_long$sample_type<- factor(epi_SKCM_TPM_long$sample_type, levels = c("ESC", "melanoctye", "RAS_Hotspot_Mutants", "BRAF_Hotspot_Mutants", "Triple_WT"))

BoxPlot for TWIST1

ggplot(epi_SKCM_TPM_long %>% dplyr::filter(gene_id == "ENSG00000122691", !sample_type %in% c("ESC")), aes(x = sample_type, y = log2(TPM +1))) + 
  geom_boxplot(aes(fill = sample_type)) +
  scale_fill_manual(values = c("purple", "blue", "red", "green")) +
  theme_bw(base_size = 10) +
  theme(axis.text.x = element_text(angle = 90))

df_test<- epi_SKCM_TPM_long %>% dplyr::filter(gene_id == "ENSG00000122691", !sample_type %in% c("ESC"))

wilcox.test(TPM ~ sample_type, data = df_test, subset = sample_type %in% c("melanoctye", "RAS_Hotspot_Mutants"), alternative = "less")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  TPM by sample_type
## W = 7, p-value = 0.01453
## alternative hypothesis: true location shift is less than 0

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.