library("dplyr")
library(ggplot2)
require(reshape2)
library(biomaRt)
library(ggpubr)
library(ggrepel)
library(tidyverse)
library(gplots)
library(genefilter)
library(RColorBrewer)
library(vegan)
library(preprocessCore)

Representative Example for Single Gene BoxPlot in SKCM vs Melanocyte samples

# read in SKCM RNA-seq data - already TPM transformed
SKCM_TPM <- read.table(file = "/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/SKCM_TPM-2.txt", header = TRUE, stringsAsFactors = FALSE, 
                       quote = "")
colnames(SKCM_TPM) <- gsub(pattern = "\\.", replacement = "-", colnames(SKCM_TPM))
colnames(SKCM_TPM) <- gsub(pattern = "TCGA-(.*)-(.*)-(.*)-(.*)-(.*)-(01|06|07)", replacement = "TCGA-\\1-\\2", colnames(SKCM_TPM))

# read in roadmap RNA-seq data
epi_count<- read.table("/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/57epigenomes.N.pc.gz", header = T, stringsAsFactors = F, quote = "")

epi_count_sub<- epi_count[, c("gene_id", "E003", "E059", "E061")]
epi_mat<- as.matrix(epi_count_sub[, -1])
rownames(epi_mat)<- epi_count_sub$gene_id

# change raw counts to TPM
library(EnsDb.Hsapiens.v75)
edb <- EnsDb.Hsapiens.v75

genes_ensemble <- genes(edb)

gene_length<- as.data.frame(genes_ensemble) %>% dplyr::select(gene_id, gene_name, width)

gene_length_in_mat<- left_join(data.frame(gene_id = rownames(epi_mat)), gene_length) %>% dplyr::filter(!is.na(width))

epi_mat_sub<- epi_mat[rownames(epi_mat) %in% gene_length_in_mat$gene_id, ]

countToTpm <- function(counts, effLen)
{
  rate <- log(counts + 1) - log(effLen)
  denom <- log(sum(exp(rate)))
  exp(rate - denom + log(1e6))
}

epi_TPM<- apply(epi_mat_sub, 2, countToTpm, effLen = gene_length_in_mat$width)

# combine SKCM and roadmap data 
epi_SKCM_TPM<- merge(epi_TPM, SKCM_TPM, by="row.names", all = FALSE)

# read in TCGA subtype information
TCGA_info<- read_tsv("/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/TCGA_Melanoma_ID_Metadata_MetastaticTumors_only.txt", col_names = T)
table(TCGA_info$MUTATIONSUBTYPES)

# subset based on mutation 
Mut_subtype_info<- TCGA_info %>% 
  dplyr::filter(MUTATIONSUBTYPES %in% c("RAS_Hotspot_Mutants", "BRAF_Hotspot_Mutants", "Triple_WT")) %>%
  dplyr::select(Name, MUTATIONSUBTYPES)

# check how many RNAseq samples have mutation subtype info
table(colnames(SKCM_TPM) %in% Mut_subtype_info$Name)

# subset the dataframe
epi_SKCM_TPM<- epi_SKCM_TPM %>% dplyr::rename(gene_id = Row.names, H1 = E003, melanoctye_1 = E059, melanoctye_2 = E061)
sample_idx<- colnames(epi_SKCM_TPM) %in% c("gene_id", "H1", "melanoctye_1", "melanoctye_2", Mut_subtype_info$Name)
epi_SKCM_TPM<- epi_SKCM_TPM[, sample_idx]

# Quantile Normalization
epi_SKCM_TPM.matrix <- as.matrix(epi_SKCM_TPM[,-c(1)])
epi_SKCM_TPM.normalized <- normalize.quantiles(epi_SKCM_TPM.matrix)
rownames(epi_SKCM_TPM.normalized) <- epi_SKCM_TPM$gene_id
colnames(epi_SKCM_TPM.normalized) <- colnames(epi_SKCM_TPM[,-c(1)])
epi_SKCM_TPM.normalized <- as.data.frame(epi_SKCM_TPM.normalized)
epi_SKCM_TPM.normalized <- rownames_to_column(epi_SKCM_TPM.normalized, var = "gene_id")

# set gene for boxplot - TWIST1=ENSG00000122691 
gene_ids<- "ENSG00000122691"

# set levels for boxplot - can include ESCs here
epi_SKCM_TPM_long<- epi_SKCM_TPM.normalized %>% dplyr::filter(gene_id %in% gene_ids) %>% gather(sample, TPM, 2: ncol(epi_SKCM_TPM.normalized)) %>% left_join(Mut_subtype_info, by = c("sample" = "Name"))

epi_SKCM_TPM_long<- epi_SKCM_TPM_long %>% 
  dplyr::rename(sample_type = MUTATIONSUBTYPES) %>%
  mutate(sample_type = case_when(
    sample == "H1" ~ "ESC",
    sample == "melanoctye_1" | sample == "melanoctye_2" ~ "melanoctye",
    TRUE ~ sample_type
  ))

epi_SKCM_TPM_long$sample_type<- factor(epi_SKCM_TPM_long$sample_type, levels = c("ESC", "melanoctye", "RAS_Hotspot_Mutants", "BRAF_Hotspot_Mutants", "Triple_WT"))

BoxPlot for TWIST1

ggplot(epi_SKCM_TPM_long %>% dplyr::filter(gene_id == "ENSG00000122691", !sample_type %in% c("ESC")), aes(x = sample_type, y = log2(TPM +1))) + 
  geom_boxplot(aes(fill = sample_type)) +
  scale_fill_manual(values = c("purple", "blue", "red", "green")) +
  theme_bw(base_size = 10) +
  theme(axis.text.x = element_text(angle = 90))

df_test<- epi_SKCM_TPM_long %>% dplyr::filter(gene_id == "ENSG00000122691", !sample_type %in% c("ESC"))

wilcox.test(TPM ~ sample_type, data = df_test, subset = sample_type %in% c("melanoctye", "RAS_Hotspot_Mutants"), alternative = "less")

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

BoxPlot displaying RNA-seq expression levels from TCGA SKCM tumors (all metastatic samples) and melanocyte cell lines from Roadmap for a single gene

Christopher Terranova May 2019

Load necessary programs

Representative Example for Single Gene BoxPlot in SKCM vs Melanocyte samples

BoxPlot for TWIST1