BoxPlot of SKCM RNA-seq expression levels (all metastatic samples) based on shortening or lengthening of broad H3K4me3 Domains (-/+2kb) in TCGA SKCM Tumors relative to melanocytes

Christopher Terranova May 2019

Load necessary programs

library(tidyverse)
library("dplyr")
library(ggplot2)
require(reshape2)
library(biomaRt)
library(ggpubr)
library(ggrepel)
library(preprocessCore)

Shortening or Lengthening of Broad H3K4me3 Domains in SKCM Tumors relative to Melanocytes

# read in SKCM RNA-seq data - already TPM transformed
SKCM_TPM <- read.table(file = "/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/SKCM_TPM-2.txt", header = TRUE, stringsAsFactors = FALSE, 
                       quote = "")
colnames(SKCM_TPM) <- gsub(pattern = "\\.", replacement = "-", colnames(SKCM_TPM))
colnames(SKCM_TPM) <- gsub(pattern = "TCGA-(.*)-(.*)-(.*)-(.*)-(.*)-(01|06|07)", replacement = "TCGA-\\1-\\2", colnames(SKCM_TPM))

# read in roadmap RNA-seq data
epi_count<- read.table("/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/57epigenomes.N.pc.gz", header = T, stringsAsFactors = F, quote = "")

epi_count_sub<- epi_count[, c("gene_id","E059", "E061")]
epi_mat<- as.matrix(epi_count_sub[, -1])
rownames(epi_mat)<- epi_count_sub$gene_id

# change raw counts to TPM
library(EnsDb.Hsapiens.v75)
edb <- EnsDb.Hsapiens.v75

genes_ensemble <- genes(edb)

gene_length<- as.data.frame(genes_ensemble) %>% dplyr::select(gene_id, gene_name, width)

gene_length_in_mat<- left_join(data.frame(gene_id = rownames(epi_mat)), gene_length) %>% dplyr::filter(!is.na(width))

epi_mat_sub<- epi_mat[rownames(epi_mat) %in% gene_length_in_mat$gene_id, ]

countToTpm <- function(counts, effLen)
{
  rate <- log(counts + 1) - log(effLen)
  denom <- log(sum(exp(rate)))
  exp(rate - denom + log(1e6))
}

epi_TPM<- apply(epi_mat_sub, 2, countToTpm, effLen = gene_length_in_mat$width)

# combine SKCM and roadmap data
epi_SKCM_TPM<- merge(epi_TPM, SKCM_TPM, by="row.names", all = FALSE)

# read in TCGA subtype information
TCGA_info<- read_tsv("/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/TCGA_Melanoma_ID_Metadata_MetastaticTumors_only.txt", col_names = T)
table(TCGA_info$MUTATIONSUBTYPES)
## 
##                    - BRAF_Hotspot_Mutants      NF1_Any_Mutants 
##                    6                  118                   23 
##  RAS_Hotspot_Mutants            Triple_WT 
##                   81                   38
# subset based on mutation -- subtype can be changed or all subtypes can be used
Mut_subtype_info<- TCGA_info %>% 
  dplyr::filter(MUTATIONSUBTYPES %in% c("RAS_Hotspot_Mutants")) %>%
  dplyr::select(Name, MUTATIONSUBTYPES)

# check how many RNAseq samples have mutation subtype info
table(colnames(SKCM_TPM) %in% Mut_subtype_info$Name)
## 
## FALSE  TRUE 
##   391    81
# subset the combined df
epi_SKCM_TPM<- epi_SKCM_TPM %>% dplyr::rename(gene_id = Row.names, melanoctye_1 = E059, melanoctye_2 = E061)
sample_idx<- colnames(epi_SKCM_TPM) %in% c("gene_id", "melanoctye_1", "melanoctye_2", Mut_subtype_info$Name)
epi_SKCM_TPM<- epi_SKCM_TPM[, sample_idx]

# Quantile Normalization
epi_SKCM_TPM.matrix <- as.matrix(epi_SKCM_TPM[,-c(1)])
epi_SKCM_TPM.normalized <- normalize.quantiles(epi_SKCM_TPM.matrix)
rownames(epi_SKCM_TPM.normalized) <- epi_SKCM_TPM$gene_id
colnames(epi_SKCM_TPM.normalized) <- colnames(epi_SKCM_TPM[,-c(1)])
epi_SKCM_TPM.normalized <- as.data.frame(epi_SKCM_TPM.normalized)
epi_SKCM_TPM.normalized <- rownames_to_column(epi_SKCM_TPM.normalized, var = "gene_id")

# read in shortening/lengthening H3K4me3 domains (-/+2kb) overlapping -/+10kbTSS in SKCM tumors -- peaks were obtained using intersectBed -wo
bivalent_genes <- readxl::read_xlsx("/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/TCGA_SKCM_Broad_Sharp_SizeComparison_melanocytes_Broad_to_Allpeaks_RAS_RFile.xlsx", sheet = 1, col_names = T)
head(bivalent_genes)
## # A tibble: 6 x 4
##   X__1            X__2   X__3  X__4      
##   <chr>           <chr>  <chr> <chr>     
## 1 ENSG00000245532 NEAT1  RAS   Shortening
## 2 ENSG00000123374 CDK2   RAS   Shortening
## 3 ENSG00000185664 PMEL   RAS   Shortening
## 4 ENSG00000137203 TFAP2A RAS   Shortening
## 5 ENSG00000065357 DGKA   RAS   Shortening
## 6 ENSG00000135903 PAX3   RAS   Shortening
#subset by the gene list  
gene_ids <- bivalent_genes$`X__1`

epi_SKCM_TPM_long<- epi_SKCM_TPM.normalized %>% 
  dplyr::filter(gene_id %in% gene_ids) %>% 
  gather(sample, TPM, 2: ncol(epi_SKCM_TPM.normalized)) %>% 
  left_join(Mut_subtype_info, by = c("sample" = "Name")) %>%
  left_join(bivalent_genes, by = c("gene_id" = "X__1")) %>%
  dplyr::rename(symbol = X__2, gene_type = X__3, bivalent_loss_type = X__4)

epi_SKCM_TPM_long<- epi_SKCM_TPM_long %>% 
  dplyr::rename(sample_type = MUTATIONSUBTYPES) %>%
  mutate(sample_type = case_when(
    sample == "melanoctye_1" | sample == "melanoctye_2" ~ "melanoctye",
    TRUE ~ sample_type
  ))

epi_SKCM_TPM_long$sample_type<- factor(epi_SKCM_TPM_long$sample_type, levels = c("melanoctye", "RAS_Hotspot_Mutants", "BRAF_Hotspot_Mutants", "Triple_WT"))

BoxPlot for Mean RNA-seq expression from shortening or lengthening of broad H3K4me3 domains in melanoma tumors relative to melanocytes

ggplot(epi_SKCM_TPM_long, aes(x = sample_type, y = log2(TPM+1))) +
  geom_boxplot(aes(col = sample_type)) +
  facet_wrap(bivalent_loss_type ~ gene_type) +
  theme_bw(base_size = 10) +
  theme(axis.text.x = element_text(angle = 90)) +
  coord_cartesian(ylim = c(0, 15))

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.