Single Gene BoxPlot using CCLE and Roadmap Data

Christopher Terranova May 2019

Load necessary programs

library(tidyverse)
library("dplyr")
library(ggplot2)
require(reshape2)
library(biomaRt)
library(ggpubr)
library(ggrepel)
library(preprocessCore)
library(EnsDb.Hsapiens.v75)

Representative Example for Single Gene BoxPlot in Breast vs BRCA samples

# read in CCLE data -- already TPM transformed
TPM_df_sub<- read_tsv("~/TCGA_SKCM_RNA_Files/CCLE_chris_TPM.tsv")

# read in roadmap data
RoadMap_counts <- read.table(file = "~/TCGA_SKCM_RNA_Files/57epigenomes.N.pc.gz", sep = '\t', header = TRUE, stringsAsFactors = FALSE, 
                             quote = "")
colnames(RoadMap_counts)[1:57] <- colnames(RoadMap_counts)[2:58]
RoadMap_counts <- RoadMap_counts[,c(1:57)]
RoadMap_counts_ESC_MEL <- RoadMap_counts[,c(2:5,7:9,12,13,24,25,29,30,38,39,44,45)]

# transform raw counts to TPM
edb <- EnsDb.Hsapiens.v75

genes_ensemble <- genes(edb)

gene_length<- as.data.frame(genes_ensemble) %>% dplyr::select(gene_id, gene_name, width)

gene_length_in_mat<- left_join(data.frame(gene_id = rownames(RoadMap_counts_ESC_MEL)), gene_length) %>% dplyr::filter(!is.na(width))

RoadMap_counts_ESC_MEL_sub <- RoadMap_counts_ESC_MEL[rownames(RoadMap_counts_ESC_MEL) %in% gene_length_in_mat$gene_id, ]

countToTpm <- function(counts, effLen)
{
  rate <- log(counts + 1) - log(effLen)
  denom <- log(sum(exp(rate)))
  exp(rate - denom + log(1e6))
}

RoadMap_counts_ESC_MEL_TPM <- apply(RoadMap_counts_ESC_MEL_sub, 2, countToTpm, effLen = gene_length_in_mat$width)

# Quantile normalization for roadmap data
RoadMap_counts_ESC_MEL_TPM.normalized <- normalize.quantiles(RoadMap_counts_ESC_MEL_TPM)
colnames(RoadMap_counts_ESC_MEL_TPM.normalized) <- colnames(RoadMap_counts_ESC_MEL_TPM)
rownames(RoadMap_counts_ESC_MEL_TPM.normalized) <- rownames(RoadMap_counts_ESC_MEL_TPM)
RoadMap_counts_ESC_MEL_TPM.normalized <- as.data.frame(RoadMap_counts_ESC_MEL_TPM.normalized)

# Get ENSGenes using biomaRt for roadmap data
ENSGenes <- useEnsembl(biomart = "ensembl", dataset="hsapiens_gene_ensembl", GRCh=37)
ENSGenes <- getBM(attributes = c('ensembl_gene_id', 'hgnc_symbol'), mart = ENSGenes)
RoadMap_counts_ESC_MEL_ENSGenes <- inner_join(x = ENSGenes, y = as.data.frame(RoadMap_counts_ESC_MEL_TPM.normalized) %>% add_rownames(), by = c("ensembl_gene_id" = "rowname"))

# Join CCLE and roadmap data
counts.all.2 <- inner_join(x = RoadMap_counts_ESC_MEL_ENSGenes, y = as.data.frame(TPM_df_sub), by = c("ensembl_gene_id" = "gene_id"))

# Quantile normalization for CCLE and roadmap data
counts.all.2.matrix <- as.matrix(counts.all.2[,-c(1,2,73,74)])
counts.all.2.matrix.normalized <- normalize.quantiles(counts.all.2.matrix)
rownames(counts.all.2.matrix.normalized) <- counts.all.2$hgnc_symbol
colnames(counts.all.2.matrix.normalized) <- colnames(counts.all.2[,-c(1,2,73,74)])
counts.all.2.matrix.normalized <- as.data.frame(counts.all.2.matrix.normalized)

# Extract BRCA and breast samples for boxplot
counts.all.2.matrix.normalized <- counts.all.2.matrix.normalized[,c(8,9,18:28,30:33)]
counts.all.2.matrix.normalized <- rownames_to_column(counts.all.2.matrix.normalized, var = "hgnc_symbol")

BoxPlot for Sox9

counts.all.2.matrix.normalized %>% 
  dplyr::filter(hgnc_symbol == "SOX9") %>%
  gather(-hgnc_symbol, key = "sample", value = "TPM") %>%
  mutate(type = case_when(
      sample %in% c("E027", "E028") ~ "Breast",
      grepl("BREAST$", sample) ~ "BRCA")) %>%
  mutate(type = factor(type, levels = c("Breast", "BRCA"))) %>%
  ggplot(aes(x = type, y = log2(TPM +1))) +
  geom_boxplot(aes(col = type), outlier.color = NA) +
  geom_jitter(width = 0.2, aes(col = type)) +
  theme_classic(base_size = 14) +
  ggtitle(paste0("expression level of ", "SOX9"))

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.