BoxPlot displaying RNA-seq expression levels from SKCM TCGA tumors (20 matching samples from chromatin state anaylsis), MSTC, CCLE cell lines and Normal ESCs, Germ Stem Cells, breast, brain, colon, lung, ovary and melanocyte samples from Roadmap

Christopher Terranova May 2019

Load necessary programs

library(tidyverse)
library("dplyr")
library(ggplot2)
require(reshape2)
library(biomaRt)
library(ggpubr)
library(ggrepel)
library(preprocessCore)

Representative Example for Single Gene BoxPlot in SKCM, MSTC, CCLE, ESCs, Germ Stem Cells and Normal samples

# read in SKCM RNA-seq data - already TPM transformed
TPM_df_sub<- read_tsv("/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/SKCM_CCLE_STC_melanocyte_RNA_quantile_normalized_TPM.tsv")

# read in roadmap RNA-seq data
RoadMap_counts <- read.table(file = "/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/57epigenomes.N.pc.gz", sep = '\t', header = TRUE, stringsAsFactors = FALSE, 
                             quote = "")
colnames(RoadMap_counts)[1:57] <- colnames(RoadMap_counts)[2:58]
RoadMap_counts <- RoadMap_counts[,c(1:57)]
RoadMap_counts_ESC_MEL <- RoadMap_counts[,c(2:5,7:9,12,13,29,30,38,39,44,45)]

# change raw counts to TPM
library(EnsDb.Hsapiens.v75)
edb <- EnsDb.Hsapiens.v75

genes_ensemble <- genes(edb)

gene_length<- as.data.frame(genes_ensemble) %>% dplyr::select(gene_id, gene_name, width)

gene_length_in_mat<- left_join(data.frame(gene_id = rownames(RoadMap_counts_ESC_MEL)), gene_length) %>% dplyr::filter(!is.na(width))

RoadMap_counts_ESC_MEL_sub <- RoadMap_counts_ESC_MEL[rownames(RoadMap_counts_ESC_MEL) %in% gene_length_in_mat$gene_id, ]

countToTpm <- function(counts, effLen)
{
  rate <- log(counts + 1) - log(effLen)
  denom <- log(sum(exp(rate)))
  exp(rate - denom + log(1e6))
}

RoadMap_counts_ESC_MEL_TPM <- apply(RoadMap_counts_ESC_MEL_sub, 2, countToTpm, effLen = gene_length_in_mat$width)

# Quantile Normalization
RoadMap_counts_ESC_MEL_TPM.normalized <- normalize.quantiles(RoadMap_counts_ESC_MEL_TPM)
colnames(RoadMap_counts_ESC_MEL_TPM.normalized) <- colnames(RoadMap_counts_ESC_MEL_TPM)
rownames(RoadMap_counts_ESC_MEL_TPM.normalized) <- rownames(RoadMap_counts_ESC_MEL_TPM)
RoadMap_counts_ESC_MEL_TPM.normalized <- as.data.frame(RoadMap_counts_ESC_MEL_TPM.normalized)

# get ENSGenes using biomaRt for roadmap data 
ENSGenes <- useEnsembl(biomart = "ensembl", dataset="hsapiens_gene_ensembl", GRCh=37)
ENSGenes <- getBM(attributes = c('ensembl_gene_id', 'hgnc_symbol'), mart = ENSGenes)
RoadMap_counts_ESC_MEL_ENSGenes <- inner_join(x = ENSGenes, y = as.data.frame(RoadMap_counts_ESC_MEL_TPM.normalized) %>% add_rownames(), by = c("ensembl_gene_id" = "rowname"))

# combine Cancer and Normal datsets data
counts.all.2 <- inner_join(x = RoadMap_counts_ESC_MEL_ENSGenes, y = as.data.frame(TPM_df_sub), by = c("ensembl_gene_id" = "gene_id"))

# Quantile Normalization
counts.all.2.matrix <- as.matrix(counts.all.2[,-c(1,2,61,62)])
counts.all.2.matrix.normalized <- normalize.quantiles(counts.all.2.matrix)
rownames(counts.all.2.matrix.normalized) <- counts.all.2$hgnc_symbol
colnames(counts.all.2.matrix.normalized) <- colnames(counts.all.2[,-c(1,2,61,62)])
counts.all.2.matrix.normalized <- as.data.frame(counts.all.2.matrix.normalized)
counts.all.2.matrix.normalized <- rownames_to_column(counts.all.2.matrix.normalized, var = "gene_name")

BoxPlot for TWIST1

counts.all.2.matrix.normalized %>% 
  dplyr::filter(gene_name == "TWIST1") %>%
  gather(-gene_name, key = "sample", value = "TPM") %>%
  mutate(type = case_when(
    sample %in% c("E003") ~ "ESC",
    sample %in% c("E005") ~ "Trophoblast",
    sample %in% c("E011") ~ "Endoderm",
    sample %in% c("E012") ~ "Ectoderm",
    sample %in% c("E013") ~ "Mesoderm",
    sample %in% c("E004") ~ "Mesendoderm",
    sample %in% c("E006") ~ "Mesenchymal SC",
    sample %in% c("E027", "E028") ~ "Breast",
    sample %in% c("E070", "E071") ~ "Brain",
    sample %in% c("E106", "E109") ~ "Colon",
    sample %in% c("E097") ~ "Ovary",
    sample %in% c("E096") ~ "Lung",
    sample %in% c("E059", "E061") ~ "melanocyte",
    grepl("06$", sample) ~ "TCGA",
    grepl("STC1$", sample) ~ "STC",
    grepl("SKIN$", sample) ~ "CCLE")) %>%
  mutate(type = factor(type, levels = c("ESC", "Trophoblast", "Endoderm", "Ectoderm", "Mesoderm", "Mesendoderm", "Mesenchymal SC", "Breast", "Brain", "Colon","Lung", "Ovary", "melanocyte", "CCLE", "STC", "TCGA"))) %>%
  ggplot(aes(x = type, y = log2(TPM +1))) +
  geom_boxplot(aes(col = type)) +
  geom_jitter(width = 0.2, aes(col = type)) +
  theme_classic(base_size = 10) +
  theme(axis.text.x = element_text(angle = 90)) +
  ggtitle(paste0("expression level of ", "TWIST1"))

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.