Christopher Terranova May 2019

Load necessary programs

library(tidyverse)
library("dplyr")
library(ggplot2)
require(reshape2)
library(biomaRt)
library(ggpubr)
library(ggrepel)
library(preprocessCore)

Representative Example for Single Gene BoxPlot for TWIST1 in Cancer and Normal samples

#Read in dataset - TPM normalized
TPM_df_sub <- read_tsv("~/Desktop/TCGA_RNA-seq_files/CCLE_chris_TPM.tsv")
TPM_df_sub <- TPM_df_sub[,-c(55,56)]
TPM_df_sub2 <- read_tsv("~/Desktop/TCGA_RNA-seq_files/SKCM_CCLE_STC_melanocyte_RNA_quantile_normalized_TPM.tsv")
TPM_df_sub2 <- TPM_df_sub2[,c(1:14)]
TPM_df_sub_final <- inner_join(x = TPM_df_sub, y = as.data.frame(TPM_df_sub2), by = c("gene_id" = "gene_id"))

#Read in Roadmap samples
RoadMap_counts <- read.table(file = "~/Desktop/TCGA_RNA-seq_files/57epigenomes.N.pc.gz", sep = '\t', header = TRUE, stringsAsFactors = FALSE, 
                             quote = "")
colnames(RoadMap_counts)[1:57] <- colnames(RoadMap_counts)[2:58]
RoadMap_counts <- RoadMap_counts[,c(1:57)]
RoadMap_counts_ESC_MEL <- RoadMap_counts[,c(2:5,7:9,12,13,24,25,29,30,38,39,44,45)]

#Change raw counts to TPM
library(EnsDb.Hsapiens.v75)
edb <- EnsDb.Hsapiens.v75

genes_ensemble <- genes(edb)

gene_length<- as.data.frame(genes_ensemble) %>% dplyr::select(gene_id, gene_name, width)

gene_length_in_mat<- left_join(data.frame(gene_id = rownames(RoadMap_counts_ESC_MEL)), gene_length) %>% dplyr::filter(!is.na(width))

RoadMap_counts_ESC_MEL_sub <- RoadMap_counts_ESC_MEL[rownames(RoadMap_counts_ESC_MEL) %in% gene_length_in_mat$gene_id, ]

#Use gene_length instead of effective length. effective length = gene length - fragment_length

all.equal(rownames(RoadMap_counts_ESC_MEL_sub), gene_length_in_mat$gene_id)
## [1] TRUE
countToTpm <- function(counts, effLen)
{
  rate <- log(counts + 1) - log(effLen)
  denom <- log(sum(exp(rate)))
  exp(rate - denom + log(1e6))
}

RoadMap_counts_ESC_MEL_TPM <- apply(RoadMap_counts_ESC_MEL_sub, 2, countToTpm, effLen = gene_length_in_mat$width)

#Quantile Normalization
RoadMap_counts_ESC_MEL_TPM.normalized <- normalize.quantiles(RoadMap_counts_ESC_MEL_TPM)
colnames(RoadMap_counts_ESC_MEL_TPM.normalized) <- colnames(RoadMap_counts_ESC_MEL_TPM)
rownames(RoadMap_counts_ESC_MEL_TPM.normalized) <- rownames(RoadMap_counts_ESC_MEL_TPM)
RoadMap_counts_ESC_MEL_TPM.normalized <- as.data.frame(RoadMap_counts_ESC_MEL_TPM.normalized)

#Get ENSGenes using biomaRt ##
ENSGenes <- useEnsembl(biomart = "ensembl", dataset="hsapiens_gene_ensembl", GRCh=37)
ENSGenes <- getBM(attributes = c('ensembl_gene_id', 'hgnc_symbol'), mart = ENSGenes)
RoadMap_counts_ESC_MEL_ENSGenes <- inner_join(x = ENSGenes, y = as.data.frame(RoadMap_counts_ESC_MEL_TPM.normalized) %>% add_rownames(), by = c("ensembl_gene_id" = "rowname"))

#Join cancer and normal samples and quantile noramlize
counts.all.2 <- inner_join(x = RoadMap_counts_ESC_MEL_ENSGenes, y = as.data.frame(TPM_df_sub_final), by = c("ensembl_gene_id" = "gene_id"))

counts.all.2.matrix <- as.matrix(counts.all.2[,-c(1,2)])
counts.all.2.matrix.normalized <- normalize.quantiles(counts.all.2.matrix)
rownames(counts.all.2.matrix.normalized) <- counts.all.2$hgnc_symbol
colnames(counts.all.2.matrix.normalized) <- colnames(counts.all.2[,-c(1,2)])
counts.all.2.matrix.normalized <- as.data.frame(counts.all.2.matrix.normalized)
counts.all.2.matrix.normalized <- rownames_to_column(counts.all.2.matrix.normalized, var = "gene_name")

BoxPlot for TWIST1

# Boxplot for mean of total samples
counts.all.2.matrix.normalized %>% 
  dplyr::filter(gene_name == "TWIST1") %>%
  gather(-gene_name, key = "sample", value = "TPM") %>%
  mutate(type = case_when(
    sample %in% c("E003") ~ "ESC",
    sample %in% c("E005") ~ "Trophoblast",
    sample %in% c("E004") ~ "Mesendoderm",
    sample %in% c("E013") ~ "Mesoderm",
    sample %in% c("E006") ~ "Mesenchymal SC",
    sample %in% c("E011") ~ "Endoderm",
    sample %in% c("E012") ~ "Ectoderm",
    sample %in% c("E027", "E028") ~ "Breast",
    sample %in% c("E070", "E071") ~ "Brain",
    sample %in% c("E106", "E109") ~ "Colon",
    sample %in% c("E096") ~ "Lung",
    sample %in% c("E097") ~ "Ovary",
    sample %in% c("E059", "E061") ~ "Melanocyte",
    grepl("BREAST$", sample) ~ "BRCA",
    grepl("LARGE_INTESTINE$", sample) ~ "COAD",
    grepl("CENTRAL_NERVOUS_SYSTEM$", sample) ~ "GBM",
    grepl("LUNG$", sample) ~ "LUNG",
    grepl("OVARY$", sample) ~ "OVCA",
    grepl("SKIN$", sample) ~ "SKCM")) %>%
  mutate(type = factor(type, levels = c("ESC", "Trophoblast", "Mesendoderm","Mesoderm", "Mesenchymal SC", "Endoderm", "Ectoderm", 
                                        "Breast","Brain","Colon","Lung","Ovary","Melanocyte", "BRCA", "GBM","COAD", "LUNG", "OVCA","SKCM"))) %>%
  ggplot(aes(x = type, y = log2(TPM +1))) +
  geom_boxplot(aes(col = type)) +
  geom_jitter(width = 0.2, aes(col = type)) +
  theme_classic(base_size = 14) +
  theme(axis.text = element_text(angle = 90)) +
  ggtitle(paste0("expression level of ", "TWIST1"))

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.