library(tidyverse)
library("dplyr")
library(ggplot2)
require(reshape2)
library(biomaRt)
library(ggpubr)
library(ggrepel)
library(preprocessCore)
#Read in dataset - TPM normalized
TPM_df_sub <- read_tsv("~/Desktop/TCGA_RNA-seq_files/CCLE_chris_TPM.tsv")
TPM_df_sub <- TPM_df_sub[,-c(55,56)]
TPM_df_sub2 <- read_tsv("~/Desktop/TCGA_RNA-seq_files/SKCM_CCLE_STC_melanocyte_RNA_quantile_normalized_TPM.tsv")
TPM_df_sub2 <- TPM_df_sub2[,c(1:14)]
TPM_df_sub_final <- inner_join(x = TPM_df_sub, y = as.data.frame(TPM_df_sub2), by = c("gene_id" = "gene_id"))
#Read in Roadmap samples
RoadMap_counts <- read.table(file = "~/Desktop/TCGA_RNA-seq_files/57epigenomes.N.pc.gz", sep = '\t', header = TRUE, stringsAsFactors = FALSE,
quote = "")
colnames(RoadMap_counts)[1:57] <- colnames(RoadMap_counts)[2:58]
RoadMap_counts <- RoadMap_counts[,c(1:57)]
RoadMap_counts_ESC_MEL <- RoadMap_counts[,c(2:5,7:9,12,13,24,25,29,30,38,39,44,45)]
#Change raw counts to TPM
library(EnsDb.Hsapiens.v75)
edb <- EnsDb.Hsapiens.v75
genes_ensemble <- genes(edb)
gene_length<- as.data.frame(genes_ensemble) %>% dplyr::select(gene_id, gene_name, width)
gene_length_in_mat<- left_join(data.frame(gene_id = rownames(RoadMap_counts_ESC_MEL)), gene_length) %>% dplyr::filter(!is.na(width))
RoadMap_counts_ESC_MEL_sub <- RoadMap_counts_ESC_MEL[rownames(RoadMap_counts_ESC_MEL) %in% gene_length_in_mat$gene_id, ]
#Use gene_length instead of effective length. effective length = gene length - fragment_length
all.equal(rownames(RoadMap_counts_ESC_MEL_sub), gene_length_in_mat$gene_id)
## [1] TRUE
countToTpm <- function(counts, effLen)
{
rate <- log(counts + 1) - log(effLen)
denom <- log(sum(exp(rate)))
exp(rate - denom + log(1e6))
}
RoadMap_counts_ESC_MEL_TPM <- apply(RoadMap_counts_ESC_MEL_sub, 2, countToTpm, effLen = gene_length_in_mat$width)
#Quantile Normalization
RoadMap_counts_ESC_MEL_TPM.normalized <- normalize.quantiles(RoadMap_counts_ESC_MEL_TPM)
colnames(RoadMap_counts_ESC_MEL_TPM.normalized) <- colnames(RoadMap_counts_ESC_MEL_TPM)
rownames(RoadMap_counts_ESC_MEL_TPM.normalized) <- rownames(RoadMap_counts_ESC_MEL_TPM)
RoadMap_counts_ESC_MEL_TPM.normalized <- as.data.frame(RoadMap_counts_ESC_MEL_TPM.normalized)
#Get ENSGenes using biomaRt ##
ENSGenes <- useEnsembl(biomart = "ensembl", dataset="hsapiens_gene_ensembl", GRCh=37)
ENSGenes <- getBM(attributes = c('ensembl_gene_id', 'hgnc_symbol'), mart = ENSGenes)
RoadMap_counts_ESC_MEL_ENSGenes <- inner_join(x = ENSGenes, y = as.data.frame(RoadMap_counts_ESC_MEL_TPM.normalized) %>% add_rownames(), by = c("ensembl_gene_id" = "rowname"))
#Join cancer and normal samples and quantile noramlize
counts.all.2 <- inner_join(x = RoadMap_counts_ESC_MEL_ENSGenes, y = as.data.frame(TPM_df_sub_final), by = c("ensembl_gene_id" = "gene_id"))
counts.all.2.matrix <- as.matrix(counts.all.2[,-c(1,2)])
counts.all.2.matrix.normalized <- normalize.quantiles(counts.all.2.matrix)
rownames(counts.all.2.matrix.normalized) <- counts.all.2$hgnc_symbol
colnames(counts.all.2.matrix.normalized) <- colnames(counts.all.2[,-c(1,2)])
counts.all.2.matrix.normalized <- as.data.frame(counts.all.2.matrix.normalized)
counts.all.2.matrix.normalized <- rownames_to_column(counts.all.2.matrix.normalized, var = "gene_name")
# Boxplot for mean of total samples
counts.all.2.matrix.normalized %>%
dplyr::filter(gene_name == "TWIST1") %>%
gather(-gene_name, key = "sample", value = "TPM") %>%
mutate(type = case_when(
sample %in% c("E003") ~ "ESC",
sample %in% c("E005") ~ "Trophoblast",
sample %in% c("E004") ~ "Mesendoderm",
sample %in% c("E013") ~ "Mesoderm",
sample %in% c("E006") ~ "Mesenchymal SC",
sample %in% c("E011") ~ "Endoderm",
sample %in% c("E012") ~ "Ectoderm",
sample %in% c("E027", "E028") ~ "Breast",
sample %in% c("E070", "E071") ~ "Brain",
sample %in% c("E106", "E109") ~ "Colon",
sample %in% c("E096") ~ "Lung",
sample %in% c("E097") ~ "Ovary",
sample %in% c("E059", "E061") ~ "Melanocyte",
grepl("BREAST$", sample) ~ "BRCA",
grepl("LARGE_INTESTINE$", sample) ~ "COAD",
grepl("CENTRAL_NERVOUS_SYSTEM$", sample) ~ "GBM",
grepl("LUNG$", sample) ~ "LUNG",
grepl("OVARY$", sample) ~ "OVCA",
grepl("SKIN$", sample) ~ "SKCM")) %>%
mutate(type = factor(type, levels = c("ESC", "Trophoblast", "Mesendoderm","Mesoderm", "Mesenchymal SC", "Endoderm", "Ectoderm",
"Breast","Brain","Colon","Lung","Ovary","Melanocyte", "BRCA", "GBM","COAD", "LUNG", "OVCA","SKCM"))) %>%
ggplot(aes(x = type, y = log2(TPM +1))) +
geom_boxplot(aes(col = type)) +
geom_jitter(width = 0.2, aes(col = type)) +
theme_classic(base_size = 14) +
theme(axis.text = element_text(angle = 90)) +
ggtitle(paste0("expression level of ", "TWIST1"))
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.