library(tidyverse)
library("dplyr")
library(ggplot2)
require(reshape2)
library(biomaRt)
library(ggpubr)
library(ggrepel)
library(preprocessCore)# read in SKCM RNA-seq data - already TPM transformed
TPM_df_sub<- read_tsv("/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/SKCM_CCLE_STC_melanocyte_RNA_quantile_normalized_TPM.tsv")
# read in roadmap RNA-seq data
RoadMap_counts <- read.table(file = "/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/57epigenomes.N.pc.gz", sep = '\t', header = TRUE, stringsAsFactors = FALSE,
quote = "")
colnames(RoadMap_counts)[1:57] <- colnames(RoadMap_counts)[2:58]
RoadMap_counts <- RoadMap_counts[,c(1:57)]
RoadMap_counts_ESC_MEL <- RoadMap_counts[,c(2:5,7:9,12,13,29,30,38,39,44,45)]
# change raw counts to TPM
library(EnsDb.Hsapiens.v75)
edb <- EnsDb.Hsapiens.v75
genes_ensemble <- genes(edb)
gene_length<- as.data.frame(genes_ensemble) %>% dplyr::select(gene_id, gene_name, width)
gene_length_in_mat<- left_join(data.frame(gene_id = rownames(RoadMap_counts_ESC_MEL)), gene_length) %>% dplyr::filter(!is.na(width))
RoadMap_counts_ESC_MEL_sub <- RoadMap_counts_ESC_MEL[rownames(RoadMap_counts_ESC_MEL) %in% gene_length_in_mat$gene_id, ]
countToTpm <- function(counts, effLen)
{
rate <- log(counts + 1) - log(effLen)
denom <- log(sum(exp(rate)))
exp(rate - denom + log(1e6))
}
RoadMap_counts_ESC_MEL_TPM <- apply(RoadMap_counts_ESC_MEL_sub, 2, countToTpm, effLen = gene_length_in_mat$width)
# Quantile Normalization
RoadMap_counts_ESC_MEL_TPM.normalized <- normalize.quantiles(RoadMap_counts_ESC_MEL_TPM)
colnames(RoadMap_counts_ESC_MEL_TPM.normalized) <- colnames(RoadMap_counts_ESC_MEL_TPM)
rownames(RoadMap_counts_ESC_MEL_TPM.normalized) <- rownames(RoadMap_counts_ESC_MEL_TPM)
RoadMap_counts_ESC_MEL_TPM.normalized <- as.data.frame(RoadMap_counts_ESC_MEL_TPM.normalized)
# get ENSGenes using biomaRt for roadmap data
ENSGenes <- useEnsembl(biomart = "ensembl", dataset="hsapiens_gene_ensembl", GRCh=37)
ENSGenes <- getBM(attributes = c('ensembl_gene_id', 'hgnc_symbol'), mart = ENSGenes)
RoadMap_counts_ESC_MEL_ENSGenes <- inner_join(x = ENSGenes, y = as.data.frame(RoadMap_counts_ESC_MEL_TPM.normalized) %>% add_rownames(), by = c("ensembl_gene_id" = "rowname"))
# combine Cancer and Normal datsets data
counts.all.2 <- inner_join(x = RoadMap_counts_ESC_MEL_ENSGenes, y = as.data.frame(TPM_df_sub), by = c("ensembl_gene_id" = "gene_id"))
# Quantile Normalization
counts.all.2.matrix <- as.matrix(counts.all.2[,-c(1,2,61,62)])
counts.all.2.matrix.normalized <- normalize.quantiles(counts.all.2.matrix)
rownames(counts.all.2.matrix.normalized) <- counts.all.2$hgnc_symbol
colnames(counts.all.2.matrix.normalized) <- colnames(counts.all.2[,-c(1,2,61,62)])
counts.all.2.matrix.normalized <- as.data.frame(counts.all.2.matrix.normalized)
counts.all.2.matrix.normalized <- rownames_to_column(counts.all.2.matrix.normalized, var = "gene_name")counts.all.2.matrix.normalized %>%
dplyr::filter(gene_name == "TWIST1") %>%
gather(-gene_name, key = "sample", value = "TPM") %>%
mutate(type = case_when(
sample %in% c("E003") ~ "ESC",
sample %in% c("E005") ~ "Trophoblast",
sample %in% c("E011") ~ "Endoderm",
sample %in% c("E012") ~ "Ectoderm",
sample %in% c("E013") ~ "Mesoderm",
sample %in% c("E004") ~ "Mesendoderm",
sample %in% c("E006") ~ "Mesenchymal SC",
sample %in% c("E027", "E028") ~ "Breast",
sample %in% c("E070", "E071") ~ "Brain",
sample %in% c("E106", "E109") ~ "Colon",
sample %in% c("E097") ~ "Ovary",
sample %in% c("E096") ~ "Lung",
sample %in% c("E059", "E061") ~ "melanocyte",
grepl("06$", sample) ~ "TCGA",
grepl("STC1$", sample) ~ "STC",
grepl("SKIN$", sample) ~ "CCLE")) %>%
mutate(type = factor(type, levels = c("ESC", "Trophoblast", "Endoderm", "Ectoderm", "Mesoderm", "Mesendoderm", "Mesenchymal SC", "Breast", "Brain", "Colon","Lung", "Ovary", "melanocyte", "CCLE", "STC", "TCGA"))) %>%
ggplot(aes(x = type, y = log2(TPM +1))) +
geom_boxplot(aes(col = type)) +
geom_jitter(width = 0.2, aes(col = type)) +
theme_classic(base_size = 10) +
theme(axis.text.x = element_text(angle = 90)) +
ggtitle(paste0("expression level of ", "TWIST1"))Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.