library(tidyverse)
library("dplyr")
library(ggplot2)
require(reshape2)
library(biomaRt)
library(ggpubr)
library(ggrepel)
library(preprocessCore)
library(EnsDb.Hsapiens.v75)# read in CCLE data -- already TPM transformed
TPM_df_sub<- read_tsv("~/TCGA_SKCM_RNA_Files/CCLE_chris_TPM.tsv")
# read in roadmap data
RoadMap_counts <- read.table(file = "~/TCGA_SKCM_RNA_Files/57epigenomes.N.pc.gz", sep = '\t', header = TRUE, stringsAsFactors = FALSE,
quote = "")
colnames(RoadMap_counts)[1:57] <- colnames(RoadMap_counts)[2:58]
RoadMap_counts <- RoadMap_counts[,c(1:57)]
RoadMap_counts_ESC_MEL <- RoadMap_counts[,c(2:5,7:9,12,13,24,25,29,30,38,39,44,45)]
# transform raw counts to TPM
edb <- EnsDb.Hsapiens.v75
genes_ensemble <- genes(edb)
gene_length<- as.data.frame(genes_ensemble) %>% dplyr::select(gene_id, gene_name, width)
gene_length_in_mat<- left_join(data.frame(gene_id = rownames(RoadMap_counts_ESC_MEL)), gene_length) %>% dplyr::filter(!is.na(width))
RoadMap_counts_ESC_MEL_sub <- RoadMap_counts_ESC_MEL[rownames(RoadMap_counts_ESC_MEL) %in% gene_length_in_mat$gene_id, ]
countToTpm <- function(counts, effLen)
{
rate <- log(counts + 1) - log(effLen)
denom <- log(sum(exp(rate)))
exp(rate - denom + log(1e6))
}
RoadMap_counts_ESC_MEL_TPM <- apply(RoadMap_counts_ESC_MEL_sub, 2, countToTpm, effLen = gene_length_in_mat$width)
# Quantile normalization for roadmap data
RoadMap_counts_ESC_MEL_TPM.normalized <- normalize.quantiles(RoadMap_counts_ESC_MEL_TPM)
colnames(RoadMap_counts_ESC_MEL_TPM.normalized) <- colnames(RoadMap_counts_ESC_MEL_TPM)
rownames(RoadMap_counts_ESC_MEL_TPM.normalized) <- rownames(RoadMap_counts_ESC_MEL_TPM)
RoadMap_counts_ESC_MEL_TPM.normalized <- as.data.frame(RoadMap_counts_ESC_MEL_TPM.normalized)
# Get ENSGenes using biomaRt for roadmap data
ENSGenes <- useEnsembl(biomart = "ensembl", dataset="hsapiens_gene_ensembl", GRCh=37)
ENSGenes <- getBM(attributes = c('ensembl_gene_id', 'hgnc_symbol'), mart = ENSGenes)
RoadMap_counts_ESC_MEL_ENSGenes <- inner_join(x = ENSGenes, y = as.data.frame(RoadMap_counts_ESC_MEL_TPM.normalized) %>% add_rownames(), by = c("ensembl_gene_id" = "rowname"))
# Join CCLE and roadmap data
counts.all.2 <- inner_join(x = RoadMap_counts_ESC_MEL_ENSGenes, y = as.data.frame(TPM_df_sub), by = c("ensembl_gene_id" = "gene_id"))
# Quantile normalization for CCLE and roadmap data
counts.all.2.matrix <- as.matrix(counts.all.2[,-c(1,2,73,74)])
counts.all.2.matrix.normalized <- normalize.quantiles(counts.all.2.matrix)
rownames(counts.all.2.matrix.normalized) <- counts.all.2$hgnc_symbol
colnames(counts.all.2.matrix.normalized) <- colnames(counts.all.2[,-c(1,2,73,74)])
counts.all.2.matrix.normalized <- as.data.frame(counts.all.2.matrix.normalized)
# Extract BRCA and breast samples for boxplot
counts.all.2.matrix.normalized <- counts.all.2.matrix.normalized[,c(8,9,18:28,30:33)]
counts.all.2.matrix.normalized <- rownames_to_column(counts.all.2.matrix.normalized, var = "hgnc_symbol")counts.all.2.matrix.normalized %>%
dplyr::filter(hgnc_symbol == "SOX9") %>%
gather(-hgnc_symbol, key = "sample", value = "TPM") %>%
mutate(type = case_when(
sample %in% c("E027", "E028") ~ "Breast",
grepl("BREAST$", sample) ~ "BRCA")) %>%
mutate(type = factor(type, levels = c("Breast", "BRCA"))) %>%
ggplot(aes(x = type, y = log2(TPM +1))) +
geom_boxplot(aes(col = type), outlier.color = NA) +
geom_jitter(width = 0.2, aes(col = type)) +
theme_classic(base_size = 14) +
ggtitle(paste0("expression level of ", "SOX9"))Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.