library(tidyverse)
library("dplyr")
library(ggplot2)
require(reshape2)
library(biomaRt)
library(ggpubr)
library(ggrepel)
library(preprocessCore)# read in SKCM RNA-seq data - already TPM transformed
SKCM_TPM <- read.table(file = "/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/SKCM_TPM-2.txt", header = TRUE, stringsAsFactors = FALSE,
quote = "")
colnames(SKCM_TPM) <- gsub(pattern = "\\.", replacement = "-", colnames(SKCM_TPM))
colnames(SKCM_TPM) <- gsub(pattern = "TCGA-(.*)-(.*)-(.*)-(.*)-(.*)-(01|06|07)", replacement = "TCGA-\\1-\\2", colnames(SKCM_TPM))
# read in roadmap RNA-seq data
epi_count<- read.table("/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/57epigenomes.N.pc.gz", header = T, stringsAsFactors = F, quote = "")
epi_count_sub<- epi_count[, c("gene_id","E059", "E061")]
epi_mat<- as.matrix(epi_count_sub[, -1])
rownames(epi_mat)<- epi_count_sub$gene_id
# change raw counts to TPM
library(EnsDb.Hsapiens.v75)
edb <- EnsDb.Hsapiens.v75
genes_ensemble <- genes(edb)
gene_length<- as.data.frame(genes_ensemble) %>% dplyr::select(gene_id, gene_name, width)
gene_length_in_mat<- left_join(data.frame(gene_id = rownames(epi_mat)), gene_length) %>% dplyr::filter(!is.na(width))
epi_mat_sub<- epi_mat[rownames(epi_mat) %in% gene_length_in_mat$gene_id, ]
countToTpm <- function(counts, effLen)
{
rate <- log(counts + 1) - log(effLen)
denom <- log(sum(exp(rate)))
exp(rate - denom + log(1e6))
}
epi_TPM<- apply(epi_mat_sub, 2, countToTpm, effLen = gene_length_in_mat$width)
# combine SKCM and roadmap data
epi_SKCM_TPM<- merge(epi_TPM, SKCM_TPM, by="row.names", all = FALSE)
# read in TCGA subtype information
TCGA_info<- read_tsv("/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/TCGA_Melanoma_ID_Metadata_MetastaticTumors_only.txt", col_names = T)
table(TCGA_info$MUTATIONSUBTYPES)##
## - BRAF_Hotspot_Mutants NF1_Any_Mutants
## 6 118 23
## RAS_Hotspot_Mutants Triple_WT
## 81 38
# subset based on mutation -- subtype can be changed or all subtypes can be used
Mut_subtype_info<- TCGA_info %>%
dplyr::filter(MUTATIONSUBTYPES %in% c("RAS_Hotspot_Mutants")) %>%
dplyr::select(Name, MUTATIONSUBTYPES)
# check how many RNAseq samples have mutation subtype info
table(colnames(SKCM_TPM) %in% Mut_subtype_info$Name)##
## FALSE TRUE
## 391 81
# subset the combined df
epi_SKCM_TPM<- epi_SKCM_TPM %>% dplyr::rename(gene_id = Row.names, melanoctye_1 = E059, melanoctye_2 = E061)
sample_idx<- colnames(epi_SKCM_TPM) %in% c("gene_id", "melanoctye_1", "melanoctye_2", Mut_subtype_info$Name)
epi_SKCM_TPM<- epi_SKCM_TPM[, sample_idx]
# Quantile Normalization
epi_SKCM_TPM.matrix <- as.matrix(epi_SKCM_TPM[,-c(1)])
epi_SKCM_TPM.normalized <- normalize.quantiles(epi_SKCM_TPM.matrix)
rownames(epi_SKCM_TPM.normalized) <- epi_SKCM_TPM$gene_id
colnames(epi_SKCM_TPM.normalized) <- colnames(epi_SKCM_TPM[,-c(1)])
epi_SKCM_TPM.normalized <- as.data.frame(epi_SKCM_TPM.normalized)
epi_SKCM_TPM.normalized <- rownames_to_column(epi_SKCM_TPM.normalized, var = "gene_id")
# read in shortening/lengthening H3K4me3 domains (-/+2kb) overlapping -/+10kbTSS in SKCM tumors -- peaks were obtained using intersectBed -wo
bivalent_genes <- readxl::read_xlsx("/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/TCGA_SKCM_Broad_Sharp_SizeComparison_melanocytes_Broad_to_Allpeaks_RAS_RFile.xlsx", sheet = 1, col_names = T)
head(bivalent_genes)## # A tibble: 6 x 4
## X__1 X__2 X__3 X__4
## <chr> <chr> <chr> <chr>
## 1 ENSG00000245532 NEAT1 RAS Shortening
## 2 ENSG00000123374 CDK2 RAS Shortening
## 3 ENSG00000185664 PMEL RAS Shortening
## 4 ENSG00000137203 TFAP2A RAS Shortening
## 5 ENSG00000065357 DGKA RAS Shortening
## 6 ENSG00000135903 PAX3 RAS Shortening
#subset by the gene list
gene_ids <- bivalent_genes$`X__1`
epi_SKCM_TPM_long<- epi_SKCM_TPM.normalized %>%
dplyr::filter(gene_id %in% gene_ids) %>%
gather(sample, TPM, 2: ncol(epi_SKCM_TPM.normalized)) %>%
left_join(Mut_subtype_info, by = c("sample" = "Name")) %>%
left_join(bivalent_genes, by = c("gene_id" = "X__1")) %>%
dplyr::rename(symbol = X__2, gene_type = X__3, bivalent_loss_type = X__4)
epi_SKCM_TPM_long<- epi_SKCM_TPM_long %>%
dplyr::rename(sample_type = MUTATIONSUBTYPES) %>%
mutate(sample_type = case_when(
sample == "melanoctye_1" | sample == "melanoctye_2" ~ "melanoctye",
TRUE ~ sample_type
))
epi_SKCM_TPM_long$sample_type<- factor(epi_SKCM_TPM_long$sample_type, levels = c("melanoctye", "RAS_Hotspot_Mutants", "BRAF_Hotspot_Mutants", "Triple_WT"))ggplot(epi_SKCM_TPM_long, aes(x = sample_type, y = log2(TPM+1))) +
geom_boxplot(aes(col = sample_type)) +
facet_wrap(bivalent_loss_type ~ gene_type) +
theme_bw(base_size = 10) +
theme(axis.text.x = element_text(angle = 90)) +
coord_cartesian(ylim = c(0, 15))Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.