library("dplyr")
library(ggplot2)
require(reshape2)
library(biomaRt)
library(ggpubr)
library(ggrepel)
library(tidyverse)
library(gplots)
library(genefilter)
library(RColorBrewer)
library(vegan)
library(preprocessCore)# read in SKCM RNA-seq data - already TPM transformed
SKCM_TPM <- read.table(file = "/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/SKCM_TPM-2.txt", header = TRUE, stringsAsFactors = FALSE,
quote = "")
colnames(SKCM_TPM) <- gsub(pattern = "\\.", replacement = "-", colnames(SKCM_TPM))
colnames(SKCM_TPM) <- gsub(pattern = "TCGA-(.*)-(.*)-(.*)-(.*)-(.*)-(01|06|07)", replacement = "TCGA-\\1-\\2", colnames(SKCM_TPM))
# read in roadmap RNA-seq data
epi_count<- read.table("/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/57epigenomes.N.pc.gz", header = T, stringsAsFactors = F, quote = "")
epi_count_sub<- epi_count[, c("gene_id", "E003", "E059", "E061")]
epi_mat<- as.matrix(epi_count_sub[, -1])
rownames(epi_mat)<- epi_count_sub$gene_id
# change raw counts to TPM
library(EnsDb.Hsapiens.v75)
edb <- EnsDb.Hsapiens.v75
genes_ensemble <- genes(edb)
gene_length<- as.data.frame(genes_ensemble) %>% dplyr::select(gene_id, gene_name, width)
gene_length_in_mat<- left_join(data.frame(gene_id = rownames(epi_mat)), gene_length) %>% dplyr::filter(!is.na(width))
epi_mat_sub<- epi_mat[rownames(epi_mat) %in% gene_length_in_mat$gene_id, ]
countToTpm <- function(counts, effLen)
{
rate <- log(counts + 1) - log(effLen)
denom <- log(sum(exp(rate)))
exp(rate - denom + log(1e6))
}
epi_TPM<- apply(epi_mat_sub, 2, countToTpm, effLen = gene_length_in_mat$width)
# combine SKCM and roadmap data
epi_SKCM_TPM<- merge(epi_TPM, SKCM_TPM, by="row.names", all = FALSE)
# read in TCGA subtype information
TCGA_info<- read_tsv("/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/TCGA_Melanoma_ID_Metadata_MetastaticTumors_only.txt", col_names = T)
table(TCGA_info$MUTATIONSUBTYPES)##
## - BRAF_Hotspot_Mutants NF1_Any_Mutants
## 6 118 23
## RAS_Hotspot_Mutants Triple_WT
## 81 38
# subset based on mutation
Mut_subtype_info<- TCGA_info %>%
dplyr::filter(MUTATIONSUBTYPES %in% c("RAS_Hotspot_Mutants", "BRAF_Hotspot_Mutants", "Triple_WT")) %>%
dplyr::select(Name, MUTATIONSUBTYPES)
# check how many RNAseq samples have mutation subtype info
table(colnames(SKCM_TPM) %in% Mut_subtype_info$Name)##
## FALSE TRUE
## 235 237
# subset the dataframe
epi_SKCM_TPM<- epi_SKCM_TPM %>% dplyr::rename(gene_id = Row.names, H1 = E003, melanoctye_1 = E059, melanoctye_2 = E061)
sample_idx<- colnames(epi_SKCM_TPM) %in% c("gene_id", "H1", "melanoctye_1", "melanoctye_2", Mut_subtype_info$Name)
epi_SKCM_TPM<- epi_SKCM_TPM[, sample_idx]
# Quantile Normalization
epi_SKCM_TPM.matrix <- as.matrix(epi_SKCM_TPM[,-c(1)])
epi_SKCM_TPM.normalized <- normalize.quantiles(epi_SKCM_TPM.matrix)
rownames(epi_SKCM_TPM.normalized) <- epi_SKCM_TPM$gene_id
colnames(epi_SKCM_TPM.normalized) <- colnames(epi_SKCM_TPM[,-c(1)])
epi_SKCM_TPM.normalized <- as.data.frame(epi_SKCM_TPM.normalized)
epi_SKCM_TPM.normalized <- rownames_to_column(epi_SKCM_TPM.normalized, var = "gene_id")
# set gene for boxplot - TWIST1=ENSG00000122691
gene_ids<- "ENSG00000122691"
# set levels for boxplot - can include ESCs here
epi_SKCM_TPM_long<- epi_SKCM_TPM.normalized %>% dplyr::filter(gene_id %in% gene_ids) %>% gather(sample, TPM, 2: ncol(epi_SKCM_TPM.normalized)) %>% left_join(Mut_subtype_info, by = c("sample" = "Name"))
epi_SKCM_TPM_long<- epi_SKCM_TPM_long %>%
dplyr::rename(sample_type = MUTATIONSUBTYPES) %>%
mutate(sample_type = case_when(
sample == "H1" ~ "ESC",
sample == "melanoctye_1" | sample == "melanoctye_2" ~ "melanoctye",
TRUE ~ sample_type
))
epi_SKCM_TPM_long$sample_type<- factor(epi_SKCM_TPM_long$sample_type, levels = c("ESC", "melanoctye", "RAS_Hotspot_Mutants", "BRAF_Hotspot_Mutants", "Triple_WT"))ggplot(epi_SKCM_TPM_long %>% dplyr::filter(gene_id == "ENSG00000122691", !sample_type %in% c("ESC")), aes(x = sample_type, y = log2(TPM +1))) +
geom_boxplot(aes(fill = sample_type)) +
scale_fill_manual(values = c("purple", "blue", "red", "green")) +
theme_bw(base_size = 10) +
theme(axis.text.x = element_text(angle = 90))df_test<- epi_SKCM_TPM_long %>% dplyr::filter(gene_id == "ENSG00000122691", !sample_type %in% c("ESC"))
wilcox.test(TPM ~ sample_type, data = df_test, subset = sample_type %in% c("melanoctye", "RAS_Hotspot_Mutants"), alternative = "less")##
## Wilcoxon rank sum test with continuity correction
##
## data: TPM by sample_type
## W = 7, p-value = 0.01453
## alternative hypothesis: true location shift is less than 0
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.