# read in SKCM RNA-seq data - already TPM transformed
SKCM_TPM <- read.table(file = "/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/SKCM_TPM-2.txt", header = TRUE, stringsAsFactors = FALSE, 
                       quote = "")
colnames(SKCM_TPM) <- gsub(pattern = "\\.", replacement = "-", colnames(SKCM_TPM))
colnames(SKCM_TPM) <- gsub(pattern = "TCGA-(.*)-(.*)-(.*)-(.*)-(.*)-(01|06|07)", replacement = "TCGA-\\1-\\2", colnames(SKCM_TPM))

# read in roadmap RNA-seq data
epi_count<- read.table("/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/57epigenomes.N.pc.gz", header = T, stringsAsFactors = F, quote = "")

epi_count_sub<- epi_count[, c("gene_id","E059", "E061")]
epi_mat<- as.matrix(epi_count_sub[, -1])
rownames(epi_mat)<- epi_count_sub$gene_id

# change raw counts to TPM
library(EnsDb.Hsapiens.v75)
edb <- EnsDb.Hsapiens.v75

genes_ensemble <- genes(edb)

gene_length<- as.data.frame(genes_ensemble) %>% dplyr::select(gene_id, gene_name, width)

gene_length_in_mat<- left_join(data.frame(gene_id = rownames(epi_mat)), gene_length) %>% dplyr::filter(!is.na(width))

epi_mat_sub<- epi_mat[rownames(epi_mat) %in% gene_length_in_mat$gene_id, ]

countToTpm <- function(counts, effLen)
{
  rate <- log(counts + 1) - log(effLen)
  denom <- log(sum(exp(rate)))
  exp(rate - denom + log(1e6))
}

epi_TPM<- apply(epi_mat_sub, 2, countToTpm, effLen = gene_length_in_mat$width)

# combine SKCM and roadmap data
epi_SKCM_TPM<- merge(epi_TPM, SKCM_TPM, by="row.names", all = FALSE)

# read in TCGA subtype information
TCGA_info<- read_tsv("/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/TCGA_Melanoma_ID_Metadata_MetastaticTumors_only.txt", col_names = T)
table(TCGA_info$MUTATIONSUBTYPES)

# subset based on mutation -- subtype can be changed or all subtypes can be used
Mut_subtype_info<- TCGA_info %>% 
  dplyr::filter(MUTATIONSUBTYPES %in% c("RAS_Hotspot_Mutants")) %>%
  dplyr::select(Name, MUTATIONSUBTYPES)

# check how many RNAseq samples have mutation subtype info
table(colnames(SKCM_TPM) %in% Mut_subtype_info$Name)

# subset the combined df
epi_SKCM_TPM<- epi_SKCM_TPM %>% dplyr::rename(gene_id = Row.names, melanoctye_1 = E059, melanoctye_2 = E061)
sample_idx<- colnames(epi_SKCM_TPM) %in% c("gene_id", "melanoctye_1", "melanoctye_2", Mut_subtype_info$Name)
epi_SKCM_TPM<- epi_SKCM_TPM[, sample_idx]

# Quantile Normalization
epi_SKCM_TPM.matrix <- as.matrix(epi_SKCM_TPM[,-c(1)])
epi_SKCM_TPM.normalized <- normalize.quantiles(epi_SKCM_TPM.matrix)
rownames(epi_SKCM_TPM.normalized) <- epi_SKCM_TPM$gene_id
colnames(epi_SKCM_TPM.normalized) <- colnames(epi_SKCM_TPM[,-c(1)])
epi_SKCM_TPM.normalized <- as.data.frame(epi_SKCM_TPM.normalized)
epi_SKCM_TPM.normalized <- rownames_to_column(epi_SKCM_TPM.normalized, var = "gene_id")

# read in shortening/lengthening H3K4me3 domains (-/+2kb) overlapping -/+10kbTSS in SKCM tumors -- peaks were obtained using intersectBed -wo
bivalent_genes <- readxl::read_xlsx("/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/TCGA_SKCM_Broad_Sharp_SizeComparison_melanocytes_Broad_to_Allpeaks_RAS_RFile.xlsx", sheet = 1, col_names = T)
head(bivalent_genes)

#subset by the gene list  
gene_ids <- bivalent_genes$`X__1`

epi_SKCM_TPM_long<- epi_SKCM_TPM.normalized %>% 
  dplyr::filter(gene_id %in% gene_ids) %>% 
  gather(sample, TPM, 2: ncol(epi_SKCM_TPM.normalized)) %>% 
  left_join(Mut_subtype_info, by = c("sample" = "Name")) %>%
  left_join(bivalent_genes, by = c("gene_id" = "X__1")) %>%
  dplyr::rename(symbol = X__2, gene_type = X__3, bivalent_loss_type = X__4)

epi_SKCM_TPM_long<- epi_SKCM_TPM_long %>% 
  dplyr::rename(sample_type = MUTATIONSUBTYPES) %>%
  mutate(sample_type = case_when(
    sample == "melanoctye_1" | sample == "melanoctye_2" ~ "melanoctye",
    TRUE ~ sample_type
  ))

epi_SKCM_TPM_long$sample_type<- factor(epi_SKCM_TPM_long$sample_type, levels = c("melanoctye", "RAS_Hotspot_Mutants", "BRAF_Hotspot_Mutants", "Triple_WT"))

BoxPlot for Mean RNA-seq expression from shortening or lengthening of broad H3K4me3 domains in melanoma tumors relative to melanocytes

ggplot(epi_SKCM_TPM_long, aes(x = sample_type, y = log2(TPM+1))) +
  geom_boxplot(aes(col = sample_type)) +
  facet_wrap(bivalent_loss_type ~ gene_type) +
  theme_bw(base_size = 10) +
  theme(axis.text.x = element_text(angle = 90)) +
  coord_cartesian(ylim = c(0, 15))

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

BoxPlot of SKCM RNA-seq expression levels (all metastatic samples) based on shortening or lengthening of broad H3K4me3 Domains (-/+2kb) in TCGA SKCM Tumors relative to melanocytes

Christopher Terranova May 2019

Load necessary programs

Shortening or Lengthening of Broad H3K4me3 Domains in SKCM Tumors relative to Melanocytes

BoxPlot for Mean RNA-seq expression from shortening or lengthening of broad H3K4me3 domains in melanoma tumors relative to melanocytes