library(tidyverse)
library("dplyr")
library(ggplot2)
require(reshape2)
library(biomaRt)
library(ggpubr)
library(ggrepel)
library(preprocessCore)# read in broad/sharp H3K4me3 domains overlapping -/+10kbTSS in SKCM tumors -- peaks were obtained using intersectBed -wo
Broad_Domains <- read.table(file = "/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/RAS_Broad_10kbTSS.interval", sep = "\t", header = FALSE, stringsAsFactors = FALSE)
Broad_Domains <- Broad_Domains[-c(8:15)]
head(Broad_Domains)## V1 V2 V3 V4 V5 V6 V7
## 1 chr1 1095745 1112106 chr1 1104935 1114935 ENST00000379317
## 2 chr1 1095745 1112106 chr1 1093242 1103242 ENST00000384875
## 3 chr1 1095745 1112106 chr1 1092483 1102483 ENST00000384997
## 4 chr1 1095745 1112106 chr1 1103242 1113242 ENST00000384875
## 5 chr1 1095745 1112106 chr1 1094736 1104736 ENST00000606993
## 6 chr1 1095745 1112106 chr1 1094384 1104384 ENST00000362106
# read in bivalent domains overlapping -/+10kbTSS in melanocytes -- peaks were obtained using intersectBed -wo
Bivalent_Melanocytes <- read.table(file = "/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/Melanoctyte_Bivalent_Total_10KBTSS.interval", sep = "\t", header = FALSE, stringsAsFactors = FALSE)
Bivalent_Melanocytes <- Bivalent_Melanocytes[-c(8:15)]
head(Bivalent_Melanocytes)## V1 V2 V3 V4 V5 V6 V7
## 1 chr1 858514 861282 chr1 861263 866445 ENST00000598827
## 2 chr1 858514 861282 chr1 861117 879955 ENST00000342066
## 3 chr1 858514 861282 chr1 860259 874671 ENST00000420190
## 4 chr1 858514 861282 chr1 860529 871173 ENST00000437963
## 5 chr1 875023 878365 chr1 861117 879955 ENST00000342066
## 6 chr1 875023 878365 chr1 874654 879639 ENST00000455979
# read in H3K79me2 domains overlapping -/+10kbTSS in melanocytes -- peaks were obtained using intersectBed -wo
H3K79me2 <- read.table(file = "/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/All_H3K79me2_10kbTSS.interval", sep = "\t", header = FALSE, stringsAsFactors = FALSE)
H3K79me2 <- H3K79me2[-c(8:15)]
head(H3K79me2)## V1 V2 V3 V4 V5 V6 V7
## 1 chr1 886927 888155 chr1 879583 894670 ENST00000327044
## 2 chr1 886927 888155 chr1 879584 893838 ENST00000477976
## 3 chr1 888310 890867 chr1 879583 894670 ENST00000327044
## 4 chr1 888310 890867 chr1 879584 893838 ENST00000477976
## 5 chr1 888310 890867 chr1 889805 894689 ENST00000487214
## 6 chr1 891762 894541 chr1 879583 894670 ENST00000327044
# read in gene names
ENSGeneNames <- read.table(file = "/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/UCSC_Main_on_Human__ensemblToGeneName_genome.tabular", sep = "\t", header = FALSE, stringsAsFactors = FALSE)
head(ENSGeneNames)## V1 V2
## 1 ENST00000400776 PNRC2
## 2 ENST00000374457 SRSF10
## 3 ENST00000433682 SRSF10
## 4 ENST00000374449 SRSF10
## 5 ENST00000459035 U6atac
## 6 ENST00000516583 SNORD112
ENST_ENSG <- read.table(file = "/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/ENST_ENSG.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
head(ENST_ENSG)## ensembl_gene_id hgnc_symbol ensembl_transcript_id ensembl_gene_id.1
## 1 ENSG00000261657 SLC25A26 ENST00000566782 ENSG00000261657
## 2 ENSG00000261657 SLC25A26 ENST00000562780 ENSG00000261657
## 3 ENSG00000261657 SLC25A26 ENST00000569579 ENSG00000261657
## 4 ENSG00000261657 SLC25A26 ENST00000568242 ENSG00000261657
## 5 ENSG00000261657 SLC25A26 ENST00000565530 ENSG00000261657
## 6 ENSG00000223116 ENST00000411184 ENSG00000223116
# join broad H3K4me3 with gene names
Broad_Domains_Gene <- inner_join(x = Broad_Domains, y = ENSGeneNames, by = c("V7" = "V1"))
Broad_Domains_Gene <- distinct(Broad_Domains_Gene, V2.y, .keep_all = TRUE)
head(Broad_Domains_Gene)## V1 V2.x V3 V4 V5 V6 V7 V2.y
## 1 chr1 1095745 1112106 chr1 1104935 1114935 ENST00000379317 TTLL10-AS1
## 2 chr1 1095745 1112106 chr1 1093242 1103242 ENST00000384875 MIR200A
## 3 chr1 1095745 1112106 chr1 1092483 1102483 ENST00000384997 MIR200B
## 4 chr1 1095745 1112106 chr1 1094736 1104736 ENST00000606993 RP11-465B22.8
## 5 chr1 1095745 1112106 chr1 1094384 1104384 ENST00000362106 MIR429
## 6 chr1 1095745 1112106 chr1 1099285 1109285 ENST00000379289 TTLL10
# join bivalent_melanocytes with gene names
Bivalent_Melanocytes_Gene <- inner_join(x = Bivalent_Melanocytes, y = ENSGeneNames, by = c("V7" = "V1"))
Bivalent_Melanocytes_Gene <- distinct(Bivalent_Melanocytes_Gene, V2.y, .keep_all = TRUE)
head(Bivalent_Melanocytes_Gene)## V1 V2.x V3 V4 V5 V6 V7 V2.y
## 1 chr1 858514 861282 chr1 861263 866445 ENST00000598827 AL645608.1
## 2 chr1 858514 861282 chr1 861117 879955 ENST00000342066 SAMD11
## 3 chr1 1289248 1290366 chr1 1288071 1297157 ENST00000477278 MXRA8
## 4 chr1 1980901 1982284 chr1 1981923 2077547 ENST00000468310 PRKCZ
## 5 chr1 1980901 1982284 chr1 1980742 1981509 ENST00000449154 RP11-547D24.3
## 6 chr1 2985026 2987766 chr1 2985746 3103155 ENST00000607632 PRDM16
## join H3K79me2 with gene names
H3K79me2_Gene <- inner_join(x = H3K79me2, y = ENSGeneNames, by = c("V7" = "V1"))
H3K79me2_Gene <- distinct(H3K79me2_Gene, V2.y, .keep_all = TRUE)
head(H3K79me2_Gene)## V1 V2.x V3 V4 V5 V6 V7 V2.y
## 1 chr1 886927 888155 chr1 879583 894670 ENST00000327044 NOC2L
## 2 chr1 934518 934919 chr1 934341 935552 ENST00000428771 HES4
## 3 chr1 948991 951006 chr1 948802 949920 ENST00000379389 ISG15
## 4 chr1 956682 957119 chr1 955502 991496 ENST00000379370 AGRN
## 5 chr1 1038661 1039044 chr1 1017197 1051461 ENST00000379339 C1orf159
## 6 chr1 1146781 1149041 chr1 1146719 1149518 ENST00000497869 TNFRSF4
# join bivalent with broad
Bivalent_to_Broad <- inner_join(x = Broad_Domains_Gene, y = Bivalent_Melanocytes_Gene, by = c("V2.y" = "V2.y"))
head(Bivalent_to_Broad)## V1.x V2.x.x V3.x V4.x V5.x V6.x V7.x V2.y
## 1 chr1 92945800 92953739 chr1 92940239 92950239 ENST00000483490 GFI1
## 2 chr1 155050571 155058735 chr1 155051384 155061384 ENST00000418360 EFNA3
## 3 chr1 156890311 156897813 chr1 156891889 156901889 ENST00000472465 LRRC71
## 4 chr1 156890311 156897813 chr1 156883323 156893323 ENST00000465101 PEAR1
## 5 chr1 156890311 156897813 chr1 156896036 156906036 ENST00000390226 MIR765
## 6 chr1 200003013 200012292 chr1 199996729 200006729 ENST00000367362 NR5A2
## V1.y V2.x.y V3.y V4.y V5.y V6.y V7.y
## 1 chr1 92946187 92952879 chr1 92948546 92950239 ENST00000483490
## 2 chr1 155050666 155055687 chr1 155051384 155059382 ENST00000418360
## 3 chr1 156889739 156891071 chr1 156890441 156902886 ENST00000337428
## 4 chr1 156889739 156891071 chr1 156883323 156893323 ENST00000465101
## 5 chr1 156896972 156898132 chr1 156896036 156906036 ENST00000390226
## 6 chr1 199997834 199999066 chr1 199996741 200144731 ENST00000236914
# join bivalent_broad with H3K79me2
Bivalent_to_Broad_H3K79me2 <- inner_join(x = Bivalent_to_Broad, y = H3K79me2_Gene, by = c("V2.y" = "V2.y"))
head(Bivalent_to_Broad_H3K79me2)## V1.x V2.x.x V3.x V4.x V5.x V6.x V7.x
## 1 chr1 92945800 92953739 chr1 92940239 92950239 ENST00000483490
## 2 chr1 155050571 155058735 chr1 155051384 155061384 ENST00000418360
## 3 chr1 245316538 245322060 chr1 245308286 245318286 ENST00000407071
## 4 chr10 102890180 102896655 chr10 102880513 102890513 ENST00000463716
## 5 chr10 102890180 102896655 chr10 102882535 102892535 ENST00000598040
## 6 chr10 102890180 102896655 chr10 102880883 102890883 ENST00000445873
## V2.y V1.y V2.x.y V3.y V4.y V5.y V6.y
## 1 GFI1 chr1 92946187 92952879 chr1 92948546 92950239
## 2 EFNA3 chr1 155050666 155055687 chr1 155051384 155059382
## 3 KIF26B chr1 245316334 245320781 chr1 245318286 245866428
## 4 TLX1 chr10 102880323 102881776 chr10 102880513 102890513
## 5 HUG1 chr10 102882496 102883632 chr10 102882535 102883624
## 6 TLX1NB chr10 102880323 102881776 chr10 102850064 102890883
## V7.y V1 V2.x V3 V4 V5 V6
## 1 ENST00000483490 chr1 92944873 92949496 chr1 92940318 92951628
## 2 ENST00000418360 chr1 155036799 155037890 chr1 155036223 155059283
## 3 ENST00000407071 chr1 245318336 245335692 chr1 245318286 245866428
## 4 ENST00000463716 chr10 102891575 102894110 chr10 102889256 102897545
## 5 ENST00000598040 chr10 102891575 102894110 chr10 102882535 102892535
## 6 ENST00000425505 chr10 102891575 102894110 chr10 102890883 102900883
## V7
## 1 ENST00000427103
## 2 ENST00000505139
## 3 ENST00000407071
## 4 ENST00000370196
## 5 ENST00000598040
## 6 ENST00000445873
# join ENST with ENSG
Broad_Domains_Gene_ENSG <- inner_join(x = Bivalent_to_Broad_H3K79me2, y = ENST_ENSG, by = c("V2.y" = "hgnc_symbol"))
head(Broad_Domains_Gene_ENSG)## V1.x V2.x.x V3.x V4.x V5.x V6.x V7.x V2.y
## 1 chr1 92945800 92953739 chr1 92940239 92950239 ENST00000483490 GFI1
## 2 chr1 92945800 92953739 chr1 92940239 92950239 ENST00000483490 GFI1
## 3 chr1 92945800 92953739 chr1 92940239 92950239 ENST00000483490 GFI1
## 4 chr1 92945800 92953739 chr1 92940239 92950239 ENST00000483490 GFI1
## 5 chr1 155050571 155058735 chr1 155051384 155061384 ENST00000418360 EFNA3
## 6 chr1 155050571 155058735 chr1 155051384 155061384 ENST00000418360 EFNA3
## V1.y V2.x.y V3.y V4.y V5.y V6.y V7.y V1
## 1 chr1 92946187 92952879 chr1 92948546 92950239 ENST00000483490 chr1
## 2 chr1 92946187 92952879 chr1 92948546 92950239 ENST00000483490 chr1
## 3 chr1 92946187 92952879 chr1 92948546 92950239 ENST00000483490 chr1
## 4 chr1 92946187 92952879 chr1 92948546 92950239 ENST00000483490 chr1
## 5 chr1 155050666 155055687 chr1 155051384 155059382 ENST00000418360 chr1
## 6 chr1 155050666 155055687 chr1 155051384 155059382 ENST00000418360 chr1
## V2.x V3 V4 V5 V6 V7
## 1 92944873 92949496 chr1 92940318 92951628 ENST00000427103
## 2 92944873 92949496 chr1 92940318 92951628 ENST00000427103
## 3 92944873 92949496 chr1 92940318 92951628 ENST00000427103
## 4 92944873 92949496 chr1 92940318 92951628 ENST00000427103
## 5 155036799 155037890 chr1 155036223 155059283 ENST00000505139
## 6 155036799 155037890 chr1 155036223 155059283 ENST00000505139
## ensembl_gene_id ensembl_transcript_id ensembl_gene_id.1
## 1 ENSG00000162676 ENST00000370332 ENSG00000162676
## 2 ENSG00000162676 ENST00000294702 ENSG00000162676
## 3 ENSG00000162676 ENST00000483490 ENSG00000162676
## 4 ENSG00000162676 ENST00000427103 ENSG00000162676
## 5 ENSG00000143590 ENST00000368408 ENSG00000143590
## 6 ENSG00000143590 ENST00000470294 ENSG00000143590
# filter based on ENSG gene name
Broad_Domains_Gene_ENSG <- distinct(Broad_Domains_Gene_ENSG, V2.y, .keep_all = TRUE)
head(Broad_Domains_Gene_ENSG)## V1.x V2.x.x V3.x V4.x V5.x V6.x V7.x
## 1 chr1 92945800 92953739 chr1 92940239 92950239 ENST00000483490
## 2 chr1 155050571 155058735 chr1 155051384 155061384 ENST00000418360
## 3 chr1 245316538 245322060 chr1 245308286 245318286 ENST00000407071
## 4 chr10 102890180 102896655 chr10 102880513 102890513 ENST00000463716
## 5 chr10 102890180 102896655 chr10 102880883 102890883 ENST00000445873
## 6 chr10 124898586 124911252 chr10 124903792 124913792 ENST00000368865
## V2.y V1.y V2.x.y V3.y V4.y V5.y V6.y
## 1 GFI1 chr1 92946187 92952879 chr1 92948546 92950239
## 2 EFNA3 chr1 155050666 155055687 chr1 155051384 155059382
## 3 KIF26B chr1 245316334 245320781 chr1 245318286 245866428
## 4 TLX1 chr10 102880323 102881776 chr10 102880513 102890513
## 5 TLX1NB chr10 102880323 102881776 chr10 102850064 102890883
## 6 BUB3 chr10 124907103 124911113 chr10 124903930 124913930
## V7.y V1 V2.x V3 V4 V5 V6
## 1 ENST00000483490 chr1 92944873 92949496 chr1 92940318 92951628
## 2 ENST00000418360 chr1 155036799 155037890 chr1 155036223 155059283
## 3 ENST00000407071 chr1 245318336 245335692 chr1 245318286 245866428
## 4 ENST00000463716 chr10 102891575 102894110 chr10 102889256 102897545
## 5 ENST00000425505 chr10 102891575 102894110 chr10 102890883 102900883
## 6 ENST00000368858 chr10 124914053 124927135 chr10 124913930 124924881
## V7 ensembl_gene_id ensembl_transcript_id ensembl_gene_id.1
## 1 ENST00000427103 ENSG00000162676 ENST00000370332 ENSG00000162676
## 2 ENST00000505139 ENSG00000143590 ENST00000368408 ENSG00000143590
## 3 ENST00000407071 ENSG00000162849 ENST00000407071 ENSG00000162849
## 4 ENST00000370196 ENSG00000107807 ENST00000370196 ENSG00000107807
## 5 ENST00000445873 ENSG00000236311 ENST00000445873 ENSG00000236311
## 6 ENST00000368858 ENSG00000154473 ENST00000368865 ENSG00000154473
# read in SKCM RNA-seq data - already TPM transformed
SKCM_TPM <- read.table(file = "/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/SKCM_TPM-2.txt", header = TRUE, stringsAsFactors = FALSE,
quote = "")
colnames(SKCM_TPM) <- gsub(pattern = "\\.", replacement = "-", colnames(SKCM_TPM))
colnames(SKCM_TPM) <- gsub(pattern = "TCGA-(.*)-(.*)-(.*)-(.*)-(.*)-(01|06|07)", replacement = "TCGA-\\1-\\2", colnames(SKCM_TPM))
# read in roadmap RNA-seq data
epi_count<- read.table("/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/57epigenomes.N.pc.gz", header = T, stringsAsFactors = F, quote = "")
epi_count_sub<- epi_count[, c("gene_id","E059", "E061")]
epi_mat<- as.matrix(epi_count_sub[, -1])
rownames(epi_mat)<- epi_count_sub$gene_id
# change raw counts to TPM
library(EnsDb.Hsapiens.v75)
edb <- EnsDb.Hsapiens.v75
genes_ensemble <- genes(edb)
gene_length<- as.data.frame(genes_ensemble) %>% dplyr::select(gene_id, gene_name, width)
gene_length_in_mat<- left_join(data.frame(gene_id = rownames(epi_mat)), gene_length) %>% dplyr::filter(!is.na(width))
epi_mat_sub<- epi_mat[rownames(epi_mat) %in% gene_length_in_mat$gene_id, ]
countToTpm <- function(counts, effLen)
{
rate <- log(counts + 1) - log(effLen)
denom <- log(sum(exp(rate)))
exp(rate - denom + log(1e6))
}
epi_TPM<- apply(epi_mat_sub, 2, countToTpm, effLen = gene_length_in_mat$width)
# combine SKCM and roadmap data
epi_SKCM_TPM<- merge(epi_TPM, SKCM_TPM, by="row.names", all = FALSE)
# read in TCGA subtype information
TCGA_info<- read_tsv("/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/TCGA_Melanoma_ID_Metadata_MetastaticTumors_only.txt", col_names = T)
table(TCGA_info$MUTATIONSUBTYPES)##
## - BRAF_Hotspot_Mutants NF1_Any_Mutants
## 6 118 23
## RAS_Hotspot_Mutants Triple_WT
## 81 38
# subset based on mutation -- subtype can be changed or all subtypes can be used
Mut_subtype_info<- TCGA_info %>%
dplyr::filter(MUTATIONSUBTYPES %in% c("RAS_Hotspot_Mutants")) %>%
dplyr::select(Name, MUTATIONSUBTYPES)
# check how many RNAseq samples have mutation subtype info
table(colnames(SKCM_TPM) %in% Mut_subtype_info$Name)##
## FALSE TRUE
## 391 81
# subset the combined df
epi_SKCM_TPM<- epi_SKCM_TPM %>% dplyr::rename(gene_id = Row.names, melanoctye_1 = E059, melanoctye_2 = E061)
sample_idx<- colnames(epi_SKCM_TPM) %in% c("gene_id", "melanoctye_1", "melanoctye_2", Mut_subtype_info$Name)
epi_SKCM_TPM<- epi_SKCM_TPM[, sample_idx]
# Quantile Normalization
epi_SKCM_TPM.matrix <- as.matrix(epi_SKCM_TPM[,-c(1)])
epi_SKCM_TPM.normalized <- normalize.quantiles(epi_SKCM_TPM.matrix)
rownames(epi_SKCM_TPM.normalized) <- epi_SKCM_TPM$gene_id
colnames(epi_SKCM_TPM.normalized) <- colnames(epi_SKCM_TPM[,-c(1)])
epi_SKCM_TPM.normalized <- as.data.frame(epi_SKCM_TPM.normalized)
epi_SKCM_TPM.normalized <- rownames_to_column(epi_SKCM_TPM.normalized, var = "gene_id")
# subset by the gene list from Broad_Domains_Gene_ENSG -- this includes all subtypes and sharp H3K4me3
bivalent_genes <- readxl::read_xlsx("/TCGA_SKCM_Toshiba/TCGA_SKCM_RNA_Files/TCGA_SKCM_Broad_Sharp_Total_10kbTSS_ENS_BivalentLoss_RAS_237Tumors_Rfile.xlsx", sheet = 1, col_names = T)
head(bivalent_genes)## # A tibble: 6 x 4
## X__1 X__2 X__3 X__4
## <chr> <chr> <chr> <chr>
## 1 ENSG00000162676 GFI1 RAS Broad H3K4me3
## 2 ENSG00000143590 EFNA3 RAS Broad H3K4me3
## 3 ENSG00000162849 KIF26B RAS Broad H3K4me3
## 4 ENSG00000107807 TLX1 RAS Broad H3K4me3
## 5 ENSG00000236311 TLX1NB RAS Broad H3K4me3
## 6 ENSG00000154473 BUB3 RAS Broad H3K4me3
gene_ids <- bivalent_genes$`X__1`
epi_SKCM_TPM_long<- epi_SKCM_TPM.normalized %>%
dplyr::filter(gene_id %in% gene_ids) %>%
gather(sample, TPM, 2: ncol(epi_SKCM_TPM.normalized)) %>%
left_join(Mut_subtype_info, by = c("sample" = "Name")) %>%
left_join(bivalent_genes, by = c("gene_id" = "X__1")) %>%
dplyr::rename(symbol = X__2, gene_type = X__3, bivalent_loss_type = X__4)
epi_SKCM_TPM_long<- epi_SKCM_TPM_long %>%
dplyr::rename(sample_type = MUTATIONSUBTYPES) %>%
mutate(sample_type = case_when(
sample == "melanoctye_1" | sample == "melanoctye_2" ~ "melanoctye",
TRUE ~ sample_type
))
epi_SKCM_TPM_long$sample_type<- factor(epi_SKCM_TPM_long$sample_type, levels = c("melanoctye", "RAS_Hotspot_Mutants", "BRAF_Hotspot_Mutants", "Triple_WT"))ggplot(epi_SKCM_TPM_long, aes(x = sample_type, y = log2(TPM+1))) +
geom_boxplot(aes(col = sample_type)) +
facet_wrap(bivalent_loss_type ~ gene_type) +
theme_bw(base_size = 10) +
theme(axis.text.x = element_text(angle = 90)) +
coord_cartesian(ylim = c(0, 15))Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.