AIM: To find the HISTONE METHYLATION LANDSCAPE (regardless of cell type) which are overlapping the enhancer regions of DE genes (BAT vs WHITE) so as to find out the general expressivity of the enhancers
## Warning: package 'knitr' was built under R version 3.1.2
library(RCurl)
library(xlsx)
library(ggplot2)
library(dplyr)
library(GenomicRanges)
#reads in relavant excel sheet "Final"
NCBIdata = read.xlsx("../data/lncRNA_enhancer.xlsx", sheetName="Final", endRow=3, header=TRUE)
CUFFdata = read.xlsx("../data/lncRNA_enhancer.xlsx",sheetName="Final", startRow=4, header=TRUE)
CUFFdata = CUFFdata[complete.cases(CUFFdata),]
#Build genomicRanges obj
gr.NCBIdata = with(
NCBIdata, GRanges(seqnames = chromosome,
ranges = IRanges(Enhancer_start, Enhancer_end),
geneName = NCBI_id
))
gr.CUFFdata= with(CUFFdata, GRanges(seqnames = chromosome,
ranges = IRanges(Enhancer_start, Enhancer_end),
geneName = Gene_ID
))
gr.enhancers = c(gr.NCBIdata, gr.CUFFdata)
## Warning in .Seqinfo.mergexy(x, y): Each of the 2 combined objects has sequence levels not in the other:
## - in 'x': chr11
## - in 'y': chr1, chr10, chr2, chr4, chr5, chr6, chr8, chr9
## Make sure to always combine/compare objects based on the same reference
## genome (use suppressWarnings() to suppress this warning).
#Download narrowpeak data from encode MOUSE
url = "http://hgdownload.cse.ucsc.edu/goldenPath/mm9/encodeDCC/wgEncodeCaltechHist/"
mapping = readLines("../data/files.txt")
fileNames = do.call(c,lapply(mapping, function(x) {
name = strsplit(x, "\t")[[1]][1]
ifelse(grepl(x=name, pattern="narrowPeak"), yes=name, no=NA)
}))
fileNames = fileNames[!is.na(fileNames)]
data = do.call(rbind,lapply(fileNames, function(name){
temp <- tempfile()
download.file(paste0(url, name),temp)
data <- read.table(gzfile(temp))
unlink(temp)
data
}))
#Set Column Names
data = setNames(data, c("chrom", "chromStart", "chromEnd", "name", "score",
"strand", "signalValue", "pValue", "qValue", "peak"))
#New column histoneType
data$histoneType = as.character(gsub('^\\S+(H\\d+\\S+me)\\S+$', '\\1', as.character(data$name)))
head(data)
## chrom chromStart chromEnd name score strand
## 1 chr1 4773723 4774709 11907-C2C12-exp-H3K79me3.50mers1 0 .
## 2 chr1 4847972 4851941 11907-C2C12-exp-H3K79me3.50mers2 0 .
## 3 chr1 4852008 4853258 11907-C2C12-exp-H3K79me3.50mers3 0 .
## 4 chr1 4853339 4855298 11907-C2C12-exp-H3K79me3.50mers4 0 .
## 5 chr1 4855367 4856127 11907-C2C12-exp-H3K79me3.50mers5 0 .
## 6 chr1 4856183 4856572 11907-C2C12-exp-H3K79me3.50mers6 0 .
## signalValue pValue qValue peak histoneType
## 1 4.9 6.6576 -1 584 H3K79me3.50me
## 2 31.6 81.3665 -1 1211 H3K79me3.50me
## 3 7.4 94.2840 -1 477 H3K79me3.50me
## 4 11.3 116.3565 -1 726 H3K79me3.50me
## 5 4.1 123.0044 -1 203 H3K79me3.50me
## 6 2.3 125.9208 -1 290 H3K79me3.50me
#build gRanges obj
gr.histone = GRanges(seqnames = data$chrom,
ranges =IRanges(data$chromStart, end = data$chromEnd),
histone=data$histoneType
)
mtch = findOverlaps(
gr.histone,
gr.enhancers)
mtch
## Hits of length 98
## queryLength: 159316
## subjectLength: 14
## queryHits subjectHits
## <integer> <integer>
## 1 1389 11
## 2 1390 11
## 3 4087 2
## 4 4088 2
## 5 4089 2
## ... ... ...
## 94 142289 5
## 95 142290 5
## 96 144680 6
## 97 145368 13
## 98 145369 13
gr.histone[mtch@queryHits, ]
## GRanges object with 98 ranges and 1 metadata column:
## seqnames ranges strand | histone
## <Rle> <IRanges> <Rle> | <character>
## [1] chr10 [61111603, 61112660] * | H3K79me3.50me
## [2] chr10 [61113730, 61115072] * | H3K79me3.50me
## [3] chr12 [77645744, 77648608] * | H3K79me3.50me
## [4] chr12 [77649223, 77650780] * | H3K79me3.50me
## [5] chr12 [77650865, 77652613] * | H3K79me3.50me
## ... ... ... ... ... ...
## [94] chr2 [167514195, 167517370] * | H3K4me3.50me
## [95] chr2 [167602838, 167603548] * | H3K4me3.50me
## [96] chr5 [136719488, 136720820] * | H3K4me3.50me
## [97] chr6 [124364000, 124365133] * | H3K4me3.50me
## [98] chr6 [124365205, 124366044] * | H3K4me3.50me
## -------
## seqinfo: 20 sequences from an unspecified genome; no seqlengths
sessionInfo()
## R version 3.1.1 (2014-07-10)
## Platform: x86_64-apple-darwin13.1.0 (64-bit)
##
## locale:
## [1] C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] GenomicRanges_1.18.3 GenomeInfoDb_1.2.4 IRanges_2.0.1
## [4] S4Vectors_0.4.0 BiocGenerics_0.12.1 dplyr_0.3.0.2
## [7] ggplot2_1.0.0 xlsx_0.5.7 xlsxjars_0.6.1
## [10] rJava_0.9-6 RCurl_1.95-4.5 bitops_1.0-6
## [13] knitr_1.8
##
## loaded via a namespace (and not attached):
## [1] DBI_0.3.1 MASS_7.3-35 Rcpp_0.11.3 XVector_0.6.0
## [5] assertthat_0.1 codetools_0.2-9 colorspace_1.2-4 digest_0.6.7
## [9] evaluate_0.5.5 formatR_1.0 grid_3.1.1 gtable_0.1.2
## [13] htmltools_0.2.6 magrittr_1.5 munsell_0.4.2 plyr_1.8.1
## [17] proto_0.3-10 reshape2_1.4.1 rmarkdown_0.4.2 scales_0.2.4
## [21] stringr_0.6.2 tools_3.1.1