Histone methylation of eRNA in ENCODE mouse data

AIM: To find the HISTONE METHYLATION LANDSCAPE (regardless of cell type) which are overlapping the enhancer regions of DE genes (BAT vs WHITE) so as to find out the general expressivity of the enhancers

## Warning: package 'knitr' was built under R version 3.1.2
library(RCurl)
library(xlsx)
library(ggplot2)
library(dplyr)
library(GenomicRanges)
#reads in relavant excel sheet "Final"
NCBIdata = read.xlsx("../data/lncRNA_enhancer.xlsx", sheetName="Final", endRow=3, header=TRUE)
CUFFdata = read.xlsx("../data/lncRNA_enhancer.xlsx",sheetName="Final", startRow=4, header=TRUE)
CUFFdata = CUFFdata[complete.cases(CUFFdata),]
#Build genomicRanges obj
gr.NCBIdata = with(
NCBIdata, GRanges(seqnames = chromosome, 
ranges = IRanges(Enhancer_start, Enhancer_end), 
geneName = NCBI_id
))
gr.CUFFdata= with(CUFFdata, GRanges(seqnames = chromosome, 
ranges = IRanges(Enhancer_start, Enhancer_end), 
geneName = Gene_ID
))
gr.enhancers = c(gr.NCBIdata, gr.CUFFdata)
## Warning in .Seqinfo.mergexy(x, y): Each of the 2 combined objects has sequence levels not in the other:
##   - in 'x': chr11
##   - in 'y': chr1, chr10, chr2, chr4, chr5, chr6, chr8, chr9
##   Make sure to always combine/compare objects based on the same reference
##   genome (use suppressWarnings() to suppress this warning).
#Download narrowpeak data from encode MOUSE
url       = "http://hgdownload.cse.ucsc.edu/goldenPath/mm9/encodeDCC/wgEncodeCaltechHist/"
mapping   = readLines("../data/files.txt")
fileNames = do.call(c,lapply(mapping, function(x) {
name = strsplit(x, "\t")[[1]][1]
ifelse(grepl(x=name, pattern="narrowPeak"), yes=name, no=NA)
}))
fileNames = fileNames[!is.na(fileNames)]


data = do.call(rbind,lapply(fileNames, function(name){
       temp <- tempfile()
       download.file(paste0(url, name),temp)
       data <- read.table(gzfile(temp))
       unlink(temp)
       data
}))
#Set Column Names
data = setNames(data, c("chrom", "chromStart", "chromEnd", "name", "score", 
                 "strand", "signalValue", "pValue", "qValue", "peak"))
#New column histoneType
data$histoneType = as.character(gsub('^\\S+(H\\d+\\S+me)\\S+$', '\\1', as.character(data$name)))
head(data)
##   chrom chromStart chromEnd                             name score strand
## 1  chr1    4773723  4774709 11907-C2C12-exp-H3K79me3.50mers1     0      .
## 2  chr1    4847972  4851941 11907-C2C12-exp-H3K79me3.50mers2     0      .
## 3  chr1    4852008  4853258 11907-C2C12-exp-H3K79me3.50mers3     0      .
## 4  chr1    4853339  4855298 11907-C2C12-exp-H3K79me3.50mers4     0      .
## 5  chr1    4855367  4856127 11907-C2C12-exp-H3K79me3.50mers5     0      .
## 6  chr1    4856183  4856572 11907-C2C12-exp-H3K79me3.50mers6     0      .
##   signalValue   pValue qValue peak   histoneType
## 1         4.9   6.6576     -1  584 H3K79me3.50me
## 2        31.6  81.3665     -1 1211 H3K79me3.50me
## 3         7.4  94.2840     -1  477 H3K79me3.50me
## 4        11.3 116.3565     -1  726 H3K79me3.50me
## 5         4.1 123.0044     -1  203 H3K79me3.50me
## 6         2.3 125.9208     -1  290 H3K79me3.50me
#build gRanges obj
gr.histone = GRanges(seqnames = data$chrom,
ranges =IRanges(data$chromStart, end = data$chromEnd), 
histone=data$histoneType
)
mtch = findOverlaps(
    gr.histone, 
    gr.enhancers)
mtch
## Hits of length 98
## queryLength: 159316
## subjectLength: 14
##     queryHits subjectHits 
##      <integer>   <integer> 
##  1        1389          11 
##  2        1390          11 
##  3        4087           2 
##  4        4088           2 
##  5        4089           2 
##  ...       ...         ... 
##  94     142289           5 
##  95     142290           5 
##  96     144680           6 
##  97     145368          13 
##  98     145369          13
gr.histone[mtch@queryHits, ]
## GRanges object with 98 ranges and 1 metadata column:
##        seqnames                 ranges strand   |       histone
##           <Rle>              <IRanges>  <Rle>   |   <character>
##    [1]    chr10   [61111603, 61112660]      *   | H3K79me3.50me
##    [2]    chr10   [61113730, 61115072]      *   | H3K79me3.50me
##    [3]    chr12   [77645744, 77648608]      *   | H3K79me3.50me
##    [4]    chr12   [77649223, 77650780]      *   | H3K79me3.50me
##    [5]    chr12   [77650865, 77652613]      *   | H3K79me3.50me
##    ...      ...                    ...    ... ...           ...
##   [94]     chr2 [167514195, 167517370]      *   |  H3K4me3.50me
##   [95]     chr2 [167602838, 167603548]      *   |  H3K4me3.50me
##   [96]     chr5 [136719488, 136720820]      *   |  H3K4me3.50me
##   [97]     chr6 [124364000, 124365133]      *   |  H3K4me3.50me
##   [98]     chr6 [124365205, 124366044]      *   |  H3K4me3.50me
##   -------
##   seqinfo: 20 sequences from an unspecified genome; no seqlengths
sessionInfo()
## R version 3.1.1 (2014-07-10)
## Platform: x86_64-apple-darwin13.1.0 (64-bit)
## 
## locale:
## [1] C
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] GenomicRanges_1.18.3 GenomeInfoDb_1.2.4   IRanges_2.0.1       
##  [4] S4Vectors_0.4.0      BiocGenerics_0.12.1  dplyr_0.3.0.2       
##  [7] ggplot2_1.0.0        xlsx_0.5.7           xlsxjars_0.6.1      
## [10] rJava_0.9-6          RCurl_1.95-4.5       bitops_1.0-6        
## [13] knitr_1.8           
## 
## loaded via a namespace (and not attached):
##  [1] DBI_0.3.1        MASS_7.3-35      Rcpp_0.11.3      XVector_0.6.0   
##  [5] assertthat_0.1   codetools_0.2-9  colorspace_1.2-4 digest_0.6.7    
##  [9] evaluate_0.5.5   formatR_1.0      grid_3.1.1       gtable_0.1.2    
## [13] htmltools_0.2.6  magrittr_1.5     munsell_0.4.2    plyr_1.8.1      
## [17] proto_0.3-10     reshape2_1.4.1   rmarkdown_0.4.2  scales_0.2.4    
## [21] stringr_0.6.2    tools_3.1.1