# source("http://www.bioconductor.org/biocLite.R")
# biocLite("rtracklayer")

library(GenomicRanges)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     grep, grepl, intersect, is.unsorted, lapply, lengths, Map,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
##     setdiff, sort, table, tapply, union, unique, unlist, unsplit
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
library(rtracklayer)
library(AnnotationHub)

Overview

We are going to use AnnotationHub and GenomicRanges to access ENCODE data on the H3K4me3 histone modification in a specific cell line. This histone modification is believed to mark active promoters, and we are going to attempt to verify this statement. This involves

Getting the ENCODE histone data using AnnotationHub. Getting promoters using AnnotationHub. Comparing the histone data and promoters using findOverlaps in GenomicRanges.

# First we use AnnotationHub to get data on homo sapiens.

ah <- AnnotationHub()
## snapshotDate(): 2016-08-15
ah <- subset(ah, species == "Homo sapiens")
ah # This provides 50,427 files
## AnnotationHub with 24352 records
## # snapshotDate(): 2016-08-15 
## # $dataprovider: BroadInstitute, UCSC, Ensembl, Gencode, NIH Pathway In...
## # $species: Homo sapiens
## # $rdataclass: GRanges, BigWigFile, ChainFile, FaFile, data.frame, VcfF...
## # additional mcols(): taxonomyid, genome, description, tags,
## #   sourceurl, sourcetype 
## # retrieve records with, e.g., 'object[["AH133"]]' 
## 
##             title                                         
##   AH133   | Homo_sapiens.GRCh37.69.cdna.all.fa            
##   AH134   | Homo_sapiens.GRCh37.69.dna.toplevel.fa        
##   AH135   | Homo_sapiens.GRCh37.69.dna_rm.toplevel.fa     
##   AH136   | Homo_sapiens.GRCh37.69.dna_sm.toplevel.fa     
##   AH137   | Homo_sapiens.GRCh37.69.ncrna.fa               
##   ...       ...                                           
##   AH50423 | common_no_known_medical_impact_20160203.vcf.gz
##   AH50424 | clinvar_20160203.vcf.gz                       
##   AH50425 | clinvar_20160203_papu.vcf.gz                  
##   AH50426 | common_and_clinical_20160203.vcf.gz           
##   AH50427 | common_no_known_medical_impact_20160203.vcf.gz

Next we search for two keywords: H3K4me3 and Gm12878 which is the name of the cell line we are interested in.

qhs <- query(ah, "H3K4me3")
qhs <- query(qhs, "Gm12878")
qhs # Provides 11 records
## AnnotationHub with 11 records
## # snapshotDate(): 2016-08-15 
## # $dataprovider: BroadInstitute, UCSC
## # $species: Homo sapiens
## # $rdataclass: GRanges, BigWigFile
## # additional mcols(): taxonomyid, genome, description, tags,
## #   sourceurl, sourcetype 
## # retrieve records with, e.g., 'object[["AH23256"]]' 
## 
##             title                                                      
##   AH23256 | wgEncodeBroadHistoneGm12878H3k4me3StdPk.broadPeak.gz       
##   AH27075 | wgEncodeUwHistoneGm12878H3k4me3StdHotspotsRep1.broadPeak.gz
##   AH27076 | wgEncodeUwHistoneGm12878H3k4me3StdHotspotsRep2.broadPeak.gz
##   AH27077 | wgEncodeUwHistoneGm12878H3k4me3StdPkRep1.narrowPeak.gz     
##   AH27078 | wgEncodeUwHistoneGm12878H3k4me3StdPkRep2.narrowPeak.gz     
##   ...       ...                                                        
##   AH30747 | E116-H3K4me3.narrowPeak.gz                                 
##   AH31690 | E116-H3K4me3.gappedPeak.gz                                 
##   AH32869 | E116-H3K4me3.fc.signal.bigwig                              
##   AH33901 | E116-H3K4me3.pval.signal.bigwig                            
##   AH40294 | E116-H3K4me3.imputed.pval.signal.bigwig
qhs$title # Shows the titles of the 11 docs
##  [1] "wgEncodeBroadHistoneGm12878H3k4me3StdPk.broadPeak.gz"       
##  [2] "wgEncodeUwHistoneGm12878H3k4me3StdHotspotsRep1.broadPeak.gz"
##  [3] "wgEncodeUwHistoneGm12878H3k4me3StdHotspotsRep2.broadPeak.gz"
##  [4] "wgEncodeUwHistoneGm12878H3k4me3StdPkRep1.narrowPeak.gz"     
##  [5] "wgEncodeUwHistoneGm12878H3k4me3StdPkRep2.narrowPeak.gz"     
##  [6] "E116-H3K4me3.broadPeak.gz"                                  
##  [7] "E116-H3K4me3.narrowPeak.gz"                                 
##  [8] "E116-H3K4me3.gappedPeak.gz"                                 
##  [9] "E116-H3K4me3.fc.signal.bigwig"                              
## [10] "E116-H3K4me3.pval.signal.bigwig"                            
## [11] "E116-H3K4me3.imputed.pval.signal.bigwig"
qhs$dataprovider # This just tells us where the titles came from, UCSC and the Broad
##  [1] "UCSC"           "UCSC"           "UCSC"           "UCSC"          
##  [5] "UCSC"           "BroadInstitute" "BroadInstitute" "BroadInstitute"
##  [9] "BroadInstitute" "BroadInstitute" "BroadInstitute"

This result is a great illustration of the mess of public data. It turns our that E116 is a Roadmap Epigenetics code for the Gm12878 cell line. The first 5 hits are from ENCODE, hosted at UCSC and the last 6 hits are from Roadmap Epigenomics hosted at the Broad Institute. The Roadmap data is different representation (and peaks) from the same underlying data. For the ENCODE data, two different centers did the same experiment in the same cell line (Broad, hit 1) and (Uw, hit 2-5), where Uw exposed data on two replicates (whatever that means). These two experiments seems to be analyzed using different algorithms. It is even possible that the Roadmap data is from the same raw data but just analyzed using different algorithms.

Lets take a look at the narrowPeak data:

gr1 <- subset(qhs, title == "wgEncodeUwHistoneGm12878H3k4me3StdPkRep1.narrowPeak.gz")[[1]]
## loading from cache '/home/mcc/.AnnotationHub/32505'
gr1
## GRanges object with 74470 ranges and 6 metadata columns:
##           seqnames                 ranges strand   |        name     score
##              <Rle>              <IRanges>  <Rle>   | <character> <numeric>
##       [1]     chr1       [713301, 713450]      *   |        <NA>         0
##       [2]     chr1       [713501, 713650]      *   |        <NA>         0
##       [3]     chr1       [713881, 714030]      *   |        <NA>         0
##       [4]     chr1       [714181, 714330]      *   |        <NA>         0
##       [5]     chr1       [714481, 714630]      *   |        <NA>         0
##       ...      ...                    ...    ... ...         ...       ...
##   [74466]     chrX [154492741, 154492890]      *   |        <NA>         0
##   [74467]     chrX [154493401, 154493550]      *   |        <NA>         0
##   [74468]     chrX [154560441, 154560590]      *   |        <NA>         0
##   [74469]     chrX [154562121, 154562270]      *   |        <NA>         0
##   [74470]     chrX [154842061, 154842210]      *   |        <NA>         0
##           signalValue    pValue    qValue      peak
##             <numeric> <numeric> <numeric> <numeric>
##       [1]          91  112.7680        -1        -1
##       [2]          25   26.7181        -1        -1
##       [3]          81   77.4798        -1        -1
##       [4]          32  106.5650        -1        -1
##       [5]         122  153.8320        -1        -1
##       ...         ...       ...       ...       ...
##   [74466]          43  52.20930        -1        -1
##   [74467]         122 203.16800        -1        -1
##   [74468]           8   4.49236        -1        -1
##   [74469]           8   4.41978        -1        -1
##   [74470]         125 170.20100        -1        -1
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome
# Let's look at the second narrow peek
gr2 <- subset(qhs, title == "E116-H3K4me3.narrowPeak.gz")[[1]]
gr2
## GRanges object with 76188 ranges and 6 metadata columns:
##           seqnames                 ranges strand   |        name     score
##              <Rle>              <IRanges>  <Rle>   | <character> <numeric>
##       [1]     chr9 [123553464, 123557122]      *   |      Rank_1       623
##       [2]     chr3 [ 53196213,  53197995]      *   |      Rank_2       622
##       [3]    chr18 [  9137534,   9142676]      *   |      Rank_3       583
##       [4]    chr11 [ 75110593,  75111943]      *   |      Rank_4       548
##       [5]    chr13 [ 41343776,  41345943]      *   |      Rank_5       543
##       ...      ...                    ...    ... ...         ...       ...
##   [76184]     chr8 [ 98881411,  98881613]      *   |  Rank_76184        20
##   [76185]     chr9 [ 26812360,  26812533]      *   |  Rank_76185        20
##   [76186]     chrX [ 49019816,  49020016]      *   |  Rank_76186        20
##   [76187]     chrX [ 55932872,  55933045]      *   |  Rank_76187        20
##   [76188]     chr5 [118322235, 118322408]      *   |  Rank_76188        20
##           signalValue    pValue    qValue      peak
##             <numeric> <numeric> <numeric> <numeric>
##       [1]    23.09216  62.34132  52.85911      1736
##       [2]    24.91976  62.22835  52.85911       439
##       [3]    22.48938  58.39180  50.67302       442
##       [4]    22.47056  54.84914  47.77176       752
##       [5]    20.82804  54.31837  47.29083       909
##       ...         ...       ...       ...       ...
##   [76184]     2.63471   2.02539   0.27458        94
##   [76185]     2.63471   2.02539   0.27458        32
##   [76186]     2.63471   2.02539   0.27458        82
##   [76187]     2.63471   2.02539   0.27458        41
##   [76188]     2.62206   2.00970   0.26790        52
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome