# source("http://www.bioconductor.org/biocLite.R")
# biocLite("rtracklayer")
library(GenomicRanges)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
## grep, grepl, intersect, is.unsorted, lapply, lengths, Map,
## mapply, match, mget, order, paste, pmax, pmax.int, pmin,
## pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
## setdiff, sort, table, tapply, union, unique, unlist, unsplit
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
library(rtracklayer)
library(AnnotationHub)
Overview
We are going to use AnnotationHub and GenomicRanges to access ENCODE data on the H3K4me3 histone modification in a specific cell line. This histone modification is believed to mark active promoters, and we are going to attempt to verify this statement. This involves
Getting the ENCODE histone data using AnnotationHub. Getting promoters using AnnotationHub. Comparing the histone data and promoters using findOverlaps in GenomicRanges.
# First we use AnnotationHub to get data on homo sapiens.
ah <- AnnotationHub()
## snapshotDate(): 2016-08-15
ah <- subset(ah, species == "Homo sapiens")
ah # This provides 50,427 files
## AnnotationHub with 24352 records
## # snapshotDate(): 2016-08-15
## # $dataprovider: BroadInstitute, UCSC, Ensembl, Gencode, NIH Pathway In...
## # $species: Homo sapiens
## # $rdataclass: GRanges, BigWigFile, ChainFile, FaFile, data.frame, VcfF...
## # additional mcols(): taxonomyid, genome, description, tags,
## # sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH133"]]'
##
## title
## AH133 | Homo_sapiens.GRCh37.69.cdna.all.fa
## AH134 | Homo_sapiens.GRCh37.69.dna.toplevel.fa
## AH135 | Homo_sapiens.GRCh37.69.dna_rm.toplevel.fa
## AH136 | Homo_sapiens.GRCh37.69.dna_sm.toplevel.fa
## AH137 | Homo_sapiens.GRCh37.69.ncrna.fa
## ... ...
## AH50423 | common_no_known_medical_impact_20160203.vcf.gz
## AH50424 | clinvar_20160203.vcf.gz
## AH50425 | clinvar_20160203_papu.vcf.gz
## AH50426 | common_and_clinical_20160203.vcf.gz
## AH50427 | common_no_known_medical_impact_20160203.vcf.gz
Next we search for two keywords: H3K4me3 and Gm12878 which is the name of the cell line we are interested in.
qhs <- query(ah, "H3K4me3")
qhs <- query(qhs, "Gm12878")
qhs # Provides 11 records
## AnnotationHub with 11 records
## # snapshotDate(): 2016-08-15
## # $dataprovider: BroadInstitute, UCSC
## # $species: Homo sapiens
## # $rdataclass: GRanges, BigWigFile
## # additional mcols(): taxonomyid, genome, description, tags,
## # sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH23256"]]'
##
## title
## AH23256 | wgEncodeBroadHistoneGm12878H3k4me3StdPk.broadPeak.gz
## AH27075 | wgEncodeUwHistoneGm12878H3k4me3StdHotspotsRep1.broadPeak.gz
## AH27076 | wgEncodeUwHistoneGm12878H3k4me3StdHotspotsRep2.broadPeak.gz
## AH27077 | wgEncodeUwHistoneGm12878H3k4me3StdPkRep1.narrowPeak.gz
## AH27078 | wgEncodeUwHistoneGm12878H3k4me3StdPkRep2.narrowPeak.gz
## ... ...
## AH30747 | E116-H3K4me3.narrowPeak.gz
## AH31690 | E116-H3K4me3.gappedPeak.gz
## AH32869 | E116-H3K4me3.fc.signal.bigwig
## AH33901 | E116-H3K4me3.pval.signal.bigwig
## AH40294 | E116-H3K4me3.imputed.pval.signal.bigwig
qhs$title # Shows the titles of the 11 docs
## [1] "wgEncodeBroadHistoneGm12878H3k4me3StdPk.broadPeak.gz"
## [2] "wgEncodeUwHistoneGm12878H3k4me3StdHotspotsRep1.broadPeak.gz"
## [3] "wgEncodeUwHistoneGm12878H3k4me3StdHotspotsRep2.broadPeak.gz"
## [4] "wgEncodeUwHistoneGm12878H3k4me3StdPkRep1.narrowPeak.gz"
## [5] "wgEncodeUwHistoneGm12878H3k4me3StdPkRep2.narrowPeak.gz"
## [6] "E116-H3K4me3.broadPeak.gz"
## [7] "E116-H3K4me3.narrowPeak.gz"
## [8] "E116-H3K4me3.gappedPeak.gz"
## [9] "E116-H3K4me3.fc.signal.bigwig"
## [10] "E116-H3K4me3.pval.signal.bigwig"
## [11] "E116-H3K4me3.imputed.pval.signal.bigwig"
qhs$dataprovider # This just tells us where the titles came from, UCSC and the Broad
## [1] "UCSC" "UCSC" "UCSC" "UCSC"
## [5] "UCSC" "BroadInstitute" "BroadInstitute" "BroadInstitute"
## [9] "BroadInstitute" "BroadInstitute" "BroadInstitute"
This result is a great illustration of the mess of public data. It turns our that E116 is a Roadmap Epigenetics code for the Gm12878 cell line. The first 5 hits are from ENCODE, hosted at UCSC and the last 6 hits are from Roadmap Epigenomics hosted at the Broad Institute. The Roadmap data is different representation (and peaks) from the same underlying data. For the ENCODE data, two different centers did the same experiment in the same cell line (Broad, hit 1) and (Uw, hit 2-5), where Uw exposed data on two replicates (whatever that means). These two experiments seems to be analyzed using different algorithms. It is even possible that the Roadmap data is from the same raw data but just analyzed using different algorithms.
Lets take a look at the narrowPeak data:
gr1 <- subset(qhs, title == "wgEncodeUwHistoneGm12878H3k4me3StdPkRep1.narrowPeak.gz")[[1]]
## loading from cache '/home/mcc/.AnnotationHub/32505'
gr1
## GRanges object with 74470 ranges and 6 metadata columns:
## seqnames ranges strand | name score
## <Rle> <IRanges> <Rle> | <character> <numeric>
## [1] chr1 [713301, 713450] * | <NA> 0
## [2] chr1 [713501, 713650] * | <NA> 0
## [3] chr1 [713881, 714030] * | <NA> 0
## [4] chr1 [714181, 714330] * | <NA> 0
## [5] chr1 [714481, 714630] * | <NA> 0
## ... ... ... ... ... ... ...
## [74466] chrX [154492741, 154492890] * | <NA> 0
## [74467] chrX [154493401, 154493550] * | <NA> 0
## [74468] chrX [154560441, 154560590] * | <NA> 0
## [74469] chrX [154562121, 154562270] * | <NA> 0
## [74470] chrX [154842061, 154842210] * | <NA> 0
## signalValue pValue qValue peak
## <numeric> <numeric> <numeric> <numeric>
## [1] 91 112.7680 -1 -1
## [2] 25 26.7181 -1 -1
## [3] 81 77.4798 -1 -1
## [4] 32 106.5650 -1 -1
## [5] 122 153.8320 -1 -1
## ... ... ... ... ...
## [74466] 43 52.20930 -1 -1
## [74467] 122 203.16800 -1 -1
## [74468] 8 4.49236 -1 -1
## [74469] 8 4.41978 -1 -1
## [74470] 125 170.20100 -1 -1
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
# Let's look at the second narrow peek
gr2 <- subset(qhs, title == "E116-H3K4me3.narrowPeak.gz")[[1]]
gr2
## GRanges object with 76188 ranges and 6 metadata columns:
## seqnames ranges strand | name score
## <Rle> <IRanges> <Rle> | <character> <numeric>
## [1] chr9 [123553464, 123557122] * | Rank_1 623
## [2] chr3 [ 53196213, 53197995] * | Rank_2 622
## [3] chr18 [ 9137534, 9142676] * | Rank_3 583
## [4] chr11 [ 75110593, 75111943] * | Rank_4 548
## [5] chr13 [ 41343776, 41345943] * | Rank_5 543
## ... ... ... ... ... ... ...
## [76184] chr8 [ 98881411, 98881613] * | Rank_76184 20
## [76185] chr9 [ 26812360, 26812533] * | Rank_76185 20
## [76186] chrX [ 49019816, 49020016] * | Rank_76186 20
## [76187] chrX [ 55932872, 55933045] * | Rank_76187 20
## [76188] chr5 [118322235, 118322408] * | Rank_76188 20
## signalValue pValue qValue peak
## <numeric> <numeric> <numeric> <numeric>
## [1] 23.09216 62.34132 52.85911 1736
## [2] 24.91976 62.22835 52.85911 439
## [3] 22.48938 58.39180 50.67302 442
## [4] 22.47056 54.84914 47.77176 752
## [5] 20.82804 54.31837 47.29083 909
## ... ... ... ... ...
## [76184] 2.63471 2.02539 0.27458 94
## [76185] 2.63471 2.02539 0.27458 32
## [76186] 2.63471 2.02539 0.27458 82
## [76187] 2.63471 2.02539 0.27458 41
## [76188] 2.62206 2.00970 0.26790 52
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome