sessionInfo()
## R version 3.2.3 (2015-12-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04 LTS
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] magrittr_1.5 formatR_1.4 tools_3.2.3 htmltools_0.3.5
## [5] yaml_2.1.13 Rcpp_0.12.7 stringi_1.1.1 rmarkdown_1.0
## [9] knitr_1.14 stringr_1.1.0 digest_0.6.10 evaluate_0.9
# ipak function: install and load multiple R packages.
# ipak checks to see if packages are installed,
# installs them if they are not,
# then load them into the R session.
ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
# usage
packages = c('AnnotationHub', 'GenomicRanges', 'rtracklayer')
ipak(packages)
## AnnotationHub GenomicRanges rtracklayer
## TRUE TRUE TRUE
Q1:
Use the AnnotationHub package to obtain data on “CpG Islands” in the human genome.
Question: How many islands exists on the autosomes?
[autosomes - any chromosome that is not a sex chromosome]
ah <- AnnotationHub()
## snapshotDate(): 2016-08-15
ah_human_CpG <- query(ah, c("CpG Islands", "hg19"))
# ah_human_CpG
ah_human_CpG_data <- ah_human_CpG[["AH5086"]]
# ah_human_CpG_data
#######################################
# summary info about CpG island dataset
# Used to check output
# summary(width(ah_human_CpG_data))
# seqinfo(ah_human_CpG_data)
# seqlevels(ah_human_CpG_data)
# gaps(ah_human_CpG_data)
#######################################
# reduce data
ah_human_CpG_reduce <- reduce(ah_human_CpG_data)
# ah_human_CpG_reduce
# count number of CpG islands in autochromosome
autosome <- c(paste("chr", 1:22, sep=""))
split_data_by_chr <- split(ah_human_CpG_reduce, seqnames(ah_human_CpG_reduce))
autosome_CpG_data <- split_data_by_chr[autosome]
# seqlevels(autosome_CpG_data)
# CpG Islands on autosome
unlist(autosome_CpG_data)
## GRanges object with 26641 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## chr1 chr1 [ 28736, 29810] *
## chr1 chr1 [135125, 135563] *
## chr1 chr1 [327791, 328229] *
## chr1 chr1 [437152, 438164] *
## chr1 chr1 [449274, 450544] *
## ... ... ... ...
## chr22 chr22 [51135671, 51136118] *
## chr22 chr22 [51142803, 51143308] *
## chr22 chr22 [51158387, 51160060] *
## chr22 chr22 [51169028, 51170019] *
## chr22 chr22 [51221773, 51222317] *
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
Q2:
Question: How many CpG Islands exists on chromosome 4?
# CpG Islands on chr4
autosome_CpG_data[4]
## GRangesList object of length 1:
## $chr4
## GRanges object with 1031 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr4 [ 53199, 53672] *
## [2] chr4 [107147, 107898] *
## [3] chr4 [124333, 124841] *
## [4] chr4 [206378, 206892] *
## [5] chr4 [298804, 299312] *
## ... ... ... ...
## [1027] chr4 [190939802, 190940591] *
## [1028] chr4 [190942735, 190944898] *
## [1029] chr4 [190959045, 190960011] *
## [1030] chr4 [190962112, 190962689] *
## [1031] chr4 [190986383, 191013609] *
##
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
Q3:
Obtain the data for the H3K4me3 histone modification for the H1 cell line from Epigenomics Roadmap, using AnnotationHub. Subset these regions to only keep regions mapped to the autosomes (chromosomes 1 to 22).
Question: How many bases does these regions cover?
ah_H3K4me <- query(ah, c("H3K4me3", "E003"))
ah_H3K4me_data <- ah_H3K4me[["AH29884"]]
# seqinfo(ah_H3K4me_data)
# seqlevels(ah_H3K4me_data)
# subset autosome data
ah_H3K4me_autosome_data <- subset(ah_H3K4me_data, seqnames %in% autosome)
# count base pairs
sum(width(unlist(ah_H3K4me_autosome_data)))
## [1] 41135164
Q4:
H3K27me3 histone modification for the H1 cell line from Epigenomics Roadmap
ah_H3K27me3 <- query(ah, c("H3K27me3", "narrowPeak", "E003"))
# ah_H3K27me3
# retrieve data
ah_H3K27me3_data <- ah_H3K27me3[["AH29892"]]
# summary(width(ah_H3K27me3_data))
# seqlevels(ah_H3K27me3_data)
# seqinfo(ah_H3K27me3_data)
# subset standard chrosome data
ah_H3K27me3_autosome_data <- subset(ah_H3K27me3_data, seqnames %in% autosome)
# calculate mean signalValue
ah_H3K27me3_autosome_data_mean <- mean(ah_H3K27me3_autosome_data$signalValue)
ah_H3K27me3_autosome_data_mean
## [1] 4.770728
Q5:
Bivalent regions are bound by both H3K4me3 and H3K27me3
bivalent_data <- intersect(unlist(ah_H3K4me_autosome_data), unlist(ah_H3K27me3_autosome_data))
sum(width(reduce(bivalent_data)))
## [1] 10289096
Q6:
We will examine the extent to which bivalent regions overlap CpG Islands.
Question: how big a fraction (expressed as a number between 0 and 1) of the bivalent regions, overlap one or more CpG Islands?
CpG_bivalent_data <- findOverlaps(bivalent_data, unlist(autosome_CpG_data))
fraction_bi <- length(unique(queryHits(CpG_bivalent_data)))/length(bivalent_data)
fraction_bi
## [1] 0.5383644
Q7:
How big a fraction (expressed as a number between 0 and 1) of the bases which are part of CpG Islands, are also bivalent marked.
ov_CpG_bivalent <- intersect(bivalent_data, unlist(autosome_CpG_data))
fraction_CpG <- sum(width(reduce(ov_CpG_bivalent)))/sum(width(unlist(autosome_CpG_data)))
fraction_CpG
## [1] 0.241688
Q8:
How many bases are bivalently marked within 10kb of CpG Islands?
# autosome_CpG_data
CpG_10k <- resize(unlist(autosome_CpG_data),
width = 20000 + width(unlist(autosome_CpG_data)),
fix = "center")
CpG_10k_bivalent <- intersect(CpG_10k, bivalent_data)
sum(width(CpG_10k_bivalent))
## [1] 9782086
Q9:
Fraction of CpG
How big a fraction (expressed as a number between 0 and 1) of the human genome is contained in a CpG Island?
# calculate genome size
genome <- ah[["AH5018"]]
autosomes <- paste0("chr", 1:22, sep = "")
genome <- keepSeqlevels(genome, autosomes)
genome_size <- sum(as.numeric(seqlengths(genome)))
sum(as.numeric(width(unlist(autosome_CpG_data))))/genome_size
## [1] 0.007047481
Q10:
Calculate odds ratio from contigency table
# odds ratio
inOut = matrix(0, ncol = 2, nrow = 2)
colnames(inOut) = c("in", "out")
rownames(inOut) = c("in", "out")
# inOut
inOut[1,1] = sum(width(intersect(bivalent_data,
unlist(autosome_CpG_data),
ignore.strand=TRUE)))
inOut[1,2] = sum(width(setdiff(bivalent_data,
unlist(autosome_CpG_data),
ignore.strand=TRUE)))
inOut[2,1] = sum(width(setdiff(unlist(autosome_CpG_data),
bivalent_data,
ignore.strand=TRUE)))
inOut[2,2] = genome_size - sum(inOut)
inOut
## in out
## in 4907239 5381857
## out 15396787 2855347403
odd_ratio <- inOut[1,1]*inOut[2,2]/(inOut[1,2]*inOut[2,1])
odd_ratio
## [1] 169.0962