sessionInfo()
## R version 3.2.3 (2015-12-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04 LTS
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] magrittr_1.5    formatR_1.4     tools_3.2.3     htmltools_0.3.5
##  [5] yaml_2.1.13     Rcpp_0.12.7     stringi_1.1.1   rmarkdown_1.0  
##  [9] knitr_1.14      stringr_1.1.0   digest_0.6.10   evaluate_0.9
# ipak function: install and load multiple R packages.
# ipak checks to see if packages are installed, 
#    installs them if they are not, 
#    then load them into the R session.

ipak <- function(pkg){
    new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
    if (length(new.pkg)) 
        install.packages(new.pkg, dependencies = TRUE)
    sapply(pkg, require, character.only = TRUE)
}

# usage
packages = c('AnnotationHub', 'GenomicRanges', 'rtracklayer')
ipak(packages)
## AnnotationHub GenomicRanges   rtracklayer 
##          TRUE          TRUE          TRUE

Q1:

Use the AnnotationHub package to obtain data on “CpG Islands” in the human genome.

Question: How many islands exists on the autosomes?

[autosomes - any chromosome that is not a sex chromosome]

ah <- AnnotationHub()
## snapshotDate(): 2016-08-15
ah_human_CpG <- query(ah, c("CpG Islands", "hg19"))
# ah_human_CpG
ah_human_CpG_data <- ah_human_CpG[["AH5086"]]
# ah_human_CpG_data

#######################################
# summary info about CpG island dataset
# Used to check output
# summary(width(ah_human_CpG_data))
# seqinfo(ah_human_CpG_data)
# seqlevels(ah_human_CpG_data)
# gaps(ah_human_CpG_data)
#######################################

# reduce data
ah_human_CpG_reduce <- reduce(ah_human_CpG_data)
# ah_human_CpG_reduce

# count number of CpG islands in autochromosome
autosome <- c(paste("chr", 1:22, sep=""))
split_data_by_chr <- split(ah_human_CpG_reduce, seqnames(ah_human_CpG_reduce))
autosome_CpG_data <- split_data_by_chr[autosome]
# seqlevels(autosome_CpG_data)

# CpG Islands on autosome
unlist(autosome_CpG_data)
## GRanges object with 26641 ranges and 0 metadata columns:
##         seqnames               ranges strand
##            <Rle>            <IRanges>  <Rle>
##    chr1     chr1     [ 28736,  29810]      *
##    chr1     chr1     [135125, 135563]      *
##    chr1     chr1     [327791, 328229]      *
##    chr1     chr1     [437152, 438164]      *
##    chr1     chr1     [449274, 450544]      *
##     ...      ...                  ...    ...
##   chr22    chr22 [51135671, 51136118]      *
##   chr22    chr22 [51142803, 51143308]      *
##   chr22    chr22 [51158387, 51160060]      *
##   chr22    chr22 [51169028, 51170019]      *
##   chr22    chr22 [51221773, 51222317]      *
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome

Q2:

Question: How many CpG Islands exists on chromosome 4?

# CpG Islands on chr4
autosome_CpG_data[4]
## GRangesList object of length 1:
## $chr4 
## GRanges object with 1031 ranges and 0 metadata columns:
##          seqnames                 ranges strand
##             <Rle>              <IRanges>  <Rle>
##      [1]     chr4       [ 53199,  53672]      *
##      [2]     chr4       [107147, 107898]      *
##      [3]     chr4       [124333, 124841]      *
##      [4]     chr4       [206378, 206892]      *
##      [5]     chr4       [298804, 299312]      *
##      ...      ...                    ...    ...
##   [1027]     chr4 [190939802, 190940591]      *
##   [1028]     chr4 [190942735, 190944898]      *
##   [1029]     chr4 [190959045, 190960011]      *
##   [1030]     chr4 [190962112, 190962689]      *
##   [1031]     chr4 [190986383, 191013609]      *
## 
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome

Q3:

Obtain the data for the H3K4me3 histone modification for the H1 cell line from Epigenomics Roadmap, using AnnotationHub. Subset these regions to only keep regions mapped to the autosomes (chromosomes 1 to 22).

Question: How many bases does these regions cover?

ah_H3K4me <- query(ah, c("H3K4me3", "E003"))
ah_H3K4me_data <- ah_H3K4me[["AH29884"]]
# seqinfo(ah_H3K4me_data)
# seqlevels(ah_H3K4me_data)

# subset autosome data
ah_H3K4me_autosome_data <- subset(ah_H3K4me_data, seqnames %in% autosome)
# count base pairs
sum(width(unlist(ah_H3K4me_autosome_data)))
## [1] 41135164

Q4:

H3K27me3 histone modification for the H1 cell line from Epigenomics Roadmap

ah_H3K27me3 <- query(ah, c("H3K27me3", "narrowPeak", "E003"))
# ah_H3K27me3

# retrieve data
ah_H3K27me3_data <- ah_H3K27me3[["AH29892"]]

# summary(width(ah_H3K27me3_data))
# seqlevels(ah_H3K27me3_data)
# seqinfo(ah_H3K27me3_data)
# subset standard chrosome data

ah_H3K27me3_autosome_data <- subset(ah_H3K27me3_data, seqnames %in% autosome)

# calculate mean signalValue
ah_H3K27me3_autosome_data_mean <- mean(ah_H3K27me3_autosome_data$signalValue)
ah_H3K27me3_autosome_data_mean
## [1] 4.770728

Q5:

Bivalent regions are bound by both H3K4me3 and H3K27me3

bivalent_data <- intersect(unlist(ah_H3K4me_autosome_data), unlist(ah_H3K27me3_autosome_data))

sum(width(reduce(bivalent_data)))
## [1] 10289096

Q6:

We will examine the extent to which bivalent regions overlap CpG Islands.

Question: how big a fraction (expressed as a number between 0 and 1) of the bivalent regions, overlap one or more CpG Islands?

CpG_bivalent_data <- findOverlaps(bivalent_data, unlist(autosome_CpG_data))

fraction_bi <- length(unique(queryHits(CpG_bivalent_data)))/length(bivalent_data)

fraction_bi
## [1] 0.5383644

Q7:

How big a fraction (expressed as a number between 0 and 1) of the bases which are part of CpG Islands, are also bivalent marked.

ov_CpG_bivalent <- intersect(bivalent_data, unlist(autosome_CpG_data))

fraction_CpG <- sum(width(reduce(ov_CpG_bivalent)))/sum(width(unlist(autosome_CpG_data)))

fraction_CpG
## [1] 0.241688

Q8:

How many bases are bivalently marked within 10kb of CpG Islands?

# autosome_CpG_data
CpG_10k <- resize(unlist(autosome_CpG_data), 
                  width = 20000 + width(unlist(autosome_CpG_data)), 
                  fix = "center")

CpG_10k_bivalent <- intersect(CpG_10k, bivalent_data)

sum(width(CpG_10k_bivalent))
## [1] 9782086

Q9:

Fraction of CpG

How big a fraction (expressed as a number between 0 and 1) of the human genome is contained in a CpG Island?

# calculate genome size
genome <- ah[["AH5018"]]

autosomes <- paste0("chr", 1:22, sep = "")
genome <- keepSeqlevels(genome, autosomes)

genome_size <- sum(as.numeric(seqlengths(genome)))

sum(as.numeric(width(unlist(autosome_CpG_data))))/genome_size
## [1] 0.007047481

Q10:

Calculate odds ratio from contigency table

# odds ratio
inOut = matrix(0, ncol = 2, nrow = 2)
colnames(inOut) = c("in", "out")
rownames(inOut) = c("in", "out")

# inOut
inOut[1,1] = sum(width(intersect(bivalent_data, 
                                 unlist(autosome_CpG_data),
                                 ignore.strand=TRUE)))
inOut[1,2] = sum(width(setdiff(bivalent_data, 
                               unlist(autosome_CpG_data),
                               ignore.strand=TRUE)))
inOut[2,1] = sum(width(setdiff(unlist(autosome_CpG_data), 
                               bivalent_data, 
                               ignore.strand=TRUE)))
inOut[2,2] = genome_size - sum(inOut)

inOut
##           in        out
## in   4907239    5381857
## out 15396787 2855347403
odd_ratio <- inOut[1,1]*inOut[2,2]/(inOut[1,2]*inOut[2,1])

odd_ratio
## [1] 169.0962