HUMAN GENOME IN R

Question 2.9.1 Reference Genome Discovery

How many Bioconductor packages provide reference genomic sequence for zebrafish? Exclude the packages with suffix .masked, that we will discuss later.:

library(BSgenome)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist, unsplit
## 
## Loading required package: S4Vectors
## Loading required package: stats4
## Creating a generic function for 'nchar' from package 'base' in package 'S4Vectors'
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: Biostrings
## Loading required package: XVector
## Loading required package: rtracklayer
library(Biostrings)
available.genomes()
##  [1] "BSgenome.Alyrata.JGI.v1"                  
##  [2] "BSgenome.Amellifera.BeeBase.assembly4"    
##  [3] "BSgenome.Amellifera.UCSC.apiMel2"         
##  [4] "BSgenome.Amellifera.UCSC.apiMel2.masked"  
##  [5] "BSgenome.Athaliana.TAIR.04232008"         
##  [6] "BSgenome.Athaliana.TAIR.TAIR9"            
##  [7] "BSgenome.Btaurus.UCSC.bosTau3"            
##  [8] "BSgenome.Btaurus.UCSC.bosTau3.masked"     
##  [9] "BSgenome.Btaurus.UCSC.bosTau4"            
## [10] "BSgenome.Btaurus.UCSC.bosTau4.masked"     
## [11] "BSgenome.Btaurus.UCSC.bosTau6"            
## [12] "BSgenome.Btaurus.UCSC.bosTau6.masked"     
## [13] "BSgenome.Btaurus.UCSC.bosTau8"            
## [14] "BSgenome.Celegans.UCSC.ce10"              
## [15] "BSgenome.Celegans.UCSC.ce2"               
## [16] "BSgenome.Celegans.UCSC.ce6"               
## [17] "BSgenome.Cfamiliaris.UCSC.canFam2"        
## [18] "BSgenome.Cfamiliaris.UCSC.canFam2.masked" 
## [19] "BSgenome.Cfamiliaris.UCSC.canFam3"        
## [20] "BSgenome.Cfamiliaris.UCSC.canFam3.masked" 
## [21] "BSgenome.Dmelanogaster.UCSC.dm2"          
## [22] "BSgenome.Dmelanogaster.UCSC.dm2.masked"   
## [23] "BSgenome.Dmelanogaster.UCSC.dm3"          
## [24] "BSgenome.Dmelanogaster.UCSC.dm3.masked"   
## [25] "BSgenome.Dmelanogaster.UCSC.dm6"          
## [26] "BSgenome.Drerio.UCSC.danRer5"             
## [27] "BSgenome.Drerio.UCSC.danRer5.masked"      
## [28] "BSgenome.Drerio.UCSC.danRer6"             
## [29] "BSgenome.Drerio.UCSC.danRer6.masked"      
## [30] "BSgenome.Drerio.UCSC.danRer7"             
## [31] "BSgenome.Drerio.UCSC.danRer7.masked"      
## [32] "BSgenome.Ecoli.NCBI.20080805"             
## [33] "BSgenome.Gaculeatus.UCSC.gasAcu1"         
## [34] "BSgenome.Gaculeatus.UCSC.gasAcu1.masked"  
## [35] "BSgenome.Ggallus.UCSC.galGal3"            
## [36] "BSgenome.Ggallus.UCSC.galGal3.masked"     
## [37] "BSgenome.Ggallus.UCSC.galGal4"            
## [38] "BSgenome.Ggallus.UCSC.galGal4.masked"     
## [39] "BSgenome.Hsapiens.NCBI.GRCh38"            
## [40] "BSgenome.Hsapiens.UCSC.hg17"              
## [41] "BSgenome.Hsapiens.UCSC.hg17.masked"       
## [42] "BSgenome.Hsapiens.UCSC.hg18"              
## [43] "BSgenome.Hsapiens.UCSC.hg18.masked"       
## [44] "BSgenome.Hsapiens.UCSC.hg19"              
## [45] "BSgenome.Hsapiens.UCSC.hg19.masked"       
## [46] "BSgenome.Hsapiens.UCSC.hg38"              
## [47] "BSgenome.Hsapiens.UCSC.hg38.masked"       
## [48] "BSgenome.Mfascicularis.NCBI.5.0"          
## [49] "BSgenome.Mfuro.UCSC.musFur1"              
## [50] "BSgenome.Mmulatta.UCSC.rheMac2"           
## [51] "BSgenome.Mmulatta.UCSC.rheMac2.masked"    
## [52] "BSgenome.Mmulatta.UCSC.rheMac3"           
## [53] "BSgenome.Mmulatta.UCSC.rheMac3.masked"    
## [54] "BSgenome.Mmusculus.UCSC.mm10"             
## [55] "BSgenome.Mmusculus.UCSC.mm10.masked"      
## [56] "BSgenome.Mmusculus.UCSC.mm8"              
## [57] "BSgenome.Mmusculus.UCSC.mm8.masked"       
## [58] "BSgenome.Mmusculus.UCSC.mm9"              
## [59] "BSgenome.Mmusculus.UCSC.mm9.masked"       
## [60] "BSgenome.Osativa.MSU.MSU7"                
## [61] "BSgenome.Ptroglodytes.UCSC.panTro2"       
## [62] "BSgenome.Ptroglodytes.UCSC.panTro2.masked"
## [63] "BSgenome.Ptroglodytes.UCSC.panTro3"       
## [64] "BSgenome.Ptroglodytes.UCSC.panTro3.masked"
## [65] "BSgenome.Rnorvegicus.UCSC.rn4"            
## [66] "BSgenome.Rnorvegicus.UCSC.rn4.masked"     
## [67] "BSgenome.Rnorvegicus.UCSC.rn5"            
## [68] "BSgenome.Rnorvegicus.UCSC.rn5.masked"     
## [69] "BSgenome.Rnorvegicus.UCSC.rn6"            
## [70] "BSgenome.Scerevisiae.UCSC.sacCer1"        
## [71] "BSgenome.Scerevisiae.UCSC.sacCer2"        
## [72] "BSgenome.Scerevisiae.UCSC.sacCer3"        
## [73] "BSgenome.Sscrofa.UCSC.susScr3"            
## [74] "BSgenome.Sscrofa.UCSC.susScr3.masked"     
## [75] "BSgenome.Tgondii.ToxoDB.7.0"              
## [76] "BSgenome.Tguttata.UCSC.taeGut1"           
## [77] "BSgenome.Tguttata.UCSC.taeGut1.masked"
grep("Drerio", available.genomes(), value=TRUE) # exclude masked
## [1] "BSgenome.Drerio.UCSC.danRer5"       
## [2] "BSgenome.Drerio.UCSC.danRer5.masked"
## [3] "BSgenome.Drerio.UCSC.danRer6"       
## [4] "BSgenome.Drerio.UCSC.danRer6.masked"
## [5] "BSgenome.Drerio.UCSC.danRer7"       
## [6] "BSgenome.Drerio.UCSC.danRer7.masked"

Question 2.9.2 Masking Structures for Genome gaps and repetitions.

We have noted that the reference genome builds for complex organisms are works in progress. Genomic sequence “mask” structures have been defined to isolate ambiguous, unmappable, and low-complexity segments of genomes so that sequence analysis research can be targeted to reflect current knowledge of sequence regions that are more likely to be functionally informative.

Obtain BSgenome.Hsapiens.UCSC.hg19.masked (it is only a 20MB transfer.)

library(BiocInstaller)
biocLite("BSgenome.Hsapiens.UCSC.hg19.masked")

Then run the command

library(BSgenome.Hsapiens.UCSC.hg19.masked)
## Loading required package: BSgenome.Hsapiens.UCSC.hg19
c17m = BSgenome.Hsapiens.UCSC.hg19.masked$chr17

what is the class of c17m?

class(c17m)
## [1] "MaskedDNAString"
## attr(,"package")
## [1] "Biostrings"

2.9.3 Quantifying assembly gaps

When we print out the value of a MaskedDNAString c17m we get a report on types of mask present. Part of the report for chromosome 17 in hg19 is:

c17m
##   81195210-letter "MaskedDNAString" instance (# for masking)
## seq: AAGCTTCTCACCCTGTTCCTGCATAGATAATTGC...GGTGTGGGTGTGGTGTGTGGGTGTGGGTGTGGT
## masks:
##   maskedwidth maskedratio active names                               desc
## 1     3400000  0.04187439   TRUE AGAPS                      assembly gaps
## 2           0  0.00000000   TRUE   AMB   intra-contig ambiguities (empty)
## 3    38103987  0.46928861  FALSE    RM                       RepeatMasker
## 4      665087  0.00819121  FALSE   TRF Tandem Repeats Finder [period<=12]
## all masks together:
##   maskedwidth maskedratio
##      41543806   0.5116534
## all active masks together:
##   maskedwidth maskedratio
##       3400000  0.04187439

In build hg19, what percentage of the length of chromosome 22 is occupied by “assembly gaps”? Reply with an integer between 0 and 100.

c22m = BSgenome.Hsapiens.UCSC.hg19.masked$chr22
c22m
##   51304566-letter "MaskedDNAString" instance (# for masking)
## seq: ##################################...#################################
## masks:
##   maskedwidth  maskedratio active names                               desc
## 1    16410000 3.198546e-01   TRUE AGAPS                      assembly gaps
## 2          21 4.093203e-07   TRUE   AMB           intra-contig ambiguities
## 3    17230953 3.358561e-01  FALSE    RM                       RepeatMasker
## 4      377637 7.360690e-03  FALSE   TRF Tandem Repeats Finder [period<=12]
## all masks together:
##   maskedwidth maskedratio
##      33664642   0.6561724
## all active masks together:
##   maskedwidth maskedratio
##      16410021    0.319855
round(100*sum(width(masks(c22m)$AGAPS))/length(c22m),0)
## [1] 32

CONVERTING ACROSS VERSIONS OF THE HUMAN GENOME TEXT

Question 2.10.1 Result of a liftOver

Obtain the hg19 to hg38 liftover chain file, and decompress:

# Or do it in R
download.file("http://hgdownload.cse.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz",
  "hg19ToHg38.over.chain.gz")
library(R.utils)
gunzip("hg19ToHg38.over.chain.gz")

We will use it to convert the HepG2 binding address to hg38.

library(ERBS)
data(HepG2)
library(rtracklayer)
ch = import.chain("hg19ToHg38.over.chain") 
nHepG2 = liftOver(HepG2, ch)

How far, in bases, has the first range in HepG2(ch2,20335378 is starting address) moved upstream in the hg38 build?:

start(HepG2[1]) - start(nHepG2[1])
## IntegerList of length 1
## [["1"]] 199761

Question 2.11.1 Parsing a gene model display

Install a recent version of the ph525x package.

library(ph525x)
## Loading required package: png
## Loading required package: grid
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

Use the modPlot function to visualize a model of ESR1, the estrogen receptor 1 gene.

modPlot("ESR1", useGeneSym = FALSE, collapse = FALSE)
## Loading required package: Homo.sapiens
## Loading required package: AnnotationDbi
## Loading required package: OrganismDbi
## Loading required package: GenomicFeatures
## Loading required package: GO.db
## Loading required package: DBI
## 
## Loading required package: org.Hs.eg.db
## 
## Loading required package: TxDb.Hsapiens.UCSC.hg19.knownGene
## Loading required package: Gviz

Question 2.11.2

What are the number of transcripts comprising the model for ESR1?

Count the number of linear structures

Question 2.11.3 Programmatic Enumeration of Transcripts

What is the output of

library(Homo.sapiens)
length(transcriptsBy(Homo.sapiens,by="gene")$"2099")
## [1] 27

This may seem mysterious. We will learn about the relationship between integer codes (2099 is the NCBI Entrez Gene ID for ESR1) and “ESR1” later in this unit.

Question 2.12.1 MIRNA TARGET SITES: PRE-GRANGES

Bioconductor’s rtracklayer package supports import and export of files in common genomic data formats. The package includes a demonstration dataset of microRNA target sites.

library(rtracklayer)
data(targets)

What is the class of targets?

class(targets)
## [1] "data.frame"

Question 2.11.2 CHECKING ESSENTIAL METADATA

To what referene bild do the chromosome, start and end values in targets refer?

Unfortunately it is not typical to add metadata to data.frame instances, as one has to fill a column with relevant information

Question 2.12.3 GRANGES TO BED

We can create GRanges instance from the targets data frame as follows:

library(GenomicRanges)
mtar = with(targets,
            GRanges(chrom, IRanges(start,end), strand=strand,
            targets=target, mirname=name))

You can glimpse of exported versions of this data with

cat(export(mtar[1:5],format = "bed"), sep = "\n")
## chr12    8985196 8985217 .   0   -
## chr7 117095439   117095461   .   0   +
## chr17    23750063    23750088    .   0   +
## chr7 27187934    27187957    .   0   -
## chr17    43458622    43458643    .   0   -
cat("\n")
cat(export(mtar[1:5], format = "gff3"), sep = "\n")
## ##gff-version 3
## ##source-version rtracklayer 1.28.9
## ##date 2015-09-03
## chr12    rtracklayer sequence_feature    8985197 8985217 .   -   .   targets=ENST00000000412; mirname=hsa-miR-16
## chr7 rtracklayer sequence_feature    117095440   117095461   .   +   .   targets=ENST00000003084; mirname=hsa-miR-509-3p
## chr17    rtracklayer sequence_feature    23750064    23750088    .   +   .   targets=ENST00000003834; mirname=hsa-miR-612
## chr7 rtracklayer sequence_feature    27187935    27187957    .   -   .   targets=ENST00000006015; mirname=hsa-miR-423-3p
## chr17    rtracklayer sequence_feature    43458623    43458643    .   -   .   targets=ENST00000006101; mirname=hsa-miR-125b

How can metadata about the data origin and reference build be encoded in the bed export?

> See the specification for bed and other formats at UCSC

Question 2.13.1 Hub exploration via Query()

Use commands

library(AnnotationHub)
ah = AnnotationHub()
mah = metadata(ah)

Question 2.14.1 ENUMERATING GENES WHERE ER BINDS TO TSS

We’ll start out by generating the addresses of all genes for Homo sapiens, and then acquire the ER binding sites in liver cells.

library(Homo.sapiens)
g = genes(Homo.sapiens)
library(ERBS)
data("HepG2")

Interpret the following computation

kp = g[resize(g,1) %over% HepG2]
kp
## GRanges object with 79 ranges and 1 metadata column:
##             seqnames                 ranges strand   |       GENEID
##                <Rle>              <IRanges>  <Rle>   | <FactorList>
##   100129518     chr6 [160181291, 160183364]      -   |    100129518
##   100134015    chr20 [  3087557,   3131513]      +   |    100134015
##       10116    chr15 [ 68570141,  68583640]      +   |        10116
##       10131    chr16 [  3708038,   3767598]      -   |        10131
##       10656     chr8 [136469716, 136659848]      +   |        10656
##         ...      ...                    ...    ... ...          ...
##       90121     chrX   [54466853, 54471731]      +   |        90121
##       91252    chr11   [47377001, 47438051]      +   |        91252
##        9377    chr15   [75212617, 75230495]      -   |         9377
##          95     chr3   [52017300, 52023218]      +   |           95
##        9563     chr1   [ 9294863,  9331394]      +   |         9563
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome

kp is the set of genes whose transcription start site lies in a HepG2 binding site. resize(g,1) is a strand respecting reduction of the gene interval to a single base representing the transcription start site.

Question 2.14.2. Creating and filtering an HTML5 report on Gene Annotation

The ReportingTools package provides very nice interactive tabulaton. We’ll generate a data.frame and publish it to the browser.

nn = names(kp)
m = select(Homo.sapiens, keys = nn, keytype = "ENTREZID",
           columns = c("SYMBOL", "GENENAME","TERM","GO"))

library(ReportingTools)
hrep = HTMLReport(shortName = "erhep.html")

publish(m,hrep)
finish(hrep)        

Point your browser to the generated erhep.html and use the search facility. How many entries mention apoptosis?: You enter the string in the search box and look at the bottom of the page which tells how many entries are found.