How many Bioconductor packages provide reference genomic sequence for zebrafish? Exclude the packages with suffix .masked, that we will discuss later.:
library(BSgenome)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
## intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unlist, unsplit
##
## Loading required package: S4Vectors
## Loading required package: stats4
## Creating a generic function for 'nchar' from package 'base' in package 'S4Vectors'
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: Biostrings
## Loading required package: XVector
## Loading required package: rtracklayer
library(Biostrings)
available.genomes()
## [1] "BSgenome.Alyrata.JGI.v1"
## [2] "BSgenome.Amellifera.BeeBase.assembly4"
## [3] "BSgenome.Amellifera.UCSC.apiMel2"
## [4] "BSgenome.Amellifera.UCSC.apiMel2.masked"
## [5] "BSgenome.Athaliana.TAIR.04232008"
## [6] "BSgenome.Athaliana.TAIR.TAIR9"
## [7] "BSgenome.Btaurus.UCSC.bosTau3"
## [8] "BSgenome.Btaurus.UCSC.bosTau3.masked"
## [9] "BSgenome.Btaurus.UCSC.bosTau4"
## [10] "BSgenome.Btaurus.UCSC.bosTau4.masked"
## [11] "BSgenome.Btaurus.UCSC.bosTau6"
## [12] "BSgenome.Btaurus.UCSC.bosTau6.masked"
## [13] "BSgenome.Btaurus.UCSC.bosTau8"
## [14] "BSgenome.Celegans.UCSC.ce10"
## [15] "BSgenome.Celegans.UCSC.ce2"
## [16] "BSgenome.Celegans.UCSC.ce6"
## [17] "BSgenome.Cfamiliaris.UCSC.canFam2"
## [18] "BSgenome.Cfamiliaris.UCSC.canFam2.masked"
## [19] "BSgenome.Cfamiliaris.UCSC.canFam3"
## [20] "BSgenome.Cfamiliaris.UCSC.canFam3.masked"
## [21] "BSgenome.Dmelanogaster.UCSC.dm2"
## [22] "BSgenome.Dmelanogaster.UCSC.dm2.masked"
## [23] "BSgenome.Dmelanogaster.UCSC.dm3"
## [24] "BSgenome.Dmelanogaster.UCSC.dm3.masked"
## [25] "BSgenome.Dmelanogaster.UCSC.dm6"
## [26] "BSgenome.Drerio.UCSC.danRer5"
## [27] "BSgenome.Drerio.UCSC.danRer5.masked"
## [28] "BSgenome.Drerio.UCSC.danRer6"
## [29] "BSgenome.Drerio.UCSC.danRer6.masked"
## [30] "BSgenome.Drerio.UCSC.danRer7"
## [31] "BSgenome.Drerio.UCSC.danRer7.masked"
## [32] "BSgenome.Ecoli.NCBI.20080805"
## [33] "BSgenome.Gaculeatus.UCSC.gasAcu1"
## [34] "BSgenome.Gaculeatus.UCSC.gasAcu1.masked"
## [35] "BSgenome.Ggallus.UCSC.galGal3"
## [36] "BSgenome.Ggallus.UCSC.galGal3.masked"
## [37] "BSgenome.Ggallus.UCSC.galGal4"
## [38] "BSgenome.Ggallus.UCSC.galGal4.masked"
## [39] "BSgenome.Hsapiens.NCBI.GRCh38"
## [40] "BSgenome.Hsapiens.UCSC.hg17"
## [41] "BSgenome.Hsapiens.UCSC.hg17.masked"
## [42] "BSgenome.Hsapiens.UCSC.hg18"
## [43] "BSgenome.Hsapiens.UCSC.hg18.masked"
## [44] "BSgenome.Hsapiens.UCSC.hg19"
## [45] "BSgenome.Hsapiens.UCSC.hg19.masked"
## [46] "BSgenome.Hsapiens.UCSC.hg38"
## [47] "BSgenome.Hsapiens.UCSC.hg38.masked"
## [48] "BSgenome.Mfascicularis.NCBI.5.0"
## [49] "BSgenome.Mfuro.UCSC.musFur1"
## [50] "BSgenome.Mmulatta.UCSC.rheMac2"
## [51] "BSgenome.Mmulatta.UCSC.rheMac2.masked"
## [52] "BSgenome.Mmulatta.UCSC.rheMac3"
## [53] "BSgenome.Mmulatta.UCSC.rheMac3.masked"
## [54] "BSgenome.Mmusculus.UCSC.mm10"
## [55] "BSgenome.Mmusculus.UCSC.mm10.masked"
## [56] "BSgenome.Mmusculus.UCSC.mm8"
## [57] "BSgenome.Mmusculus.UCSC.mm8.masked"
## [58] "BSgenome.Mmusculus.UCSC.mm9"
## [59] "BSgenome.Mmusculus.UCSC.mm9.masked"
## [60] "BSgenome.Osativa.MSU.MSU7"
## [61] "BSgenome.Ptroglodytes.UCSC.panTro2"
## [62] "BSgenome.Ptroglodytes.UCSC.panTro2.masked"
## [63] "BSgenome.Ptroglodytes.UCSC.panTro3"
## [64] "BSgenome.Ptroglodytes.UCSC.panTro3.masked"
## [65] "BSgenome.Rnorvegicus.UCSC.rn4"
## [66] "BSgenome.Rnorvegicus.UCSC.rn4.masked"
## [67] "BSgenome.Rnorvegicus.UCSC.rn5"
## [68] "BSgenome.Rnorvegicus.UCSC.rn5.masked"
## [69] "BSgenome.Rnorvegicus.UCSC.rn6"
## [70] "BSgenome.Scerevisiae.UCSC.sacCer1"
## [71] "BSgenome.Scerevisiae.UCSC.sacCer2"
## [72] "BSgenome.Scerevisiae.UCSC.sacCer3"
## [73] "BSgenome.Sscrofa.UCSC.susScr3"
## [74] "BSgenome.Sscrofa.UCSC.susScr3.masked"
## [75] "BSgenome.Tgondii.ToxoDB.7.0"
## [76] "BSgenome.Tguttata.UCSC.taeGut1"
## [77] "BSgenome.Tguttata.UCSC.taeGut1.masked"
grep("Drerio", available.genomes(), value=TRUE) # exclude masked
## [1] "BSgenome.Drerio.UCSC.danRer5"
## [2] "BSgenome.Drerio.UCSC.danRer5.masked"
## [3] "BSgenome.Drerio.UCSC.danRer6"
## [4] "BSgenome.Drerio.UCSC.danRer6.masked"
## [5] "BSgenome.Drerio.UCSC.danRer7"
## [6] "BSgenome.Drerio.UCSC.danRer7.masked"
We have noted that the reference genome builds for complex organisms are works in progress. Genomic sequence “mask” structures have been defined to isolate ambiguous, unmappable, and low-complexity segments of genomes so that sequence analysis research can be targeted to reflect current knowledge of sequence regions that are more likely to be functionally informative.
Obtain BSgenome.Hsapiens.UCSC.hg19.masked (it is only a 20MB transfer.)
library(BiocInstaller)
biocLite("BSgenome.Hsapiens.UCSC.hg19.masked")
Then run the command
library(BSgenome.Hsapiens.UCSC.hg19.masked)
## Loading required package: BSgenome.Hsapiens.UCSC.hg19
c17m = BSgenome.Hsapiens.UCSC.hg19.masked$chr17
what is the class of c17m?
class(c17m)
## [1] "MaskedDNAString"
## attr(,"package")
## [1] "Biostrings"
When we print out the value of a MaskedDNAString c17m we get a report on types of mask present. Part of the report for chromosome 17 in hg19 is:
c17m
## 81195210-letter "MaskedDNAString" instance (# for masking)
## seq: AAGCTTCTCACCCTGTTCCTGCATAGATAATTGC...GGTGTGGGTGTGGTGTGTGGGTGTGGGTGTGGT
## masks:
## maskedwidth maskedratio active names desc
## 1 3400000 0.04187439 TRUE AGAPS assembly gaps
## 2 0 0.00000000 TRUE AMB intra-contig ambiguities (empty)
## 3 38103987 0.46928861 FALSE RM RepeatMasker
## 4 665087 0.00819121 FALSE TRF Tandem Repeats Finder [period<=12]
## all masks together:
## maskedwidth maskedratio
## 41543806 0.5116534
## all active masks together:
## maskedwidth maskedratio
## 3400000 0.04187439
In build hg19, what percentage of the length of chromosome 22 is occupied by “assembly gaps”? Reply with an integer between 0 and 100.
c22m = BSgenome.Hsapiens.UCSC.hg19.masked$chr22
c22m
## 51304566-letter "MaskedDNAString" instance (# for masking)
## seq: ##################################...#################################
## masks:
## maskedwidth maskedratio active names desc
## 1 16410000 3.198546e-01 TRUE AGAPS assembly gaps
## 2 21 4.093203e-07 TRUE AMB intra-contig ambiguities
## 3 17230953 3.358561e-01 FALSE RM RepeatMasker
## 4 377637 7.360690e-03 FALSE TRF Tandem Repeats Finder [period<=12]
## all masks together:
## maskedwidth maskedratio
## 33664642 0.6561724
## all active masks together:
## maskedwidth maskedratio
## 16410021 0.319855
round(100*sum(width(masks(c22m)$AGAPS))/length(c22m),0)
## [1] 32
Obtain the hg19 to hg38 liftover chain file, and decompress:
# Or do it in R
download.file("http://hgdownload.cse.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz",
"hg19ToHg38.over.chain.gz")
library(R.utils)
gunzip("hg19ToHg38.over.chain.gz")
We will use it to convert the HepG2 binding address to hg38.
library(ERBS)
data(HepG2)
library(rtracklayer)
ch = import.chain("hg19ToHg38.over.chain")
nHepG2 = liftOver(HepG2, ch)
How far, in bases, has the first range in HepG2(ch2,20335378 is starting address) moved upstream in the hg38 build?:
start(HepG2[1]) - start(nHepG2[1])
## IntegerList of length 1
## [["1"]] 199761
Install a recent version of the ph525x package.
library(ph525x)
## Loading required package: png
## Loading required package: grid
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
Use the modPlot function to visualize a model of ESR1, the estrogen receptor 1 gene.
modPlot("ESR1", useGeneSym = FALSE, collapse = FALSE)
## Loading required package: Homo.sapiens
## Loading required package: AnnotationDbi
## Loading required package: OrganismDbi
## Loading required package: GenomicFeatures
## Loading required package: GO.db
## Loading required package: DBI
##
## Loading required package: org.Hs.eg.db
##
## Loading required package: TxDb.Hsapiens.UCSC.hg19.knownGene
## Loading required package: Gviz
What are the number of transcripts comprising the model for ESR1?
Count the number of linear structures
What is the output of
library(Homo.sapiens)
length(transcriptsBy(Homo.sapiens,by="gene")$"2099")
## [1] 27
Bioconductor’s rtracklayer package supports import and export of files in common genomic data formats. The package includes a demonstration dataset of microRNA target sites.
library(rtracklayer)
data(targets)
What is the class of targets?
class(targets)
## [1] "data.frame"
To what referene bild do the chromosome, start and end values in targets refer?
Unfortunately it is not typical to add metadata to data.frame instances, as one has to fill a column with relevant information
We can create GRanges instance from the targets data frame as follows:
library(GenomicRanges)
mtar = with(targets,
GRanges(chrom, IRanges(start,end), strand=strand,
targets=target, mirname=name))
You can glimpse of exported versions of this data with
cat(export(mtar[1:5],format = "bed"), sep = "\n")
## chr12 8985196 8985217 . 0 -
## chr7 117095439 117095461 . 0 +
## chr17 23750063 23750088 . 0 +
## chr7 27187934 27187957 . 0 -
## chr17 43458622 43458643 . 0 -
cat("\n")
cat(export(mtar[1:5], format = "gff3"), sep = "\n")
## ##gff-version 3
## ##source-version rtracklayer 1.28.9
## ##date 2015-09-03
## chr12 rtracklayer sequence_feature 8985197 8985217 . - . targets=ENST00000000412; mirname=hsa-miR-16
## chr7 rtracklayer sequence_feature 117095440 117095461 . + . targets=ENST00000003084; mirname=hsa-miR-509-3p
## chr17 rtracklayer sequence_feature 23750064 23750088 . + . targets=ENST00000003834; mirname=hsa-miR-612
## chr7 rtracklayer sequence_feature 27187935 27187957 . - . targets=ENST00000006015; mirname=hsa-miR-423-3p
## chr17 rtracklayer sequence_feature 43458623 43458643 . - . targets=ENST00000006101; mirname=hsa-miR-125b
How can metadata about the data origin and reference build be encoded in the bed export?
Use commands
library(AnnotationHub)
ah = AnnotationHub()
mah = metadata(ah)
We’ll start out by generating the addresses of all genes for Homo sapiens, and then acquire the ER binding sites in liver cells.
library(Homo.sapiens)
g = genes(Homo.sapiens)
library(ERBS)
data("HepG2")
Interpret the following computation
kp = g[resize(g,1) %over% HepG2]
kp
## GRanges object with 79 ranges and 1 metadata column:
## seqnames ranges strand | GENEID
## <Rle> <IRanges> <Rle> | <FactorList>
## 100129518 chr6 [160181291, 160183364] - | 100129518
## 100134015 chr20 [ 3087557, 3131513] + | 100134015
## 10116 chr15 [ 68570141, 68583640] + | 10116
## 10131 chr16 [ 3708038, 3767598] - | 10131
## 10656 chr8 [136469716, 136659848] + | 10656
## ... ... ... ... ... ...
## 90121 chrX [54466853, 54471731] + | 90121
## 91252 chr11 [47377001, 47438051] + | 91252
## 9377 chr15 [75212617, 75230495] - | 9377
## 95 chr3 [52017300, 52023218] + | 95
## 9563 chr1 [ 9294863, 9331394] + | 9563
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
kp is the set of genes whose transcription start site lies in a HepG2 binding site. resize(g,1) is a strand respecting reduction of the gene interval to a single base representing the transcription start site.
The ReportingTools package provides very nice interactive tabulaton. We’ll generate a data.frame and publish it to the browser.
nn = names(kp)
m = select(Homo.sapiens, keys = nn, keytype = "ENTREZID",
columns = c("SYMBOL", "GENENAME","TERM","GO"))
library(ReportingTools)
hrep = HTMLReport(shortName = "erhep.html")
publish(m,hrep)
finish(hrep)
Point your browser to the generated erhep.html and use the search facility. How many entries mention apoptosis?: You enter the string in the search box and look at the bottom of the page which tells how many entries are found.