Install the bioconductor packages.
# install the required packages
source("http://bioconductor.org/biocLite.R")
biocLite(c("Biostrings", "ShortRead", "ggplot2","Rsamtools","org.Hs.eg.db", "TxDb.Hsapiens.UCSC.hg19.knownGene"))
Download and install another packages.
# install course package
download.file(url=
"http://www.bioconductor.org/help/course-materials/2014/SeattleFeb2014/BiocIntro_0.0.4.tar.gz",
destfile="bioc.tar.gz")
install.packages("bioc.tar.gz", repos = NULL, type = "source")
Run the following R commands and try your best to understand what you are doing.
## ----library-----------------------------------------
library(Biostrings)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
## intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unlist, unsplit
##
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: XVector
library(ShortRead)
## Loading required package: BiocParallel
## Loading required package: Rsamtools
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: GenomicAlignments
## ----Intro---------------------------------------------------------------
b <- BString("I store any set of characters!" ) # Creates big String object.
b
## 30-letter "BString" instance
## seq: I store any set of characters!
d <- DNAString("GCATATATA") # Creates DNAString object.
d
## 9-letter "DNAString" instance
## seq: GCATATATA
r <- RNAString("GCAUAUAUA") # Creates RNAString object.
r
## 9-letter "RNAString" instance
## seq: GCAUAUAUA
r2 <- RNAString(d) # Converts d into RNAString object.
r2
## 9-letter "RNAString" instance
## seq: GCAUAUAUA
p <- AAString("HCWYHH") # Create AAString object.
p
## 6-letter "AAString" instance
## seq: HCWYHH
aa <- translate(d) # Converts d into AAString object.
aa
## 3-letter "AAString" instance
## seq: AYI
## ----fileaccess----------------------------------------------------------
fls <- list.files(system.file("extdata", package="BiocIntro"),
pattern =".txt",full=TRUE)
fls
## [1] "C:/Users/Jia/Documents/R/win-library/3.1/BiocIntro/extdata/brca1_cds.txt"
## [2] "C:/Users/Jia/Documents/R/win-library/3.1/BiocIntro/extdata/brca2_cds.txt"
## ----data----------------------------------------------------------------
seq <- readFasta(fls) # read in the FATSA file
seq
## class: ShortRead
## length: 2 reads; width: 5592 10257 cycles
writeFasta(seq, "eg.fasta") # write FASTA file
##Lets create a DNAStringSet which is a container for storing a set of DNAString
dna <- sread(seq)
dna
## A DNAStringSet instance of length 2
## width seq
## [1] 5592 ATGGATTTATCTGCTCTTCGCGTTGAAGAAG...GATACCCCAGATCCCCCACAGCCACTACTGA
## [2] 10257 ATGCCTATTGGATCCAAAGAGAGGCCAACAT...GGACACAATTACAACTAAAAAATATATCTAA
#Approach-2
dna <- readDNAStringSet(fls)
dna
## A DNAStringSet instance of length 2
## width seq names
## [1] 5592 ATGGATTTATCTGCTCTTCGC...TCCCCCACAGCCACTACTGA lcl|NM_007294.3_c...
## [2] 10257 ATGCCTATTGGATCCAAAGAG...CAACTAAAAAATATATCTAA lcl|NM_000059.3_c...
# let us look at the first DNAString, brca1 stored at [1]
# This [[]] operation converts a DNAStringSet to DNAString
brca1 <- dna[[1]]
brca1
## 5592-letter "DNAString" instance
## seq: ATGGATTTATCTGCTCTTCGCGTTGAAGAAGTAC...CTGATACCCCAGATCCCCCACAGCCACTACTGA
brca2 <- dna[[2]]
brca2
## 10257-letter "DNAString" instance
## seq: ATGCCTATTGGATCCAAAGAGAGGCCAACATTTT...CAGGACACAATTACAACTAAAAAATATATCTAA
# making the output more understandable.
# A sequence in FASTA format is represented as a series of lines, each of which
# usually do not exceed 80 characters.
options(showHeadLines=5)
successiveViews(brca1, width=rep(50,length(dna[[1]])/50+1))
## Views on a 5592-letter DNAString subject
## subject: ATGGATTTATCTGCTCTTCGCGTTGAAGAAGT...GATACCCCAGATCCCCCACAGCCACTACTGA
## views:
## start end width
## [1] 1 50 50 [ATGGATTTATCTGCTCTTCGCGTTGAAGAAGTACAAAATGTCATTAATGC]
## [2] 51 100 50 [TATGCAGAAAATCTTAGAGTGTCCCATCTGTCTGGAGTTGATCAAGGAAC]
## [3] 101 150 50 [CTGTCTCCACAAAGTGTGACCACATATTTTGCAAATTTTGCATGCTGAAA]
## [4] 151 200 50 [CTTCTCAACCAGAAGAAAGGGCCTTCACAGTGTCCTTTATGTAAGAATGA]
## [5] 201 250 50 [TATAACCAAAAGGAGCCTACAAGAAAGTACGAGATTTAGTCAACTTGTTG]
## ... ... ... ... ...
## [108] 5351 5400 50 [TACAGCTGTGTGGTGCTTCTGTGGTGAAGGAGCTTTCATCATTCACCCTT]
## [109] 5401 5450 50 [GGCACAGGTGTCCACCCAATTGTGGTTGTGCAGCCAGATGCCTGGACAGA]
## [110] 5451 5500 50 [GGACAATGGCTTCCATGCAATTGGGCAGATGTGTGAGGCACCTGTGGTGA]
## [111] 5501 5550 50 [CCCGAGAGTGGGTGTTGGACAGTGTAGCACTCTACCAGTGCCAGGAGCTG]
## [112] 5551 5600 50 [GACACCTACCTGATACCCCAGATCCCCCACAGCCACTACTGA ]
## ----operation-------------------------------------------------------------
brca1
## 5592-letter "DNAString" instance
## seq: ATGGATTTATCTGCTCTTCGCGTTGAAGAAGTAC...CTGATACCCCAGATCCCCCACAGCCACTACTGA
reverse(brca1)
## 5592-letter "DNAString" instance
## seq: AGTCATCACCGACACCCCCTAGACCCCATAGTCC...ATGAAGAAGTTGCGCTTCTCGTCTATTTAGGTA
complement(brca1)
## 5592-letter "DNAString" instance
## seq: TACCTAAATAGACGAGAAGCGCAACTTCTTCATG...GACTATGGGGTCTAGGGGGTGTCGGTGATGACT
reverseComplement(brca1)
## 5592-letter "DNAString" instance
## seq: TCAGTAGTGGCTGTGGGGGATCTGGGGTATCAGG...TACTTCTTCAACGCGAAGAGCAGATAAATCCAT
translate(brca1)
## 1864-letter "AAString" instance
## seq: MDLSALRVEEVQNVINAMQKILECPICLELIKEP...VVTREWVLDSVALYQCQELDTYLIPQIPHSHY*
## ----predef--------------------------------------------------------------
DNA_BASES
## [1] "A" "C" "G" "T"
DNA_ALPHABET
## [1] "A" "C" "G" "T" "M" "R" "W" "S" "Y" "K" "V" "H" "D" "B" "N" "-" "+"
## [18] "."
IUPAC_CODE_MAP # IUPAC nucleotide ambiguity codes
## A C G T M R W S Y K
## "A" "C" "G" "T" "AC" "AG" "AT" "CG" "CT" "GT"
## V H D B N
## "ACG" "ACT" "AGT" "CGT" "ACGT"
## ----frequeny------------------------------------------------------------
# what are the unique letter ?
uniqueLetters(brca2)
## [1] "A" "C" "G" "T"
alphabetFrequency(brca2)
## A C G T M R W S Y K V H D B N
## 3769 1784 1882 2822 0 0 0 0 0 0 0 0 0 0 0
## - + .
## 0 0 0
alphabetFrequency(brca2, baseOnly=TRUE)
## A C G T other
## 3769 1784 1882 2822 0
dinucleotideFrequency(brca2)
## AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG
## 1562 554 805 847 774 339 74 597 765 333 313 471 667 558 690
## TT
## 907
trinucleotideFrequency(brca2)
## AAA AAC AAG AAT ACA ACC ACG ACT AGA AGC AGG AGT ATA ATC ATG ATT CAA CAC
## 687 206 334 334 233 110 31 180 336 146 133 190 225 143 216 263 278 106
## CAG CAT CCA CCC CCG CCT CGA CGC CGG CGT CTA CTC CTG CTT GAA GAC GAG GAT
## 229 161 161 64 12 102 22 15 12 25 116 107 184 190 369 102 122 172
## GCA GCC GCG GCT GGA GGC GGG GGT GTA GTC GTG GTT TAA TAC TAG TAT TCA TCC
## 146 71 12 104 152 59 36 66 124 90 109 148 228 140 120 179 234 94
## TCG TCT TGA TGC TGG TGT TTA TTC TTG TTT
## 19 211 255 113 132 190 202 218 181 306
## ----packages---------------------------------------------
library(ShortRead)
library(BiocIntro)
## No methods found in "ShortRead" for requests: quality
## ----fastq-format ----------------
sp <- SolexaPath(system.file('extdata', package='ShortRead'))
sp
## class: SolexaPath
## experimentPath: C:/Users/Jia/Documents/R/win-library/3.1/ShortRead/extdata
## dataPath: Data
## scanPath: NA
## imageAnalysisPath: C1-36Firecrest
## baseCallPath: Bustard
## analysisPath: GERALD
fl <- file.path(analysisPath(sp), "s_1_sequence.txt")
fl
## [1] "C:/Users/Jia/Documents/R/win-library/3.1/ShortRead/extdata/Data/C1-36Firecrest/Bustard/GERALD/s_1_sequence.txt"
sampler <- FastqSampler(fl, 1000000)
reads <- yield(sampler)
head( id(reads) ) # reads id
## A BStringSet instance of length 6
## width seq
## [1] 24 HWI-EAS88_1_1_1_1001_499
## [2] 23 HWI-EAS88_1_1_1_898_392
## [3] 23 HWI-EAS88_1_1_1_922_465
## [4] 23 HWI-EAS88_1_1_1_895_493
## [5] 23 HWI-EAS88_1_1_1_953_493
## [6] 23 HWI-EAS88_1_1_1_868_763
head(sread(reads) ) # reads nucleotide information
## A DNAStringSet instance of length 6
## width seq
## [1] 36 GGACTTTGTAGGATACCCTCGCTTTCCTTCTCCTGT
## [2] 36 GATTTCTTACCTATTAGTGGTTGAACAGCATCGGAC
## [3] 36 GCGGTGGTCTATAGTGTTATTAATATCAATTTGGGT
## [4] 36 GTTACCATGATGTTATTTCTTCATTTGGAGGTAAAA
## [5] 36 GTATGTTTCTCCTGCTTATCACCTTCTTGAAGGCTT
## [6] 36 GTTCTCTAAAAACCATTTTTCGTCCCCTTCGGGGCG
head(quality(reads)) # quality information
## class: SFastqQuality
## quality:
## A BStringSet instance of length 6
## width seq
## [1] 36 ]]]]]]]]]]]]Y]Y]]]]]]]]]]]]VCHVMPLAS
## [2] 36 ]]]]]]]]]]]]Y]]]]]]]]]YPV]T][PZPICCK
## [3] 36 ]]]]Y]]]]]V]T]]]]]T]]]]]V]TMJEUXEFLA
## [4] 36 ]]]]]]]]]]]]]]]]]]]]]]T]]]]RJRZTQLOA
## [5] 36 ]]]]]]]]]]]]]]]]]T]]]]]]]]]]MJUJVLSS
## [6] 36 ]]]]]]]]]]]Y]]T]]]O]]]]VO]W]VZMXVOLS
#------------matplot---------------
abc <- alphabetByCycle(sread(reads))
abc[1:4, 1:8]
## cycle
## alphabet [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## A 0 61 70 73 74 67 81 67
## C 0 42 42 33 36 63 36 52
## G 256 51 37 53 51 52 62 41
## T 0 102 107 97 95 74 77 96
matplot(t(abc[c("A","G","T","C"),]), type="l")
# A histogram of the GC content of individual reads is obtained with:
alf0 <- alphabetFrequency(sread(reads), as.prob=TRUE)
hist(rowSums(alf0[,c("G", "C")]),main = "Histogram of gc Content",
xlab="individual reads" )
## ----reads quality-----------------------------------------------
fq <- FastqSampler(fl, 100000)
srq <- yield(fq)
srq
## class: ShortReadQ
## length: 256 reads; width: 36 cycles
## ----quality assessment on short reads-----------------------
qa <- qa(fl, type="fastq")
browseURL(report(qa))
## ----packages ----------------------
library(ShortRead);
library(GenomicRanges);
## ---- define a function----------------------------------------------------
plotRanges <- function(x, xlim = x, main = deparse(substitute(x)),
col = "black", sep = 0.5, ...){
height <- 1
if (is(xlim, "Ranges"))
xlim <- c(min(start(xlim)), max(end(xlim)))
bins <- disjointBins(IRanges(start(x), end(x) + 1))
plot.new()
par(mai=c(0.6, 0.2, 0.6, 0.2))
plot.window(xlim, c(0, max(bins)*(height + sep)))
ybottom <- bins * (sep + height) - height
rect(start(x)-0.5, ybottom, end(x)+0.5, ybottom + height, col = col, ...)
title(main, cex.main=2.8, font.main=1)
axis(1)}
## ----GRanges-genes-------------------------------------------------------
genes <- GRanges(seqnames=c("chr3R", "chrX"),
ranges=IRanges(
start=c(19967117, 18962306),
end = c(19973212, 18962925)),
strand=c("+", "-"),
seqlengths=c(chr3R=27905053, chrX=22422827))
genes # how to define a gene
## GRanges object with 2 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr3R [19967117, 19973212] +
## [2] chrX [18962306, 18962925] -
## -------
## seqinfo: 2 sequences from an unspecified genome
## ----GRanges-basics------------------------------------------------------
genes[2]
## GRanges object with 1 range and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chrX [18962306, 18962925] -
## -------
## seqinfo: 2 sequences from an unspecified genome
strand(genes)
## factor-Rle of length 2 with 2 runs
## Lengths: 1 1
## Values : + -
## Levels(3): + - *
width(genes)
## [1] 6096 620
length(genes)
## [1] 2
names(genes) <- c("FBgn0039155", "FBgn0085359")
genes # now with names
## GRanges object with 2 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## FBgn0039155 chr3R [19967117, 19973212] +
## FBgn0085359 chrX [18962306, 18962925] -
## -------
## seqinfo: 2 sequences from an unspecified genome
## ----Ranges -----------------------------------------------------------
ir1 <- IRanges(start=c(7, 9, 12, 14, 22:24),end=c(15, 11, 12, 18, 26, 27, 28))
ir1
## IRanges of length 7
## start end width
## [1] 7 15 9
## [2] 9 11 3
## [3] 12 12 1
## [4] 14 18 5
## [5] 22 26 5
## [6] 23 27 5
## [7] 24 28 5
ir2 <- IRanges(start=c(6, 8),end=c(28, 12))
ir2
## IRanges of length 2
## start end width
## [1] 6 28 23
## [2] 8 12 5
findOverlaps(ir1,ir2) #
## Hits of length 10
## queryLength: 7
## subjectLength: 2
## queryHits subjectHits
## <integer> <integer>
## 1 1 1
## 2 1 2
## 3 2 1
## 4 2 2
## 5 3 1
## 6 3 2
## 7 4 1
## 8 5 1
## 9 6 1
## 10 7 1
countOverlaps(ir1,ir2)
## [1] 2 2 2 1 1 1 1
## ----shift, reduce, disjoin --------------------------
ir <- IRanges(start=c(7, 9, 12, 14, 22:24),end=c(15, 11, 12, 18, 26, 27, 28))
ir
## IRanges of length 7
## start end width
## [1] 7 15 9
## [2] 9 11 3
## [3] 12 12 1
## [4] 14 18 5
## [5] 22 26 5
## [6] 23 27 5
## [7] 24 28 5
par(mfrow= c(2,2))
plotRanges(ir, xlim=c(5, 35), main="Original")
plotRanges(shift(ir, 5), xlim=c(5, 35), main="Shift")
plotRanges(reduce(ir), xlim=c(5, 35), main="Reduce")
plotRanges(disjoin(ir), xlim=c(5, 35), main="disjoin")
## ----coverage------------------------------------------------------------
cvg <- coverage(ir)
plot(as.integer(cvg), type="s", xlab="Coordinate", ylab="Depth of coverage")
## ----GRanges-mcols, metadata-------------------------------------------------------
mcols(genes) <- DataFrame(EntrezId=c("42865", "2768869"),
Symbol=c("kal-1", "CG34330"))
metadata(genes) <- list(CreatedBy="ChinaSIP", Date=date())
metadata(genes)
## $CreatedBy
## [1] "ChinaSIP"
##
## $Date
## [1] "Thu May 14 16:43:12 2015"
library(GenomicAlignments)
library(GenomicFeatures)
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
##
## Attaching package: 'AnnotationDbi'
##
## The following object is masked from 'package:GenomeInfoDb':
##
## species
## ----SAM-----------------------------------------------------------------
fl <- system.file("extdata", "ex1.sam", package="Rsamtools")
fl
## [1] "C:/Users/Jia/Documents/R/win-library/3.1/Rsamtools/extdata/ex1.sam"
strsplit(readLines(fl, 2), "\t")[[1]]
## [1] "B7_591:4:96:693:509"
## [2] "73"
## [3] "seq1"
## [4] "1"
## [5] "99"
## [6] "36M"
## [7] "*"
## [8] "0"
## [9] "0"
## [10] "CACTAGTGGCTCATTGTAAATGTGTGGTTTAACTCG"
## [11] "<<<<<<<<<<<<<<<;<<<<<<<<<5<<<<<;:<;7"
## [12] "MF:i:18"
## [13] "Aq:i:73"
## [14] "NM:i:0"
## [15] "UQ:i:0"
## [16] "H0:i:1"
## [17] "H1:i:0"
# more details about SAM format: http://samtools.github.io/hts-specs/SAMv1.pdf
fl <- system.file("extdata", "ex1.bam", package="Rsamtools")
fl
## [1] "C:/Users/Jia/Documents/R/win-library/3.1/Rsamtools/extdata/ex1.bam"
aln <- readGAlignments(fl)
length(aln)
## [1] 3271
aln
## GAlignments object with 3271 alignments and 0 metadata columns:
## seqnames strand cigar qwidth start end
## <Rle> <Rle> <character> <integer> <integer> <integer>
## [1] seq1 + 36M 36 1 36
## [2] seq1 + 35M 35 3 37
## [3] seq1 + 35M 35 5 39
## [4] seq1 + 36M 36 6 41
## [5] seq1 + 35M 35 9 43
## ... ... ... ... ... ... ...
## [3267] seq2 + 35M 35 1524 1558
## [3268] seq2 + 35M 35 1524 1558
## [3269] seq2 - 35M 35 1528 1562
## [3270] seq2 - 35M 35 1532 1566
## [3271] seq2 - 35M 35 1533 1567
## width njunc
## <integer> <integer>
## [1] 36 0
## [2] 35 0
## [3] 35 0
## [4] 36 0
## [5] 35 0
## ... ... ...
## [3267] 35 0
## [3268] 35 0
## [3269] 35 0
## [3270] 35 0
## [3271] 35 0
## -------
## seqinfo: 2 sequences from an unspecified genome
## ----GappedAlignments-accessors------------------------------------------
table(strand(aln))
##
## + - *
## 1647 1624 0
range(width(aln))
## [1] 30 40
width(aln)[9786]
## [1] NA
head(sort(table(cigar(aln)), decreasing=TRUE))
##
## 35M 36M 40M 34M 33M 14M4I17M
## 2804 283 112 37 6 4
## ----GappedAlignments, to keep sequence----------------------------------------------
param <- ScanBamParam(what="seq")
aln <- readGAlignments(fl, param=param)
aln # You can now see the nucleotide sequence
## GAlignments object with 3271 alignments and 1 metadata column:
## seqnames strand cigar qwidth start end
## <Rle> <Rle> <character> <integer> <integer> <integer>
## [1] seq1 + 36M 36 1 36
## [2] seq1 + 35M 35 3 37
## [3] seq1 + 35M 35 5 39
## [4] seq1 + 36M 36 6 41
## [5] seq1 + 35M 35 9 43
## ... ... ... ... ... ... ...
## [3267] seq2 + 35M 35 1524 1558
## [3268] seq2 + 35M 35 1524 1558
## [3269] seq2 - 35M 35 1528 1562
## [3270] seq2 - 35M 35 1532 1566
## [3271] seq2 - 35M 35 1533 1567
## width njunc | seq
## <integer> <integer> | <DNAStringSet>
## [1] 36 0 | CACTAGTGGCTCATTGTAAATGTGTGGTTTAACTCG
## [2] 35 0 | CTAGTGGCTCATTGTAAATGTGTGGTTTAACTCGT
## [3] 35 0 | AGTGGCTCATTGTAAATGTGTGGTTTAACTCGTCC
## [4] 36 0 | GTGGCTCATTGTAATTTTTTGTTTTAACTCTTCTCT
## [5] 35 0 | GCTCATTGTAAATGTGTGGTTTAACTCGTCCATGG
## ... ... ... ... ...
## [3267] 35 0 | TTTCTTTTCTCCTTTTTTTTTTTTTTTTTCTACAT
## [3268] 35 0 | TTTCTTTTCACTTTTTTTTTTTTTTTTTTTTACTT
## [3269] 35 0 | TTTTTTCTTTTTTTTTTTTTTTTTTTTGCATGCCA
## [3270] 35 0 | TTTTTTTTTTTTTTTTTTTTTTTGCATGCCAGAAA
## [3271] 35 0 | TTTTTTTTTTTTTTTTTTTTTTTCATGCCAGAAAA
## -------
## seqinfo: 2 sequences from an unspecified genome
## define a gene
genes <- GRanges(seqnames=c("seq1", "seq2"),
ranges=IRanges(
start=c(100, 1000),
end = c(200, 1200)),
strand=c("+", "-"))
genes # how to define a gene
## GRanges object with 2 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] seq1 [ 100, 200] +
## [2] seq2 [1000, 1200] -
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
## ----summarizeOverlaps and calculate RPKM---------------------------------------------------
## library(BiocParallel)
countOverlaps(genes, aln) # number of reads aligned to genes
## [1] 51 164
width(genes) # length of transcripts
## [1] 101 201
length(aln) # total number of reads
## [1] 3271
rpkm <- countOverlaps(genes, aln)/((width(genes)/10^3)*(length(aln)/10^6))
# This example neglected the existence of introns.
library(GenomicFeatures);
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
# Download Transcript Information
## transcriptDB from UCSC, take a very long time
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene # this step takes some time.
txdb
## TxDb object:
## | Db type: TxDb
## | Supporting package: GenomicFeatures
## | Data source: UCSC
## | Genome: hg19
## | Organism: Homo sapiens
## | UCSC Table: knownGene
## | Resource URL: http://genome.ucsc.edu/
## | Type of Gene ID: Entrez Gene ID
## | Full dataset: yes
## | miRBase build ID: GRCh37
## | transcript_nrow: 82960
## | exon_nrow: 289969
## | cds_nrow: 237533
## | Db created by: GenomicFeatures package from Bioconductor
## | Creation time: 2014-09-26 11:16:12 -0700 (Fri, 26 Sep 2014)
## | GenomicFeatures version at creation time: 1.17.17
## | RSQLite version at creation time: 0.11.4
## | DBSCHEMAVERSION: 1.0
cds <- cdsBy(txdb)
cds
## GRangesList object of length 63691:
## $2
## GRanges object with 3 ranges and 3 metadata columns:
## seqnames ranges strand | cds_id cds_name exon_rank
## <Rle> <IRanges> <Rle> | <integer> <character> <integer>
## [1] chr1 [12190, 12227] + | 1 <NA> 1
## [2] chr1 [12595, 12721] + | 2 <NA> 2
## [3] chr1 [13403, 13639] + | 3 <NA> 3
##
## $4
## GRanges object with 1 range and 3 metadata columns:
## seqnames ranges strand | cds_id cds_name exon_rank
## [1] chr1 [69091, 70008] + | 4 <NA> 1
##
## $7
## GRanges object with 2 ranges and 3 metadata columns:
## seqnames ranges strand | cds_id cds_name exon_rank
## [1] chr1 [324343, 324345] + | 5 <NA> 2
## [2] chr1 [324439, 325605] + | 6 <NA> 3
##
## ...
## <63688 more elements>
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
cds[1]
## GRangesList object of length 1:
## $2
## GRanges object with 3 ranges and 3 metadata columns:
## seqnames ranges strand | cds_id cds_name exon_rank
## <Rle> <IRanges> <Rle> | <integer> <character> <integer>
## [1] chr1 [12190, 12227] + | 1 <NA> 1
## [2] chr1 [12595, 12721] + | 2 <NA> 2
## [3] chr1 [13403, 13639] + | 3 <NA> 3
##
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
exons <- exonsBy(txdb)
exons
## GRangesList object of length 82960:
## $1
## GRanges object with 3 ranges and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## <Rle> <IRanges> <Rle> | <integer> <character> <integer>
## [1] chr1 [11874, 12227] + | 1 <NA> 1
## [2] chr1 [12613, 12721] + | 3 <NA> 2
## [3] chr1 [13221, 14409] + | 5 <NA> 3
##
## $2
## GRanges object with 3 ranges and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## [1] chr1 [11874, 12227] + | 1 <NA> 1
## [2] chr1 [12595, 12721] + | 2 <NA> 2
## [3] chr1 [13403, 14409] + | 6 <NA> 3
##
## $3
## GRanges object with 3 ranges and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## [1] chr1 [11874, 12227] + | 1 <NA> 1
## [2] chr1 [12646, 12697] + | 4 <NA> 2
## [3] chr1 [13221, 14409] + | 5 <NA> 3
##
## ...
## <82957 more elements>
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
introns <- intronsByTranscript(txdb)
introns
## GRangesList object of length 82960:
## $1
## GRanges object with 2 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 [12228, 12612] +
## [2] chr1 [12722, 13220] +
##
## $2
## GRanges object with 2 ranges and 0 metadata columns:
## seqnames ranges strand
## [1] chr1 [12228, 12594] +
## [2] chr1 [12722, 13402] +
##
## $3
## GRanges object with 2 ranges and 0 metadata columns:
## seqnames ranges strand
## [1] chr1 [12228, 12645] +
## [2] chr1 [12698, 13220] +
##
## ...
## <82957 more elements>
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
utr5 <- fiveUTRsByTranscript(txdb)
utr5
## GRangesList object of length 61495:
## $2
## GRanges object with 1 range and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## <Rle> <IRanges> <Rle> | <integer> <character> <integer>
## [1] chr1 [11874, 12189] + | 1 <NA> 1
##
## $7
## GRanges object with 2 ranges and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## [1] chr1 [322037, 322228] + | 10 <NA> 1
## [2] chr1 [324288, 324342] + | 12 <NA> 2
##
## $8
## GRanges object with 2 ranges and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## [1] chr1 [323892, 324060] + | 11 <NA> 1
## [2] chr1 [324288, 324342] + | 12 <NA> 2
##
## ...
## <61492 more elements>
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
utr3 <- threeUTRsByTranscript(txdb)
utr3
## GRangesList object of length 60740:
## $2
## GRanges object with 1 range and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## <Rle> <IRanges> <Rle> | <integer> <character> <integer>
## [1] chr1 [13640, 14409] + | 6 <NA> 3
##
## $7
## GRanges object with 1 range and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## [1] chr1 [325606, 326938] + | 14 <NA> 3
##
## $8
## GRanges object with 1 range and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## [1] chr1 [325606, 328581] + | 15 <NA> 3
##
## ...
## <60737 more elements>
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
## ----select--------------------------------------------------------------
library(org.Hs.eg.db) # library of gene annotations
## Loading required package: DBI
# Get the Unigene identifiers that are mapped to an entrez gene id
x <- org.Hs.egGENENAME
mapped_genes <- mappedkeys(x)
xx <- as.list(x[mapped_genes]) # Convert to a list
xx[1:10] # mapping between entrez gene ID and gene name
## $`1`
## [1] "alpha-1-B glycoprotein"
##
## $`10`
## [1] "N-acetyltransferase 2 (arylamine N-acetyltransferase)"
##
## $`100`
## [1] "adenosine deaminase"
##
## $`1000`
## [1] "cadherin 2, type 1, N-cadherin (neuronal)"
##
## $`10000`
## [1] "v-akt murine thymoma viral oncogene homolog 3"
##
## $`100008586`
## [1] "G antigen 12F"
##
## $`100008587`
## [1] "RNA, 5.8S ribosomal 5"
##
## $`100008588`
## [1] "RNA, 18S ribosomal 5"
##
## $`100008589`
## [1] "RNA, 28S ribosomal 5"
##
## $`100009601`
## [1] "transfer RNA-Tyr (GTA) 7-1"
xx[[names(cds[1])]] # get the gene name of the first CDS.
## [1] "alpha-2-macroglobulin"
# Get the Unigene identifiers that are mapped to an entrez gene id
x <- org.Hs.egSYMBOL
# Get the gene symbol that are mapped to an entrez gene identifiers
mapped_genes <- mappedkeys(x)
xx <- as.list(x[mapped_genes])
xx[1:10] # mapping between entrez gene ID and gene symbol
## $`1`
## [1] "A1BG"
##
## $`10`
## [1] "NAT2"
##
## $`100`
## [1] "ADA"
##
## $`1000`
## [1] "CDH2"
##
## $`10000`
## [1] "AKT3"
##
## $`100008586`
## [1] "GAGE12F"
##
## $`100008587`
## [1] "RNA5-8S5"
##
## $`100008588`
## [1] "RNA18S5"
##
## $`100008589`
## [1] "RNA28S5"
##
## $`100009601`
## [1] "TRY-GTA7-1"
xx[[names(cds[1])]] # get the gene symbol of the first CDS.
## [1] "A2M"