package installation

Install the bioconductor packages.

# install the required packages
source("http://bioconductor.org/biocLite.R")
biocLite(c("Biostrings", "ShortRead", "ggplot2","Rsamtools","org.Hs.eg.db", "TxDb.Hsapiens.UCSC.hg19.knownGene"))

Download and install another packages.

# install course package
download.file(url=
    "http://www.bioconductor.org/help/course-materials/2014/SeattleFeb2014/BiocIntro_0.0.4.tar.gz",
              destfile="bioc.tar.gz")
install.packages("bioc.tar.gz", repos = NULL, type = "source")

Run the following R commands and try your best to understand what you are doing.

Commands related to sequences.

## ----library-----------------------------------------
library(Biostrings)

## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist, unsplit
## 
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: XVector

library(ShortRead)

## Loading required package: BiocParallel
## Loading required package: Rsamtools
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: GenomicAlignments

## ----Intro---------------------------------------------------------------
b <- BString("I store any set of characters!" ) # Creates big String object.
b

##   30-letter "BString" instance
## seq: I store any set of characters!

d <- DNAString("GCATATATA") # Creates DNAString object.
d

##   9-letter "DNAString" instance
## seq: GCATATATA

r <- RNAString("GCAUAUAUA") # Creates RNAString object.
r

##   9-letter "RNAString" instance
## seq: GCAUAUAUA

r2 <- RNAString(d) # Converts d into RNAString object.
r2

##   9-letter "RNAString" instance
## seq: GCAUAUAUA

p <- AAString("HCWYHH") # Create AAString object.
p

##   6-letter "AAString" instance
## seq: HCWYHH

aa <- translate(d) # Converts d into AAString object.
aa

##   3-letter "AAString" instance
## seq: AYI

## ----fileaccess----------------------------------------------------------
fls <- list.files(system.file("extdata", package="BiocIntro"), 
                  pattern =".txt",full=TRUE)
fls

## [1] "C:/Users/Jia/Documents/R/win-library/3.1/BiocIntro/extdata/brca1_cds.txt"
## [2] "C:/Users/Jia/Documents/R/win-library/3.1/BiocIntro/extdata/brca2_cds.txt"

## ----data----------------------------------------------------------------
seq <- readFasta(fls)         # read in the FATSA file
seq

## class: ShortRead
## length: 2 reads; width: 5592 10257 cycles

writeFasta(seq, "eg.fasta")   # write FASTA file

##Lets create a DNAStringSet which is a container for storing a set of DNAString
dna <- sread(seq)
dna

##   A DNAStringSet instance of length 2
##     width seq
## [1]  5592 ATGGATTTATCTGCTCTTCGCGTTGAAGAAG...GATACCCCAGATCCCCCACAGCCACTACTGA
## [2] 10257 ATGCCTATTGGATCCAAAGAGAGGCCAACAT...GGACACAATTACAACTAAAAAATATATCTAA

#Approach-2
dna <- readDNAStringSet(fls)
dna

##   A DNAStringSet instance of length 2
##     width seq                                          names               
## [1]  5592 ATGGATTTATCTGCTCTTCGC...TCCCCCACAGCCACTACTGA lcl|NM_007294.3_c...
## [2] 10257 ATGCCTATTGGATCCAAAGAG...CAACTAAAAAATATATCTAA lcl|NM_000059.3_c...

# let us look at the first DNAString, brca1 stored at [1]
# This [[]] operation converts a DNAStringSet to DNAString
brca1 <- dna[[1]]
brca1

##   5592-letter "DNAString" instance
## seq: ATGGATTTATCTGCTCTTCGCGTTGAAGAAGTAC...CTGATACCCCAGATCCCCCACAGCCACTACTGA

brca2 <- dna[[2]]
brca2

##   10257-letter "DNAString" instance
## seq: ATGCCTATTGGATCCAAAGAGAGGCCAACATTTT...CAGGACACAATTACAACTAAAAAATATATCTAA

# making the output more understandable.
# A sequence in FASTA format is represented as a series of lines, each of which 
# usually do not exceed 80 characters.
options(showHeadLines=5)
successiveViews(brca1,  width=rep(50,length(dna[[1]])/50+1))

##   Views on a 5592-letter DNAString subject
## subject: ATGGATTTATCTGCTCTTCGCGTTGAAGAAGT...GATACCCCAGATCCCCCACAGCCACTACTGA
## views:
##       start  end width
##   [1]     1   50    50 [ATGGATTTATCTGCTCTTCGCGTTGAAGAAGTACAAAATGTCATTAATGC]
##   [2]    51  100    50 [TATGCAGAAAATCTTAGAGTGTCCCATCTGTCTGGAGTTGATCAAGGAAC]
##   [3]   101  150    50 [CTGTCTCCACAAAGTGTGACCACATATTTTGCAAATTTTGCATGCTGAAA]
##   [4]   151  200    50 [CTTCTCAACCAGAAGAAAGGGCCTTCACAGTGTCCTTTATGTAAGAATGA]
##   [5]   201  250    50 [TATAACCAAAAGGAGCCTACAAGAAAGTACGAGATTTAGTCAACTTGTTG]
##   ...   ...  ...   ... ...
## [108]  5351 5400    50 [TACAGCTGTGTGGTGCTTCTGTGGTGAAGGAGCTTTCATCATTCACCCTT]
## [109]  5401 5450    50 [GGCACAGGTGTCCACCCAATTGTGGTTGTGCAGCCAGATGCCTGGACAGA]
## [110]  5451 5500    50 [GGACAATGGCTTCCATGCAATTGGGCAGATGTGTGAGGCACCTGTGGTGA]
## [111]  5501 5550    50 [CCCGAGAGTGGGTGTTGGACAGTGTAGCACTCTACCAGTGCCAGGAGCTG]
## [112]  5551 5600    50 [GACACCTACCTGATACCCCAGATCCCCCACAGCCACTACTGA        ]

## ----operation-------------------------------------------------------------
brca1

##   5592-letter "DNAString" instance
## seq: ATGGATTTATCTGCTCTTCGCGTTGAAGAAGTAC...CTGATACCCCAGATCCCCCACAGCCACTACTGA

reverse(brca1)

##   5592-letter "DNAString" instance
## seq: AGTCATCACCGACACCCCCTAGACCCCATAGTCC...ATGAAGAAGTTGCGCTTCTCGTCTATTTAGGTA

complement(brca1)

##   5592-letter "DNAString" instance
## seq: TACCTAAATAGACGAGAAGCGCAACTTCTTCATG...GACTATGGGGTCTAGGGGGTGTCGGTGATGACT

reverseComplement(brca1)

##   5592-letter "DNAString" instance
## seq: TCAGTAGTGGCTGTGGGGGATCTGGGGTATCAGG...TACTTCTTCAACGCGAAGAGCAGATAAATCCAT

translate(brca1)

##   1864-letter "AAString" instance
## seq: MDLSALRVEEVQNVINAMQKILECPICLELIKEP...VVTREWVLDSVALYQCQELDTYLIPQIPHSHY*

## ----predef--------------------------------------------------------------
DNA_BASES

## [1] "A" "C" "G" "T"

DNA_ALPHABET

##  [1] "A" "C" "G" "T" "M" "R" "W" "S" "Y" "K" "V" "H" "D" "B" "N" "-" "+"
## [18] "."

IUPAC_CODE_MAP # IUPAC nucleotide ambiguity codes

##      A      C      G      T      M      R      W      S      Y      K 
##    "A"    "C"    "G"    "T"   "AC"   "AG"   "AT"   "CG"   "CT"   "GT" 
##      V      H      D      B      N 
##  "ACG"  "ACT"  "AGT"  "CGT" "ACGT"

## ----frequeny------------------------------------------------------------
# what are the unique letter ?
uniqueLetters(brca2)

## [1] "A" "C" "G" "T"

alphabetFrequency(brca2)

##    A    C    G    T    M    R    W    S    Y    K    V    H    D    B    N 
## 3769 1784 1882 2822    0    0    0    0    0    0    0    0    0    0    0 
##    -    +    . 
##    0    0    0

alphabetFrequency(brca2, baseOnly=TRUE)

##     A     C     G     T other 
##  3769  1784  1882  2822     0

dinucleotideFrequency(brca2)

##   AA   AC   AG   AT   CA   CC   CG   CT   GA   GC   GG   GT   TA   TC   TG 
## 1562  554  805  847  774  339   74  597  765  333  313  471  667  558  690 
##   TT 
##  907

trinucleotideFrequency(brca2)

## AAA AAC AAG AAT ACA ACC ACG ACT AGA AGC AGG AGT ATA ATC ATG ATT CAA CAC 
## 687 206 334 334 233 110  31 180 336 146 133 190 225 143 216 263 278 106 
## CAG CAT CCA CCC CCG CCT CGA CGC CGG CGT CTA CTC CTG CTT GAA GAC GAG GAT 
## 229 161 161  64  12 102  22  15  12  25 116 107 184 190 369 102 122 172 
## GCA GCC GCG GCT GGA GGC GGG GGT GTA GTC GTG GTT TAA TAC TAG TAT TCA TCC 
## 146  71  12 104 152  59  36  66 124  90 109 148 228 140 120 179 234  94 
## TCG TCT TGA TGC TGG TGT TTA TTC TTG TTT 
##  19 211 255 113 132 190 202 218 181 306

Commands related to fastq file.

## ----packages---------------------------------------------
library(ShortRead)
library(BiocIntro)

## No methods found in "ShortRead" for requests: quality

## ----fastq-format ----------------
sp <- SolexaPath(system.file('extdata', package='ShortRead'))
sp

## class: SolexaPath
## experimentPath: C:/Users/Jia/Documents/R/win-library/3.1/ShortRead/extdata
## dataPath: Data
## scanPath: NA
## imageAnalysisPath: C1-36Firecrest
## baseCallPath: Bustard
## analysisPath: GERALD

fl <- file.path(analysisPath(sp), "s_1_sequence.txt")
fl

## [1] "C:/Users/Jia/Documents/R/win-library/3.1/ShortRead/extdata/Data/C1-36Firecrest/Bustard/GERALD/s_1_sequence.txt"

sampler <- FastqSampler(fl, 1000000)
reads <- yield(sampler)
head( id(reads) ) # reads id

##   A BStringSet instance of length 6
##     width seq
## [1]    24 HWI-EAS88_1_1_1_1001_499
## [2]    23 HWI-EAS88_1_1_1_898_392
## [3]    23 HWI-EAS88_1_1_1_922_465
## [4]    23 HWI-EAS88_1_1_1_895_493
## [5]    23 HWI-EAS88_1_1_1_953_493
## [6]    23 HWI-EAS88_1_1_1_868_763

head(sread(reads) ) # reads nucleotide information

##   A DNAStringSet instance of length 6
##     width seq
## [1]    36 GGACTTTGTAGGATACCCTCGCTTTCCTTCTCCTGT
## [2]    36 GATTTCTTACCTATTAGTGGTTGAACAGCATCGGAC
## [3]    36 GCGGTGGTCTATAGTGTTATTAATATCAATTTGGGT
## [4]    36 GTTACCATGATGTTATTTCTTCATTTGGAGGTAAAA
## [5]    36 GTATGTTTCTCCTGCTTATCACCTTCTTGAAGGCTT
## [6]    36 GTTCTCTAAAAACCATTTTTCGTCCCCTTCGGGGCG

head(quality(reads)) # quality information

## class: SFastqQuality
## quality:
##   A BStringSet instance of length 6
##     width seq
## [1]    36 ]]]]]]]]]]]]Y]Y]]]]]]]]]]]]VCHVMPLAS
## [2]    36 ]]]]]]]]]]]]Y]]]]]]]]]YPV]T][PZPICCK
## [3]    36 ]]]]Y]]]]]V]T]]]]]T]]]]]V]TMJEUXEFLA
## [4]    36 ]]]]]]]]]]]]]]]]]]]]]]T]]]]RJRZTQLOA
## [5]    36 ]]]]]]]]]]]]]]]]]T]]]]]]]]]]MJUJVLSS
## [6]    36 ]]]]]]]]]]]Y]]T]]]O]]]]VO]W]VZMXVOLS

#------------matplot---------------
abc <- alphabetByCycle(sread(reads))
abc[1:4, 1:8]

##         cycle
## alphabet [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
##        A    0   61   70   73   74   67   81   67
##        C    0   42   42   33   36   63   36   52
##        G  256   51   37   53   51   52   62   41
##        T    0  102  107   97   95   74   77   96

matplot(t(abc[c("A","G","T","C"),]), type="l")

# A histogram of the GC content of individual reads is obtained with:
alf0 <- alphabetFrequency(sread(reads), as.prob=TRUE)
hist(rowSums(alf0[,c("G", "C")]),main = "Histogram of gc Content",
     xlab="individual reads" )

## ----reads quality-----------------------------------------------
fq <- FastqSampler(fl, 100000)
srq <- yield(fq)
srq

## class: ShortReadQ
## length: 256 reads; width: 36 cycles

## ----quality assessment on short reads-----------------------
qa <- qa(fl, type="fastq")
browseURL(report(qa))

Commands related to Ranges.

## ----packages ----------------------
library(ShortRead);
library(GenomicRanges);

## ---- define a function----------------------------------------------------
plotRanges <- function(x, xlim = x, main = deparse(substitute(x)),
                       col = "black", sep = 0.5, ...){    
  height <- 1
  if (is(xlim, "Ranges"))
    xlim <- c(min(start(xlim)), max(end(xlim)))
  bins <- disjointBins(IRanges(start(x), end(x) + 1))
  plot.new()
  par(mai=c(0.6, 0.2, 0.6, 0.2))
  plot.window(xlim, c(0, max(bins)*(height + sep)))
  ybottom <- bins * (sep + height) - height
  rect(start(x)-0.5, ybottom, end(x)+0.5, ybottom + height, col = col, ...)
  title(main, cex.main=2.8, font.main=1)
  axis(1)}

## ----GRanges-genes-------------------------------------------------------
genes <- GRanges(seqnames=c("chr3R", "chrX"),
                 ranges=IRanges(
                   start=c(19967117, 18962306),
                   end = c(19973212, 18962925)),
                 strand=c("+", "-"),
                 seqlengths=c(chr3R=27905053, chrX=22422827))
genes # how to define a gene

## GRanges object with 2 ranges and 0 metadata columns:
##       seqnames               ranges strand
##          <Rle>            <IRanges>  <Rle>
##   [1]    chr3R [19967117, 19973212]      +
##   [2]     chrX [18962306, 18962925]      -
##   -------
##   seqinfo: 2 sequences from an unspecified genome

## ----GRanges-basics------------------------------------------------------
genes[2]

## GRanges object with 1 range and 0 metadata columns:
##       seqnames               ranges strand
##          <Rle>            <IRanges>  <Rle>
##   [1]     chrX [18962306, 18962925]      -
##   -------
##   seqinfo: 2 sequences from an unspecified genome

strand(genes)

## factor-Rle of length 2 with 2 runs
##   Lengths: 1 1
##   Values : + -
## Levels(3): + - *

width(genes)

## [1] 6096  620

length(genes)

## [1] 2

names(genes) <- c("FBgn0039155", "FBgn0085359")
genes  # now with names

## GRanges object with 2 ranges and 0 metadata columns:
##               seqnames               ranges strand
##                  <Rle>            <IRanges>  <Rle>
##   FBgn0039155    chr3R [19967117, 19973212]      +
##   FBgn0085359     chrX [18962306, 18962925]      -
##   -------
##   seqinfo: 2 sequences from an unspecified genome

## ----Ranges -----------------------------------------------------------
ir1 <- IRanges(start=c(7, 9, 12, 14, 22:24),end=c(15, 11, 12, 18, 26, 27, 28))
ir1

## IRanges of length 7
##     start end width
## [1]     7  15     9
## [2]     9  11     3
## [3]    12  12     1
## [4]    14  18     5
## [5]    22  26     5
## [6]    23  27     5
## [7]    24  28     5

ir2 <- IRanges(start=c(6, 8),end=c(28, 12))
ir2

## IRanges of length 2
##     start end width
## [1]     6  28    23
## [2]     8  12     5

findOverlaps(ir1,ir2)  #

## Hits of length 10
## queryLength: 7
## subjectLength: 2
##    queryHits subjectHits 
##     <integer>   <integer> 
##  1          1           1 
##  2          1           2 
##  3          2           1 
##  4          2           2 
##  5          3           1 
##  6          3           2 
##  7          4           1 
##  8          5           1 
##  9          6           1 
##  10         7           1

countOverlaps(ir1,ir2)

## [1] 2 2 2 1 1 1 1

## ----shift, reduce, disjoin --------------------------
ir <- IRanges(start=c(7, 9, 12, 14, 22:24),end=c(15, 11, 12, 18, 26, 27, 28))
ir

## IRanges of length 7
##     start end width
## [1]     7  15     9
## [2]     9  11     3
## [3]    12  12     1
## [4]    14  18     5
## [5]    22  26     5
## [6]    23  27     5
## [7]    24  28     5

par(mfrow= c(2,2)) 
plotRanges(ir, xlim=c(5, 35), main="Original")
plotRanges(shift(ir, 5), xlim=c(5, 35), main="Shift")
plotRanges(reduce(ir), xlim=c(5, 35), main="Reduce")
plotRanges(disjoin(ir), xlim=c(5, 35), main="disjoin")

## ----coverage------------------------------------------------------------
cvg <- coverage(ir)
plot(as.integer(cvg), type="s", xlab="Coordinate", ylab="Depth of coverage")

## ----GRanges-mcols, metadata-------------------------------------------------------
mcols(genes) <- DataFrame(EntrezId=c("42865", "2768869"),
                          Symbol=c("kal-1", "CG34330"))
metadata(genes) <- list(CreatedBy="ChinaSIP", Date=date())
metadata(genes)

## $CreatedBy
## [1] "ChinaSIP"
## 
## $Date
## [1] "Thu May 14 16:43:12 2015"

Commands related to BAM.

library(GenomicAlignments)
library(GenomicFeatures)

## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## 
## Attaching package: 'AnnotationDbi'
## 
## The following object is masked from 'package:GenomeInfoDb':
## 
##     species

## ----SAM-----------------------------------------------------------------
fl <- system.file("extdata", "ex1.sam", package="Rsamtools")
fl

## [1] "C:/Users/Jia/Documents/R/win-library/3.1/Rsamtools/extdata/ex1.sam"

strsplit(readLines(fl, 2), "\t")[[1]]

##  [1] "B7_591:4:96:693:509"                 
##  [2] "73"                                  
##  [3] "seq1"                                
##  [4] "1"                                   
##  [5] "99"                                  
##  [6] "36M"                                 
##  [7] "*"                                   
##  [8] "0"                                   
##  [9] "0"                                   
## [10] "CACTAGTGGCTCATTGTAAATGTGTGGTTTAACTCG"
## [11] "<<<<<<<<<<<<<<<;<<<<<<<<<5<<<<<;:<;7"
## [12] "MF:i:18"                             
## [13] "Aq:i:73"                             
## [14] "NM:i:0"                              
## [15] "UQ:i:0"                              
## [16] "H0:i:1"                              
## [17] "H1:i:0"

# more details about SAM format: http://samtools.github.io/hts-specs/SAMv1.pdf

fl <- system.file("extdata", "ex1.bam", package="Rsamtools")
fl

## [1] "C:/Users/Jia/Documents/R/win-library/3.1/Rsamtools/extdata/ex1.bam"

aln <- readGAlignments(fl)
length(aln)

## [1] 3271

aln

## GAlignments object with 3271 alignments and 0 metadata columns:
##          seqnames strand       cigar    qwidth     start       end
##             <Rle>  <Rle> <character> <integer> <integer> <integer>
##      [1]     seq1      +         36M        36         1        36
##      [2]     seq1      +         35M        35         3        37
##      [3]     seq1      +         35M        35         5        39
##      [4]     seq1      +         36M        36         6        41
##      [5]     seq1      +         35M        35         9        43
##      ...      ...    ...         ...       ...       ...       ...
##   [3267]     seq2      +         35M        35      1524      1558
##   [3268]     seq2      +         35M        35      1524      1558
##   [3269]     seq2      -         35M        35      1528      1562
##   [3270]     seq2      -         35M        35      1532      1566
##   [3271]     seq2      -         35M        35      1533      1567
##              width     njunc
##          <integer> <integer>
##      [1]        36         0
##      [2]        35         0
##      [3]        35         0
##      [4]        36         0
##      [5]        35         0
##      ...       ...       ...
##   [3267]        35         0
##   [3268]        35         0
##   [3269]        35         0
##   [3270]        35         0
##   [3271]        35         0
##   -------
##   seqinfo: 2 sequences from an unspecified genome

## ----GappedAlignments-accessors------------------------------------------
table(strand(aln))

## 
##    +    -    * 
## 1647 1624    0

range(width(aln))

## [1] 30 40

width(aln)[9786]

## [1] NA

head(sort(table(cigar(aln)), decreasing=TRUE))

## 
##      35M      36M      40M      34M      33M 14M4I17M 
##     2804      283      112       37        6        4

## ----GappedAlignments, to keep sequence----------------------------------------------
param <- ScanBamParam(what="seq")
aln <- readGAlignments(fl, param=param)
aln # You can now see the nucleotide sequence

## GAlignments object with 3271 alignments and 1 metadata column:
##          seqnames strand       cigar    qwidth     start       end
##             <Rle>  <Rle> <character> <integer> <integer> <integer>
##      [1]     seq1      +         36M        36         1        36
##      [2]     seq1      +         35M        35         3        37
##      [3]     seq1      +         35M        35         5        39
##      [4]     seq1      +         36M        36         6        41
##      [5]     seq1      +         35M        35         9        43
##      ...      ...    ...         ...       ...       ...       ...
##   [3267]     seq2      +         35M        35      1524      1558
##   [3268]     seq2      +         35M        35      1524      1558
##   [3269]     seq2      -         35M        35      1528      1562
##   [3270]     seq2      -         35M        35      1532      1566
##   [3271]     seq2      -         35M        35      1533      1567
##              width     njunc   |                                  seq
##          <integer> <integer>   |                       <DNAStringSet>
##      [1]        36         0   | CACTAGTGGCTCATTGTAAATGTGTGGTTTAACTCG
##      [2]        35         0   |  CTAGTGGCTCATTGTAAATGTGTGGTTTAACTCGT
##      [3]        35         0   |  AGTGGCTCATTGTAAATGTGTGGTTTAACTCGTCC
##      [4]        36         0   | GTGGCTCATTGTAATTTTTTGTTTTAACTCTTCTCT
##      [5]        35         0   |  GCTCATTGTAAATGTGTGGTTTAACTCGTCCATGG
##      ...       ...       ... ...                                  ...
##   [3267]        35         0   |  TTTCTTTTCTCCTTTTTTTTTTTTTTTTTCTACAT
##   [3268]        35         0   |  TTTCTTTTCACTTTTTTTTTTTTTTTTTTTTACTT
##   [3269]        35         0   |  TTTTTTCTTTTTTTTTTTTTTTTTTTTGCATGCCA
##   [3270]        35         0   |  TTTTTTTTTTTTTTTTTTTTTTTGCATGCCAGAAA
##   [3271]        35         0   |  TTTTTTTTTTTTTTTTTTTTTTTCATGCCAGAAAA
##   -------
##   seqinfo: 2 sequences from an unspecified genome

## define a gene
genes <- GRanges(seqnames=c("seq1", "seq2"),
                 ranges=IRanges(
                   start=c(100, 1000),
                   end = c(200, 1200)),
                 strand=c("+", "-"))
genes # how to define a gene

## GRanges object with 2 ranges and 0 metadata columns:
##       seqnames       ranges strand
##          <Rle>    <IRanges>  <Rle>
##   [1]     seq1 [ 100,  200]      +
##   [2]     seq2 [1000, 1200]      -
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths

## ----summarizeOverlaps and calculate RPKM---------------------------------------------------
## library(BiocParallel)
countOverlaps(genes, aln) # number of reads aligned to genes

## [1]  51 164

width(genes) # length of transcripts

## [1] 101 201

length(aln) # total number of reads

## [1] 3271

rpkm <- countOverlaps(genes, aln)/((width(genes)/10^3)*(length(aln)/10^6))
# This example neglected the existence of introns.

Commands related to Annotations.

library(GenomicFeatures);
library(TxDb.Hsapiens.UCSC.hg19.knownGene)

# Download Transcript Information 
## transcriptDB from UCSC, take a very long time
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene # this step takes some time.
txdb

## TxDb object:
## | Db type: TxDb
## | Supporting package: GenomicFeatures
## | Data source: UCSC
## | Genome: hg19
## | Organism: Homo sapiens
## | UCSC Table: knownGene
## | Resource URL: http://genome.ucsc.edu/
## | Type of Gene ID: Entrez Gene ID
## | Full dataset: yes
## | miRBase build ID: GRCh37
## | transcript_nrow: 82960
## | exon_nrow: 289969
## | cds_nrow: 237533
## | Db created by: GenomicFeatures package from Bioconductor
## | Creation time: 2014-09-26 11:16:12 -0700 (Fri, 26 Sep 2014)
## | GenomicFeatures version at creation time: 1.17.17
## | RSQLite version at creation time: 0.11.4
## | DBSCHEMAVERSION: 1.0

cds <- cdsBy(txdb)
cds

## GRangesList object of length 63691:
## $2 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand |    cds_id    cds_name exon_rank
##          <Rle>      <IRanges>  <Rle> | <integer> <character> <integer>
##   [1]     chr1 [12190, 12227]      + |         1        <NA>         1
##   [2]     chr1 [12595, 12721]      + |         2        <NA>         2
##   [3]     chr1 [13403, 13639]      + |         3        <NA>         3
## 
## $4 
## GRanges object with 1 range and 3 metadata columns:
##       seqnames         ranges strand | cds_id cds_name exon_rank
##   [1]     chr1 [69091, 70008]      + |      4     <NA>         1
## 
## $7 
## GRanges object with 2 ranges and 3 metadata columns:
##       seqnames           ranges strand | cds_id cds_name exon_rank
##   [1]     chr1 [324343, 324345]      + |      5     <NA>         2
##   [2]     chr1 [324439, 325605]      + |      6     <NA>         3
## 
## ...
## <63688 more elements>
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome

cds[1]

## GRangesList object of length 1:
## $2 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand |    cds_id    cds_name exon_rank
##          <Rle>      <IRanges>  <Rle> | <integer> <character> <integer>
##   [1]     chr1 [12190, 12227]      + |         1        <NA>         1
##   [2]     chr1 [12595, 12721]      + |         2        <NA>         2
##   [3]     chr1 [13403, 13639]      + |         3        <NA>         3
## 
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome

exons <- exonsBy(txdb)
exons

## GRangesList object of length 82960:
## $1 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand |   exon_id   exon_name exon_rank
##          <Rle>      <IRanges>  <Rle> | <integer> <character> <integer>
##   [1]     chr1 [11874, 12227]      + |         1        <NA>         1
##   [2]     chr1 [12613, 12721]      + |         3        <NA>         2
##   [3]     chr1 [13221, 14409]      + |         5        <NA>         3
## 
## $2 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand | exon_id exon_name exon_rank
##   [1]     chr1 [11874, 12227]      + |       1      <NA>         1
##   [2]     chr1 [12595, 12721]      + |       2      <NA>         2
##   [3]     chr1 [13403, 14409]      + |       6      <NA>         3
## 
## $3 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand | exon_id exon_name exon_rank
##   [1]     chr1 [11874, 12227]      + |       1      <NA>         1
##   [2]     chr1 [12646, 12697]      + |       4      <NA>         2
##   [3]     chr1 [13221, 14409]      + |       5      <NA>         3
## 
## ...
## <82957 more elements>
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome

introns <- intronsByTranscript(txdb)
introns

## GRangesList object of length 82960:
## $1 
## GRanges object with 2 ranges and 0 metadata columns:
##       seqnames         ranges strand
##          <Rle>      <IRanges>  <Rle>
##   [1]     chr1 [12228, 12612]      +
##   [2]     chr1 [12722, 13220]      +
## 
## $2 
## GRanges object with 2 ranges and 0 metadata columns:
##       seqnames         ranges strand
##   [1]     chr1 [12228, 12594]      +
##   [2]     chr1 [12722, 13402]      +
## 
## $3 
## GRanges object with 2 ranges and 0 metadata columns:
##       seqnames         ranges strand
##   [1]     chr1 [12228, 12645]      +
##   [2]     chr1 [12698, 13220]      +
## 
## ...
## <82957 more elements>
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome

utr5 <- fiveUTRsByTranscript(txdb)
utr5

## GRangesList object of length 61495:
## $2 
## GRanges object with 1 range and 3 metadata columns:
##       seqnames         ranges strand |   exon_id   exon_name exon_rank
##          <Rle>      <IRanges>  <Rle> | <integer> <character> <integer>
##   [1]     chr1 [11874, 12189]      + |         1        <NA>         1
## 
## $7 
## GRanges object with 2 ranges and 3 metadata columns:
##       seqnames           ranges strand | exon_id exon_name exon_rank
##   [1]     chr1 [322037, 322228]      + |      10      <NA>         1
##   [2]     chr1 [324288, 324342]      + |      12      <NA>         2
## 
## $8 
## GRanges object with 2 ranges and 3 metadata columns:
##       seqnames           ranges strand | exon_id exon_name exon_rank
##   [1]     chr1 [323892, 324060]      + |      11      <NA>         1
##   [2]     chr1 [324288, 324342]      + |      12      <NA>         2
## 
## ...
## <61492 more elements>
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome

utr3 <- threeUTRsByTranscript(txdb)
utr3

## GRangesList object of length 60740:
## $2 
## GRanges object with 1 range and 3 metadata columns:
##       seqnames         ranges strand |   exon_id   exon_name exon_rank
##          <Rle>      <IRanges>  <Rle> | <integer> <character> <integer>
##   [1]     chr1 [13640, 14409]      + |         6        <NA>         3
## 
## $7 
## GRanges object with 1 range and 3 metadata columns:
##       seqnames           ranges strand | exon_id exon_name exon_rank
##   [1]     chr1 [325606, 326938]      + |      14      <NA>         3
## 
## $8 
## GRanges object with 1 range and 3 metadata columns:
##       seqnames           ranges strand | exon_id exon_name exon_rank
##   [1]     chr1 [325606, 328581]      + |      15      <NA>         3
## 
## ...
## <60737 more elements>
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome

## ----select--------------------------------------------------------------
library(org.Hs.eg.db) # library of gene annotations

## Loading required package: DBI

# Get the Unigene identifiers that are mapped to an entrez gene id
x <- org.Hs.egGENENAME
mapped_genes <- mappedkeys(x)
xx <- as.list(x[mapped_genes]) # Convert to a list
xx[1:10]  # mapping between entrez gene ID and gene name

## $`1`
## [1] "alpha-1-B glycoprotein"
## 
## $`10`
## [1] "N-acetyltransferase 2 (arylamine N-acetyltransferase)"
## 
## $`100`
## [1] "adenosine deaminase"
## 
## $`1000`
## [1] "cadherin 2, type 1, N-cadherin (neuronal)"
## 
## $`10000`
## [1] "v-akt murine thymoma viral oncogene homolog 3"
## 
## $`100008586`
## [1] "G antigen 12F"
## 
## $`100008587`
## [1] "RNA, 5.8S ribosomal 5"
## 
## $`100008588`
## [1] "RNA, 18S ribosomal 5"
## 
## $`100008589`
## [1] "RNA, 28S ribosomal 5"
## 
## $`100009601`
## [1] "transfer RNA-Tyr (GTA) 7-1"

xx[[names(cds[1])]] # get the gene name of the first CDS.

## [1] "alpha-2-macroglobulin"

# Get the Unigene identifiers that are mapped to an entrez gene id
x <- org.Hs.egSYMBOL
# Get the gene symbol that are mapped to an entrez gene identifiers
mapped_genes <- mappedkeys(x)
xx <- as.list(x[mapped_genes])
xx[1:10]  # mapping between entrez gene ID and gene symbol

## $`1`
## [1] "A1BG"
## 
## $`10`
## [1] "NAT2"
## 
## $`100`
## [1] "ADA"
## 
## $`1000`
## [1] "CDH2"
## 
## $`10000`
## [1] "AKT3"
## 
## $`100008586`
## [1] "GAGE12F"
## 
## $`100008587`
## [1] "RNA5-8S5"
## 
## $`100008588`
## [1] "RNA18S5"
## 
## $`100008589`
## [1] "RNA28S5"
## 
## $`100009601`
## [1] "TRY-GTA7-1"

xx[[names(cds[1])]] # get the gene symbol of the first CDS.

## [1] "A2M"