[2014 Epiworkshop] R basics and Sequencing data analysis
Part III: Sequencing data analysis in R
Lecturer: Sheng Li
Weill Cornell Medical College
1. GenomicRanges object
library(GenomicRanges)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, duplicated, eval, evalq, Filter, Find, get,
## intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unlist
##
## Loading required package: IRanges
## Loading required package: XVector
gr = GRanges(Rle("chr1"), IRanges(start = 15000, width = 100))
gr
## GRanges with 1 range and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 [15000, 15099] *
## ---
## seqlengths:
## chr1
## NA
seqinfo <- Seqinfo(paste0("chr", 1:3), c(1000, 2000, 1500), NA, "mock1")
## Combining objects
gr2 <- GRanges(seqnames = Rle(c("chr1", "chr2", "chr3"), c(3, 3, 4)), IRanges(1:10,
width = 5), strand = "-", score = 101:110, GC = runif(10), seqinfo = seqinfo)
gr2
## GRanges with 10 ranges and 2 metadata columns:
## seqnames ranges strand | score GC
## <Rle> <IRanges> <Rle> | <integer> <numeric>
## [1] chr1 [ 1, 5] - | 101 0.864219795446843
## [2] chr1 [ 2, 6] - | 102 0.479244332294911
## [3] chr1 [ 3, 7] - | 103 0.120439695660025
## [4] chr2 [ 4, 8] - | 104 0.709010389633477
## [5] chr2 [ 5, 9] - | 105 0.617770149838179
## [6] chr2 [ 6, 10] - | 106 0.294313735794276
## [7] chr3 [ 7, 11] - | 107 0.853401493746787
## [8] chr3 [ 8, 12] - | 108 0.675788938999176
## [9] chr3 [ 9, 13] - | 109 0.274476919323206
## [10] chr3 [10, 14] - | 110 0.772934696869925
## ---
## seqlengths:
## chr1 chr2 chr3
## 1000 2000 1500
gr3 <- GRanges(seqnames = Rle(c("chr1", "chr2", "chr3"), c(3, 4, 3)), IRanges(101:110,
width = 10), strand = "-", score = 21:30, GC = runif(10), seqinfo = seqinfo)
gr3
## GRanges with 10 ranges and 2 metadata columns:
## seqnames ranges strand | score GC
## <Rle> <IRanges> <Rle> | <integer> <numeric>
## [1] chr1 [101, 110] - | 21 0.22265721927397
## [2] chr1 [102, 111] - | 22 0.782207254553214
## [3] chr1 [103, 112] - | 23 0.114360670791939
## [4] chr2 [104, 113] - | 24 0.225365708349273
## [5] chr2 [105, 114] - | 25 0.339775336440653
## [6] chr2 [106, 115] - | 26 0.97892294311896
## [7] chr2 [107, 116] - | 27 0.776719868881628
## [8] chr3 [108, 117] - | 28 0.453788619255647
## [9] chr3 [109, 118] - | 29 0.15555077791214
## [10] chr3 [110, 119] - | 30 0.244707313366234
## ---
## seqlengths:
## chr1 chr2 chr3
## 1000 2000 1500
some.gr <- c(gr2, gr3)
some.gr
## GRanges with 20 ranges and 2 metadata columns:
## seqnames ranges strand | score GC
## <Rle> <IRanges> <Rle> | <integer> <numeric>
## [1] chr1 [1, 5] - | 101 0.864219795446843
## [2] chr1 [2, 6] - | 102 0.479244332294911
## [3] chr1 [3, 7] - | 103 0.120439695660025
## [4] chr2 [4, 8] - | 104 0.709010389633477
## [5] chr2 [5, 9] - | 105 0.617770149838179
## ... ... ... ... ... ... ...
## [16] chr2 [106, 115] - | 26 0.97892294311896
## [17] chr2 [107, 116] - | 27 0.776719868881628
## [18] chr3 [108, 117] - | 28 0.453788619255647
## [19] chr3 [109, 118] - | 29 0.15555077791214
## [20] chr3 [110, 119] - | 30 0.244707313366234
## ---
## seqlengths:
## chr1 chr2 chr3
## 1000 2000 1500
# obtain statistics
seqnames(gr3)
## factor-Rle of length 10 with 3 runs
## Lengths: 3 4 3
## Values : chr1 chr2 chr3
## Levels(3): chr1 chr2 chr3
start(gr3)
## [1] 101 102 103 104 105 106 107 108 109 110
end(gr3)
## [1] 110 111 112 113 114 115 116 117 118 119
width(gr3)
## [1] 10 10 10 10 10 10 10 10 10 10
values(gr2)
## DataFrame with 10 rows and 2 columns
## score GC
## <integer> <numeric>
## 1 101 0.8642
## 2 102 0.4792
## 3 103 0.1204
## 4 104 0.7090
## 5 105 0.6178
## 6 106 0.2943
## 7 107 0.8534
## 8 108 0.6758
## 9 109 0.2745
## 10 110 0.7729
values(gr2)[, "GC"]
## [1] 0.8642 0.4792 0.1204 0.7090 0.6178 0.2943 0.8534 0.6758 0.2745 0.7729
# GenomicRanges manupulation
shift(gr3, 100)
## GRanges with 10 ranges and 2 metadata columns:
## seqnames ranges strand | score GC
## <Rle> <IRanges> <Rle> | <integer> <numeric>
## [1] chr1 [201, 210] - | 21 0.22265721927397
## [2] chr1 [202, 211] - | 22 0.782207254553214
## [3] chr1 [203, 212] - | 23 0.114360670791939
## [4] chr2 [204, 213] - | 24 0.225365708349273
## [5] chr2 [205, 214] - | 25 0.339775336440653
## [6] chr2 [206, 215] - | 26 0.97892294311896
## [7] chr2 [207, 216] - | 27 0.776719868881628
## [8] chr3 [208, 217] - | 28 0.453788619255647
## [9] chr3 [209, 218] - | 29 0.15555077791214
## [10] chr3 [210, 219] - | 30 0.244707313366234
## ---
## seqlengths:
## chr1 chr2 chr3
## 1000 2000 1500
flank(gr3, width = 10, start = TRUE)
## GRanges with 10 ranges and 2 metadata columns:
## seqnames ranges strand | score GC
## <Rle> <IRanges> <Rle> | <integer> <numeric>
## [1] chr1 [111, 120] - | 21 0.22265721927397
## [2] chr1 [112, 121] - | 22 0.782207254553214
## [3] chr1 [113, 122] - | 23 0.114360670791939
## [4] chr2 [114, 123] - | 24 0.225365708349273
## [5] chr2 [115, 124] - | 25 0.339775336440653
## [6] chr2 [116, 125] - | 26 0.97892294311896
## [7] chr2 [117, 126] - | 27 0.776719868881628
## [8] chr3 [118, 127] - | 28 0.453788619255647
## [9] chr3 [119, 128] - | 29 0.15555077791214
## [10] chr3 [120, 129] - | 30 0.244707313366234
## ---
## seqlengths:
## chr1 chr2 chr3
## 1000 2000 1500
promoters(gr3, upstream = 100, downstream = 100)
## GRanges with 10 ranges and 2 metadata columns:
## seqnames ranges strand | score GC
## <Rle> <IRanges> <Rle> | <integer> <numeric>
## [1] chr1 [11, 210] - | 21 0.22265721927397
## [2] chr1 [12, 211] - | 22 0.782207254553214
## [3] chr1 [13, 212] - | 23 0.114360670791939
## [4] chr2 [14, 213] - | 24 0.225365708349273
## [5] chr2 [15, 214] - | 25 0.339775336440653
## [6] chr2 [16, 215] - | 26 0.97892294311896
## [7] chr2 [17, 216] - | 27 0.776719868881628
## [8] chr3 [18, 217] - | 28 0.453788619255647
## [9] chr3 [19, 218] - | 29 0.15555077791214
## [10] chr3 [20, 219] - | 30 0.244707313366234
## ---
## seqlengths:
## chr1 chr2 chr3
## 1000 2000 1500
reduce(gr3)
## GRanges with 3 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 [101, 112] -
## [2] chr2 [104, 116] -
## [3] chr3 [108, 119] -
## ---
## seqlengths:
## chr1 chr2 chr3
## 1000 2000 1500
2. Read in bam file
library(GenomicRanges)
library(Rsamtools)
## Loading required package: Biostrings
library(IRanges)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
## Loading required package: GenomicFeatures
## Warning: package 'GenomicFeatures' was built under R version 3.0.3
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
##
## Attaching package: 'Biobase'
##
## The following object is masked from 'package:ProjectTemplate':
##
## cache
library(org.Hs.eg.db)
## Loading required package: DBI
# file
bamFile = system.file("extdata", "ex1.bam", package = "Rsamtools")
bamFile
## [1] "/Library/Frameworks/R.framework/Versions/3.0/Resources/library/Rsamtools/extdata/ex1.bam"
# Parameters to read bam files
what = c("flag", "seq")
param = ScanBamParam(what = what)
# read bam file into GAlignments objects
aln = readGAlignments(bamFile, param = param)
aln
## GAlignments with 3271 alignments and 2 metadata columns:
## seqnames strand cigar qwidth start end
## <Rle> <Rle> <character> <integer> <integer> <integer>
## [1] seq1 + 36M 36 1 36
## [2] seq1 + 35M 35 3 37
## [3] seq1 + 35M 35 5 39
## [4] seq1 + 36M 36 6 41
## [5] seq1 + 35M 35 9 43
## ... ... ... ... ... ... ...
## [3267] seq2 + 35M 35 1524 1558
## [3268] seq2 + 35M 35 1524 1558
## [3269] seq2 - 35M 35 1528 1562
## [3270] seq2 - 35M 35 1532 1566
## [3271] seq2 - 35M 35 1533 1567
## width ngap | flag
## <integer> <integer> | <integer>
## [1] 36 0 | 73
## [2] 35 0 | 73
## [3] 35 0 | 137
## [4] 36 0 | 137
## [5] 35 0 | 137
## ... ... ... ... ...
## [3267] 35 0 | 137
## [3268] 35 0 | 73
## [3269] 35 0 | 83
## [3270] 35 0 | 147
## [3271] 35 0 | 83
## seq
## <DNAStringSet>
## [1] CACTAGTGGCTCATTGTAAATGTGTGGTTTAACTCG
## [2] CTAGTGGCTCATTGTAAATGTGTGGTTTAACTCGT
## [3] AGTGGCTCATTGTAAATGTGTGGTTTAACTCGTCC
## [4] GTGGCTCATTGTAATTTTTTGTTTTAACTCTTCTCT
## [5] GCTCATTGTAAATGTGTGGTTTAACTCGTCCATGG
## ... ...
## [3267] TTTCTTTTCTCCTTTTTTTTTTTTTTTTTCTACAT
## [3268] TTTCTTTTCACTTTTTTTTTTTTTTTTTTTTACTT
## [3269] TTTTTTCTTTTTTTTTTTTTTTTTTTTGCATGCCA
## [3270] TTTTTTTTTTTTTTTTTTTTTTTGCATGCCAGAAA
## [3271] TTTTTTTTTTTTTTTTTTTTTTTCATGCCAGAAAA
## ---
## seqlengths:
## seq1 seq2
## 1575 1584
# specify which range to read in Parameters to read bam files
which = RangesList(seq1 = IRanges(1000, 2000), seq2 = IRanges(c(100, 1000),
c(1000, 2000)))
what = c("flag", "seq")
param = ScanBamParam(which = which, what = what)
# read bam file into GAlignments objects
aln = readGAlignments(bamFile, param = param)
aln
## GAlignments with 2404 alignments and 2 metadata columns:
## seqnames strand cigar qwidth start end
## <Rle> <Rle> <character> <integer> <integer> <integer>
## [1] seq1 + 35M 35 970 1004
## [2] seq1 + 35M 35 971 1005
## [3] seq1 + 35M 35 972 1006
## [4] seq1 + 35M 35 973 1007
## [5] seq1 + 35M 35 974 1008
## ... ... ... ... ... ... ...
## [2400] seq2 + 35M 35 1524 1558
## [2401] seq2 + 35M 35 1524 1558
## [2402] seq2 - 35M 35 1528 1562
## [2403] seq2 - 35M 35 1532 1566
## [2404] seq2 - 35M 35 1533 1567
## width ngap | flag
## <integer> <integer> | <integer>
## [1] 35 0 | 99
## [2] 35 0 | 163
## [3] 35 0 | 99
## [4] 35 0 | 163
## [5] 35 0 | 99
## ... ... ... ... ...
## [2400] 35 0 | 137
## [2401] 35 0 | 73
## [2402] 35 0 | 83
## [2403] 35 0 | 147
## [2404] 35 0 | 83
## seq
## <DNAStringSet>
## [1] TATTAGGAAATGCTTTACTGTCATAACTATGAAGA
## [2] ATTAGGAAATGCTTTACTGTCATAACTATGAAGAG
## [3] TTAGGAAATGCTTTACTGTCATAACTATGAAGAGA
## [4] TAGGAAATGCTTTACTGTCATAACTATGAAGAGAC
## [5] AGGAAATGCTTTACTGTCATAACTATGAAGAGACT
## ... ...
## [2400] TTTCTTTTCTCCTTTTTTTTTTTTTTTTTCTACAT
## [2401] TTTCTTTTCACTTTTTTTTTTTTTTTTTTTTACTT
## [2402] TTTTTTCTTTTTTTTTTTTTTTTTTTTGCATGCCA
## [2403] TTTTTTTTTTTTTTTTTTTTTTTGCATGCCAGAAA
## [2404] TTTTTTTTTTTTTTTTTTTTTTTCATGCCAGAAAA
## ---
## seqlengths:
## seq1 seq2
## 1575 1584
# annotate
gnModel <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, "gene")
geneid <- AnnotationDbi::select(org.Hs.eg.db, keys = names(gnModel), columns = "SYMBOL")
rownames(geneid) = geneid$ENTREZID
head(geneid)
## ENTREZID SYMBOL
## 1 1 A1BG
## 10 10 NAT2
## 100 100 ADA
## 1000 1000 CDH2
## 10000 10000 AKT3
## 100008586 100008586 GAGE12F
# read bam file
aln1 = readGAlignments("chr1.bam", param = ScanBamParam(what = "flag"))
strand(aln1) = "*" # for strand-blind sample prep protocol
hits = countOverlaps(aln1, gnModel) # calculate the number of hit from gnModel for each alignment
head(hits)
## [1] 0 0 0 0 1 1
counts = countOverlaps(gnModel, aln1[hits == 1]) # calculate the number of hit from the alignment that overlap with gnModel
names(counts) = geneid[names(gnModel), "SYMBOL"]
head(counts[counts > 10])
## ARHGAP30 RPL5 TOMM20
## 64 12 42
3. Interacting with a species genome, using BSgenome
library("BSgenome.Scerevisiae.UCSC.sacCer3")
## Loading required package: BSgenome
##
## Attaching package: 'BSgenome'
##
## The following object is masked from 'package:AnnotationDbi':
##
## species
Scerevisiae
## Yeast genome
## |
## | organism: Saccharomyces cerevisiae (Yeast)
## | provider: UCSC
## | provider version: sacCer3
## | release date: April 2011
## | release name: SGD April 2011 sequence
## |
## | sequences (see '?seqnames'):
## | chrI chrII chrIII chrIV chrV chrVI chrVII chrVIII
## | chrIX chrX chrXI chrXII chrXIII chrXIV chrXV chrXVI
## | chrM
## |
## | (use the '$' or '[[' operator to access a given sequence)
Scerevisiae$chrI
## 230218-letter "DNAString" instance
## seq: CCACACCACACCCACACACCCACACACCACACCA...GTGTGGGTGTGGTGTGGGTGTGGTGTGTGTGGG
length(Scerevisiae$chrI)
## [1] 230218
alphabetFrequency(Scerevisiae$chrI)
## A C G T M R W S Y K V H
## 69836 44641 45766 69975 0 0 0 0 0 0 0 0
## D B N - +
## 0 0 0 0 0
# pattern matching operations
p1 <- "TTTTAGA"
countPattern(p1, Scerevisiae$chrI)
## [1] 27
countPattern(p1, Scerevisiae$chrI, max.mismatch = 1)
## [1] 646
m1 <- matchPattern(p1, Scerevisiae$chrI, max.mismatch = 1)
m1[1:5]
## Views on a 230218-letter DNAString subject
## subject: CCACACCACACCCACACACCCACACACCACAC...GTGGGTGTGGTGTGGGTGTGGTGTGTGTGGG
## views:
## start end width
## [1] 878 884 7 [TTTCAGA]
## [2] 1426 1432 7 [TCTTAGA]
## [3] 2568 2574 7 [TTTTAGA]
## [4] 2758 2764 7 [TTTTAGC]
## [5] 3251 3257 7 [CTTTAGA]
mismatch(p1, m1[1:5])
## [[1]]
## [1] 4
##
## [[2]]
## [1] 2
##
## [[3]]
## integer(0)
##
## [[4]]
## [1] 7
##
## [[5]]
## [1] 1