library(GenomicRanges)

GRanges: Single Interval Range Features

gr = GRanges(seqnames = 
               Rle(c('chr1', 'chr2', 'chr1', 'chr3'), c(1,2,3,4)),
             ranges = 
               IRanges(1:10, end = 7:16, names = head(letters, 10)),
             strand = 
               Rle(strand(c('-', '+', '*', '+', '-')),
                   c(1,2,2,3,2)),
             score = 1:10,
             GC = seq(1,0, length=10))

seqlengths(gr) = c(249250621,243199373,198022430)
seqnames(gr)
## factor-Rle of length 10 with 4 runs
##   Lengths:    1    2    3    4
##   Values : chr1 chr2 chr1 chr3
## Levels(3): chr1 chr2 chr3
ranges(gr)
## IRanges of length 10
##      start end width names
## [1]      1   7     7     a
## [2]      2   8     7     b
## [3]      3   9     7     c
## [4]      4  10     7     d
## [5]      5  11     7     e
## [6]      6  12     7     f
## [7]      7  13     7     g
## [8]      8  14     7     h
## [9]      9  15     7     i
## [10]    10  16     7     j
names(gr)
##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j"
strand(gr)
## factor-Rle of length 10 with 5 runs
##   Lengths: 1 2 2 3 2
##   Values : - + * + -
## Levels(3): + - *
mcols(gr)$score
##  [1]  1  2  3  4  5  6  7  8  9 10
gr$score
##  [1]  1  2  3  4  5  6  7  8  9 10
seqlengths(gr)
##      chr1      chr2      chr3 
## 249250621 243199373 198022430
length(gr)
## [1] 10

Splitting and combining GRanges objects

sp = split(gr, rep(1:2, each = 5))
sp
## GRangesList object of length 2:
## $1 
## GRanges object with 5 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   a     chr1   [1,  7]      - |         1                 1
##   b     chr2   [2,  8]      + |         2 0.888888888888889
##   c     chr2   [3,  9]      + |         3 0.777777777777778
##   d     chr1   [4, 10]      * |         4 0.666666666666667
##   e     chr1   [5, 11]      * |         5 0.555555555555556
## 
## $2 
## GRanges object with 5 ranges and 2 metadata columns:
##     seqnames   ranges strand | score                GC
##   f     chr1 [ 6, 12]      + |     6 0.444444444444444
##   g     chr3 [ 7, 13]      + |     7 0.333333333333333
##   h     chr3 [ 8, 14]      + |     8 0.222222222222222
##   i     chr3 [ 9, 15]      - |     9 0.111111111111111
##   j     chr3 [10, 16]      - |    10                 0
## 
## -------
## seqinfo: 3 sequences from an unspecified genome
#also check out my exploration time
split(gr, seqnames(gr))
## GRangesList object of length 3:
## $chr1 
## GRanges object with 4 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   a     chr1   [1,  7]      - |         1                 1
##   d     chr1   [4, 10]      * |         4 0.666666666666667
##   e     chr1   [5, 11]      * |         5 0.555555555555556
##   f     chr1   [6, 12]      + |         6 0.444444444444444
## 
## $chr2 
## GRanges object with 2 ranges and 2 metadata columns:
##     seqnames ranges strand | score                GC
##   b     chr2 [2, 8]      + |     2 0.888888888888889
##   c     chr2 [3, 9]      + |     3 0.777777777777778
## 
## $chr3 
## GRanges object with 4 ranges and 2 metadata columns:
##     seqnames   ranges strand | score                GC
##   g     chr3 [ 7, 13]      + |     7 0.333333333333333
##   h     chr3 [ 8, 14]      + |     8 0.222222222222222
##   i     chr3 [ 9, 15]      - |     9 0.111111111111111
##   j     chr3 [10, 16]      - |    10                 0
## 
## -------
## seqinfo: 3 sequences from an unspecified genome
split(gr, strand(gr))
## GRangesList object of length 3:
## $+ 
## GRanges object with 5 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   b     chr2   [2,  8]      + |         2 0.888888888888889
##   c     chr2   [3,  9]      + |         3 0.777777777777778
##   f     chr1   [6, 12]      + |         6 0.444444444444444
##   g     chr3   [7, 13]      + |         7 0.333333333333333
##   h     chr3   [8, 14]      + |         8 0.222222222222222
## 
## $- 
## GRanges object with 3 ranges and 2 metadata columns:
##     seqnames   ranges strand | score                GC
##   a     chr1 [ 1,  7]      - |     1                 1
##   i     chr3 [ 9, 15]      - |     9 0.111111111111111
##   j     chr3 [10, 16]      - |    10                 0
## 
## $* 
## GRanges object with 2 ranges and 2 metadata columns:
##     seqnames  ranges strand | score                GC
##   d     chr1 [4, 10]      * |     4 0.666666666666667
##   e     chr1 [5, 11]      * |     5 0.555555555555556
## 
## -------
## seqinfo: 3 sequences from an unspecified genome
c(sp[[1]], sp[[2]])
## GRanges object with 10 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   a     chr1  [ 1,  7]      - |         1                 1
##   b     chr2  [ 2,  8]      + |         2 0.888888888888889
##   c     chr2  [ 3,  9]      + |         3 0.777777777777778
##   d     chr1  [ 4, 10]      * |         4 0.666666666666667
##   e     chr1  [ 5, 11]      * |         5 0.555555555555556
##   f     chr1  [ 6, 12]      + |         6 0.444444444444444
##   g     chr3  [ 7, 13]      + |         7 0.333333333333333
##   h     chr3  [ 8, 14]      + |         8 0.222222222222222
##   i     chr3  [ 9, 15]      - |         9 0.111111111111111
##   j     chr3  [10, 16]      - |        10                 0
##   -------
##   seqinfo: 3 sequences from an unspecified genome

Subsetting GRanges objects

gr[2:3]
## GRanges object with 2 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   b     chr2    [2, 8]      + |         2 0.888888888888889
##   c     chr2    [3, 9]      + |         3 0.777777777777778
##   -------
##   seqinfo: 3 sequences from an unspecified genome
gr[2:3, 'GC']
## GRanges object with 2 ranges and 1 metadata column:
##     seqnames    ranges strand |                GC
##        <Rle> <IRanges>  <Rle> |         <numeric>
##   b     chr2    [2, 8]      + | 0.888888888888889
##   c     chr2    [3, 9]      + | 0.777777777777778
##   -------
##   seqinfo: 3 sequences from an unspecified genome
gr[2:3, 2]
## GRanges object with 2 ranges and 1 metadata column:
##     seqnames    ranges strand |                GC
##        <Rle> <IRanges>  <Rle> |         <numeric>
##   b     chr2    [2, 8]      + | 0.888888888888889
##   c     chr2    [3, 9]      + | 0.777777777777778
##   -------
##   seqinfo: 3 sequences from an unspecified genome
#Here is an example where the 2nd row of a GRanges object is replaced with the 1st row of gr

singles <- split(gr, names(gr))
grMod <- gr
grMod[2] <- singles[[1]]
head(grMod, n=3)
## GRanges object with 3 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   a     chr1    [1, 7]      - |         1                 1
##   b     chr1    [1, 7]      - |         1                 1
##   c     chr2    [3, 9]      + |         3 0.777777777777778
##   -------
##   seqinfo: 3 sequences from an unspecified genome
#Here is a second examplefrom the 2nd row etc.
grMod[2,1] <- singles[[3]][,1]
head(grMod, n=3)
## GRanges object with 3 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   a     chr1    [1, 7]      - |         1                 1
##   b     chr2    [3, 9]      + |         3                 1
##   c     chr2    [3, 9]      + |         3 0.777777777777778
##   -------
##   seqinfo: 3 sequences from an unspecified genome
rep(singles[[2]], times = 3)
## GRanges object with 3 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   b     chr2    [2, 8]      + |         2 0.888888888888889
##   b     chr2    [2, 8]      + |         2 0.888888888888889
##   b     chr2    [2, 8]      + |         2 0.888888888888889
##   -------
##   seqinfo: 3 sequences from an unspecified genome
rev(gr)
## GRanges object with 10 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   j     chr3  [10, 16]      - |        10                 0
##   i     chr3  [ 9, 15]      - |         9 0.111111111111111
##   h     chr3  [ 8, 14]      + |         8 0.222222222222222
##   g     chr3  [ 7, 13]      + |         7 0.333333333333333
##   f     chr1  [ 6, 12]      + |         6 0.444444444444444
##   e     chr1  [ 5, 11]      * |         5 0.555555555555556
##   d     chr1  [ 4, 10]      * |         4 0.666666666666667
##   c     chr2  [ 3,  9]      + |         3 0.777777777777778
##   b     chr2  [ 2,  8]      + |         2 0.888888888888889
##   a     chr1  [ 1,  7]      - |         1                 1
##   -------
##   seqinfo: 3 sequences from an unspecified genome
#using window
window(gr, start=2,end=4)
## GRanges object with 3 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   b     chr2   [2,  8]      + |         2 0.888888888888889
##   c     chr2   [3,  9]      + |         3 0.777777777777778
##   d     chr1   [4, 10]      * |         4 0.666666666666667
##   -------
##   seqinfo: 3 sequences from an unspecified genome
gr[2:4]
## GRanges object with 3 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   b     chr2   [2,  8]      + |         2 0.888888888888889
##   c     chr2   [3,  9]      + |         3 0.777777777777778
##   d     chr1   [4, 10]      * |         4 0.666666666666667
##   -------
##   seqinfo: 3 sequences from an unspecified genome
#using ranges
gr[IRanges(start=c(2,7), end=c(3,9))]
## GRanges object with 5 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   b     chr2   [2,  8]      + |         2 0.888888888888889
##   c     chr2   [3,  9]      + |         3 0.777777777777778
##   g     chr3   [7, 13]      + |         7 0.333333333333333
##   h     chr3   [8, 14]      + |         8 0.222222222222222
##   i     chr3   [9, 15]      - |         9 0.111111111111111
##   -------
##   seqinfo: 3 sequences from an unspecified genome
gr[c(2,3,7,8,9)]
## GRanges object with 5 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   b     chr2   [2,  8]      + |         2 0.888888888888889
##   c     chr2   [3,  9]      + |         3 0.777777777777778
##   g     chr3   [7, 13]      + |         7 0.333333333333333
##   h     chr3   [8, 14]      + |         8 0.222222222222222
##   i     chr3   [9, 15]      - |         9 0.111111111111111
##   -------
##   seqinfo: 3 sequences from an unspecified genome

Basic interval operations for GRanges objects

g <- gr[1:3]
g <- append(g, singles[[10]])
g
## GRanges object with 4 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   a     chr1  [ 1,  7]      - |         1                 1
##   b     chr2  [ 2,  8]      + |         2 0.888888888888889
##   c     chr2  [ 3,  9]      + |         3 0.777777777777778
##   j     chr3  [10, 16]      - |        10                 0
##   -------
##   seqinfo: 3 sequences from an unspecified genome
start(g)
## [1]  1  2  3 10
end(g)
## [1]  7  8  9 16
width(g)
## [1] 7 7 7 7
range(g)
## GRanges object with 3 ranges and 0 metadata columns:
##       seqnames    ranges strand
##          <Rle> <IRanges>  <Rle>
##   [1]     chr1  [ 1,  7]      -
##   [2]     chr2  [ 2,  9]      +
##   [3]     chr3  [10, 16]      -
##   -------
##   seqinfo: 3 sequences from an unspecified genome
# It looks like range provides the range on each chromosome and strand
sort(gr)
## GRanges object with 10 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   f     chr1  [ 6, 12]      + |         6 0.444444444444444
##   a     chr1  [ 1,  7]      - |         1                 1
##   d     chr1  [ 4, 10]      * |         4 0.666666666666667
##   e     chr1  [ 5, 11]      * |         5 0.555555555555556
##   b     chr2  [ 2,  8]      + |         2 0.888888888888889
##   c     chr2  [ 3,  9]      + |         3 0.777777777777778
##   g     chr3  [ 7, 13]      + |         7 0.333333333333333
##   h     chr3  [ 8, 14]      + |         8 0.222222222222222
##   i     chr3  [ 9, 15]      - |         9 0.111111111111111
##   j     chr3  [10, 16]      - |        10                 0
##   -------
##   seqinfo: 3 sequences from an unspecified genome
range(gr)
## GRanges object with 6 ranges and 0 metadata columns:
##       seqnames    ranges strand
##          <Rle> <IRanges>  <Rle>
##   [1]     chr1   [6, 12]      +
##   [2]     chr1   [1,  7]      -
##   [3]     chr1   [4, 11]      *
##   [4]     chr2   [2,  9]      +
##   [5]     chr3   [7, 14]      +
##   [6]     chr3   [9, 16]      -
##   -------
##   seqinfo: 3 sequences from an unspecified genome
# flank gives you a width of 10, for new range beginning 1 coordinate after the end
# all this is backwards if start = FALSE
g
## GRanges object with 4 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   a     chr1  [ 1,  7]      - |         1                 1
##   b     chr2  [ 2,  8]      + |         2 0.888888888888889
##   c     chr2  [ 3,  9]      + |         3 0.777777777777778
##   j     chr3  [10, 16]      - |        10                 0
##   -------
##   seqinfo: 3 sequences from an unspecified genome
flank(g,10)
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 2 out-of-bound ranges located on sequence
##   chr2. Note that only ranges located on a non-circular sequence
##   whose length is not NA can be considered out-of-bound (use
##   seqlengths() and isCircular() to get the lengths and circularity
##   flags of the underlying sequences). You can use trim() to trim
##   these ranges. See ?`trim,GenomicRanges-method` for more
##   information.
## GRanges object with 4 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   a     chr1  [ 8, 17]      - |         1                 1
##   b     chr2  [-8,  1]      + |         2 0.888888888888889
##   c     chr2  [-7,  2]      + |         3 0.777777777777778
##   j     chr3  [17, 26]      - |        10                 0
##   -------
##   seqinfo: 3 sequences from an unspecified genome
flank(g,10, start=FALSE)
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 2 out-of-bound ranges located on sequences
##   chr1 and chr3. Note that only ranges located on a non-circular
##   sequence whose length is not NA can be considered out-of-bound
##   (use seqlengths() and isCircular() to get the lengths and
##   circularity flags of the underlying sequences). You can use trim()
##   to trim these ranges. See ?`trim,GenomicRanges-method` for more
##   information.
## GRanges object with 4 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   a     chr1  [-9,  0]      - |         1                 1
##   b     chr2  [ 9, 18]      + |         2 0.888888888888889
##   c     chr2  [10, 19]      + |         3 0.777777777777778
##   j     chr3  [ 0,  9]      - |        10                 0
##   -------
##   seqinfo: 3 sequences from an unspecified genome
# shifting ranges
g
## GRanges object with 4 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   a     chr1  [ 1,  7]      - |         1                 1
##   b     chr2  [ 2,  8]      + |         2 0.888888888888889
##   c     chr2  [ 3,  9]      + |         3 0.777777777777778
##   j     chr3  [10, 16]      - |        10                 0
##   -------
##   seqinfo: 3 sequences from an unspecified genome
shift(g, 5)
## GRanges object with 4 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   a     chr1  [ 6, 12]      - |         1                 1
##   b     chr2  [ 7, 13]      + |         2 0.888888888888889
##   c     chr2  [ 8, 14]      + |         3 0.777777777777778
##   j     chr3  [15, 21]      - |        10                 0
##   -------
##   seqinfo: 3 sequences from an unspecified genome
# resizing: keeps the 5' fixed and change the width to 30.
# default, fix = 'start'.  so does start = 5'?
g
## GRanges object with 4 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   a     chr1  [ 1,  7]      - |         1                 1
##   b     chr2  [ 2,  8]      + |         2 0.888888888888889
##   c     chr2  [ 3,  9]      + |         3 0.777777777777778
##   j     chr3  [10, 16]      - |        10                 0
##   -------
##   seqinfo: 3 sequences from an unspecified genome
resize(g, 30)
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 2 out-of-bound ranges located on sequences
##   chr1 and chr3. Note that only ranges located on a non-circular
##   sequence whose length is not NA can be considered out-of-bound
##   (use seqlengths() and isCircular() to get the lengths and
##   circularity flags of the underlying sequences). You can use trim()
##   to trim these ranges. See ?`trim,GenomicRanges-method` for more
##   information.
## GRanges object with 4 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   a     chr1 [-22,  7]      - |         1                 1
##   b     chr2 [  2, 31]      + |         2 0.888888888888889
##   c     chr2 [  3, 32]      + |         3 0.777777777777778
##   j     chr3 [-13, 16]      - |        10                 0
##   -------
##   seqinfo: 3 sequences from an unspecified genome
# reduce, disjoin, and gaps
g
## GRanges object with 4 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   a     chr1  [ 1,  7]      - |         1                 1
##   b     chr2  [ 2,  8]      + |         2 0.888888888888889
##   c     chr2  [ 3,  9]      + |         3 0.777777777777778
##   j     chr3  [10, 16]      - |        10                 0
##   -------
##   seqinfo: 3 sequences from an unspecified genome
reduce(g)
## GRanges object with 3 ranges and 0 metadata columns:
##       seqnames    ranges strand
##          <Rle> <IRanges>  <Rle>
##   [1]     chr1  [ 1,  7]      -
##   [2]     chr2  [ 2,  9]      +
##   [3]     chr3  [10, 16]      -
##   -------
##   seqinfo: 3 sequences from an unspecified genome
disjoin(g)
## GRanges object with 5 ranges and 0 metadata columns:
##       seqnames    ranges strand
##          <Rle> <IRanges>  <Rle>
##   [1]     chr1  [ 1,  7]      -
##   [2]     chr2  [ 2,  2]      +
##   [3]     chr2  [ 3,  8]      +
##   [4]     chr2  [ 9,  9]      +
##   [5]     chr3  [10, 16]      -
##   -------
##   seqinfo: 3 sequences from an unspecified genome
gaps(g)
## GRanges object with 11 ranges and 0 metadata columns:
##        seqnames          ranges strand
##           <Rle>       <IRanges>  <Rle>
##    [1]     chr1 [ 1, 249250621]      +
##    [2]     chr1 [ 8, 249250621]      -
##    [3]     chr1 [ 1, 249250621]      *
##    [4]     chr2 [ 1,         1]      +
##    [5]     chr2 [10, 243199373]      +
##    [6]     chr2 [ 1, 243199373]      -
##    [7]     chr2 [ 1, 243199373]      *
##    [8]     chr3 [ 1, 198022430]      +
##    [9]     chr3 [ 1,         9]      -
##   [10]     chr3 [17, 198022430]      -
##   [11]     chr3 [ 1, 198022430]      *
##   -------
##   seqinfo: 3 sequences from an unspecified genome

Interval set operations for GRanges objects

g2 <- head(gr,n=2)
g; g2
## GRanges object with 4 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   a     chr1  [ 1,  7]      - |         1                 1
##   b     chr2  [ 2,  8]      + |         2 0.888888888888889
##   c     chr2  [ 3,  9]      + |         3 0.777777777777778
##   j     chr3  [10, 16]      - |        10                 0
##   -------
##   seqinfo: 3 sequences from an unspecified genome
## GRanges object with 2 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   a     chr1    [1, 7]      - |         1                 1
##   b     chr2    [2, 8]      + |         2 0.888888888888889
##   -------
##   seqinfo: 3 sequences from an unspecified genome
union(g, g2)
## GRanges object with 3 ranges and 0 metadata columns:
##       seqnames    ranges strand
##          <Rle> <IRanges>  <Rle>
##   [1]     chr1  [ 1,  7]      -
##   [2]     chr2  [ 2,  9]      +
##   [3]     chr3  [10, 16]      -
##   -------
##   seqinfo: 3 sequences from an unspecified genome
#how is this union thing much different from below???
reduce(g)
## GRanges object with 3 ranges and 0 metadata columns:
##       seqnames    ranges strand
##          <Rle> <IRanges>  <Rle>
##   [1]     chr1  [ 1,  7]      -
##   [2]     chr2  [ 2,  9]      +
##   [3]     chr3  [10, 16]      -
##   -------
##   seqinfo: 3 sequences from an unspecified genome
reduce(c(g,g2))
## GRanges object with 3 ranges and 0 metadata columns:
##       seqnames    ranges strand
##          <Rle> <IRanges>  <Rle>
##   [1]     chr1  [ 1,  7]      -
##   [2]     chr2  [ 2,  9]      +
##   [3]     chr3  [10, 16]      -
##   -------
##   seqinfo: 3 sequences from an unspecified genome
#intersect and setdiff
intersect(g, g2)
## GRanges object with 2 ranges and 0 metadata columns:
##       seqnames    ranges strand
##          <Rle> <IRanges>  <Rle>
##   [1]     chr1    [1, 7]      -
##   [2]     chr2    [2, 8]      +
##   -------
##   seqinfo: 3 sequences from an unspecified genome
setdiff(g, g2)
## GRanges object with 2 ranges and 0 metadata columns:
##       seqnames    ranges strand
##          <Rle> <IRanges>  <Rle>
##   [1]     chr2  [ 9,  9]      +
##   [2]     chr3  [10, 16]      -
##   -------
##   seqinfo: 3 sequences from an unspecified genome
#punion, pintersect, and psetdiff
g3 <- g[1:2]
ranges(g3[1]) <- IRanges(start=5, end=12)

g2; g3
## GRanges object with 2 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   a     chr1    [1, 7]      - |         1                 1
##   b     chr2    [2, 8]      + |         2 0.888888888888889
##   -------
##   seqinfo: 3 sequences from an unspecified genome
## GRanges object with 2 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   a     chr1   [5, 12]      - |         1                 1
##   b     chr2   [2,  8]      + |         2 0.888888888888889
##   -------
##   seqinfo: 3 sequences from an unspecified genome
punion(g2, g3)
## GRanges object with 2 ranges and 0 metadata columns:
##     seqnames    ranges strand
##        <Rle> <IRanges>  <Rle>
##   a     chr1   [1, 12]      -
##   b     chr2   [2,  8]      +
##   -------
##   seqinfo: 3 sequences from an unspecified genome
pintersect(g2, g3)
## GRanges object with 2 ranges and 3 metadata columns:
##     seqnames    ranges strand |     score                GC       hit
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric> <logical>
##   a     chr1    [5, 7]      - |         1                 1         1
##   b     chr2    [2, 8]      + |         2 0.888888888888889         1
##   -------
##   seqinfo: 3 sequences from an unspecified genome
# remember a zero is represented with the start coordinated being 1 greater than the end coordinage
psetdiff(g2, g3) 
## GRanges object with 2 ranges and 0 metadata columns:
##     seqnames    ranges strand
##        <Rle> <IRanges>  <Rle>
##   a     chr1    [1, 4]      -
##   b     chr2    [2, 1]      +
##   -------
##   seqinfo: 3 sequences from an unspecified genome

GRangesList: Multiple Interval Range Features

This might be something useful when representing operons.

gr1 <- GRanges(seqnames = "chr2", ranges = IRanges(3, 6), strand = "+", score = 5L, GC = 0.45)

gr2 <- GRanges(seqnames = c("chr1", "chr1"), ranges = IRanges(c(7,13), width = 3), strand = c("+", "-"), score = 3:4, GC = c(0.3, 0.5))

grl <- GRangesList("txA" = gr1, "txB" = gr2)
grl
## GRangesList object of length 2:
## $txA 
## GRanges object with 1 range and 2 metadata columns:
##       seqnames    ranges strand |     score        GC
##          <Rle> <IRanges>  <Rle> | <integer> <numeric>
##   [1]     chr2    [3, 6]      + |         5      0.45
## 
## $txB 
## GRanges object with 2 ranges and 2 metadata columns:
##       seqnames   ranges strand | score  GC
##   [1]     chr1 [ 7,  9]      + |     3 0.3
##   [2]     chr1 [13, 15]      - |     4 0.5
## 
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths

Basic GRangesList accessors

seqnames(grl)
## RleList of length 2
## $txA
## factor-Rle of length 1 with 1 run
##   Lengths:    1
##   Values : chr2
## Levels(2): chr2 chr1
## 
## $txB
## factor-Rle of length 2 with 1 run
##   Lengths:    2
##   Values : chr1
## Levels(2): chr2 chr1
ranges(grl)
## IRangesList of length 2
## $txA
## IRanges of length 1
##     start end width
## [1]     3   6     4
## 
## $txB
## IRanges of length 2
##     start end width
## [1]     7   9     3
## [2]    13  15     3
length(grl)
## [1] 2
names(grl)
## [1] "txA" "txB"
seqlengths(grl)
## chr2 chr1 
##   NA   NA
elementLengths(grl)
## txA txB 
##   1   2
isEmpty(grl)
## [1] FALSE
mcols(grl) = c('Transcript A', 'Transcript B')
mcols(grl)
## DataFrame with 2 rows and 1 column
##          value
##    <character>
## 1 Transcript A
## 2 Transcript B

Combining GRangesList objects

ul = unlist(grl)
ul
## GRanges object with 3 ranges and 2 metadata columns:
##       seqnames    ranges strand |     score        GC
##          <Rle> <IRanges>  <Rle> | <integer> <numeric>
##   txA     chr2  [ 3,  6]      + |         5      0.45
##   txB     chr1  [ 7,  9]      + |         3       0.3
##   txB     chr1  [13, 15]      - |         4       0.5
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths

Basic interval operations for GRangesList objects

start(grl)
## IntegerList of length 2
## [["txA"]] 3
## [["txB"]] 7 13
end(grl)
## IntegerList of length 2
## [["txA"]] 6
## [["txB"]] 9 15
width(grl)
## IntegerList of length 2
## [["txA"]] 4
## [["txB"]] 3 3
shift(grl, 30)
## GRangesList object of length 2:
## $txA 
## GRanges object with 1 range and 2 metadata columns:
##       seqnames    ranges strand |     score        GC
##          <Rle> <IRanges>  <Rle> | <integer> <numeric>
##   [1]     chr2  [33, 36]      + |         5      0.45
## 
## $txB 
## GRanges object with 2 ranges and 2 metadata columns:
##       seqnames   ranges strand | score  GC
##   [1]     chr1 [37, 39]      + |     3 0.3
##   [2]     chr1 [43, 45]      - |     4 0.5
## 
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
coverage(grl)
## RleList of length 2
## $chr2
## integer-Rle of length 6 with 2 runs
##   Lengths: 2 4
##   Values : 0 1
## 
## $chr1
## integer-Rle of length 15 with 4 runs
##   Lengths: 6 3 3 3
##   Values : 0 1 0 1

Subsetting GRangesList objects

grl[1]
## GRangesList object of length 1:
## $txA 
## GRanges object with 1 range and 2 metadata columns:
##       seqnames    ranges strand |     score        GC
##          <Rle> <IRanges>  <Rle> | <integer> <numeric>
##   [1]     chr2    [3, 6]      + |         5      0.45
## 
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
grl[[1]]
## GRanges object with 1 range and 2 metadata columns:
##       seqnames    ranges strand |     score        GC
##          <Rle> <IRanges>  <Rle> | <integer> <numeric>
##   [1]     chr2    [3, 6]      + |         5      0.45
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths
grl['txA']
## GRangesList object of length 1:
## $txA 
## GRanges object with 1 range and 2 metadata columns:
##       seqnames    ranges strand |     score        GC
##          <Rle> <IRanges>  <Rle> | <integer> <numeric>
##   [1]     chr2    [3, 6]      + |         5      0.45
## 
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
grl$txB
## GRanges object with 2 ranges and 2 metadata columns:
##       seqnames    ranges strand |     score        GC
##          <Rle> <IRanges>  <Rle> | <integer> <numeric>
##   [1]     chr1  [ 7,  9]      + |         3       0.3
##   [2]     chr1  [13, 15]      - |         4       0.5
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths
grl[1, 'score'] # is a GRangesList object
## GRangesList object of length 1:
## $txA 
## GRanges object with 1 range and 1 metadata column:
##       seqnames    ranges strand |     score
##          <Rle> <IRanges>  <Rle> | <integer>
##   [1]     chr2    [3, 6]      + |         5
## 
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
grl[1][,'score'] # is a GRangesList object
## GRangesList object of length 1:
## $txA 
## GRanges object with 1 range and 1 metadata column:
##       seqnames    ranges strand |     score
##          <Rle> <IRanges>  <Rle> | <integer>
##   [1]     chr2    [3, 6]      + |         5
## 
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
grl[[1]][,'score'] # is a GRanges object
## GRanges object with 1 range and 1 metadata column:
##       seqnames    ranges strand |     score
##          <Rle> <IRanges>  <Rle> | <integer>
##   [1]     chr2    [3, 6]      + |         5
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths
rep(grl[[1]], times = 3)
## GRanges object with 3 ranges and 2 metadata columns:
##       seqnames    ranges strand |     score        GC
##          <Rle> <IRanges>  <Rle> | <integer> <numeric>
##   [1]     chr2    [3, 6]      + |         5      0.45
##   [2]     chr2    [3, 6]      + |         5      0.45
##   [3]     chr2    [3, 6]      + |         5      0.45
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths
rev(grl)
## GRangesList object of length 2:
## $txB 
## GRanges object with 2 ranges and 2 metadata columns:
##       seqnames    ranges strand |     score        GC
##          <Rle> <IRanges>  <Rle> | <integer> <numeric>
##   [1]     chr1  [ 7,  9]      + |         3       0.3
##   [2]     chr1  [13, 15]      - |         4       0.5
## 
## $txA 
## GRanges object with 1 range and 2 metadata columns:
##       seqnames ranges strand | score   GC
##   [1]     chr2 [3, 6]      + |     5 0.45
## 
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
# how to retrieve a range of elements from a list
window(grl, start = 1, end = 1)
## GRangesList object of length 1:
## $txA 
## GRanges object with 1 range and 2 metadata columns:
##       seqnames    ranges strand |     score        GC
##          <Rle> <IRanges>  <Rle> | <integer> <numeric>
##   [1]     chr2    [3, 6]      + |         5      0.45
## 
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
window(grl, start = 2, end = 2)
## GRangesList object of length 1:
## $txB 
## GRanges object with 2 ranges and 2 metadata columns:
##       seqnames    ranges strand |     score        GC
##          <Rle> <IRanges>  <Rle> | <integer> <numeric>
##   [1]     chr1  [ 7,  9]      + |         3       0.3
##   [2]     chr1  [13, 15]      - |         4       0.5
## 
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
window(grl, start = 1, end = 2)
## GRangesList object of length 2:
## $txA 
## GRanges object with 1 range and 2 metadata columns:
##       seqnames    ranges strand |     score        GC
##          <Rle> <IRanges>  <Rle> | <integer> <numeric>
##   [1]     chr2    [3, 6]      + |         5      0.45
## 
## $txB 
## GRanges object with 2 ranges and 2 metadata columns:
##       seqnames   ranges strand | score  GC
##   [1]     chr1 [ 7,  9]      + |     3 0.3
##   [2]     chr1 [13, 15]      - |     4 0.5
## 
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
# IRanges and grl, like window above
grl[IRanges(start = 2, end = 2)]
## GRangesList object of length 1:
## $txB 
## GRanges object with 2 ranges and 2 metadata columns:
##       seqnames    ranges strand |     score        GC
##          <Rle> <IRanges>  <Rle> | <integer> <numeric>
##   [1]     chr1  [ 7,  9]      + |         3       0.3
##   [2]     chr1  [13, 15]      - |         4       0.5
## 
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths

Looping over GRangesList objects

lapply(grl, length)
## $txA
## [1] 1
## 
## $txB
## [1] 2
sapply(grl, length)
## txA txB 
##   1   2
grl2 = shift(grl, 10)
names(grl2) = c('shiftTxA', 'shiftTxB')

# WHAT THE HELL IS THIS???
# mapply is a multivariate version of sapply. mapply applies FUN to the first elements of each ... argument, the second elements, the third elements, and so on. Arguments are recycled if necessary
mapply(c, grl, grl2)
## $txA
## GRanges object with 2 ranges and 2 metadata columns:
##       seqnames    ranges strand |     score        GC
##          <Rle> <IRanges>  <Rle> | <integer> <numeric>
##   [1]     chr2  [ 3,  6]      + |         5      0.45
##   [2]     chr2  [13, 16]      + |         5      0.45
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths
## 
## $txB
## GRanges object with 4 ranges and 2 metadata columns:
##       seqnames    ranges strand |     score        GC
##          <Rle> <IRanges>  <Rle> | <integer> <numeric>
##   [1]     chr1  [ 7,  9]      + |         3       0.3
##   [2]     chr1  [13, 15]      - |         4       0.5
##   [3]     chr1  [17, 19]      + |         3       0.3
##   [4]     chr1  [23, 25]      - |         4       0.5
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths
Map(c, grl, grl2)
## $txA
## GRanges object with 2 ranges and 2 metadata columns:
##       seqnames    ranges strand |     score        GC
##          <Rle> <IRanges>  <Rle> | <integer> <numeric>
##   [1]     chr2  [ 3,  6]      + |         5      0.45
##   [2]     chr2  [13, 16]      + |         5      0.45
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths
## 
## $txB
## GRanges object with 4 ranges and 2 metadata columns:
##       seqnames    ranges strand |     score        GC
##          <Rle> <IRanges>  <Rle> | <integer> <numeric>
##   [1]     chr1  [ 7,  9]      + |         3       0.3
##   [2]     chr1  [13, 15]      - |         4       0.5
##   [3]     chr1  [17, 19]      + |         3       0.3
##   [4]     chr1  [23, 25]      - |         4       0.5
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths
# This is reversing the individual ranges within each GRanges object in the list
# endoapply and mendoapply perform the endomorphic equivalents of lapply and mapply by returning objects of the same class as the inputs rather than a list.
endoapply(grl, rev)
## GRangesList object of length 2:
## $txA 
## GRanges object with 1 range and 2 metadata columns:
##       seqnames    ranges strand |     score        GC
##          <Rle> <IRanges>  <Rle> | <integer> <numeric>
##   [1]     chr2    [3, 6]      + |         5      0.45
## 
## $txB 
## GRanges object with 2 ranges and 2 metadata columns:
##       seqnames   ranges strand | score  GC
##   [1]     chr1 [13, 15]      - |     4 0.5
##   [2]     chr1 [ 7,  9]      + |     3 0.3
## 
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
# mendoapply(FUN, ..., MoreArgs=NULL)
mendoapply(c, grl, grl2)
## GRangesList object of length 2:
## $txA 
## GRanges object with 2 ranges and 2 metadata columns:
##       seqnames    ranges strand |     score        GC
##          <Rle> <IRanges>  <Rle> | <integer> <numeric>
##   [1]     chr2  [ 3,  6]      + |         5      0.45
##   [2]     chr2  [13, 16]      + |         5      0.45
## 
## $txB 
## GRanges object with 4 ranges and 2 metadata columns:
##       seqnames   ranges strand | score  GC
##   [1]     chr1 [ 7,  9]      + |     3 0.3
##   [2]     chr1 [13, 15]      - |     4 0.5
##   [3]     chr1 [17, 19]      + |     3 0.3
##   [4]     chr1 [23, 25]      - |     4 0.5
## 
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
# Reduce() ... as different from reduce()
Reduce(c, grl)
## GRanges object with 3 ranges and 2 metadata columns:
##       seqnames    ranges strand |     score        GC
##          <Rle> <IRanges>  <Rle> | <integer> <numeric>
##   [1]     chr2  [ 3,  6]      + |         5      0.45
##   [2]     chr1  [ 7,  9]      + |         3       0.3
##   [3]     chr1  [13, 15]      - |         4       0.5
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths

Interval overlaps involving GRanges and GRangesList objects

# findOverlaps, which takes a query and a subject as inputs and returns a Hits object containing the index pairings for the overlapping elements.

mtch = findOverlaps(gr, grl)
as.matrix(mtch)
##      queryHits subjectHits
## [1,]         2           1
## [2,]         3           1
## [3,]         4           2
## [4,]         5           2
## [5,]         6           2
findOverlaps(gr, grl, select='first')
##  [1] NA  1  1  2  2  2 NA NA NA NA
findOverlaps(grl, gr, select='first')
## [1] 2 4
#  countOverlaps, which tabulates the number of overlaps for each element in the query.
countOverlaps(gr, grl)
## a b c d e f g h i j 
## 0 1 1 1 1 1 0 0 0 0
countOverlaps(grl, gr)
## txA txB 
##   2   3
#  subsetByOverlaps, which extracts the elements in the query that overlap at least one element in the subject.
subsetByOverlaps(gr, grl)
## GRanges object with 5 ranges and 2 metadata columns:
##     seqnames    ranges strand |     score                GC
##        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
##   b     chr2   [2,  8]      + |         2 0.888888888888889
##   c     chr2   [3,  9]      + |         3 0.777777777777778
##   d     chr1   [4, 10]      * |         4 0.666666666666667
##   e     chr1   [5, 11]      * |         5 0.555555555555556
##   f     chr1   [6, 12]      + |         6 0.444444444444444
##   -------
##   seqinfo: 3 sequences from an unspecified genome

Genomic Alignments

library(GenomicAlignments)

Load a ‘BAM’ file into a GAlignments object

aln1_file = system.file("extdata", "ex1.bam",package="Rsamtools")
aln1 <- readGAlignments(aln1_file)
aln1
## GAlignments object with 3271 alignments and 0 metadata columns:
##          seqnames strand       cigar    qwidth     start       end
##             <Rle>  <Rle> <character> <integer> <integer> <integer>
##      [1]     seq1      +         36M        36         1        36
##      [2]     seq1      +         35M        35         3        37
##      [3]     seq1      +         35M        35         5        39
##      [4]     seq1      +         36M        36         6        41
##      [5]     seq1      +         35M        35         9        43
##      ...      ...    ...         ...       ...       ...       ...
##   [3267]     seq2      +         35M        35      1524      1558
##   [3268]     seq2      +         35M        35      1524      1558
##   [3269]     seq2      -         35M        35      1528      1562
##   [3270]     seq2      -         35M        35      1532      1566
##   [3271]     seq2      -         35M        35      1533      1567
##              width     njunc
##          <integer> <integer>
##      [1]        36         0
##      [2]        35         0
##      [3]        35         0
##      [4]        36         0
##      [5]        35         0
##      ...       ...       ...
##   [3267]        35         0
##   [3268]        35         0
##   [3269]        35         0
##   [3270]        35         0
##   [3271]        35         0
##   -------
##   seqinfo: 2 sequences from an unspecified genome

Simple accessor methods

seqnames(aln1)
## factor-Rle of length 3271 with 2 runs
##   Lengths: 1482 1789
##   Values : seq1 seq2
## Levels(2): seq1 seq2
seqlevels(aln1)
## [1] "seq1" "seq2"
strand(aln1)
## factor-Rle of length 3271 with 1394 runs
##   Lengths:  8  1 63  2 12  1  2  2  6  1 ...  1  1  1  1  1 12  2  2  3  3
##   Values :  +  -  +  -  +  -  +  -  +  - ...  +  -  +  -  +  -  +  -  +  -
## Levels(3): + - *
# what in the hell is this cigar crap?
head(cigar(aln1))
## [1] "36M" "35M" "35M" "36M" "35M" "35M"
# what is the difference between these two widths?
head(qwidth(aln1))
## [1] 36 35 35 36 35 35
head(width(aln1))
## [1] 36 35 35 36 35 35
head(start(aln1))
## [1]  1  3  5  6  9 13
head(end(aln1))
## [1] 36 37 39 41 43 47
# what in the hell is this njunc crap?
head(njunc(aln1))
## [1] 0 0 0 0 0 0

More accessor methods