By Erica Acton
Load file, preview, and create index file.
bamDat <- readAligned("drosophilaMelanogasterSubset.bam", type = "BAM")
str(bamDat)
## Formal class 'AlignedRead' [package "ShortRead"] with 8 slots
## ..@ chromosome : Factor w/ 15 levels "chrYHet","chrM",..: 2 2 2 2 2 2 2 2 2 2 ...
## ..@ position : int [1:64206] 548 1497 1506 1528 1540 1552 1552 1555 1559 1566 ...
## ..@ strand : Factor w/ 3 levels "+","-","*": 2 1 1 1 1 1 1 1 2 2 ...
## ..@ alignQuality:Formal class 'NumericQuality' [package "ShortRead"] with 1 slots
## .. .. ..@ quality: int [1:64206] 132 132 127 130 130 122 132 132 132 132 ...
## ..@ alignData :Formal class 'AlignedDataFrame' [package "ShortRead"] with 4 slots
## .. .. ..@ varMetadata :'data.frame': 1 obs. of 1 variable:
## .. .. .. ..$ labelDescription: chr "Type of read; see ?scanBam"
## .. .. ..@ data :'data.frame': 64206 obs. of 1 variable:
## .. .. .. ..$ flag: int [1:64206] 16 0 0 0 0 0 0 0 16 16 ...
## .. .. ..@ dimLabels : chr [1:2] "readName" "alignColumn"
## .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
## .. .. .. .. ..@ .Data:List of 1
## .. .. .. .. .. ..$ : int [1:3] 1 1 0
## ..@ quality :Formal class 'FastqQuality' [package "ShortRead"] with 1 slots
## .. .. ..@ quality:Formal class 'BStringSet' [package "Biostrings"] with 5 slots
## .. .. .. .. ..@ pool :Formal class 'SharedRaw_Pool' [package "XVector"] with 2 slots
## .. .. .. .. .. .. ..@ xp_list :List of 1
## .. .. .. .. .. .. .. ..$ :<externalptr>
## .. .. .. .. .. .. ..@ .link_to_cached_object_list:List of 1
## .. .. .. .. .. .. .. ..$ :<environment: 0x596e6f0>
## .. .. .. .. ..@ ranges :Formal class 'GroupedIRanges' [package "XVector"] with 7 slots
## .. .. .. .. .. .. ..@ group : int [1:64206] 1 1 1 1 1 1 1 1 1 1 ...
## .. .. .. .. .. .. ..@ start : int [1:64206] 1 37 73 109 145 181 217 253 289 325 ...
## .. .. .. .. .. .. ..@ width : int [1:64206] 36 36 36 36 36 36 36 36 36 36 ...
## .. .. .. .. .. .. ..@ NAMES : NULL
## .. .. .. .. .. .. ..@ elementType : chr "integer"
## .. .. .. .. .. .. ..@ elementMetadata: NULL
## .. .. .. .. .. .. ..@ metadata : list()
## .. .. .. .. ..@ elementType : chr "BString"
## .. .. .. .. ..@ elementMetadata: NULL
## .. .. .. .. ..@ metadata : list()
## ..@ sread :Formal class 'DNAStringSet' [package "Biostrings"] with 5 slots
## .. .. ..@ pool :Formal class 'SharedRaw_Pool' [package "XVector"] with 2 slots
## .. .. .. .. ..@ xp_list :List of 1
## .. .. .. .. .. ..$ :<externalptr>
## .. .. .. .. ..@ .link_to_cached_object_list:List of 1
## .. .. .. .. .. ..$ :<environment: 0x596e6f0>
## .. .. ..@ ranges :Formal class 'GroupedIRanges' [package "XVector"] with 7 slots
## .. .. .. .. ..@ group : int [1:64206] 1 1 1 1 1 1 1 1 1 1 ...
## .. .. .. .. ..@ start : int [1:64206] 1 37 73 109 145 181 217 253 289 325 ...
## .. .. .. .. ..@ width : int [1:64206] 36 36 36 36 36 36 36 36 36 36 ...
## .. .. .. .. ..@ NAMES : NULL
## .. .. .. .. ..@ elementType : chr "integer"
## .. .. .. .. ..@ elementMetadata: NULL
## .. .. .. .. ..@ metadata : list()
## .. .. ..@ elementType : chr "DNAString"
## .. .. ..@ elementMetadata: NULL
## .. .. ..@ metadata : list()
## ..@ id :Formal class 'BStringSet' [package "Biostrings"] with 5 slots
## .. .. ..@ pool :Formal class 'SharedRaw_Pool' [package "XVector"] with 2 slots
## .. .. .. .. ..@ xp_list :List of 1
## .. .. .. .. .. ..$ :<externalptr>
## .. .. .. .. ..@ .link_to_cached_object_list:List of 1
## .. .. .. .. .. ..$ :<environment: 0x596e6f0>
## .. .. ..@ ranges :Formal class 'GroupedIRanges' [package "XVector"] with 7 slots
## .. .. .. .. ..@ group : int [1:64206] 1 1 1 1 1 1 1 1 1 1 ...
## .. .. .. .. ..@ start : int [1:64206] 1 29 57 85 114 144 173 202 230 260 ...
## .. .. .. .. ..@ width : int [1:64206] 28 28 28 29 30 29 29 28 30 30 ...
## .. .. .. .. ..@ NAMES : NULL
## .. .. .. .. ..@ elementType : chr "integer"
## .. .. .. .. ..@ elementMetadata: NULL
## .. .. .. .. ..@ metadata : list()
## .. .. ..@ elementType : chr "BString"
## .. .. ..@ elementMetadata: NULL
## .. .. ..@ metadata : list()
indexFile <- indexBam("drosophilaMelanogasterSubset.bam")
Filter reads for ambiguous bases (N) with an allowable limit of 2.
Create a second filter to only allow reads aligned to the reference genome.
Apply filters.
nFilt <- nFilter(2)
chrFilt <- chromosomeFilter(regex = "chr")
filt <- compose(nFilt, chrFilt)
bamDatFiltered <- bamDat[filt(bamDat)]
Examine BAM data.
str(bamDatFiltered)
## Formal class 'AlignedRead' [package "ShortRead"] with 8 slots
## ..@ chromosome : Factor w/ 7 levels "chrM","chr2L",..: 1 1 1 1 1 1 1 1 1 1 ...
## ..@ position : int [1:56883] 548 1497 1506 1528 1540 1552 1552 1555 1559 1566 ...
## ..@ strand : Factor w/ 3 levels "+","-","*": 2 1 1 1 1 1 1 1 2 2 ...
## ..@ alignQuality:Formal class 'NumericQuality' [package "ShortRead"] with 1 slots
## .. .. ..@ quality: int [1:56883] 132 132 127 130 130 122 132 132 132 132 ...
## ..@ alignData :Formal class 'AlignedDataFrame' [package "ShortRead"] with 4 slots
## .. .. ..@ varMetadata :'data.frame': 1 obs. of 1 variable:
## .. .. .. ..$ labelDescription: chr "Type of read; see ?scanBam"
## .. .. ..@ data :'data.frame': 56883 obs. of 1 variable:
## .. .. .. ..$ flag: int [1:56883] 16 0 0 0 0 0 0 0 16 16 ...
## .. .. ..@ dimLabels : chr [1:2] "readName" "alignColumn"
## .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
## .. .. .. .. ..@ .Data:List of 1
## .. .. .. .. .. ..$ : int [1:3] 1 1 0
## ..@ quality :Formal class 'FastqQuality' [package "ShortRead"] with 1 slots
## .. .. ..@ quality:Formal class 'BStringSet' [package "Biostrings"] with 5 slots
## .. .. .. .. ..@ pool :Formal class 'SharedRaw_Pool' [package "XVector"] with 2 slots
## .. .. .. .. .. .. ..@ xp_list :List of 1
## .. .. .. .. .. .. .. ..$ :<externalptr>
## .. .. .. .. .. .. ..@ .link_to_cached_object_list:List of 1
## .. .. .. .. .. .. .. ..$ :<environment: 0x596e6f0>
## .. .. .. .. ..@ ranges :Formal class 'GroupedIRanges' [package "XVector"] with 7 slots
## .. .. .. .. .. .. ..@ group : int [1:56883] 1 1 1 1 1 1 1 1 1 1 ...
## .. .. .. .. .. .. ..@ start : int [1:56883] 1 37 73 109 145 181 217 253 289 325 ...
## .. .. .. .. .. .. ..@ width : int [1:56883] 36 36 36 36 36 36 36 36 36 36 ...
## .. .. .. .. .. .. ..@ NAMES : NULL
## .. .. .. .. .. .. ..@ elementType : chr "integer"
## .. .. .. .. .. .. ..@ elementMetadata: NULL
## .. .. .. .. .. .. ..@ metadata : list()
## .. .. .. .. ..@ elementType : chr "BString"
## .. .. .. .. ..@ elementMetadata: NULL
## .. .. .. .. ..@ metadata : list()
## ..@ sread :Formal class 'DNAStringSet' [package "Biostrings"] with 5 slots
## .. .. ..@ pool :Formal class 'SharedRaw_Pool' [package "XVector"] with 2 slots
## .. .. .. .. ..@ xp_list :List of 1
## .. .. .. .. .. ..$ :<externalptr>
## .. .. .. .. ..@ .link_to_cached_object_list:List of 1
## .. .. .. .. .. ..$ :<environment: 0x596e6f0>
## .. .. ..@ ranges :Formal class 'GroupedIRanges' [package "XVector"] with 7 slots
## .. .. .. .. ..@ group : int [1:56883] 1 1 1 1 1 1 1 1 1 1 ...
## .. .. .. .. ..@ start : int [1:56883] 1 37 73 109 145 181 217 253 289 325 ...
## .. .. .. .. ..@ width : int [1:56883] 36 36 36 36 36 36 36 36 36 36 ...
## .. .. .. .. ..@ NAMES : NULL
## .. .. .. .. ..@ elementType : chr "integer"
## .. .. .. .. ..@ elementMetadata: NULL
## .. .. .. .. ..@ metadata : list()
## .. .. ..@ elementType : chr "DNAString"
## .. .. ..@ elementMetadata: NULL
## .. .. ..@ metadata : list()
## ..@ id :Formal class 'BStringSet' [package "Biostrings"] with 5 slots
## .. .. ..@ pool :Formal class 'SharedRaw_Pool' [package "XVector"] with 2 slots
## .. .. .. .. ..@ xp_list :List of 1
## .. .. .. .. .. ..$ :<externalptr>
## .. .. .. .. ..@ .link_to_cached_object_list:List of 1
## .. .. .. .. .. ..$ :<environment: 0x596e6f0>
## .. .. ..@ ranges :Formal class 'GroupedIRanges' [package "XVector"] with 7 slots
## .. .. .. .. ..@ group : int [1:56883] 1 1 1 1 1 1 1 1 1 1 ...
## .. .. .. .. ..@ start : int [1:56883] 1 29 57 85 114 144 173 202 230 260 ...
## .. .. .. .. ..@ width : int [1:56883] 28 28 28 29 30 29 29 28 30 30 ...
## .. .. .. .. ..@ NAMES : NULL
## .. .. .. .. ..@ elementType : chr "integer"
## .. .. .. .. ..@ elementMetadata: NULL
## .. .. .. .. ..@ metadata : list()
## .. .. ..@ elementType : chr "BString"
## .. .. ..@ elementMetadata: NULL
## .. .. ..@ metadata : list()
levels(chromosome(bamDatFiltered))
## [1] "chrM" "chr2L" "chrX" "chr3L" "chr4" "chr2R" "chr3R"
id(bamDatFiltered[1:10])
## A BStringSet instance of length 10
## width seq
## [1] 28 HWI-EAS225_90320:3:1:141:680
## [2] 28 HWI-EAS225_90320:3:1:660:332
## [3] 28 HWI-EAS225_90320:3:1:164:226
## [4] 29 HWI-EAS225_90320:3:1:1088:176
## [5] 30 HWI-EAS225_90320:3:1:1038:1484
## [6] 29 HWI-EAS225_90320:3:1:850:1742
## [7] 29 HWI-EAS225_90320:3:1:1319:586
## [8] 28 HWI-EAS225_90320:3:1:103:631
## [9] 30 HWI-EAS225_90320:3:1:1353:1498
## [10] 30 HWI-EAS225_90320:3:1:1092:1016
sread(bamDatFiltered[1:10])
## A DNAStringSet instance of length 10
## width seq
## [1] 36 GGAAATCAAAAATGGAAAGGAGCGGCTCCACTTTTT
## [2] 36 AAATCATAAAGATATTGGAACTTTATATTTTATTTT
## [3] 36 AGATATTGGAACTTTATATTTTATTTTTGGAGCTTG
## [4] 36 ATTTTTGGAGCTTGAGCTGGAATAGTTGGAACATCT
## [5] 36 TGAGCTGGAATAGTTGGAACATCTTTAAGAATTTTA
## [6] 36 GTTGGAACATCTTTAAGAATTTTAATTAGAGCTGAA
## [7] 36 GTTGGAACATCTTTAAGAATTTTAATTCGAGCTGAA
## [8] 36 GGAACATCTTTAAGAATTTTAATTCGAGCTGAATTA
## [9] 36 GTCCTAATTCAGCTCGAATTAAAATTCTTAAAGATG
## [10] 36 CCAGGATGTCCTAATTCAGCTCGAATTAAAATTCTT
quality(bamDatFiltered[1:10])
## class: FastqQuality
## quality:
## A BStringSet instance of length 10
## width seq
## [1] 36 BACBCCABBBBBA>8@B@@==>;5-9A<;=7A@@B@
## [2] 36 BCBBABBA@@B;B>AB@@<>:AAA9?>?A@A<?A@@
## [3] 36 @?8AB>A?=)A=@*8>6/@3>A)/@4>?BA'(-1B=
## [4] 36 BBCACCA@-4ABC62?*;A?BBA?B@.8B9?33;+=
## [5] 36 ?5@A4::@@55;;89<'6?A8@A=4@=>54>76);A
## [6] 36 A8=B;462>;7BCBAA>1;=</?BA94%<:?(7@9=
## [7] 36 BBB>ABB@@BBBBCBCC@7ABBBAABB@B?AAA@=@
## [8] 36 =B=ACCBBC8ACCCBBBCCCBB=CAB9=BBBB@2?:
## [9] 36 @7ABBBBABBB?BAAB=@CBB;7ABAABBA?@;2=A
## [10] 36 BB>B?:ABABBBBABCB@@@BB@:@BA;>@;@B?AB
position(bamDatFiltered[1:10])
## [1] 548 1497 1506 1528 1540 1552 1552 1555 1559 1566
strand(bamDatFiltered[1:10])
## [1] - + + + + + + + - -
## Levels: + - *
Retrieve chromosome length information.
(chrSizes <- seqlengths(Dmelanogaster))
## chr2L chr2R chr3L chr3R chr4 chrX chrU
## 23011544 21146708 24543557 27905053 1351857 22422827 10049037
## chrM chr2LHet chr2RHet chr3LHet chr3RHet chrXHet chrYHet
## 19517 368872 3288761 2555491 2517507 204112 347038
## chrUextra
## 29004656
Get annotation information for Drosophila melanogaster from Ensembl.
ensembl <- useMart("ensembl", dataset = "dmelanogaster_gene_ensembl")
annotation.fields <- c("ensembl_gene_id", "strand", "chromosome_name", "start_position",
"end_position")
Restrict to chromosome 2L.
gene.annotation <- getBM(annotation.fields, mart = ensembl, filters = "chromosome_name",
values = c("2L"))
str(gene.annotation)
## 'data.frame': 2986 obs. of 5 variables:
## $ ensembl_gene_id: chr "FBgn0031208" "FBgn0002121" "FBgn0031209" "FBgn0263584" ...
## $ strand : int 1 -1 -1 1 -1 1 1 1 -1 1 ...
## $ chromosome_name: chr "2L" "2L" "2L" "2L" ...
## $ start_position : int 7529 9839 21823 21952 25402 66584 71757 76348 82421 94739 ...
## $ end_position : int 9484 21376 25155 24237 65404 71390 76211 77783 87387 102086 ...
levels(as.factor(gene.annotation$chromosome))
## [1] "2L"
# Re-label
gene.annotation$chromosome <- paste("chr", gene.annotation$chromosome_name,
sep = "")
levels(as.factor(gene.annotation$chromosome))
## [1] "chr2L"
Store gene annotation information.
gene.range <- RangedData(IRanges(start = gene.annotation$start_position, end = gene.annotation$end_position),
space = gene.annotation$chromosome, strand = gene.annotation$strand, gene = gene.annotation$ensembl_gene_id,
universe = "Dm3")
show(gene.range)
## RangedData with 2986 rows and 2 value columns across 1 space
## space ranges | strand gene
## <factor> <IRanges> | <integer> <character>
## 1 chr2L [ 7529, 9484] | 1 FBgn0031208
## 2 chr2L [ 9839, 21376] | -1 FBgn0002121
## 3 chr2L [21823, 25155] | -1 FBgn0031209
## 4 chr2L [21952, 24237] | 1 FBgn0263584
## 5 chr2L [25402, 65404] | -1 FBgn0051973
## 6 chr2L [66584, 71390] | 1 FBgn0067779
## 7 chr2L [71757, 76211] | 1 FBgn0031213
## 8 chr2L [76348, 77783] | 1 FBgn0031214
## 9 chr2L [82421, 87387] | -1 FBgn0002931
## ... ... ... ... ... ...
## 2978 chr2L [22690251, 22691008] | -1 FBgn0058439
## 2979 chr2L [22735486, 22736297] | -1 FBgn0262947
## 2980 chr2L [22736952, 22747273] | 1 FBgn0041004
## 2981 chr2L [22811944, 22834955] | 1 FBgn0002566
## 2982 chr2L [22841770, 22843208] | -1 FBgn0058005
## 2983 chr2L [22874534, 22885080] | 1 FBgn0000384
## 2984 chr2L [22892306, 22918647] | -1 FBgn0250907
## 2985 chr2L [22959606, 22961179] | -1 FBgn0086683
## 2986 chr2L [22961737, 22963456] | 1 FBgn0262887
Calculate coverage.
(cover <- coverage(bamDatFiltered, width = chrSizes))
## RleList of length 7
## $chrM
## integer-Rle of length 19517 with 953 runs
## Lengths: 547 36 913 9 22 5 ... 3 5 13 258 36 5793
## Values : 0 1 0 1 2 3 ... 4 3 2 0 1 0
##
## $chr2L
## integer-Rle of length 23011544 with 17850 runs
## Lengths: 6777 36 2316 36 1621 ... 107 36 499 36 50474
## Values : 0 1 0 1 0 ... 0 1 0 1 0
##
## $chrX
## integer-Rle of length 22422827 with 16522 runs
## Lengths: 18996 36 12225 36 ... 36 130 36 6180
## Values : 0 1 0 1 ... 1 0 1 0
##
## $chr3L
## integer-Rle of length 24543557 with 17396 runs
## Lengths: 135455 36 6783 23 ... 36 82251 36 12469
## Values : 0 1 0 1 ... 1 0 1 0
##
## $chr4
## integer-Rle of length 1351857 with 1255 runs
## Lengths: 59510 36 2019 36 ... 36 267 36 118808
## Values : 0 1 0 1 ... 1 0 1 0
##
## ...
## <2 more elements>
Aggregate the coverage for each gene.
gene.coverage <- aggregate(cover[match(names(gene.range), names(cover))], ranges(gene.range),
sum)
Find the number of reads per gene. Build a data frame to hold the data.
(gene.coverage <- ceiling(gene.coverage/unique(width(bamDat))))
## NumericList of length 1
## [["chr2L"]] 1 47 0 0 1 6 0 0 8 11 1 1 58 ... 17 16 1 0 0 0 15 4 0 1 0 6 0
length(gene.coverage[["chr2L"]])
## [1] 2986
length(ranges(gene.range)$chr2L)
## [1] 2986
countTable <- data.frame(chromosome = gene.range$space, gene_start = start(gene.range$ranges),
gene_end = end(gene.range$ranges), strand = gene.range$strand, gene = gene.range$gene,
count = as.vector(gene.coverage[["chr2L"]]))
dim(countTable)
## [1] 2986 6
head(countTable)
## chromosome gene_start gene_end strand gene count
## 1 chr2L 7529 9484 1 FBgn0031208 1
## 2 chr2L 9839 21376 -1 FBgn0002121 47
## 3 chr2L 21823 25155 -1 FBgn0031209 0
## 4 chr2L 21952 24237 1 FBgn0263584 0
## 5 chr2L 25402 65404 -1 FBgn0051973 1
## 6 chr2L 66584 71390 1 FBgn0067779 6
Normalize data according to read counts (RPKM - number of Reads Per Kilobase of gene (feature) per Million mapped reads).
countTable <- data.frame(chromosome = gene.range$space, gene_start = start(gene.range$ranges),
gene_end = end(gene.range$ranges), strand = gene.range$strand, gene = gene.range$gene,
count = as.vector(gene.coverage[["chr2L"]]), RPKM = (as.vector(gene.coverage[["chr2L"]])/(end(gene.range$ranges) -
start(gene.range$ranges))) * (1e+09/length(bamDat)))
head(countTable)
## chromosome gene_start gene_end strand gene count RPKM
## 1 chr2L 7529 9484 1 FBgn0031208 1 7.9667
## 2 chr2L 9839 21376 -1 FBgn0002121 47 63.4497
## 3 chr2L 21823 25155 -1 FBgn0031209 0 0.0000
## 4 chr2L 21952 24237 1 FBgn0263584 0 0.0000
## 5 chr2L 25402 65404 -1 FBgn0051973 1 0.3894
## 6 chr2L 66584 71390 1 FBgn0067779 6 19.4443