“Bioconductor provides tools for the analysis and comprehension of high-throughput genomic data. Bioconductor uses the R statistical programming language, and is open source and open development.”
library("dplyr")
library("ggplot2")
To install core packages, type the following in an R command window.
#leave as eval = FALSE when knitting
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install()
There are about 20000 packages in the Bioconductor universe. For now, let us install a couple of the more popular packages with the following code. To install a BioConductor package, simply type in the name of the package in quotes inside of BiocManager::install()
#leave as eval = FALSE when knitting
BiocManager::install("GenomicFeatures")
BiocManager::install("Biostrings")
BiocManager::install("Homo.sapiens")
BiocManager::install("Rqc") #quality control
BiocManager::install("ShortRead") #to read fasta or fastq files
We can start with the Homo.sapiens package of human genomes.
suppressPackageStartupMessages(library("Homo.sapiens"))
methods(class = class(Homo.sapiens))
## [1] asBED asGFF cds
## [4] cdsBy coerce<- columns
## [7] dbconn dbfile disjointExons
## [10] distance exons exonsBy
## [13] extractUpstreamSeqs fiveUTRsByTranscript genes
## [16] getTxDbIfAvailable intronsByTranscript isActiveSeq
## [19] isActiveSeq<- keys keytypes
## [22] mapIds mapToTranscripts metadata
## [25] microRNAs promoters resources
## [28] select selectByRanges selectRangesById
## [31] seqinfo show taxonomyId
## [34] threeUTRsByTranscript transcripts transcriptsBy
## [37] tRNAs TxDb TxDb<-
## see '?methods' for accessing help and source code
seqinfo(Homo.sapiens)
## Seqinfo object with 93 sequences (1 circular) from hg19 genome:
## seqnames seqlengths isCircular genome
## chr1 249250621 <NA> hg19
## chr2 243199373 <NA> hg19
## chr3 198022430 <NA> hg19
## chr4 191154276 <NA> hg19
## chr5 180915260 <NA> hg19
## ... ... ... ...
## chrUn_gl000245 36651 <NA> hg19
## chrUn_gl000246 38154 <NA> hg19
## chrUn_gl000247 36422 <NA> hg19
## chrUn_gl000248 39786 <NA> hg19
## chrUn_gl000249 38502 <NA> hg19
genes(Homo.sapiens)
## GRanges object with 23056 ranges and 1 metadata column:
## seqnames ranges strand | GENEID
## <Rle> <IRanges> <Rle> | <FactorList>
## 1 chr19 58858172-58874214 - | 1
## 10 chr8 18248755-18258723 + | 10
## 100 chr20 43248163-43280376 - | 100
## 1000 chr18 25530930-25757445 - | 1000
## 10000 chr1 243651535-244006886 - | 10000
## ... ... ... ... . ...
## 9991 chr9 114979995-115095944 - | 9991
## 9992 chr21 35736323-35743440 + | 9992
## 9993 chr22 19023795-19109967 - | 9993
## 9994 chr6 90539619-90584155 + | 9994
## 9997 chr22 50961997-50964905 - | 9997
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
metadata(Homo.sapiens)
## Only unique values are returned by this metadata method. For individual metadata values that may share a key, such as the Db type, be sure to call metadata on the individual objects.
## name
## 3 Data source
## 4 Genome
## 5 Organism
## 6 Taxonomy ID
## 7 UCSC Table
## 8 Resource URL
## 9 Type of Gene ID
## 10 Full dataset
## 11 miRBase build ID
## 12 transcript_nrow
## 13 exon_nrow
## 14 cds_nrow
## 15 Db created by
## 16 Creation time
## 17 GenomicFeatures version at creation time
## 18 RSQLite version at creation time
## 24 ORGANISM
## 25 SPECIES
## 26 EGSOURCEDATE
## 27 EGSOURCENAME
## 28 EGSOURCEURL
## 29 CENTRALID
## 30 TAXID
## 37 KEGGSOURCENAME
## 38 KEGGSOURCEURL
## 39 KEGGSOURCEDATE
## 40 GPSOURCENAME
## 41 GPSOURCEURL
## 42 GPSOURCEDATE
## 43 ENSOURCEDATE
## 44 ENSOURCENAME
## 45 ENSOURCEURL
## 46 UPSOURCENAME
## 47 UPSOURCEURL
## 48 UPSOURCEDATE
## 53 package
## value
## 3 UCSC
## 4 hg19
## 5 Homo sapiens
## 6 9606
## 7 knownGene
## 8 http://genome.ucsc.edu/
## 9 Entrez Gene ID
## 10 yes
## 11 GRCh37
## 12 82960
## 13 289969
## 14 237533
## 15 GenomicFeatures package from Bioconductor
## 16 2015-10-07 18:11:28 +0000 (Wed, 07 Oct 2015)
## 17 1.21.30
## 18 1.0.0
## 24 Homo sapiens
## 25 Human
## 26 2019-Jul10
## 27 Entrez Gene
## 28 ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
## 29 EG
## 30 9606
## 37 KEGG GENOME
## 38 ftp://ftp.genome.jp/pub/kegg/genomes
## 39 2011-Mar15
## 40 UCSC Genome Bioinformatics (Homo sapiens)
## 41
## 42 2019-Sep3
## 43 2019-Jun24
## 44 Ensembl
## 45 ftp://ftp.ensembl.org/pub/current_fasta
## 46 Uniprot
## 47 http://www.UniProt.org/
## 48 Mon Oct 21 14:24:25 2019
## 53 AnnotationDbi
hg <- genes(Homo.sapiens)
seqnames(hg)
## factor-Rle of length 23056 with 17793 runs
## Lengths: 1 1 1 1 1 ... 1 1 1 1 1
## Values : chr19 chr8 chr20 chr18 chr1 ... chr9 chr21 chr22 chr6 chr22
## Levels(93): chr1 chr2 chr3 ... chrUn_gl000247 chrUn_gl000248 chrUn_gl000249
# information by observation
runs <- seqnames(hg)
runValue(runs)[1:15]
## [1] chr19 chr8 chr20 chr18 chr1 chrX chr3 chr14 chr15 chr11 chr15 chr11
## [13] chr15 chr22 chr10
## 93 Levels: chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 ... chrUn_gl000249
runLength(runs)[1:15]
## [1] 1 1 1 1 1 1 1 1 1 1 47 1 8 1 1
# sorted by chromosone
hg[order(runs, start(hg))]
## GRanges object with 23056 ranges and 1 metadata column:
## seqnames ranges strand | GENEID
## <Rle> <IRanges> <Rle> | <FactorList>
## 100287102 chr1 11874-14409 + | 100287102
## 653635 chr1 14362-29961 - | 653635
## 79501 chr1 69091-70008 + | 79501
## 729737 chr1 134773-140566 - | 729737
## 100288069 chr1 700245-714068 - | 100288069
## ... ... ... ... . ...
## 283788 chrUn_gl000219 53809-99642 - | 283788
## 100507412 chrUn_gl000220 97129-126696 + | 100507412
## 728410 chrUn_gl000228 79448-113887 + | 728410
## 100653046 chrUn_gl000228 86060-90745 + | 100653046
## 100288687 chrUn_gl000228 112605-124171 + | 100288687
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
#sort(hg)
# distribution over chomosomes
table(seqnames(hg))
##
## chr1 chr2 chr3
## 2326 1464 1271
## chr4 chr5 chr6
## 873 1022 1000
## chr7 chr8 chr9
## 1108 818 945
## chr10 chr11 chr12
## 903 1439 1173
## chr13 chr14 chr15
## 449 779 791
## chr16 chr17 chr18
## 938 1337 341
## chr19 chr20 chr21
## 1607 647 296
## chr22 chrX chrY
## 535 918 53
## chrM chr1_gl000191_random chr1_gl000192_random
## 0 0 0
## chr4_ctg9_hap1 chr4_gl000193_random chr4_gl000194_random
## 0 1 1
## chr6_apd_hap1 chr6_cox_hap2 chr6_dbb_hap3
## 0 1 0
## chr6_mann_hap4 chr6_mcf_hap5 chr6_qbl_hap6
## 0 1 0
## chr6_ssto_hap7 chr7_gl000195_random chr8_gl000196_random
## 1 1 0
## chr8_gl000197_random chr9_gl000198_random chr9_gl000199_random
## 0 0 0
## chr9_gl000200_random chr9_gl000201_random chr11_gl000202_random
## 0 0 0
## chr17_ctg5_hap1 chr17_gl000203_random chr17_gl000204_random
## 0 0 0
## chr17_gl000205_random chr17_gl000206_random chr18_gl000207_random
## 1 0 0
## chr19_gl000208_random chr19_gl000209_random chr21_gl000210_random
## 0 6 0
## chrUn_gl000211 chrUn_gl000212 chrUn_gl000213
## 1 1 1
## chrUn_gl000214 chrUn_gl000215 chrUn_gl000216
## 0 0 0
## chrUn_gl000217 chrUn_gl000218 chrUn_gl000219
## 0 2 1
## chrUn_gl000220 chrUn_gl000221 chrUn_gl000222
## 1 0 0
## chrUn_gl000223 chrUn_gl000224 chrUn_gl000225
## 0 0 0
## chrUn_gl000226 chrUn_gl000227 chrUn_gl000228
## 0 0 3
## chrUn_gl000229 chrUn_gl000230 chrUn_gl000231
## 0 0 0
## chrUn_gl000232 chrUn_gl000233 chrUn_gl000234
## 0 0 0
## chrUn_gl000235 chrUn_gl000236 chrUn_gl000237
## 0 0 0
## chrUn_gl000238 chrUn_gl000239 chrUn_gl000240
## 0 0 0
## chrUn_gl000241 chrUn_gl000242 chrUn_gl000243
## 0 0 0
## chrUn_gl000244 chrUn_gl000245 chrUn_gl000246
## 0 0 0
## chrUn_gl000247 chrUn_gl000248 chrUn_gl000249
## 0 0 0
# store as a data.frame
hg_df <- data.frame(table(seqnames(hg)))
colnames(hg_df) <- c("chromosone", "count")
# graph
hg_df %>%
ggplot(aes(x = chromosone, y = count)) +
geom_bar(stat = "identity")
# 'coarse' to drop levels in the factor levels
hg_simple <- keepStandardChromosomes(hg, pruning.mode = "coarse")
hg_simple_df <- data.frame(table(seqnames(hg_simple)))
colnames(hg_simple_df) <- c("chromosone", "count")
# better graph
hg_simple_df %>%
ggplot(aes(x = chromosone, y = count, fill = chromosone)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(title = "Human Genome",
subtitle = "Distribution over Chromosomes",
caption = "Source: Bioconductor") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
# "M chromosone"??
# https://en.wikipedia.org/wiki/Human_artificial_chromosome
The following instructions were extracted from the vignette for Genomic Features
browseVignettes(package = "GenomicFeatures")Load the GenomicFeatures package. It is suggested that you load the packages with the following code.
#set as eval = TRUE when knitting
suppressPackageStartupMessages(library('GenomicFeatures'))
“The GenomicFeatures package retrieves and manages transcript-related features fromthe UCSC Genome Bioinformatics and BioMart data resources. The package isuseful for ChIP-chip, ChIP-seq, and RNA-seq analyses.”
"The GenomicFeatures package uses TxDb objects to store transcript metadata. This class maps the 5’ and 3’ untranslated regions (UTRs), protein coding sequences(CDSs) and exons for a set of mRNA transcripts to their associated genome. TxDb objects have numerous accessors functions to allow such features to be retrieved individually or grouped together in a way that reflects the underlying biology.
All TxDb objects are backed by a SQLite database that manages genomic locations and the relationships between pre-processed mRNA transcripts, exons, protein coding sequences, and their related gene identifiers."
#set as eval = TRUE when knitting
samplefile <- system.file("extdata",
"hg19_knownGene_sample.sqlite",
package="GenomicFeatures")
txdb <- loadDb(samplefile)
txdb
## TxDb object:
## # Db type: TxDb
## # Supporting package: GenomicFeatures
## # Data source: UCSC
## # Genome: hg19
## # Organism: Homo sapiens
## # UCSC Table: knownGene
## # Resource URL: http://genome.ucsc.edu/
## # Type of Gene ID: Entrez Gene ID
## # Full dataset: no
## # miRBase build ID: NA
## # transcript_nrow: 178
## # exon_nrow: 620
## # cds_nrow: 523
## # Db created by: GenomicFeatures package from Bioconductor
## # Creation time: 2014-10-08 10:31:15 -0700 (Wed, 08 Oct 2014)
## # GenomicFeatures version at creation time: 1.17.21
## # RSQLite version at creation time: 0.11.4
## # DBSCHEMAVERSION: 1.0
BSgenome.Hsapiens.UCSC.hg19TxDb.Hsapiens.UCSC.hg19.knownGene#set as eval = TRUE when knitting
library(BSgenome.Hsapiens.UCSC.hg19)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
tx_seqs1 <- extractTranscriptSeqs(Hsapiens,
TxDb.Hsapiens.UCSC.hg19.knownGene,
use.names=TRUE)
suppressWarnings(translate(tx_seqs1))
## A AAStringSet instance of length 82960
## width seq names
## [1] 550 LAVSLFFDLFFLFMCICCLLA...TPRRLHPAQLEILY*KHTVGF uc001aaa.3
## [2] 496 LAVSLFFDLFFLFMCICCLLA...PETFASCTARDPLLKAHCWFL uc010nxq.1
## [3] 531 LAVSLFFDLFFLFMCICCLLA...TPRRLHPAQLEILY*KHTVGF uc010nxr.1
## [4] 306 MVTEFIFLGLSDSQELQTFLF...DMKTAIRQLRKWDAHSSVKF* uc001aal.1
## [5] 10 YRPSS*LTMA uc001aaq.2
## ... ... ...
## [82956] 405 ASLGSLVSPAELLCSRLRGPG...FCLLLRRASMATDHCNLRLLR uc011mgu.1
## [82957] 245 LHF*SSPPWCCSGSHTRPEPA...TSL*KWENGFLGLK*PLY*MQ uc011mgv.2
## [82958] 10 W*ISAKVAKE uc011mgw.1
## [82959] 10 MQRWLVAL*A uc022brq.1
## [82960] 10 W*ISAKVAKE uc022brr.1
The following instructions were extracted from the vignette for Biostrings
browseVignettes(package = "Biostrings")hgu95av2probehgu95av2cdfLoad the Biostrings package. It is suggested that you load the packages with the following code.
#set as eval = TRUE when knitting
suppressPackageStartupMessages(library('Biostrings'))
suppressPackageStartupMessages(library('hgu95av2probe'))
suppressPackageStartupMessages(library('hgu95av2cdf'))
#set as eval = TRUE when knitting
this_DNAString_set <- DNAStringSet(hgu95av2probe)
this_DNAString_set_bc <- alphabetFrequency(this_DNAString_set, baseOnly = TRUE)
nrow(this_DNAString_set_bc)
## [1] 201800
head(this_DNAString_set_bc)
## A C G T other
## [1,] 1 10 6 8 0
## [2,] 5 5 5 10 0
## [3,] 6 4 3 12 0
## [4,] 4 7 8 6 0
## [5,] 4 5 8 8 0
## [6,] 2 7 6 10 0
alphabetFrequency(this_DNAString_set, baseOnly=TRUE, collapse=TRUE)
## A C G T other
## 1250858 1235532 1186629 1371981 0
GC content of a DNA sequence and graph the resulting amounts as a histogram. Run the code and write a couple of sentences about the results.#set as eval = TRUE when knitting
this_DNAString_df <- data.frame(this_DNAString_set_bc)
# alas "T" is treated as "TRUE" in R, so let's use the full names of the nucleotides
colnames(this_DNAString_df) <- c("Adenine", "Cytosine", "Guanine", "Thymine", "other")
this_DNAString_df %>%
dplyr::select(c("Adenine", "Cytosine", "Guanine", "Thymine")) %>%
mutate(GC_content = 100*(Cytosine + Guanine) /
(Adenine + Cytosine + Guanine + Thymine)) %>%
ggplot(aes(x = GC_content, fill = ..x..)) +
geom_histogram(binwidth = 5) +
labs(title = "Affymetrix Human Genome U95 Set annotation data",
subtitle = "GC Content of hgu95av2probe",
caption = "Source: Bioconductor",
x = "GC content (percentage)") +
scale_fill_gradient(low = "yellow", high= "blue") +
theme_minimal() +
theme(legend.position = "none")
mystery_gene <- readDNAStringSet("mystery_gene.fasta")
methods(class = "DNAStringSet")
## [1] !
## [2] !=
## [3] $
## [4] $<-
## [5] %in%
## [6] [
## [7] [[
## [8] [[<-
## [9] [<-
## [10] <
## [11] <=
## [12] ==
## [13] >
## [14] >=
## [15] aggregate
## [16] alphabetFrequency
## [17] anyDuplicated
## [18] anyNA
## [19] append
## [20] as.character
## [21] as.complex
## [22] as.data.frame
## [23] as.env
## [24] as.factor
## [25] as.integer
## [26] as.list
## [27] as.logical
## [28] as.matrix
## [29] as.numeric
## [30] as.raw
## [31] as.vector
## [32] bindROWS
## [33] by
## [34] c
## [35] cbind
## [36] chartr
## [37] coerce
## [38] compact
## [39] compareStrings
## [40] complement
## [41] consensusMatrix
## [42] consensusString
## [43] countOverlaps
## [44] countPattern
## [45] countPDict
## [46] dinucleotideFrequencyTest
## [47] do.call
## [48] droplevels
## [49] duplicated
## [50] elementMetadata
## [51] elementMetadata<-
## [52] elementNROWS
## [53] elementType
## [54] eval
## [55] expand
## [56] expand.grid
## [57] export
## [58] extractAt
## [59] extractROWS
## [60] FactorToClass
## [61] Filter
## [62] findOverlaps
## [63] getListElement
## [64] getSeq
## [65] hasOnlyBaseLetters
## [66] head
## [67] ifelse2
## [68] intersect
## [69] is.na
## [70] is.unsorted
## [71] isEmpty
## [72] isMatchingEndingAt
## [73] isMatchingStartingAt
## [74] lapply
## [75] length
## [76] lengths
## [77] letterFrequency
## [78] make_XStringSet_from_strings
## [79] match
## [80] matchPattern
## [81] matchPDict
## [82] mcols
## [83] mcols<-
## [84] merge
## [85] mergeROWS
## [86] metadata
## [87] metadata<-
## [88] mstack
## [89] names
## [90] names<-
## [91] nchar
## [92] neditEndingAt
## [93] neditStartingAt
## [94] normalizeSingleBracketReplacementValue
## [95] NROW
## [96] nucleotideFrequencyAt
## [97] oligonucleotideFrequency
## [98] order
## [99] overlapsAny
## [100] PairwiseAlignments
## [101] PairwiseAlignmentsSingleSubject
## [102] parallelSlotNames
## [103] parallelVectorNames
## [104] pcompare
## [105] pcompareRecursively
## [106] PDict
## [107] PWM
## [108] rank
## [109] rbind
## [110] Reduce
## [111] relist
## [112] relistToClass
## [113] rename
## [114] rep
## [115] rep.int
## [116] replaceAt
## [117] replaceLetterAt
## [118] replaceROWS
## [119] rev
## [120] revElements
## [121] reverse
## [122] reverseComplement
## [123] ROWNAMES
## [124] sapply
## [125] selfmatch
## [126] seqinfo
## [127] seqinfo<-
## [128] seqlevelsInUse
## [129] seqtype
## [130] seqtype<-
## [131] setdiff
## [132] setequal
## [133] setListElement
## [134] shiftApply
## [135] show
## [136] showAsCell
## [137] sort
## [138] split
## [139] split<-
## [140] stack
## [141] stringDist
## [142] strsplit
## [143] subseq
## [144] subseq<-
## [145] subset
## [146] subsetByOverlaps
## [147] summary
## [148] table
## [149] tail
## [150] tapply
## [151] threebands
## [152] toString
## [153] transform
## [154] translate
## [155] trimLRPatterns
## [156] twoWayAlphabetFrequency
## [157] union
## [158] unique
## [159] uniqueLetters
## [160] unlist
## [161] unsplit
## [162] unstrsplit
## [163] updateObject
## [164] values
## [165] values<-
## [166] vcountPattern
## [167] vcountPDict
## [168] vmatchPattern
## [169] vwhichPDict
## [170] which.isMatchingEndingAt
## [171] which.isMatchingStartingAt
## [172] whichPDict
## [173] width
## [174] window
## [175] window<-
## [176] windows
## [177] with
## [178] within
## [179] xtabs
## [180] xtfrm
## [181] xvcopy
## [182] zipdown
## see '?methods' for accessing help and source code
lengths(mystery_gene)
## [1] 29903
alphabetFrequency(mystery_gene, baseOnly=TRUE, collapse=TRUE)
## A C G T other
## 8954 5492 5863 9594 0
names(mystery_gene)