rm(list = ls())
#BiocManager::install("Biostrings")
###1 Working with sequences - a first start
sequences <- c( "AATTC", "ATCG","TTCCAAGG")
sequences; length(sequences); nchar(sequences); sequences[c(1, 3)]; sample(sequences)
## [1] "AATTC" "ATCG" "TTCCAAGG"
## [1] 3
## [1] 5 4 8
## [1] "AATTC" "TTCCAAGG"
## [1] "AATTC" "TTCCAAGG" "ATCG"
###2 Working with sequences - using Biostrings
library("Biostrings")
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: XVector
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
dna <- DNAStringSet(sequences)
dna; dna[[2]]; class(dna); length(dna); nchar(dna); dna[c(1, 3)]; sample(dna); rev(dna); sort(dna)
## DNAStringSet object of length 3:
## width seq
## [1] 5 AATTC
## [2] 4 ATCG
## [3] 8 TTCCAAGG
## 4-letter DNAString object
## seq: ATCG
## [1] "DNAStringSet"
## attr(,"package")
## [1] "Biostrings"
## [1] 3
## [1] 5 4 8
## DNAStringSet object of length 2:
## width seq
## [1] 5 AATTC
## [2] 8 TTCCAAGG
## DNAStringSet object of length 3:
## width seq
## [1] 8 TTCCAAGG
## [2] 4 ATCG
## [3] 5 AATTC
## DNAStringSet object of length 3:
## width seq
## [1] 8 TTCCAAGG
## [2] 4 ATCG
## [3] 5 AATTC
## DNAStringSet object of length 3:
## width seq
## [1] 5 AATTC
## [2] 4 ATCG
## [3] 8 TTCCAAGG
#Note that rev on a DNAStringSet just reverse the order of the elements,
#whereas rev on a DNAString actually reverse the string.
dna[[1]]; rev(dna[[1]])
## 5-letter DNAString object
## seq: AATTC
## 5-letter DNAString object
## seq: CTTAA
#DNAStringSet("weare")
#Error in
DNAString("ATACG"); DNAStringSet("ATACA")
## 5-letter DNAString object
## seq: ATACG
## DNAStringSet object of length 1:
## width seq
## [1] 5 ATACA
####################################
names(dna) <- paste0("seq", 1:3); dna
## DNAStringSet object of length 3:
## width seq names
## [1] 5 AATTC seq1
## [2] 4 ATCG seq2
## [3] 8 TTCCAAGG seq3
#translate: translate the DNA or RNA sequence into amino acids.
dna2 <- dna
dna2; translate(dna2)
## DNAStringSet object of length 3:
## width seq names
## [1] 5 AATTC seq1
## [2] 4 ATCG seq2
## [3] 8 TTCCAAGG seq3
## Warning in .Call2("DNAStringSet_translate", x, skip_code,
## dna_codes[codon_alphabet], : in 'x[[1]]': last 2 bases were ignored
## Warning in .Call2("DNAStringSet_translate", x, skip_code,
## dna_codes[codon_alphabet], : in 'x[[2]]': last base was ignored
## Warning in .Call2("DNAStringSet_translate", x, skip_code,
## dna_codes[codon_alphabet], : in 'x[[3]]': last 2 bases were ignored
## AAStringSet object of length 3:
## width seq names
## [1] 1 N seq1
## [2] 1 I seq2
## [3] 2 FQ seq3
#complement, reverseComplement: (reverse) complement the sequence.
#reverse: reverse the sequence.
dna2; reverseComplement(dna2)
## DNAStringSet object of length 3:
## width seq names
## [1] 5 AATTC seq1
## [2] 4 ATCG seq2
## [3] 8 TTCCAAGG seq3
## DNAStringSet object of length 3:
## width seq names
## [1] 5 GAATT seq1
## [2] 4 CGAT seq2
## [3] 8 CCTTGGAA seq3
dna2; complement(dna2)
## DNAStringSet object of length 3:
## width seq names
## [1] 5 AATTC seq1
## [2] 4 ATCG seq2
## [3] 8 TTCCAAGG seq3
## DNAStringSet object of length 3:
## width seq names
## [1] 5 TTAAG seq1
## [2] 4 TAGC seq2
## [3] 8 AAGGTTCC seq3
dna2; reverse(dna2)
## DNAStringSet object of length 3:
## width seq names
## [1] 5 AATTC seq1
## [2] 4 ATCG seq2
## [3] 8 TTCCAAGG seq3
## DNAStringSet object of length 3:
## width seq names
## [1] 5 CTTAA seq1
## [2] 4 GCTA seq2
## [3] 8 GGAACCTT seq3
###########5 Subsetting sequences
subseq(dna2, start = 1, end = 3)
## DNAStringSet object of length 3:
## width seq names
## [1] 3 AAT seq1
## [2] 3 ATC seq2
## [3] 3 TTC seq3
subseq(dna2, start = 1, width = 3)
## DNAStringSet object of length 3:
## width seq names
## [1] 3 AAT seq1
## [2] 3 ATC seq2
## [3] 3 TTC seq3
subseq(dna2[[1]], start = 1, width = 3)
## 3-letter DNAString object
## seq: AAT
subseq(dna2[[1]], start = 1, end = 3)
## 3-letter DNAString object
## seq: AAT
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.3 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.4
## Warning: package 'dplyr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::collapse() masks Biostrings::collapse(), IRanges::collapse()
## x dplyr::combine() masks BiocGenerics::combine()
## x purrr::compact() masks XVector::compact()
## x dplyr::desc() masks IRanges::desc()
## x tidyr::expand() masks S4Vectors::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks S4Vectors::first()
## x dplyr::lag() masks stats::lag()
## x ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## x purrr::reduce() masks IRanges::reduce()
## x dplyr::rename() masks S4Vectors::rename()
## x dplyr::slice() masks XVector::slice(), IRanges::slice()
dna2[rep(1, 3)] %>% # DNAStringSet with first sequence 10 times
subseq(start = 1, end = 1:3) # subsequences of 1, 2, ... 10 nucleotides
## DNAStringSet object of length 3:
## width seq names
## [1] 1 A seq1
## [2] 2 AA seq1
## [3] 3 AAT seq1
####################Counting letters
#alphabetFrequency, letterFrequency: Compute the frequency of all characters (alphabetFrequency)
#or only specific letters (letterFrequency).
dna2; dna2@pool; dna2@ranges; dna2@elementType; dna2@elementMetadata; dna2@metadata
## DNAStringSet object of length 3:
## width seq names
## [1] 5 AATTC seq1
## [2] 4 ATCG seq2
## [3] 8 TTCCAAGG seq3
## SharedRaw_Pool of length 1
## 1: SharedRaw of length 17 (data starting at address 000000002C0ECE28)
## group start end width names
## 1 1 1 5 5 seq1
## 2 1 6 9 4 seq2
## 3 1 10 17 8 seq3
## [1] "DNAString"
## NULL
## list()
alphabetFrequency(dna2); letterFrequency(dna2, "GC"); consensusMatrix(dna2, as.prob = TRUE)
## A C G T M R W S Y K V H D B N - + .
## [1,] 2 1 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [2,] 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [3,] 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## G|C
## [1,] 1
## [2,] 2
## [3,] 4
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## A 0.6666667 0.3333333 0.0000000 0.0000000 0.5 1 0 0
## C 0.0000000 0.0000000 0.6666667 0.3333333 0.5 0 0 0
## G 0.0000000 0.0000000 0.0000000 0.3333333 0.0 0 1 1
## T 0.3333333 0.6666667 0.3333333 0.3333333 0.0 0 0 0
## M 0.0000000 0.0000000 0.0000000 0.0000000 0.0 0 0 0
## R 0.0000000 0.0000000 0.0000000 0.0000000 0.0 0 0 0
## W 0.0000000 0.0000000 0.0000000 0.0000000 0.0 0 0 0
## S 0.0000000 0.0000000 0.0000000 0.0000000 0.0 0 0 0
## Y 0.0000000 0.0000000 0.0000000 0.0000000 0.0 0 0 0
## K 0.0000000 0.0000000 0.0000000 0.0000000 0.0 0 0 0
## V 0.0000000 0.0000000 0.0000000 0.0000000 0.0 0 0 0
## H 0.0000000 0.0000000 0.0000000 0.0000000 0.0 0 0 0
## D 0.0000000 0.0000000 0.0000000 0.0000000 0.0 0 0 0
## B 0.0000000 0.0000000 0.0000000 0.0000000 0.0 0 0 0
## N 0.0000000 0.0000000 0.0000000 0.0000000 0.0 0 0 0
## - 0.0000000 0.0000000 0.0000000 0.0000000 0.0 0 0 0
## + 0.0000000 0.0000000 0.0000000 0.0000000 0.0 0 0 0
## . 0.0000000 0.0000000 0.0000000 0.0000000 0.0 0 0 0
consensusMatrix(dna2)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## A 2 1 0 0 1 1 0 0
## C 0 0 2 1 1 0 0 0
## G 0 0 0 1 0 0 1 1
## T 1 2 1 1 0 0 0 0
## M 0 0 0 0 0 0 0 0
## R 0 0 0 0 0 0 0 0
## W 0 0 0 0 0 0 0 0
## S 0 0 0 0 0 0 0 0
## Y 0 0 0 0 0 0 0 0
## K 0 0 0 0 0 0 0 0
## V 0 0 0 0 0 0 0 0
## H 0 0 0 0 0 0 0 0
## D 0 0 0 0 0 0 0 0
## B 0 0 0 0 0 0 0 0
## N 0 0 0 0 0 0 0 0
## - 0 0 0 0 0 0 0 0
## + 0 0 0 0 0 0 0 0
## . 0 0 0 0 0 0 0 0
##############################
gc_dna <- letterFrequency(dna2, letters = "GC", as.prob = TRUE)
gc_dna; mean(gc_dna);sd(gc_dna);range(gc_dna);max(gc_dna);gc_dna[which(gc_dna == max(gc_dna))]
## G|C
## [1,] 0.2
## [2,] 0.5
## [3,] 0.5
## [1] 0.4
## [1] 0.1732051
## [1] 0.2 0.5
## [1] 0.5
## [1] 0.5 0.5
hist(gc_dna); plot(density(gc_dna))


###3 Reading DNA sequence data from a file
fa_file <-
system.file(package="Biostrings", "extdata", "dm3_upstream2000.fa.gz")
fa_file; readLines(fa_file, 5); tail(readLines(fa_file, 44), 5)
## [1] "C:/Users/liyix/Documents/R/win-library/4.0/Biostrings/extdata/dm3_upstream2000.fa.gz"
## [1] ">NM_078863_up_2000_chr2L_16764737_f chr2L:16764737-16766736"
## [2] "gttggtggcccaccagtgccaaaatacacaagaagaagaaacagcatctt"
## [3] "gacactaaaatgcaaaaattgctttgcgtcaatgactcaaaacgaaaatg"
## [4] "tttttgtgcttttcgaacaaaaaattgggagcagaatattggattcgctt"
## [5] "ttttcgcatcgaatatcttaagggagcaagggaagaaccatctagaataa"
## [1] "cacgcacaccgatcgtcgaatcgaaaagctttcggggtcttacgggatcc"
## [2] "atgggtatcaagttgccccgtataaaaggcaagtttaccggttgcacggt"
## [3] ">NM_001201794_up_2000_chr2L_8382455_f chr2L:8382455-8384454"
## [4] "ttatttatgtaggcgcccgttcccgcagccaaagcactcagaattccggg"
## [5] "cgtgtagcgcaacgaccatctacaaggcaatattttgatcgcttgttagg"
dna_1 <- readDNAStringSet(fa_file); dna_1
## DNAStringSet object of length 26454:
## width seq names
## [1] 2000 GTTGGTGGCCCACCAGTGCCA...CAAGTTTACCGGTTGCACGGT NM_078863_up_2000...
## [2] 2000 TTATTTATGTAGGCGCCCGTT...ATACGGAAAGTCATCCTCGAT NM_001201794_up_2...
## [3] 2000 TTATTTATGTAGGCGCCCGTT...ATACGGAAAGTCATCCTCGAT NM_001201795_up_2...
## [4] 2000 TTATTTATGTAGGCGCCCGTT...ATACGGAAAGTCATCCTCGAT NM_001201796_up_2...
## [5] 2000 TTATTTATGTAGGCGCCCGTT...ATACGGAAAGTCATCCTCGAT NM_001201797_up_2...
## ... ... ...
## [26450] 2000 ATTTACAAGACTAATAAAGAT...AAAATTAAATTTCAATAAAAC NM_001111010_up_2...
## [26451] 2000 GATATACGAAGGACGACCTGC...TTGTTTGAGTTGTTATATATT NM_001015258_up_2...
## [26452] 2000 GATATACGAAGGACGACCTGC...TTGTTTGAGTTGTTATATATT NM_001110997_up_2...
## [26453] 2000 GATATACGAAGGACGACCTGC...TTGTTTGAGTTGTTATATATT NM_001276245_up_2...
## [26454] 2000 CGTATGTATTAGTTAACTCTG...TCAAAGTGTAAGAACAAATTG NM_001015497_up_2...
#################################
#The full set of string classes are
#DNAString[Set]: DNA sequences.
#RNAString[Set]: RNA sequences.
#AAString[Set]: Amino Acids sequences (protein).
#BString[Set]: “Big” sequences, using any kind of letter.
#XString[Set] is basically any of the above classes.
#translate: translate the DNA or RNA sequence into amino acids.
#ref https://uclouvain-cbio.github.io/WSBIM1322/sec-biostrings.html
#ref https://kasperdanielhansen.github.io/genbioconductor/html/Biostrings.html