rm(list = ls())
#BiocManager::install("Biostrings")
###1 Working with sequences - a first start
sequences <- c( "AATTC", "ATCG","TTCCAAGG")
sequences; length(sequences); nchar(sequences); sequences[c(1, 3)]; sample(sequences)
## [1] "AATTC"    "ATCG"     "TTCCAAGG"
## [1] 3
## [1] 5 4 8
## [1] "AATTC"    "TTCCAAGG"
## [1] "AATTC"    "TTCCAAGG" "ATCG"
###2 Working with sequences - using Biostrings
library("Biostrings")
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: XVector
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
dna <- DNAStringSet(sequences)
dna; dna[[2]]; class(dna); length(dna); nchar(dna); dna[c(1, 3)]; sample(dna); rev(dna); sort(dna)
## DNAStringSet object of length 3:
##     width seq
## [1]     5 AATTC
## [2]     4 ATCG
## [3]     8 TTCCAAGG
## 4-letter DNAString object
## seq: ATCG
## [1] "DNAStringSet"
## attr(,"package")
## [1] "Biostrings"
## [1] 3
## [1] 5 4 8
## DNAStringSet object of length 2:
##     width seq
## [1]     5 AATTC
## [2]     8 TTCCAAGG
## DNAStringSet object of length 3:
##     width seq
## [1]     8 TTCCAAGG
## [2]     4 ATCG
## [3]     5 AATTC
## DNAStringSet object of length 3:
##     width seq
## [1]     8 TTCCAAGG
## [2]     4 ATCG
## [3]     5 AATTC
## DNAStringSet object of length 3:
##     width seq
## [1]     5 AATTC
## [2]     4 ATCG
## [3]     8 TTCCAAGG
#Note that rev on a DNAStringSet just reverse the order of the elements, 
#whereas rev on a DNAString actually reverse the string.
dna[[1]]; rev(dna[[1]])
## 5-letter DNAString object
## seq: AATTC
## 5-letter DNAString object
## seq: CTTAA
#DNAStringSet("weare")
#Error in
DNAString("ATACG"); DNAStringSet("ATACA")
## 5-letter DNAString object
## seq: ATACG
## DNAStringSet object of length 1:
##     width seq
## [1]     5 ATACA
####################################
names(dna) <- paste0("seq", 1:3); dna
## DNAStringSet object of length 3:
##     width seq                                               names               
## [1]     5 AATTC                                             seq1
## [2]     4 ATCG                                              seq2
## [3]     8 TTCCAAGG                                          seq3
#translate: translate the DNA or RNA sequence into amino acids.
dna2 <- dna
dna2; translate(dna2)
## DNAStringSet object of length 3:
##     width seq                                               names               
## [1]     5 AATTC                                             seq1
## [2]     4 ATCG                                              seq2
## [3]     8 TTCCAAGG                                          seq3
## Warning in .Call2("DNAStringSet_translate", x, skip_code,
## dna_codes[codon_alphabet], : in 'x[[1]]': last 2 bases were ignored
## Warning in .Call2("DNAStringSet_translate", x, skip_code,
## dna_codes[codon_alphabet], : in 'x[[2]]': last base was ignored
## Warning in .Call2("DNAStringSet_translate", x, skip_code,
## dna_codes[codon_alphabet], : in 'x[[3]]': last 2 bases were ignored
## AAStringSet object of length 3:
##     width seq                                               names               
## [1]     1 N                                                 seq1
## [2]     1 I                                                 seq2
## [3]     2 FQ                                                seq3
#complement, reverseComplement: (reverse) complement the sequence.
#reverse: reverse the sequence.
dna2; reverseComplement(dna2)
## DNAStringSet object of length 3:
##     width seq                                               names               
## [1]     5 AATTC                                             seq1
## [2]     4 ATCG                                              seq2
## [3]     8 TTCCAAGG                                          seq3
## DNAStringSet object of length 3:
##     width seq                                               names               
## [1]     5 GAATT                                             seq1
## [2]     4 CGAT                                              seq2
## [3]     8 CCTTGGAA                                          seq3
dna2; complement(dna2)
## DNAStringSet object of length 3:
##     width seq                                               names               
## [1]     5 AATTC                                             seq1
## [2]     4 ATCG                                              seq2
## [3]     8 TTCCAAGG                                          seq3
## DNAStringSet object of length 3:
##     width seq                                               names               
## [1]     5 TTAAG                                             seq1
## [2]     4 TAGC                                              seq2
## [3]     8 AAGGTTCC                                          seq3
dna2; reverse(dna2)
## DNAStringSet object of length 3:
##     width seq                                               names               
## [1]     5 AATTC                                             seq1
## [2]     4 ATCG                                              seq2
## [3]     8 TTCCAAGG                                          seq3
## DNAStringSet object of length 3:
##     width seq                                               names               
## [1]     5 CTTAA                                             seq1
## [2]     4 GCTA                                              seq2
## [3]     8 GGAACCTT                                          seq3
###########5 Subsetting sequences
subseq(dna2, start = 1, end = 3)
## DNAStringSet object of length 3:
##     width seq                                               names               
## [1]     3 AAT                                               seq1
## [2]     3 ATC                                               seq2
## [3]     3 TTC                                               seq3
subseq(dna2, start = 1, width = 3)
## DNAStringSet object of length 3:
##     width seq                                               names               
## [1]     3 AAT                                               seq1
## [2]     3 ATC                                               seq2
## [3]     3 TTC                                               seq3
subseq(dna2[[1]], start = 1, width = 3)
## 3-letter DNAString object
## seq: AAT
subseq(dna2[[1]], start = 1, end = 3)
## 3-letter DNAString object
## seq: AAT
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.3     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.4
## Warning: package 'dplyr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::collapse()   masks Biostrings::collapse(), IRanges::collapse()
## x dplyr::combine()    masks BiocGenerics::combine()
## x purrr::compact()    masks XVector::compact()
## x dplyr::desc()       masks IRanges::desc()
## x tidyr::expand()     masks S4Vectors::expand()
## x dplyr::filter()     masks stats::filter()
## x dplyr::first()      masks S4Vectors::first()
## x dplyr::lag()        masks stats::lag()
## x ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## x purrr::reduce()     masks IRanges::reduce()
## x dplyr::rename()     masks S4Vectors::rename()
## x dplyr::slice()      masks XVector::slice(), IRanges::slice()
dna2[rep(1, 3)] %>%               # DNAStringSet with first sequence 10 times
  subseq(start = 1, end = 1:3) # subsequences of 1, 2, ... 10 nucleotides
## DNAStringSet object of length 3:
##     width seq                                               names               
## [1]     1 A                                                 seq1
## [2]     2 AA                                                seq1
## [3]     3 AAT                                               seq1
####################Counting letters
#alphabetFrequency, letterFrequency: Compute the frequency of all characters (alphabetFrequency) 
#or only specific letters (letterFrequency).
dna2; dna2@pool; dna2@ranges; dna2@elementType; dna2@elementMetadata; dna2@metadata
## DNAStringSet object of length 3:
##     width seq                                               names               
## [1]     5 AATTC                                             seq1
## [2]     4 ATCG                                              seq2
## [3]     8 TTCCAAGG                                          seq3
## SharedRaw_Pool of length 1
## 1: SharedRaw of length 17 (data starting at address 000000002C0ECE28)
##   group start end width names
## 1     1     1   5     5  seq1
## 2     1     6   9     4  seq2
## 3     1    10  17     8  seq3
## [1] "DNAString"
## NULL
## list()
alphabetFrequency(dna2); letterFrequency(dna2, "GC"); consensusMatrix(dna2, as.prob = TRUE)
##      A C G T M R W S Y K V H D B N - + .
## [1,] 2 1 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [2,] 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [3,] 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##      G|C
## [1,]   1
## [2,]   2
## [3,]   4
##        [,1]      [,2]      [,3]      [,4] [,5] [,6] [,7] [,8]
## A 0.6666667 0.3333333 0.0000000 0.0000000  0.5    1    0    0
## C 0.0000000 0.0000000 0.6666667 0.3333333  0.5    0    0    0
## G 0.0000000 0.0000000 0.0000000 0.3333333  0.0    0    1    1
## T 0.3333333 0.6666667 0.3333333 0.3333333  0.0    0    0    0
## M 0.0000000 0.0000000 0.0000000 0.0000000  0.0    0    0    0
## R 0.0000000 0.0000000 0.0000000 0.0000000  0.0    0    0    0
## W 0.0000000 0.0000000 0.0000000 0.0000000  0.0    0    0    0
## S 0.0000000 0.0000000 0.0000000 0.0000000  0.0    0    0    0
## Y 0.0000000 0.0000000 0.0000000 0.0000000  0.0    0    0    0
## K 0.0000000 0.0000000 0.0000000 0.0000000  0.0    0    0    0
## V 0.0000000 0.0000000 0.0000000 0.0000000  0.0    0    0    0
## H 0.0000000 0.0000000 0.0000000 0.0000000  0.0    0    0    0
## D 0.0000000 0.0000000 0.0000000 0.0000000  0.0    0    0    0
## B 0.0000000 0.0000000 0.0000000 0.0000000  0.0    0    0    0
## N 0.0000000 0.0000000 0.0000000 0.0000000  0.0    0    0    0
## - 0.0000000 0.0000000 0.0000000 0.0000000  0.0    0    0    0
## + 0.0000000 0.0000000 0.0000000 0.0000000  0.0    0    0    0
## . 0.0000000 0.0000000 0.0000000 0.0000000  0.0    0    0    0
consensusMatrix(dna2)
##   [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## A    2    1    0    0    1    1    0    0
## C    0    0    2    1    1    0    0    0
## G    0    0    0    1    0    0    1    1
## T    1    2    1    1    0    0    0    0
## M    0    0    0    0    0    0    0    0
## R    0    0    0    0    0    0    0    0
## W    0    0    0    0    0    0    0    0
## S    0    0    0    0    0    0    0    0
## Y    0    0    0    0    0    0    0    0
## K    0    0    0    0    0    0    0    0
## V    0    0    0    0    0    0    0    0
## H    0    0    0    0    0    0    0    0
## D    0    0    0    0    0    0    0    0
## B    0    0    0    0    0    0    0    0
## N    0    0    0    0    0    0    0    0
## -    0    0    0    0    0    0    0    0
## +    0    0    0    0    0    0    0    0
## .    0    0    0    0    0    0    0    0
##############################
gc_dna <-  letterFrequency(dna2, letters = "GC", as.prob = TRUE)
gc_dna; mean(gc_dna);sd(gc_dna);range(gc_dna);max(gc_dna);gc_dna[which(gc_dna == max(gc_dna))]
##      G|C
## [1,] 0.2
## [2,] 0.5
## [3,] 0.5
## [1] 0.4
## [1] 0.1732051
## [1] 0.2 0.5
## [1] 0.5
## [1] 0.5 0.5
hist(gc_dna); plot(density(gc_dna))

###3 Reading DNA sequence data from a file
fa_file <-
  system.file(package="Biostrings", "extdata", "dm3_upstream2000.fa.gz")
fa_file; readLines(fa_file, 5); tail(readLines(fa_file, 44), 5)
## [1] "C:/Users/liyix/Documents/R/win-library/4.0/Biostrings/extdata/dm3_upstream2000.fa.gz"
## [1] ">NM_078863_up_2000_chr2L_16764737_f chr2L:16764737-16766736"
## [2] "gttggtggcccaccagtgccaaaatacacaagaagaagaaacagcatctt"         
## [3] "gacactaaaatgcaaaaattgctttgcgtcaatgactcaaaacgaaaatg"         
## [4] "tttttgtgcttttcgaacaaaaaattgggagcagaatattggattcgctt"         
## [5] "ttttcgcatcgaatatcttaagggagcaagggaagaaccatctagaataa"
## [1] "cacgcacaccgatcgtcgaatcgaaaagctttcggggtcttacgggatcc"         
## [2] "atgggtatcaagttgccccgtataaaaggcaagtttaccggttgcacggt"         
## [3] ">NM_001201794_up_2000_chr2L_8382455_f chr2L:8382455-8384454"
## [4] "ttatttatgtaggcgcccgttcccgcagccaaagcactcagaattccggg"         
## [5] "cgtgtagcgcaacgaccatctacaaggcaatattttgatcgcttgttagg"
dna_1 <- readDNAStringSet(fa_file); dna_1
## DNAStringSet object of length 26454:
##         width seq                                           names               
##     [1]  2000 GTTGGTGGCCCACCAGTGCCA...CAAGTTTACCGGTTGCACGGT NM_078863_up_2000...
##     [2]  2000 TTATTTATGTAGGCGCCCGTT...ATACGGAAAGTCATCCTCGAT NM_001201794_up_2...
##     [3]  2000 TTATTTATGTAGGCGCCCGTT...ATACGGAAAGTCATCCTCGAT NM_001201795_up_2...
##     [4]  2000 TTATTTATGTAGGCGCCCGTT...ATACGGAAAGTCATCCTCGAT NM_001201796_up_2...
##     [5]  2000 TTATTTATGTAGGCGCCCGTT...ATACGGAAAGTCATCCTCGAT NM_001201797_up_2...
##     ...   ... ...
## [26450]  2000 ATTTACAAGACTAATAAAGAT...AAAATTAAATTTCAATAAAAC NM_001111010_up_2...
## [26451]  2000 GATATACGAAGGACGACCTGC...TTGTTTGAGTTGTTATATATT NM_001015258_up_2...
## [26452]  2000 GATATACGAAGGACGACCTGC...TTGTTTGAGTTGTTATATATT NM_001110997_up_2...
## [26453]  2000 GATATACGAAGGACGACCTGC...TTGTTTGAGTTGTTATATATT NM_001276245_up_2...
## [26454]  2000 CGTATGTATTAGTTAACTCTG...TCAAAGTGTAAGAACAAATTG NM_001015497_up_2...
#################################
#The full set of string classes are
#DNAString[Set]: DNA sequences.
#RNAString[Set]: RNA sequences.
#AAString[Set]: Amino Acids sequences (protein).
#BString[Set]: “Big” sequences, using any kind of letter.
#XString[Set] is basically any of the above classes.
#translate: translate the DNA or RNA sequence into amino acids.

#ref https://uclouvain-cbio.github.io/WSBIM1322/sec-biostrings.html
#ref https://kasperdanielhansen.github.io/genbioconductor/html/Biostrings.html