#install.packages("seqinr")
library(seqinr)
## Warning: package 'seqinr' was built under R version 4.2.3
choosebank("genbank",timeout=20)
#1a) Genbank info for PSEN1
PSEN1 <- query(listname = "PSEN1", query="SP=homo sapiens AND K=PSEN1")
psen1_seqs <- getSequence(PSEN1)
psen1_names <- getName(PSEN1)
length(psen1_names)
## [1] 14
psen1_names
## [1] "AB159776" "AF416717.PSEN1" "BC011729.PSEN1" "KT120062.PSEN1"
## [5] "KT120063.PSEN1" "KT120064.PSEN1" "KT120065.PSEN1" "KT120066.PSEN1"
## [9] "KT120067.PSEN1" "KT599439.PSEN1" "KT599440.PSEN1" "KU178285"
## [13] "KU178286" "KU178287.PSEN1"
psen1_length<-rep(0,length(psen1_seqs))
for(i in 1:length(psen1_seqs)){
psen1_length[i]<-length(psen1_seqs[[i]])
}
cbind(psen1_names,psen1_length)
## psen1_names psen1_length
## [1,] "AB159776" "142"
## [2,] "AF416717.PSEN1" "1137"
## [3,] "BC011729.PSEN1" "1392"
## [4,] "KT120062.PSEN1" "1404"
## [5,] "KT120063.PSEN1" "1404"
## [6,] "KT120064.PSEN1" "1404"
## [7,] "KT120065.PSEN1" "1404"
## [8,] "KT120066.PSEN1" "1404"
## [9,] "KT120067.PSEN1" "1404"
## [10,] "KT599439.PSEN1" "1404"
## [11,] "KT599440.PSEN1" "1404"
## [12,] "KU178285" "1389"
## [13,] "KU178286" "1401"
## [14,] "KU178287.PSEN1" "108"
There are 14 sequences for the PSEN1 gene. The lengths of each sequence are listed above.
#1b) FASTA file entry for the first sequence.
write.fasta(psen1_seqs[1], psen1_names[1],
file.out = "C:/Users/beste/OneDrive/Documents/Grad School/STAT 646/PSEN1.fasta")
#read the created file
psen1_gene <- read.fasta(file = "C:/Users/beste/OneDrive/Documents/Grad School/STAT 646/PSEN1.fasta",
seqtype = "DNA")
getName(psen1_gene)
## [1] "AB159776"
getSequence(psen1_gene)
## [[1]]
## [1] "a" "a" "t" "c" "t" "a" "t" "a" "c" "c" "c" "c" "a" "t" "t" "c" "a" "c"
## [19] "a" "g" "a" "a" "g" "a" "t" "a" "c" "c" "g" "a" "g" "a" "c" "t" "g" "t"
## [37] "g" "g" "g" "c" "c" "a" "g" "a" "g" "a" "g" "c" "c" "c" "t" "g" "c" "a"
## [55] "c" "t" "c" "a" "a" "t" "t" "c" "t" "g" "a" "a" "t" "g" "c" "t" "g" "c"
## [73] "c" "a" "t" "c" "a" "t" "g" "a" "t" "c" "a" "g" "t" "g" "t" "c" "a" "t"
## [91] "t" "g" "t" "t" "g" "t" "c" "a" "t" "g" "a" "c" "t" "a" "t" "c" "c" "t"
## [109] "c" "c" "t" "g" "g" "t" "g" "g" "t" "t" "c" "t" "g" "a" "a" "t" "a" "a"
## [127] "a" "t" "a" "c" "a" "g" "g" "t" "g" "c" "t" "a" "t" "a" "a" "g"
table(getSequence(psen1_gene))
##
## a c g t
## 39 34 30 39
#1c) Cs and Gs in the 3rd sequence
psen1_names[3]
## [1] "BC011729.PSEN1"
table(getSequence(psen1_seqs[[3]]))
##
## a c g t
## 359 309 333 391
table(getSequence(psen1_seqs[[3]]))/psen1_length[3]
##
## a c g t
## 0.2579023 0.2219828 0.2392241 0.2808908
closebank()
There are 309 Cs and 333 Gs in the third sequence and 1392 total bases. This means that about 22% of the bases are Cs and about 24% of the bases are Gs, so about 46% of the bases are Cs or Gs.
#2) Global alignment for GTAG and GAG
#Confirming with R:
#BiocManager::install("Biostrings")
library(Biostrings)
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
seq_1 <- "GTAG"
seq_2 <- "GAG"
myScoringMat <- nucleotideSubstitutionMatrix(match = 1, mismatch = -1, baseOnly = TRUE)
myScoringMat
## A C G T
## A 1 -1 -1 -1
## C -1 1 -1 -1
## G -1 -1 1 -1
## T -1 -1 -1 1
gapOpen <- -2
gapExtend <- -2
myAlignment <- pairwiseAlignment(seq_1, seq_2, substitutionMatrix = myScoringMat,
gapOpening = gapOpen, gapExtension = gapExtend, type = "global", scoreOnly = FALSE)
myAlignment
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: GTAG
## subject: G-AG
## score: -1
#3) Optimal alignment for amino acids
data(PAM30)
PAM30[1:10,1:10]
## A R N D C Q E G H I
## A 6 -7 -4 -3 -6 -4 -2 -2 -7 -5
## R -7 8 -6 -10 -8 -2 -9 -9 -2 -5
## N -4 -6 8 2 -11 -3 -2 -3 0 -5
## D -3 -10 2 8 -14 -2 2 -3 -4 -7
## C -6 -8 -11 -14 10 -14 -14 -9 -7 -6
## Q -4 -2 -3 -2 -14 8 1 -7 1 -8
## E -2 -9 -2 2 -14 1 8 -4 -5 -5
## G -2 -9 -3 -3 -9 -7 -4 6 -9 -11
## H -7 -2 0 -4 -7 1 -5 -9 9 -9
## I -5 -5 -5 -7 -6 -8 -5 -11 -9 8
myScoringMat <- "PAM30"
seq_1 <- "ASEDLTI"
seq_2 <- "AEEDFGI"
gapOpen <- 0
gapExtend <- -2
#3a) Global alignment
myAlignment <- pairwiseAlignment(seq_2, seq_1, substitutionMatrix = myScoringMat,
gapOpening = gapOpen, gapExtension = gapExtend, type = "global", scoreOnly = FALSE)
myAlignment
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: AEEDF-GI
## subject: ASEDLT-I
## score: 19
#3a) Local alignment
myAlignment <- pairwiseAlignment(seq_2, seq_1, substitutionMatrix = myScoringMat,
gapOpening = gapOpen, gapExtension = gapExtend, type = "local", scoreOnly = FALSE)
myAlignment
## Local PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: [1] AEEDF-GI
## subject: [1] ASEDLT-I
## score: 19
For these particular sequences and this particular scoring matrix, the global and local alignments are the same.
#3b) Verify the alignment score
The global and local alignment that was produced has the following score
breakdown:
match(AA) +6
mismatch(ES) -4
match(EE) +8
match(DD) +8
mismatch(FL) -3
gap(T) -2
gap(G) -2
match(II) +8
Altogether this sums to an overall score of 19, which matches the
overall score produced by the R code.
#4a) Homologous
definition
We say that proteins are homologous if they have a common evolutionary ancestor and therefore perform a similar function for different organisms.
#4b) Comparing human & mouse
library(seqinr)
choosebank("genbank",timeout=20)
#human
H1F0_H <- query(listname = "H1F0", query="SP=homo sapiens AND K=H1F0")
h1f0_h_seqs <- getSequence(H1F0_H)
h1f0_h_names <- getName(H1F0_H)
length(h1f0_h_names)
## [1] 4
H1F0_H$req
## [[1]]
## name length frame ncbigc
## "BC000145.H1F0" "585" "0" "1"
##
## [[2]]
## name length frame ncbigc
## "BC029046.H1F0" "585" "0" "1"
##
## [[3]]
## name length frame ncbigc
## "CR456502.H1F0" "585" "0" "1"
##
## [[4]]
## name length frame ncbigc
## "CR542220" "585" "0" "1"
#mouse
H1F0_M <- query(listname = "H1F0", query="SP=mus musculus AND K=H1F0")
h1f0_m_seqs <- getSequence(H1F0_M)
h1f0_m_names <- getName(H1F0_M)
length(h1f0_m_names)
## [1] 3
H1F0_M$req
## [[1]]
## name length frame ncbigc
## "BC003830.H1F0" "585" "0" "1"
##
## [[2]]
## name length frame ncbigc
## "BC011493.H1F0" "585" "0" "1"
##
## [[3]]
## name length frame ncbigc
## "BC110361.H1F0" "585" "0" "1"
seqH<-toupper(paste0(h1f0_h_seqs[[1]],collapse = ""))
seqH
## [1] "ATGACCGAGAATTCCACGTCCGCCCCTGCGGCCAAGCCCAAGCGGGCCAAGGCCTCCAAGAAGTCCACAGACCACCCCAAGTATTCAGACATGATCGTGGCTGCCATCCAGGCCGAGAAGAACCGCGCTGGCTCCTCGCGCCAGTCCATTCAGAAGTATATCAAGAGCCACTACAAGGTGGGTGAGAACGCTGACTCGCAGATCAAGTTGTCCATCAAGCGCCTGGTCACCACCGGTGTCCTCAAGCAGACCAAAGGGGTGGGGGCCTCGGGGTCCTTCCGGCTAGCCAAGAGCGACGAACCCAAGAAGTCAGTGGCCTTCAAGAAGACCAAGAAGGAAATCAAGAAGGTAGCCACGCCAAAGAAGGCATCCAAGCCCAAGAAGGCTGCCTCCAAAGCCCCAACCAAGAAACCCAAAGCCACCCCGGTCAAGAAGGCCAAGAAGAAGCTGGCTGCCACGCCCAAGAAAGCCAAAAAACCCAAGACTGTCAAAGCCAAGCCGGTCAAGGCATCCAAGCCCAAAAAGGCCAAACCAGTGAAACCCAAAGCAAAGTCCAGTGCCAAGAGGGCCGGCAAGAAGAAGTGA"
seqM<-toupper(paste0(h1f0_m_seqs[[1]],collapse = ""))
seqM
## [1] "ATGACCGAGAACTCCACCTCCGCCCCGGCGGCGAAGCCCAAACGGGCCAAGGCTTCCAAGAAGTCCACGGACCACCCCAAGTATTCAGACATGATCGTGGCTGCTATCCAGGCAGAGAAGAACCGTGCCGGCTCCTCGCGCCAGTCCATCCAAAAGTATATCAAGAGCCACTACAAGGTGGGTGAGAACGCCGACTCCCAGATCAAGTTGTCCATCAAGCGCCTAGTGACCACCGGTGTTCTCAAGCAAACCAAAGGGGTGGGCGCCTCGGGGTCCTTCAGGCTGGCCAAGGGCGATGAGCCCAAAAGGTCGGTGGCTTTCAAGAAGACCAAGAAGGAAGTCAAGAAAGTGGCCACTCCAAAGAAGGCAGCCAAGCCCAAGAAGGCTGCCTCCAAAGCCCCAAGCAAGAAACCCAAAGCCACCCCTGTCAAGAAGGCCAAGAAGAAGCCGGCTGCCACGCCCAAGAAAGCCAAAAAGCCCAAGGTTGTCAAAGTCAAACCAGTCAAGGCCTCCAAACCCAAGAAGGCCAAAACCGTGAAGCCCAAAGCCAAGTCGAGTGCCAAGAGGGCCAGCAAGAAGAAGTGA"
gapOpen <- 0
gapExtend <- -2
myScoringMat <- nucleotideSubstitutionMatrix(match = 1, mismatch = -1, baseOnly = TRUE)
myAlignment <- pairwiseAlignment(seqH, seqM, substitutionMatrix = myScoringMat,
gapOpening = gapOpen, gapExtension = gapExtend, type = "global", scoreOnly = FALSE)
myAlignment
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: ATGACCGAGAATTCCACGTCCGCCCCTGCGGCCA...GTCCAGTGCCAAGAGGGCCGGCAAGAAGAAGTGA
## subject: ATGACCGAGAACTCCACCTCCGCCCCGGCGGCGA...GTCGAGTGCCAAGAGGGCCAGCAAGAAGAAGTGA
## score: 481
#4bi) Comparing bases for human & mouse
tab<-table(h1f0_h_seqs[[1]],h1f0_m_seqs[[1]])
tab
##
## a c g t
## a 173 3 12 0
## c 3 167 4 8
## g 10 3 135 2
## t 0 5 2 58
sum(diag(tab))/sum(tab)
## [1] 0.9111111
1-sum(diag(tab))/sum(tab)
## [1] 0.08888889
Looking at the global alignment, it seems that the optimal alignment of the human and mouse H1F0 genes does not involve any gaps or shifts in alignment. This produces a score of 481. Looking at the proportion of matches in the two sequences, there are only about 9% of bases that do not match and 91% that do so it makes sense that the optimal alignment does not involve any rearranging.
#4bii) Dot plot
dotPlot(h1f0_h_seqs[[1]], h1f0_m_seqs[[1]], col=c("white", "red"),
xlab = "Human", ylab = "Mouse")
The dot plot shows a fairly high level of alignment, as there is a noticeable diagonal line showing on the dot plot. This corresponds with the 91% matching bases that were found in part (bi) above.
#4c) Multiple alignment
H1F0_C <- query(listname = "H1F0", query="SP=bos taurus AND K=H1F0")
h1f0_c_seqs <- getSequence(H1F0_C)
h1f0_c_names <- getName(H1F0_C)
length(h1f0_c_names)
## [1] 1
H1F0_C$req
## [[1]]
## name length frame ncbigc
## "BC122613.H1F0" "585" "0" "1"
seqC<-toupper(paste0(h1f0_c_seqs[[1]],collapse = ""))
seqC
## [1] "ATGACCGAGAACTCCACGTCCACCCCTGCGGCCAAGCCCAAGCGGGCCAAGGCCTCCAAGAAGTCCACAGACCACCCCAAGTATTCAGACATGATCGTGGCCGCCATCCAAGCAGAGAAGAACCGCGCTGGTTCCTCGCGCCAGTCCATCCAGAAGTACATCAAGAGCCACTACAAGGTGGGTGAGAACGCGGACTCCCAGATCAAGTTGTCCATCAAGCGCTTGGTCACCACTGGAGTCCTCAAACAGACCAAAGGGGTGGGTGCCTCGGGGTCTTTCCGCCTGGCCAAGAGCGACGAGCCCAAAAGGTCCGTGGCCTTCAAGAAGACGAAGAAGGAAGTCAAGAAGGTGGCCACGCCAAAGAAGGCAGCCAAGCCCAAGAAGGCTGCCTCCAAAGCCCCAAGCAAGAAGCCCAAAGCCACCCCAGTCAAGAAGGCCAAGAAGAAGCCGGCTGCCACGCCCAAGAAAACCAAAAAACCTAAGACTGTCAAAGCCAAGCCAGTCAAGGCATCCAAGCCTAAGAAGACCAAGCCAGTGAAGCCCAAAGCCAAGTCCAGTGCCAAGAGGACAGGCAAGAAGAAGTGA"
require(Biostrings)
h1f0_all <- DNAStringSet(c(seqH,seqM,seqC))
names(h1f0_all) <- c("HOMO SAPIENS", "MUS MUSCULUS","BOS TAURUS")
## Use of multiple alignment function 'muscle'.
#BiocManager::install("muscle")
library(muscle)
h1f0_align <- muscle(h1f0_all)
##
## MUSCLE v3.8.31 by Robert C. Edgar
##
## http://www.drive5.com/muscle
## This software is donated to the public domain.
## Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.
##
## filea6c79d96ae2 3 seqs, max length 585, avg length 585
## 1 MB(0%)00:00:00 Iter 1 16.67% K-mer dist pass 1
4050 MB(100%)00:00:00 Iter 1 100.00% K-mer dist pass 1
## 4050 MB(100%)00:00:00 Iter 1 16.67% K-mer dist pass 2
4050 MB(100%)00:00:00 Iter 1 100.00% K-mer dist pass 2
## 4050 MB(100%)00:00:00 Iter 1 50.00% Align node
4050 MB(100%)00:00:00 Iter 1 100.00% Align node
4050 MB(100%)00:00:00 Iter 1 100.00% Align node
## 4050 MB(100%)00:00:00 Iter 1 33.33% Root alignment
4050 MB(100%)00:00:00 Iter 1 66.67% Root alignment
4050 MB(100%)00:00:00 Iter 1 100.00% Root alignment
4050 MB(100%)00:00:00 Iter 1 100.00% Root alignment
## 4050 MB(100%)00:00:00 Iter 2 100.00% Root alignment
## 4050 MB(100%)00:00:00 Iter 3 66.67% Refine biparts
4050 MB(100%)00:00:00 Iter 3 100.00% Refine biparts
4050 MB(100%)00:00:00 Iter 3 133.33% Refine biparts
4050 MB(100%)00:00:00 Iter 3 100.00% Refine biparts
## Warning in file.remove(tempIn, tempOut): cannot remove file
## 'C:\Users\beste\AppData\Local\Temp\Rtmp2zeVpr\filea6c6f7e261f.afa', reason
## 'Permission denied'
detail(h1f0_align)
#HOMO SAPIENS ATGACCGAGA ATTCCACGTC CGCCCCTGCG GCCAAGCCCA AGCGGGCCAA
#MUS MUSCULUS ATGACCGAGA ACTCCACCTC CGCCCCGGCG GCGAAGCCCA AACGGGCCAA
#BOS TAURUS ATGACCGAGA ACTCCACGTC CACCCCTGCG GCCAAGCCCA AGCGGGCCAA
closebank()