#install.packages("seqinr")
library(seqinr)
## Warning: package 'seqinr' was built under R version 4.2.3
choosebank("genbank",timeout=20)

#1a) Genbank info for PSEN1
PSEN1 <- query(listname = "PSEN1", query="SP=homo sapiens AND K=PSEN1")
psen1_seqs <- getSequence(PSEN1)
psen1_names <- getName(PSEN1)
length(psen1_names)
## [1] 14
psen1_names
##  [1] "AB159776"       "AF416717.PSEN1" "BC011729.PSEN1" "KT120062.PSEN1"
##  [5] "KT120063.PSEN1" "KT120064.PSEN1" "KT120065.PSEN1" "KT120066.PSEN1"
##  [9] "KT120067.PSEN1" "KT599439.PSEN1" "KT599440.PSEN1" "KU178285"      
## [13] "KU178286"       "KU178287.PSEN1"
psen1_length<-rep(0,length(psen1_seqs))
for(i in 1:length(psen1_seqs)){
  psen1_length[i]<-length(psen1_seqs[[i]])
}

cbind(psen1_names,psen1_length)
##       psen1_names      psen1_length
##  [1,] "AB159776"       "142"       
##  [2,] "AF416717.PSEN1" "1137"      
##  [3,] "BC011729.PSEN1" "1392"      
##  [4,] "KT120062.PSEN1" "1404"      
##  [5,] "KT120063.PSEN1" "1404"      
##  [6,] "KT120064.PSEN1" "1404"      
##  [7,] "KT120065.PSEN1" "1404"      
##  [8,] "KT120066.PSEN1" "1404"      
##  [9,] "KT120067.PSEN1" "1404"      
## [10,] "KT599439.PSEN1" "1404"      
## [11,] "KT599440.PSEN1" "1404"      
## [12,] "KU178285"       "1389"      
## [13,] "KU178286"       "1401"      
## [14,] "KU178287.PSEN1" "108"

There are 14 sequences for the PSEN1 gene. The lengths of each sequence are listed above.

#1b) FASTA file entry for the first sequence.

write.fasta(psen1_seqs[1], psen1_names[1], 
            file.out = "C:/Users/beste/OneDrive/Documents/Grad School/STAT 646/PSEN1.fasta")

#read the created file
psen1_gene <- read.fasta(file = "C:/Users/beste/OneDrive/Documents/Grad School/STAT 646/PSEN1.fasta", 
                       seqtype = "DNA")
getName(psen1_gene)
## [1] "AB159776"
getSequence(psen1_gene)
## [[1]]
##   [1] "a" "a" "t" "c" "t" "a" "t" "a" "c" "c" "c" "c" "a" "t" "t" "c" "a" "c"
##  [19] "a" "g" "a" "a" "g" "a" "t" "a" "c" "c" "g" "a" "g" "a" "c" "t" "g" "t"
##  [37] "g" "g" "g" "c" "c" "a" "g" "a" "g" "a" "g" "c" "c" "c" "t" "g" "c" "a"
##  [55] "c" "t" "c" "a" "a" "t" "t" "c" "t" "g" "a" "a" "t" "g" "c" "t" "g" "c"
##  [73] "c" "a" "t" "c" "a" "t" "g" "a" "t" "c" "a" "g" "t" "g" "t" "c" "a" "t"
##  [91] "t" "g" "t" "t" "g" "t" "c" "a" "t" "g" "a" "c" "t" "a" "t" "c" "c" "t"
## [109] "c" "c" "t" "g" "g" "t" "g" "g" "t" "t" "c" "t" "g" "a" "a" "t" "a" "a"
## [127] "a" "t" "a" "c" "a" "g" "g" "t" "g" "c" "t" "a" "t" "a" "a" "g"
table(getSequence(psen1_gene))
## 
##  a  c  g  t 
## 39 34 30 39
#1c) Cs and Gs in the 3rd sequence
psen1_names[3]
## [1] "BC011729.PSEN1"
table(getSequence(psen1_seqs[[3]]))
## 
##   a   c   g   t 
## 359 309 333 391
table(getSequence(psen1_seqs[[3]]))/psen1_length[3]
## 
##         a         c         g         t 
## 0.2579023 0.2219828 0.2392241 0.2808908
closebank()

There are 309 Cs and 333 Gs in the third sequence and 1392 total bases. This means that about 22% of the bases are Cs and about 24% of the bases are Gs, so about 46% of the bases are Cs or Gs.

#2) Global alignment for GTAG and GAG

#Confirming with R:

#BiocManager::install("Biostrings")
library(Biostrings)
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:seqinr':
## 
##     translate
## The following object is masked from 'package:base':
## 
##     strsplit
seq_1 <- "GTAG"
seq_2 <- "GAG"

myScoringMat <- nucleotideSubstitutionMatrix(match = 1, mismatch = -1, baseOnly = TRUE)
myScoringMat
##    A  C  G  T
## A  1 -1 -1 -1
## C -1  1 -1 -1
## G -1 -1  1 -1
## T -1 -1 -1  1
gapOpen <- -2
gapExtend <- -2

myAlignment <- pairwiseAlignment(seq_1, seq_2, substitutionMatrix = myScoringMat, 
                                 gapOpening = gapOpen, gapExtension = gapExtend, type = "global", scoreOnly = FALSE)
myAlignment
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: GTAG
## subject: G-AG
## score: -1

#3) Optimal alignment for amino acids

data(PAM30)
PAM30[1:10,1:10]
##    A   R   N   D   C   Q   E   G  H   I
## A  6  -7  -4  -3  -6  -4  -2  -2 -7  -5
## R -7   8  -6 -10  -8  -2  -9  -9 -2  -5
## N -4  -6   8   2 -11  -3  -2  -3  0  -5
## D -3 -10   2   8 -14  -2   2  -3 -4  -7
## C -6  -8 -11 -14  10 -14 -14  -9 -7  -6
## Q -4  -2  -3  -2 -14   8   1  -7  1  -8
## E -2  -9  -2   2 -14   1   8  -4 -5  -5
## G -2  -9  -3  -3  -9  -7  -4   6 -9 -11
## H -7  -2   0  -4  -7   1  -5  -9  9  -9
## I -5  -5  -5  -7  -6  -8  -5 -11 -9   8
myScoringMat <- "PAM30"

seq_1 <- "ASEDLTI"
seq_2 <- "AEEDFGI"

gapOpen <- 0
gapExtend <- -2

#3a) Global alignment
myAlignment <- pairwiseAlignment(seq_2, seq_1, substitutionMatrix = myScoringMat, 
                                 gapOpening = gapOpen, gapExtension = gapExtend, type = "global", scoreOnly = FALSE)
myAlignment
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: AEEDF-GI
## subject: ASEDLT-I
## score: 19
#3a) Local alignment
myAlignment <- pairwiseAlignment(seq_2, seq_1, substitutionMatrix = myScoringMat, 
                                 gapOpening = gapOpen, gapExtension = gapExtend, type = "local", scoreOnly = FALSE)
myAlignment
## Local PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: [1] AEEDF-GI
## subject: [1] ASEDLT-I
## score: 19

For these particular sequences and this particular scoring matrix, the global and local alignments are the same.

#3b) Verify the alignment score
The global and local alignment that was produced has the following score breakdown:

match(AA) +6
mismatch(ES) -4
match(EE) +8
match(DD) +8
mismatch(FL) -3
gap(T) -2
gap(G
) -2
match(II) +8

Altogether this sums to an overall score of 19, which matches the overall score produced by the R code.
#4a) Homologous definition

We say that proteins are homologous if they have a common evolutionary ancestor and therefore perform a similar function for different organisms.


#4b) Comparing human & mouse

library(seqinr)
choosebank("genbank",timeout=20)

#human
H1F0_H <- query(listname = "H1F0", query="SP=homo sapiens AND K=H1F0")
h1f0_h_seqs <- getSequence(H1F0_H)
h1f0_h_names <- getName(H1F0_H)
length(h1f0_h_names)
## [1] 4
H1F0_H$req
## [[1]]
##            name          length           frame          ncbigc 
## "BC000145.H1F0"           "585"             "0"             "1" 
## 
## [[2]]
##            name          length           frame          ncbigc 
## "BC029046.H1F0"           "585"             "0"             "1" 
## 
## [[3]]
##            name          length           frame          ncbigc 
## "CR456502.H1F0"           "585"             "0"             "1" 
## 
## [[4]]
##       name     length      frame     ncbigc 
## "CR542220"      "585"        "0"        "1"
#mouse
H1F0_M <- query(listname = "H1F0", query="SP=mus musculus AND K=H1F0")
h1f0_m_seqs <- getSequence(H1F0_M)
h1f0_m_names <- getName(H1F0_M)
length(h1f0_m_names)
## [1] 3
H1F0_M$req
## [[1]]
##            name          length           frame          ncbigc 
## "BC003830.H1F0"           "585"             "0"             "1" 
## 
## [[2]]
##            name          length           frame          ncbigc 
## "BC011493.H1F0"           "585"             "0"             "1" 
## 
## [[3]]
##            name          length           frame          ncbigc 
## "BC110361.H1F0"           "585"             "0"             "1"
seqH<-toupper(paste0(h1f0_h_seqs[[1]],collapse = ""))
seqH
## [1] "ATGACCGAGAATTCCACGTCCGCCCCTGCGGCCAAGCCCAAGCGGGCCAAGGCCTCCAAGAAGTCCACAGACCACCCCAAGTATTCAGACATGATCGTGGCTGCCATCCAGGCCGAGAAGAACCGCGCTGGCTCCTCGCGCCAGTCCATTCAGAAGTATATCAAGAGCCACTACAAGGTGGGTGAGAACGCTGACTCGCAGATCAAGTTGTCCATCAAGCGCCTGGTCACCACCGGTGTCCTCAAGCAGACCAAAGGGGTGGGGGCCTCGGGGTCCTTCCGGCTAGCCAAGAGCGACGAACCCAAGAAGTCAGTGGCCTTCAAGAAGACCAAGAAGGAAATCAAGAAGGTAGCCACGCCAAAGAAGGCATCCAAGCCCAAGAAGGCTGCCTCCAAAGCCCCAACCAAGAAACCCAAAGCCACCCCGGTCAAGAAGGCCAAGAAGAAGCTGGCTGCCACGCCCAAGAAAGCCAAAAAACCCAAGACTGTCAAAGCCAAGCCGGTCAAGGCATCCAAGCCCAAAAAGGCCAAACCAGTGAAACCCAAAGCAAAGTCCAGTGCCAAGAGGGCCGGCAAGAAGAAGTGA"
seqM<-toupper(paste0(h1f0_m_seqs[[1]],collapse = ""))
seqM
## [1] "ATGACCGAGAACTCCACCTCCGCCCCGGCGGCGAAGCCCAAACGGGCCAAGGCTTCCAAGAAGTCCACGGACCACCCCAAGTATTCAGACATGATCGTGGCTGCTATCCAGGCAGAGAAGAACCGTGCCGGCTCCTCGCGCCAGTCCATCCAAAAGTATATCAAGAGCCACTACAAGGTGGGTGAGAACGCCGACTCCCAGATCAAGTTGTCCATCAAGCGCCTAGTGACCACCGGTGTTCTCAAGCAAACCAAAGGGGTGGGCGCCTCGGGGTCCTTCAGGCTGGCCAAGGGCGATGAGCCCAAAAGGTCGGTGGCTTTCAAGAAGACCAAGAAGGAAGTCAAGAAAGTGGCCACTCCAAAGAAGGCAGCCAAGCCCAAGAAGGCTGCCTCCAAAGCCCCAAGCAAGAAACCCAAAGCCACCCCTGTCAAGAAGGCCAAGAAGAAGCCGGCTGCCACGCCCAAGAAAGCCAAAAAGCCCAAGGTTGTCAAAGTCAAACCAGTCAAGGCCTCCAAACCCAAGAAGGCCAAAACCGTGAAGCCCAAAGCCAAGTCGAGTGCCAAGAGGGCCAGCAAGAAGAAGTGA"
gapOpen <- 0
gapExtend <- -2

myScoringMat <- nucleotideSubstitutionMatrix(match = 1, mismatch = -1, baseOnly = TRUE)
myAlignment <- pairwiseAlignment(seqH, seqM, substitutionMatrix = myScoringMat, 
                                 gapOpening = gapOpen, gapExtension = gapExtend, type = "global", scoreOnly = FALSE)
myAlignment
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: ATGACCGAGAATTCCACGTCCGCCCCTGCGGCCA...GTCCAGTGCCAAGAGGGCCGGCAAGAAGAAGTGA
## subject: ATGACCGAGAACTCCACCTCCGCCCCGGCGGCGA...GTCGAGTGCCAAGAGGGCCAGCAAGAAGAAGTGA
## score: 481
#4bi) Comparing bases for human & mouse
tab<-table(h1f0_h_seqs[[1]],h1f0_m_seqs[[1]])
tab
##    
##       a   c   g   t
##   a 173   3  12   0
##   c   3 167   4   8
##   g  10   3 135   2
##   t   0   5   2  58
sum(diag(tab))/sum(tab)
## [1] 0.9111111
1-sum(diag(tab))/sum(tab)
## [1] 0.08888889

Looking at the global alignment, it seems that the optimal alignment of the human and mouse H1F0 genes does not involve any gaps or shifts in alignment. This produces a score of 481. Looking at the proportion of matches in the two sequences, there are only about 9% of bases that do not match and 91% that do so it makes sense that the optimal alignment does not involve any rearranging.

#4bii) Dot plot

dotPlot(h1f0_h_seqs[[1]], h1f0_m_seqs[[1]], col=c("white", "red"), 
        xlab = "Human", ylab = "Mouse")

The dot plot shows a fairly high level of alignment, as there is a noticeable diagonal line showing on the dot plot. This corresponds with the 91% matching bases that were found in part (bi) above.

#4c) Multiple alignment

H1F0_C <- query(listname = "H1F0", query="SP=bos taurus AND K=H1F0")
h1f0_c_seqs <- getSequence(H1F0_C)
h1f0_c_names <- getName(H1F0_C)
length(h1f0_c_names)
## [1] 1
H1F0_C$req
## [[1]]
##            name          length           frame          ncbigc 
## "BC122613.H1F0"           "585"             "0"             "1"
seqC<-toupper(paste0(h1f0_c_seqs[[1]],collapse = ""))
seqC
## [1] "ATGACCGAGAACTCCACGTCCACCCCTGCGGCCAAGCCCAAGCGGGCCAAGGCCTCCAAGAAGTCCACAGACCACCCCAAGTATTCAGACATGATCGTGGCCGCCATCCAAGCAGAGAAGAACCGCGCTGGTTCCTCGCGCCAGTCCATCCAGAAGTACATCAAGAGCCACTACAAGGTGGGTGAGAACGCGGACTCCCAGATCAAGTTGTCCATCAAGCGCTTGGTCACCACTGGAGTCCTCAAACAGACCAAAGGGGTGGGTGCCTCGGGGTCTTTCCGCCTGGCCAAGAGCGACGAGCCCAAAAGGTCCGTGGCCTTCAAGAAGACGAAGAAGGAAGTCAAGAAGGTGGCCACGCCAAAGAAGGCAGCCAAGCCCAAGAAGGCTGCCTCCAAAGCCCCAAGCAAGAAGCCCAAAGCCACCCCAGTCAAGAAGGCCAAGAAGAAGCCGGCTGCCACGCCCAAGAAAACCAAAAAACCTAAGACTGTCAAAGCCAAGCCAGTCAAGGCATCCAAGCCTAAGAAGACCAAGCCAGTGAAGCCCAAAGCCAAGTCCAGTGCCAAGAGGACAGGCAAGAAGAAGTGA"
require(Biostrings)
h1f0_all <- DNAStringSet(c(seqH,seqM,seqC))
names(h1f0_all) <- c("HOMO SAPIENS", "MUS MUSCULUS","BOS TAURUS")

## Use of multiple alignment function 'muscle'.
#BiocManager::install("muscle")

library(muscle)

h1f0_align <- muscle(h1f0_all)
## 
## MUSCLE v3.8.31 by Robert C. Edgar
## 
## http://www.drive5.com/muscle
## This software is donated to the public domain.
## Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.
## 
## filea6c79d96ae2 3 seqs, max length 585, avg  length 585
## 1 MB(0%)00:00:00                Iter   1   16.67%  K-mer dist pass 1
4050 MB(100%)00:00:00                Iter   1  100.00%  K-mer dist pass 1
## 4050 MB(100%)00:00:00                Iter   1   16.67%  K-mer dist pass 2
4050 MB(100%)00:00:00                Iter   1  100.00%  K-mer dist pass 2
## 4050 MB(100%)00:00:00                Iter   1   50.00%  Align node       
4050 MB(100%)00:00:00                Iter   1  100.00%  Align node
4050 MB(100%)00:00:00                Iter   1  100.00%  Align node
## 4050 MB(100%)00:00:00                Iter   1   33.33%  Root alignment
4050 MB(100%)00:00:00                Iter   1   66.67%  Root alignment
4050 MB(100%)00:00:00                Iter   1  100.00%  Root alignment
4050 MB(100%)00:00:00                Iter   1  100.00%  Root alignment
## 4050 MB(100%)00:00:00                Iter   2  100.00%  Root alignment
## 4050 MB(100%)00:00:00                Iter   3   66.67%  Refine biparts
4050 MB(100%)00:00:00                Iter   3  100.00%  Refine biparts
4050 MB(100%)00:00:00                Iter   3  133.33%  Refine biparts
4050 MB(100%)00:00:00                Iter   3  100.00%  Refine biparts
## Warning in file.remove(tempIn, tempOut): cannot remove file
## 'C:\Users\beste\AppData\Local\Temp\Rtmp2zeVpr\filea6c6f7e261f.afa', reason
## 'Permission denied'
detail(h1f0_align)

#HOMO SAPIENS   ATGACCGAGA ATTCCACGTC CGCCCCTGCG GCCAAGCCCA AGCGGGCCAA
#MUS MUSCULUS   ATGACCGAGA ACTCCACCTC CGCCCCGGCG GCGAAGCCCA AACGGGCCAA
#BOS TAURUS     ATGACCGAGA ACTCCACGTC CACCCCTGCG GCCAAGCCCA AGCGGGCCAA 

closebank()