#1 BASH part of the project
#First, I changed my directory to make sure I am pulling from and saving to the right folder.
#phmmer Human_PUSL1.fasta uniprot_sprot.fasta
#For Q5 of the midterm, I used HMMER in R and BLAST to find the best matches to my assigned gene.
#With R, I got a bacterial match with an E-value of 3.6e-24. With BLAST, my top match was a human one with an E-value of 0.
#With command-line, my top match is a human one with an E-value of 8.5e-203 and my second match is a mouse one with an E-value of 4.1e-153.
#Quite the contrast because, with BLAST, my mouse results didn't even show up in the first 100.
#Please see accompanying figure 1.
#phmmer Mouse_PUSL1.fasta uniprot_sprot.fasta
#Unfortunately, on the midterm, I did not properly complete the HMMER and BLAST steps for the mouse gene so lost marks as a result.
#To complete the comparison between command-line and BLAST, I conducted a BLAST search for it for the final.
#I have found that the top, not only 2, but 5 results are mouse hits with E-values of 0.
#With command-line, however, the top hit is a mouse one with an E-value of 2.9e-195 and the second hit is a human one with an E-value of 4.4e-153.
#Please see accompanying figure 2.
#For Q7 of the midterm, I used R to build the MSA and profile HMM for the given fly protein. Here, I used command-line.
#The portions done in clustalw are included as screenshots. Please refer to accompanying figures 3-12.
#hmmbuild fly_protein.hmm fly_protein_new
#hmmsearch fly_protein.hmm uniprot_sprot.fasta
#The most significant human hit is ZMAT3_HUMAN (Q9HA38, Zinc finger matrin-type protein 3 O) with an E-value of 4.6e-12.
#Please see accompanying figure 13.
#Adding all libraries for the R section of the project.
library("HMM")
library(seqinr)
library("ape")
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
library(Biostrings)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter,
## Find, get, grep, grepl, intersect, is.unsorted, lapply, Map,
## mapply, match, mget, order, paste, pmax, pmax.int, pmin,
## pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
## setdiff, sort, table, tapply, union, unique, unsplit, which,
## which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
## Loading required package: XVector
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
library(phangorn)
library("bio3d")
##
## Attaching package: 'bio3d'
## The following object is masked from 'package:Biostrings':
##
## mask
## The following object is masked from 'package:IRanges':
##
## trim
## The following object is masked from 'package:ape':
##
## consensus
## The following objects are masked from 'package:seqinr':
##
## consensus, read.fasta, write.fasta
library("msa")
#2 - My partner for this question is Zaina Atieh.
#My New Junction Detector
TheUnitedStates = c("Exon", "toA","toG1","toG2", "toT","Intron")
StarsAndStripes = c("A","C","G","T")
transProbs = matrix(c('E_toE'=0.9,'E_toA'=0.1, 'E_toG1'= 0, 'E_toG2'= 0, 'E_toT'= 0,'E_toI'=0, 'A_toE'=0,'A_toA'=0, 'A_toG1'=1.0, 'A_toG2'=0, 'A_toT'=0, 'A_toI'=0, 'G1_toE'=0,'G1_toA'=0,'G1_toG1'=0,'G1_toG2'=1.0,'G1_toT'=0,'G1_toI'=0,'G2_toE'=0,'G2_toA'=0,'G2_toG1'=0,'G2_toG2'=0,'G2_toT'=1.0,'G2_toI'=0,'T_toE'=0,'T_toA'=0,'T_toG1'=0,'T_toG2'=0,'T_toT'=0,'T_toI'=1.0, 'I_toE'=0, 'I_toA'=0,'I_toG1'=0,'I_toG2'=0,'I_toT'=0, 'I_toI'=1.0), c(length(TheUnitedStates), length(TheUnitedStates)), byrow = TRUE)
transProbs
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.9 0.1 0 0 0 0
## [2,] 0.0 0.0 1 0 0 0
## [3,] 0.0 0.0 0 1 0 0
## [4,] 0.0 0.0 0 0 1 0
## [5,] 0.0 0.0 0 0 0 1
## [6,] 0.0 0.0 0 0 0 1
emissionProbs = matrix(c('A'=0.20,'C'=0.30,'G'=0.30,'T'=0.20, 'A'=0.997,'C'=0.001,'G'=0.001,'T'=0.001,'A'=0.001,'C'=0.001,'G'=0.997,'T'=0.001,'A'=0.001,'C'=0.001,'G'=0.997,'T'=0.001,'A'=0.001,'C'=0.001,'G'=0.001,'T'=0.997,'A'=0.15,'C'=0.35,'G'=0.35,'T'=0.15), c(length(TheUnitedStates), length(StarsAndStripes)), byrow = TRUE)
emissionProbs
## [,1] [,2] [,3] [,4]
## [1,] 0.200 0.300 0.300 0.200
## [2,] 0.997 0.001 0.001 0.001
## [3,] 0.001 0.001 0.997 0.001
## [4,] 0.001 0.001 0.997 0.001
## [5,] 0.001 0.001 0.001 0.997
## [6,] 0.150 0.350 0.350 0.150
hmm <- initHMM(TheUnitedStates, StarsAndStripes, startProbs = c(1,0,0,0,0,0), transProbs = transProbs, emissionProbs = emissionProbs)
#Class Junction Detector
ParkerStates = c("Exon", "5site", "Intron")
ParkerSymbols = c("A","C","G","T")
ParkertransProbs = matrix(c('EE'=0.9,'E5'=0.1,'EI'=0, '5E'=0, '55'=0, '5I'=1.0, 'IE'=0, 'I5'=0, 'II'=1.0), c(length(ParkerStates), length(ParkerStates)), byrow = TRUE)
ParkertransProbs
## [,1] [,2] [,3]
## [1,] 0.9 0.1 0
## [2,] 0.0 0.0 1
## [3,] 0.0 0.0 1
ParkeremissionProbs = matrix(c('A'=0.25,'C'=0.25,'G'=0.25,'T'=0.25, 'A'=0.05,'C'=0.0,'G'=0.95,'T'=0.0, 'A'=0.4,'C'=0.1,'G'=0.1,'T'=0.4), c(length(ParkerStates), length(ParkerSymbols)), byrow = TRUE)
ParkeremissionProbs
## [,1] [,2] [,3] [,4]
## [1,] 0.25 0.25 0.25 0.25
## [2,] 0.05 0.00 0.95 0.00
## [3,] 0.40 0.10 0.10 0.40
Parkerhmm <- initHMM(ParkerStates, ParkerSymbols, startProbs = c(1,0,0), transProbs = ParkertransProbs, emissionProbs = ParkeremissionProbs)
#PUSL1 Human
BaldEagle <- s2c('AGCCCCAGCCCACGGCTTATTCCTCAAGTCAGTGCTGTACGGGAACCTCGgtaagaaaaacaggcacgagaagctcctgtcatgtgcccagtgactactg')
BaldEagle <- toupper(BaldEagle)
vitBaldEagle <- viterbi(hmm, BaldEagle)
vitBaldEagle
## [1] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [8] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [15] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [22] "Exon" "Exon" "Exon" "Exon" "toA" "toG1" "toG2"
## [29] "toT" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [36] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [43] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [50] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [57] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [64] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [71] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [78] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [85] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [92] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [99] "Intron" "Intron"
ParkervitBaldEagle <- viterbi(Parkerhmm, BaldEagle)
ParkervitBaldEagle
## [1] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [11] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [21] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [31] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [41] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [51] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [61] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [71] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [81] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [91] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
#chr2:85,539,313-85,539,468
NorthAmericanBison <- s2c('ACGAGGCGTTCATCGAGGAGGGCACATTCCTTTTCACCTCAGAGTCGGTCGGGGAAGGCCACCCAGGTGAGGGGACGGCCTGAAGCGAAGCGTGGGGCGGGGCAGAAGGCAGCGCCAAGGTCCGGCTGGCTGCGGCCGGCCGGTGGTGGGGCCCGC')
vitNorthAmericanBison <- viterbi(hmm, NorthAmericanBison)
vitNorthAmericanBison
## [1] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [8] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [15] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [22] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [29] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [36] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [43] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [50] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [57] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [64] "Exon" "toA" "toG1" "toG2" "toT" "Intron" "Intron"
## [71] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [78] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [85] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [92] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [99] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [106] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [113] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [120] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [127] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [134] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [141] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [148] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [155] "Intron" "Intron"
ParkervitNorthAmericanBison <- viterbi(Parkerhmm, NorthAmericanBison)
ParkervitNorthAmericanBison
## [1] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [11] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [21] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [31] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [41] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [51] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [61] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [71] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [81] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [91] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [101] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [111] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [121] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [131] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [141] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [151] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
#As expected, the improved junction detector did return a splice site while the previous junction detector we used in class did not.
#This occurred in both the sequence for the PUSL1 Human gene assigned to me as well as the sequence at chr2:85,539,313-85,539,468.
#In the latter, the improved detector picked up on the exact splice site found in the UCSC genome browser.
#However, in the former, with my own gene, the detector picked up something about 25 bases upstream of the actual splice site seen in the browser.
#3
ChristmasTree <- read.tree("toy_tree.tre")
SantasToys = readDNAMultipleAlignment("toy_MSA.fasta", format="FASTA")
SantasToys <- as.phyDat(SantasToys)
plot(ChristmasTree, type="unrooted")

fit = pml(ChristmasTree, data=SantasToys, model="Jukes-Cantor")
fit
##
## loglikelihood: -4.778555
##
## unconstrained loglikelihood: 0
##
## Rate matrix:
## a c g t
## a 0 1 1 1
## c 1 0 1 1
## g 1 1 0 1
## t 1 1 1 0
##
## Base frequencies:
## 0.25 0.25 0.25 0.25
logLik(fit)
## 'log Lik.' -4.778555 (df=5)
Reindeer<- function(a){
Rudolph<-(1+3*exp(-4*a))/4
Vixen<-(1-exp(-4*a))/4
Prancer<-(7*Vixen^5+2*Rudolph^3*Vixen^2+Rudolph^4*Vixen+4*Rudolph^2*Vixen^3+2*Rudolph*Vixen^4)/4
Blitzen<-log(Prancer)
return(Blitzen)
}
Reindeer(0.09)
## [1] -4.78537
Reindeer(0.103)
## [1] -4.778555
Reindeer(0.12)
## [1] -4.787469
#Of the three values, 0.103 shows the maximum likelihood which is -4.778555.
#The log likelihood is also computed using phangorn above the manual calculations. The result of this is also -4.778555.
#4
#Extract the multiple sequence seed alignment for the most significant protein domain from PFAM.
seedalign <- pfam("PseudoU_synth_1", alignment = "seed")
PseudoU <- as.phyDat(seedalign$ali, type="AA")
#phangorn part
#neighbor-joining
dm <- dist.ml(PseudoU)
treeNJ <- NJ(dm)
plot(treeNJ, "unrooted", main="NJ")

#maximum parsimony
treePars <- optim.parsimony(treeNJ, PseudoU,method = "sankoff")
## Final p-score 11170 after 89 nni operations
plot(treePars, "unrooted", main="parsimony")

parsimony(treeNJ, PseudoU,method = "sankoff")
## [1] 11337
#The parsimony score is 11337.
#maximum likelihood
fitpars <- pml(treeNJ, data=PseudoU)
## negative edges length changed to 0!
fitpars
##
## loglikelihood: -59152.72
##
## unconstrained loglikelihood: -1249.257
## Rate matrix:
logLik(fitpars)
## 'log Lik.' -59152.72 (df=619)
#The log likelihood is -59152.72.
plot(fitpars$tree)

#5
SARS <- read.phyDat("sars.fasta", type="DNA", format = "fasta")
SARSdm <- dist.ml(SARS)
treeNJ <- NJ(SARSdm)
treeNJroot <- root(treeNJ, outgroup = "Himalayan palm civet sars cov, complete genome", resolve.root=TRUE)
plot(treeNJroot, type="phylogram")

#Based on this, we can deduce that the city of origin is Guangzhou in Guangdong.
#Depending on the definition of the next major site, Zhongshan is the next major city but still in Guangdong and Hong Kong is the next major region after Guangdong as a whole.