Final-Project-R-Code_Nupur-Joshi.R

#1 BASH part of the project

#First, I changed my directory to make sure I am pulling from and saving to the right folder.

#phmmer Human_PUSL1.fasta uniprot_sprot.fasta

#For Q5 of the midterm, I used HMMER in R and BLAST to find the best matches to my assigned gene. 
#With R, I got a bacterial match with an E-value of 3.6e-24. With BLAST, my top match was a human one with an E-value of 0.
#With command-line, my top match is a human one with an E-value of 8.5e-203 and my second match is a mouse one with an E-value of 4.1e-153.
#Quite the contrast because, with BLAST, my mouse results didn't even show up in the first 100.
#Please see accompanying figure 1.

#phmmer Mouse_PUSL1.fasta uniprot_sprot.fasta

#Unfortunately, on the midterm, I did not properly complete the HMMER and BLAST steps for the mouse gene so lost marks as a result.
#To complete the comparison between command-line and BLAST, I conducted a BLAST search for it for the final. 
#I have found that the top, not only 2, but 5 results are mouse hits with E-values of 0. 
#With command-line, however, the top hit is a mouse one with an E-value of 2.9e-195 and the second hit is a human one with an E-value of 4.4e-153.
#Please see accompanying figure 2.

#For Q7 of the midterm, I used R to build the MSA and profile HMM for the given fly protein. Here, I used command-line.
#The portions done in clustalw are included as screenshots. Please refer to accompanying figures 3-12.

#hmmbuild fly_protein.hmm fly_protein_new
#hmmsearch fly_protein.hmm uniprot_sprot.fasta

#The most significant human hit is ZMAT3_HUMAN (Q9HA38, Zinc finger matrin-type protein 3 O) with an E-value of 4.6e-12.
#Please see accompanying figure 13.

#Adding all libraries for the R section of the project.
library("HMM")
library(seqinr)
library("ape")

## 
## Attaching package: 'ape'

## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus

library(Biostrings)

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply, Map,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
##     setdiff, sort, table, tapply, union, unique, unsplit, which,
##     which.max, which.min

## Loading required package: S4Vectors

## Loading required package: stats4

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:base':
## 
##     expand.grid

## Loading required package: IRanges

## Loading required package: XVector

## 
## Attaching package: 'Biostrings'

## The following object is masked from 'package:ape':
## 
##     complement

## The following object is masked from 'package:seqinr':
## 
##     translate

## The following object is masked from 'package:base':
## 
##     strsplit

library(phangorn)
library("bio3d")

## 
## Attaching package: 'bio3d'

## The following object is masked from 'package:Biostrings':
## 
##     mask

## The following object is masked from 'package:IRanges':
## 
##     trim

## The following object is masked from 'package:ape':
## 
##     consensus

## The following objects are masked from 'package:seqinr':
## 
##     consensus, read.fasta, write.fasta

library("msa")

#2 - My partner for this question is Zaina Atieh.
#My New Junction Detector
TheUnitedStates = c("Exon", "toA","toG1","toG2", "toT","Intron")

StarsAndStripes = c("A","C","G","T")

transProbs = matrix(c('E_toE'=0.9,'E_toA'=0.1, 'E_toG1'= 0, 'E_toG2'= 0, 'E_toT'= 0,'E_toI'=0, 'A_toE'=0,'A_toA'=0, 'A_toG1'=1.0, 'A_toG2'=0, 'A_toT'=0, 'A_toI'=0, 'G1_toE'=0,'G1_toA'=0,'G1_toG1'=0,'G1_toG2'=1.0,'G1_toT'=0,'G1_toI'=0,'G2_toE'=0,'G2_toA'=0,'G2_toG1'=0,'G2_toG2'=0,'G2_toT'=1.0,'G2_toI'=0,'T_toE'=0,'T_toA'=0,'T_toG1'=0,'T_toG2'=0,'T_toT'=0,'T_toI'=1.0, 'I_toE'=0, 'I_toA'=0,'I_toG1'=0,'I_toG2'=0,'I_toT'=0, 'I_toI'=1.0), c(length(TheUnitedStates), length(TheUnitedStates)), byrow = TRUE)
transProbs

##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]  0.9  0.1    0    0    0    0
## [2,]  0.0  0.0    1    0    0    0
## [3,]  0.0  0.0    0    1    0    0
## [4,]  0.0  0.0    0    0    1    0
## [5,]  0.0  0.0    0    0    0    1
## [6,]  0.0  0.0    0    0    0    1

emissionProbs = matrix(c('A'=0.20,'C'=0.30,'G'=0.30,'T'=0.20, 'A'=0.997,'C'=0.001,'G'=0.001,'T'=0.001,'A'=0.001,'C'=0.001,'G'=0.997,'T'=0.001,'A'=0.001,'C'=0.001,'G'=0.997,'T'=0.001,'A'=0.001,'C'=0.001,'G'=0.001,'T'=0.997,'A'=0.15,'C'=0.35,'G'=0.35,'T'=0.15), c(length(TheUnitedStates), length(StarsAndStripes)), byrow = TRUE)
emissionProbs

##       [,1]  [,2]  [,3]  [,4]
## [1,] 0.200 0.300 0.300 0.200
## [2,] 0.997 0.001 0.001 0.001
## [3,] 0.001 0.001 0.997 0.001
## [4,] 0.001 0.001 0.997 0.001
## [5,] 0.001 0.001 0.001 0.997
## [6,] 0.150 0.350 0.350 0.150

hmm <- initHMM(TheUnitedStates, StarsAndStripes, startProbs = c(1,0,0,0,0,0), transProbs = transProbs, emissionProbs = emissionProbs)

#Class Junction Detector
ParkerStates = c("Exon", "5site", "Intron")

ParkerSymbols = c("A","C","G","T")

ParkertransProbs = matrix(c('EE'=0.9,'E5'=0.1,'EI'=0, '5E'=0, '55'=0, '5I'=1.0, 'IE'=0, 'I5'=0, 'II'=1.0), c(length(ParkerStates), length(ParkerStates)), byrow = TRUE)
ParkertransProbs

##      [,1] [,2] [,3]
## [1,]  0.9  0.1    0
## [2,]  0.0  0.0    1
## [3,]  0.0  0.0    1

ParkeremissionProbs = matrix(c('A'=0.25,'C'=0.25,'G'=0.25,'T'=0.25, 'A'=0.05,'C'=0.0,'G'=0.95,'T'=0.0, 'A'=0.4,'C'=0.1,'G'=0.1,'T'=0.4), c(length(ParkerStates), length(ParkerSymbols)), byrow = TRUE)
ParkeremissionProbs

##      [,1] [,2] [,3] [,4]
## [1,] 0.25 0.25 0.25 0.25
## [2,] 0.05 0.00 0.95 0.00
## [3,] 0.40 0.10 0.10 0.40

Parkerhmm <- initHMM(ParkerStates, ParkerSymbols, startProbs = c(1,0,0), transProbs = ParkertransProbs, emissionProbs = ParkeremissionProbs)

#PUSL1 Human
BaldEagle <- s2c('AGCCCCAGCCCACGGCTTATTCCTCAAGTCAGTGCTGTACGGGAACCTCGgtaagaaaaacaggcacgagaagctcctgtcatgtgcccagtgactactg')
BaldEagle <- toupper(BaldEagle)


vitBaldEagle <- viterbi(hmm, BaldEagle)
vitBaldEagle

##   [1] "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"  
##   [8] "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"  
##  [15] "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"  
##  [22] "Exon"   "Exon"   "Exon"   "Exon"   "toA"    "toG1"   "toG2"  
##  [29] "toT"    "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
##  [36] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
##  [43] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
##  [50] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
##  [57] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
##  [64] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
##  [71] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
##  [78] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
##  [85] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
##  [92] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
##  [99] "Intron" "Intron"

ParkervitBaldEagle <- viterbi(Parkerhmm, BaldEagle)
ParkervitBaldEagle

##   [1] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
##  [11] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
##  [21] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
##  [31] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
##  [41] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
##  [51] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
##  [61] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
##  [71] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
##  [81] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
##  [91] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"

#chr2:85,539,313-85,539,468
NorthAmericanBison <- s2c('ACGAGGCGTTCATCGAGGAGGGCACATTCCTTTTCACCTCAGAGTCGGTCGGGGAAGGCCACCCAGGTGAGGGGACGGCCTGAAGCGAAGCGTGGGGCGGGGCAGAAGGCAGCGCCAAGGTCCGGCTGGCTGCGGCCGGCCGGTGGTGGGGCCCGC')

vitNorthAmericanBison <- viterbi(hmm, NorthAmericanBison)
vitNorthAmericanBison

##   [1] "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"  
##   [8] "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"  
##  [15] "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"  
##  [22] "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"  
##  [29] "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"  
##  [36] "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"  
##  [43] "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"  
##  [50] "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"  
##  [57] "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"   "Exon"  
##  [64] "Exon"   "toA"    "toG1"   "toG2"   "toT"    "Intron" "Intron"
##  [71] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
##  [78] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
##  [85] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
##  [92] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
##  [99] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [106] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [113] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [120] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [127] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [134] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [141] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [148] "Intron" "Intron" "Intron" "Intron" "Intron" "Intron" "Intron"
## [155] "Intron" "Intron"

ParkervitNorthAmericanBison <- viterbi(Parkerhmm, NorthAmericanBison)
ParkervitNorthAmericanBison

##   [1] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
##  [11] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
##  [21] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
##  [31] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
##  [41] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
##  [51] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
##  [61] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
##  [71] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
##  [81] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
##  [91] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [101] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [111] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [121] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [131] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [141] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"
## [151] "Exon" "Exon" "Exon" "Exon" "Exon" "Exon"

#As expected, the improved junction detector did return a splice site while the previous junction detector we used in class did not. 
#This occurred in both the sequence for the PUSL1 Human gene assigned to me as well as the sequence at chr2:85,539,313-85,539,468. 
#In the latter, the improved detector picked up on the exact splice site found in the UCSC genome browser. 
#However, in the former, with my own gene, the detector picked up something about 25 bases upstream of the actual splice site seen in the browser.

#3
ChristmasTree <- read.tree("toy_tree.tre")
SantasToys = readDNAMultipleAlignment("toy_MSA.fasta", format="FASTA")
SantasToys <- as.phyDat(SantasToys)
plot(ChristmasTree, type="unrooted")

fit = pml(ChristmasTree, data=SantasToys, model="Jukes-Cantor")
fit

## 
##  loglikelihood: -4.778555 
## 
## unconstrained loglikelihood: 0 
## 
## Rate matrix:
##   a c g t
## a 0 1 1 1
## c 1 0 1 1
## g 1 1 0 1
## t 1 1 1 0
## 
## Base frequencies:  
## 0.25 0.25 0.25 0.25

logLik(fit)

## 'log Lik.' -4.778555 (df=5)

Reindeer<- function(a){
  Rudolph<-(1+3*exp(-4*a))/4
  Vixen<-(1-exp(-4*a))/4
  Prancer<-(7*Vixen^5+2*Rudolph^3*Vixen^2+Rudolph^4*Vixen+4*Rudolph^2*Vixen^3+2*Rudolph*Vixen^4)/4
  Blitzen<-log(Prancer)
  return(Blitzen)
}

Reindeer(0.09)

## [1] -4.78537

Reindeer(0.103)

## [1] -4.778555

Reindeer(0.12)

## [1] -4.787469

#Of the three values, 0.103 shows the maximum likelihood which is -4.778555.
#The log likelihood is also computed using phangorn above the manual calculations. The result of this is also -4.778555.

#4

#Extract the multiple sequence seed alignment for the most significant protein domain from PFAM.
seedalign <- pfam("PseudoU_synth_1", alignment = "seed") 
PseudoU <- as.phyDat(seedalign$ali, type="AA")


#phangorn part
#neighbor-joining
dm <- dist.ml(PseudoU)
treeNJ <- NJ(dm)
plot(treeNJ, "unrooted", main="NJ")

#maximum parsimony
treePars <- optim.parsimony(treeNJ, PseudoU,method = "sankoff")

## Final p-score 11170 after  89 nni operations

plot(treePars, "unrooted", main="parsimony")

parsimony(treeNJ, PseudoU,method = "sankoff")

## [1] 11337

#The parsimony score is 11337.

#maximum likelihood
fitpars <- pml(treeNJ, data=PseudoU)

## negative edges length changed to 0!

fitpars

## 
##  loglikelihood: -59152.72 
## 
## unconstrained loglikelihood: -1249.257 
## Rate matrix:

logLik(fitpars)

## 'log Lik.' -59152.72 (df=619)

#The log likelihood is -59152.72.

plot(fitpars$tree)

#5

SARS <- read.phyDat("sars.fasta", type="DNA", format = "fasta")

SARSdm <- dist.ml(SARS)
treeNJ <- NJ(SARSdm)
treeNJroot <- root(treeNJ, outgroup = "Himalayan palm civet sars cov, complete genome", resolve.root=TRUE)
plot(treeNJroot, type="phylogram")

#Based on this, we can deduce that the city of origin is Guangzhou in Guangdong.
#Depending on the definition of the next major site, Zhongshan is the next major city but still in Guangdong and Hong Kong is the next major region after Guangdong as a whole.

Final-Project-R-Code_Nupur-Joshi.R

nupur

2019-12-18