library(ggplot2)
library(ggpubr)
library(scatterplot3d)
library(pander)
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-7
library(dendextend)
## Registered S3 method overwritten by 'dendextend':
## method from
## rev.hclust vegan
##
## ---------------------
## Welcome to dendextend version 1.15.2
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:permute':
##
## shuffle
## The following object is masked from 'package:ggpubr':
##
## rotate
## The following object is masked from 'package:stats':
##
## cutree
library(lsa)
## Loading required package: SnowballC
aa.1 <-c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L","M","N",
"P","Q","R","S","T","V","W","Y")
bulk <-c(11.5, 13.46, 11.68, 13.57, 19.8, 3.4, 13.69, 21.4, 15.71, 21.4, 16.25,
12.28,17.43, 14.45, 14.28, 9.47, 15.77, 21.57, 21.67, 18.03)
isoelec <-c(6,5.07,2.77,3.22,5.48,5.97,7.59,6.02,9.74,5.98, 5.74, 5.41, 6.3,
5.65, 10.76, 5.68, 6.16, 5.96, 5.89, 5.66)
# make dataframe
aa.full <- NA
aa.3 <- NA
habk.df <- data.frame(aa.full = aa.full,
aa.3 = aa.3,
aa.1 = aa.1,
Bulk = bulk,
pI = isoelec)
row.names(habk.df) <- aa.1
Display the finished dataframe with pander::pander()
# show table with pander
pander(habk.df)
| aa.full | aa.3 | aa.1 | Bulk | pI | |
|---|---|---|---|---|---|
| A | NA | NA | A | 11.5 | 6 |
| C | NA | NA | C | 13.46 | 5.07 |
| D | NA | NA | D | 11.68 | 2.77 |
| E | NA | NA | E | 13.57 | 3.22 |
| F | NA | NA | F | 19.8 | 5.48 |
| G | NA | NA | G | 3.4 | 5.97 |
| H | NA | NA | H | 13.69 | 7.59 |
| I | NA | NA | I | 21.4 | 6.02 |
| K | NA | NA | K | 15.71 | 9.74 |
| L | NA | NA | L | 21.4 | 5.98 |
| M | NA | NA | M | 16.25 | 5.74 |
| N | NA | NA | N | 12.28 | 5.41 |
| P | NA | NA | P | 17.43 | 6.3 |
| Q | NA | NA | Q | 14.45 | 5.65 |
| R | NA | NA | R | 14.28 | 10.76 |
| S | NA | NA | S | 9.47 | 5.68 |
| T | NA | NA | T | 15.77 | 6.16 |
| V | NA | NA | V | 21.57 | 5.96 |
| W | NA | NA | W | 21.67 | 5.89 |
| Y | NA | NA | Y | 18.03 | 5.66 |
# make plot
ggscatter(x = "Bulk",
y = "pI",
data = habk.df,
size = 1,
add = "reg.line",
label = aa.1,
cor.coef = TRUE,
conf.int = TRUE,
ellipse = TRUE,
ylab = "pI",
xlab = "Molecular Weight",
title = "Prediction of pI from MW")
## `geom_smooth()` using formula 'y ~ x'
Look up the molecular mass of the non-standard amino acids Selenocysteine and Pyrrolysine from https://en.wikipedia.org/wiki/Amino_acid
Place this information in objects called selenocysteine.MW and pyrrolysine.MW
# molecular weights
selenocysteine.MW <- 168.064
pyrrolysine.MW <- 255.313
Selenocysteine and Pyrrolysine are amino acids that can be added to proteins by re-coding stop codons. (On an exam you should recognize these as amino acids that occur in proteins via this mechanism and be able to answer simple multiple choice questions about them.)
For more information see:
Scitable: https://www.nature.com/scitable/topicpage/an-evolutionary-perspective-on-amino-acids-14568445/
Wikipedia: https://en.wikipedia.org/wiki/Selenocysteine https://en.wikipedia.org/wiki/Pyrrolysine
Build a regression model using lm() to determine the y = m*x + b equation for the line in your scatterplot.
A simple regression model has a slope (m) and an intercept and can be set up as an equation
y = m*x + b
In this situation the model would be
pI = m*MW.da + b
“b” is called the “intercept” in R output.
When you build a regression model you can access output that looks like this: (Intercept) MW.da 2.30536091 1.01131509
Which is interpreted as
(Intercept) MW.da b m
If the numbers above were accurate (they aren’t), then our equation would be pI = 1.01131509*MW.da + 2.30536091
First, build the regression model (aka linear model)
# Build model
RModel <- lm(pI ~ bulk, data = habk.df)
RModel
##
## Call:
## lm(formula = pI ~ bulk, data = habk.df)
##
## Coefficients:
## (Intercept) bulk
## 5.57308 0.03125
Next get the coefficients for the model (the m of m*x, and the b).
#get coefficients / parameters for equation
lm(formula = pI ~ bulk, data = habk.df)
##
## Call:
## lm(formula = pI ~ bulk, data = habk.df)
##
## Coefficients:
## (Intercept) bulk
## 5.57308 0.03125
b <- RModel$coefficients[1]
m <- RModel$coefficients[2]
Use the equation y = m*x + b to estimate the pI of selenocystein an pyrrolysine.
# Estimate pI for Se and Py
pI.Sel <- (m * selenocysteine.MW)+b
pI.Sel
## bulk
## 10.82537
pI.Pyr <- (m * pyrrolysine.MW) + b
pI.Pyr
## bulk
## 13.55205
Make a table of your results, including the following columns
# make table
amino.acid <- c('Selenocysteine','Pyrrolysine')
three.letter <- c('Sec','Pyl')
one.letter <- c('U','O')
molecular.weight <-c(selenocysteine.MW,pyrrolysine.MW)
pI.estimate <- c(pI.Sel,pI.Pyr)
UOSummaryInfo <-data.frame(amino.acid,three.letter,one.letter, molecular.weight,pI.estimate)
Display the dataframe with the function pander() from the pander package.
# make table
pander(UOSummaryInfo)
| amino.acid | three.letter | one.letter | molecular.weight | pI.estimate |
|---|---|---|---|---|
| Selenocysteine | Sec | U | 168.1 | 10.83 |
| Pyrrolysine | Pyl | O | 255.3 | 13.55 |
Using R, determine the correlation coefficient for molecular mass versus pI. Is this a very strong correlation coefficient?
# correlation coefficient
MolecularWeight<- c(89,121,133,146,165,75,155,131,146,131,149,132,115,147,174,105,119,117,204,181)
pI<- c(6,5.07,2.77,3.22,5.48,5.97,7.59,6.02,9.74,5.98,5.74,5.41,6.3,5.65,10.76,5.68,6.16,5.96,
5.89,5.66)
cor(pI, MolecularWeight)
## [1] 0.1974383
Build a dataframe with all 8 variables used in Higgs and Attwood Chapter 2. Add molecular weight to this (which wasn’t included in their original table).
Determine the variable which has the highest correlation with pI.
First, make a datafrome with volume, bulk etc, along with molecular weight.
# data
Volume <- c(67,86,91,109,135,48,118,124,135,124,124,
96,90,114,148,73,93,105,163,141)
Bulk <- c(11.5,13.46,11.68,13.57,19.8,3.4,13.69,21.4,
15.71,21.4,16.25,12.28,17.43,14.45,14.28,
9.47,15.77,21.57,21.67,18.03)
Polarity <-c(0,1.48,49.7,49.9,0.35,0,51.6,0.13,49.5,0.13,1.43,
3.38,1.58,3.53,52,1.67,1.66,0.13,2.1,1.61)
pho1 <-c(1.8,2.5,-3.5,-3.5,2.8,-0.4,-3.2,4.5,
-3.9,3.8,1.9,-3.5,-1.6,-3.5,-4.5,-0.8,
-0.7,4.2,-0.9,-1.3)
pho2 <-c(1.6,2,-9.2,-8.2,3.7,1,-3,3.1,-8.8,2.8,
3.4,-4.8,-0.2,-4.1,-12.3,0.6,1.2,2.6,1.9,-0.7)
IE <-c(6,5.07,2.77,3.22,5.48,5.97,7.59,6.02,9.74,5.98, 5.74, 5.41, 6.3,
5.65, 10.76, 5.68, 6.16, 5.96, 5.89, 5.66)
SA<-c(113,140,151,183,218,85,194,182,211,180,204,
158,143,189,241,122,146,160,259,229)
FA <- c(0.74,0.91,0.62,0.62,0.88,0.72,0.78,0.88,0.52,
0.85,0.85,0.63,0.64,0.62,0.64,0.66,0.7,0.86,0.85,0.76)
MolecularWeight<- c(89,121,133,146,165,75,155,131,146,131,149,132,115,147,174,105,119,117,204,181)
pI<- c(6,5.07,2.77,3.22,5.48,5.97,7.59,6.02,9.74,5.98,5.74,5.41,6.3,5.65,10.76,5.68,6.16,5.96,
5.89,5.66)
# made dataframe
FinalDF <- data.frame(MolecularWeight,Volume,Bulk, Polarity, pI, pho1, pho2,SA,FA)
FinalDF
## MolecularWeight Volume Bulk Polarity pI pho1 pho2 SA FA
## 1 89 67 11.50 0.00 6.00 1.8 1.6 113 0.74
## 2 121 86 13.46 1.48 5.07 2.5 2.0 140 0.91
## 3 133 91 11.68 49.70 2.77 -3.5 -9.2 151 0.62
## 4 146 109 13.57 49.90 3.22 -3.5 -8.2 183 0.62
## 5 165 135 19.80 0.35 5.48 2.8 3.7 218 0.88
## 6 75 48 3.40 0.00 5.97 -0.4 1.0 85 0.72
## 7 155 118 13.69 51.60 7.59 -3.2 -3.0 194 0.78
## 8 131 124 21.40 0.13 6.02 4.5 3.1 182 0.88
## 9 146 135 15.71 49.50 9.74 -3.9 -8.8 211 0.52
## 10 131 124 21.40 0.13 5.98 3.8 2.8 180 0.85
## 11 149 124 16.25 1.43 5.74 1.9 3.4 204 0.85
## 12 132 96 12.28 3.38 5.41 -3.5 -4.8 158 0.63
## 13 115 90 17.43 1.58 6.30 -1.6 -0.2 143 0.64
## 14 147 114 14.45 3.53 5.65 -3.5 -4.1 189 0.62
## 15 174 148 14.28 52.00 10.76 -4.5 -12.3 241 0.64
## 16 105 73 9.47 1.67 5.68 -0.8 0.6 122 0.66
## 17 119 93 15.77 1.66 6.16 -0.7 1.2 146 0.70
## 18 117 105 21.57 0.13 5.96 4.2 2.6 160 0.86
## 19 204 163 21.67 2.10 5.89 -0.9 1.9 259 0.85
## 20 181 141 18.03 1.61 5.66 -1.3 -0.7 229 0.76
Then make a correlation matrix from the data. Save it to an object called corr.mat
# make correlation matrix
corr.mat <- cor(FinalDF[,])
Round off the correlation matrix to 1 decimal [lace]
# round off
corr.mat <- round(corr.mat, 1)
Display the matrix with pander::pander()
# display with pander()
pander::pander(corr.mat)
| MolecularWeight | Volume | Bulk | Polarity | pI | pho1 | |
|---|---|---|---|---|---|---|
| MolecularWeight | 1 | 0.9 | 0.6 | 0.3 | 0.2 | -0.3 |
| Volume | 0.9 | 1 | 0.7 | 0.2 | 0.4 | -0.1 |
| Bulk | 0.6 | 0.7 | 1 | -0.2 | 0.1 | 0.4 |
| Polarity | 0.3 | 0.2 | -0.2 | 1 | 0.3 | -0.7 |
| pI | 0.2 | 0.4 | 0.1 | 0.3 | 1 | -0.2 |
| pho1 | -0.3 | -0.1 | 0.4 | -0.7 | -0.2 | 1 |
| pho2 | -0.2 | -0.2 | 0.3 | -0.9 | -0.3 | 0.8 |
| SA | 1 | 1 | 0.6 | 0.3 | 0.3 | -0.2 |
| FA | 0.1 | 0.2 | 0.5 | -0.5 | -0.2 | 0.8 |
| pho2 | SA | FA | |
|---|---|---|---|
| MolecularWeight | -0.2 | 1 | 0.1 |
| Volume | -0.2 | 1 | 0.2 |
| Bulk | 0.3 | 0.6 | 0.5 |
| Polarity | -0.9 | 0.3 | -0.5 |
| pI | -0.3 | 0.3 | -0.2 |
| pho1 | 0.8 | -0.2 | 0.8 |
| pho2 | 1 | -0.2 | 0.8 |
| SA | -0.2 | 1 | 0.1 |
| FA | 0.8 | 0.1 | 1 |
Which variable has the strongest (most positive OR most negative) correlation with pI? (if possible write this with code, otherwise just state what it is).
# find maximum
corr.mat[5,]
## MolecularWeight Volume Bulk Polarity pI
## 0.2 0.4 0.1 0.3 1.0
## pho1 pho2 SA FA
## -0.2 -0.3 0.3 -0.2