Writing: on aa chemistry. abstract
load all packges, gggplot2, ggpubr, vegan, scatterplot3d library()
library(ggplot2)#creating plots
## Warning: package 'ggplot2' was built under R version 4.0.3
library(ggpubr) #wrapper for ggplot2(to make ggplot2 easy)
## Warning: package 'ggpubr' was built under R version 4.0.3
library(vegan) #for doing and plotting PCA
## Warning: package 'vegan' was built under R version 4.0.3
## Loading required package: permute
## Warning: package 'permute' was built under R version 4.0.3
## Loading required package: lattice
## This is vegan 2.5-6
library(scatterplot3d)#creating 3D plots
## Warning: package 'scatterplot3d' was built under R version 4.0.3
Make the vectors.
# 1 letter code
aa <-c('A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y')
## molecular weight in dalton
MW.da <-c(89,121,133,146,165,75,155,131,146,131,149,132,115,147,174,105,119,117,204,181)
## vol from van der Waals radii
vol <-c(67,86,91,109,135,48,118,124,135,124,124,96,90,114,148,73,93,105,163,141)
## bulk – a measure of the shape of the side chain
bulk <-c(11.5,13.46,11.68,13.57,19.8,3.4,13.69,21.4,15.71,21.4,16.25,12.28,17.43,
14.45,14.28,9.47,15.77,21.57,21.67,18.03)
## pol – a measure of the electric field strength around the molecule
pol <-c(0,1.48,49.7,49.9,0.35,0,51.6,0.13,49.5,0.13,1.43,3.38,1.58,3.53,52,1.67,1.66,0.13,2.1,1.61)
## isoelec point
isoelec <-c(6,5.07,2.77,3.22,5.48,5.97,7.59,6.02,9.74,5.98,5.74,5.41,6.3,5.65,10.76,5.68,6.16,5.96,5.89,5.66)
## 1st Hydrophobicity scale
H2Opho.34 <-c(1.8,2.5,-3.5,-3.5,2.8,-0.4,-3.2,4.5,-3.9,3.8,1.9,-3.5,-1.6,-3.5,-4.5,-0.8,-0.7,4.2,-0.9,-1.3)
## 2nd Hydrophobicity scale
H2Opho.35 <-c(1.6,2,-9.2,-8.2,3.7,1,-3,3.1,-8.8,2.8,3.4,-4.8,-0.2,-4.1,-12.3,0.6,1.2,2.6,1.9,-0.7)
## Surface area accessible to water in an unfolded peptide
saaH2O <-c(113,140,151,183,218,85,194,182,211,180,204,158,143,189,241,122,146,160,259,229)
## Fraction of accessible area lost when a protein folds
faal.fold <-c(0.74,0.91,0.62,0.62,0.88,0.72,0.78,0.88,0.52,0.85,0.85,0.63,0.64,0.62,0.64,0.66,
0.7,0.86,0.85,0.76)
# Polar requirement
polar.req <-c(7,4.8,13,12.5,5,7.9,8.4,4.9,10.1,4.9,5.3,10,6.6,8.6,9.1,7.5,6.6,5.6,5.2,5.4)
## relative frequency of occurance
## "The frequencies column shows the mean
### percentage of each amino acid in the protein sequences
### of modern organisms"
freq <-c(7.8,1.1,5.19,6.72,4.39,6.77,2.03,6.95,6.32,10.15,2.28,4.37,4.26,3.45,5.23,6.46,5.12,
7.01,1.09,3.3)
# charges
## un = Un-charged
## neg = negative
## pos = positive
charge<-c('un','un','neg','neg','un','un','pos','un','pos','un','un','un','un','un','pos','un'
,'un','un','un','un')
## hydropathy
hydropathy<-c('hydrophobic','hydrophobic','hydrophilic','hydrophilic','hydrophobic','neutral',
'neutral','hydrophobic','hydrophilic','hydrophobic','hydrophobic','hydrophilic',
'neutral','hydrophilic','hydrophilic','neutral','neutral','hydrophobic','hydrophobic','neutral')
## vol
vol.cat<-c('verysmall','small','small','medium','verylarge','verysmall','medium','large','large',
'large','large','small','small','medium','large','verysmall','small','medium','verylarge',
'verylarge')
## pol
pol.cat<-c('nonpolar','nonpolar','polar','polar','nonpolar','nonpolar','polar','nonpolar',
'polar','nonpolar','nonpolar','polar','nonpolar','polar','polar','polar','polar','nonpolar',
'nonpolar','polar')
## chemical
chemical<-c('aliphatic','sulfur','acidic','acidic','aromatic','aliphatic','basic','aliphatic','basic',
'aliphatic','sulfur','amide','aliphatic','amide','basic','hydroxyl','hydroxyl','aliphatic',
'aromatic','aromatic')
Build the dataframe
aa_dat <- data.frame(aa, MW.da,vol, bulk, pol, isoelec, H2Opho.34, H2Opho.35, saaH2O,
faal.fold, polar.req, freq, charge, hydropathy, vol.cat, pol.cat, chemical)
Creating a correlation matrix to examine variance between the variables.
TODO: Examine the code below. What is it called? Replace the XXXXXXXX above with the name of this thing
cor_ <- round(cor(aa_dat[,-c(1,13:17)]),2)
diag(cor_) <- NA
cor_[upper.tri(cor_)] <- NA
cor_
## MW.da vol bulk pol isoelec H2Opho.34 H2Opho.35 saaH2O faal.fold
## MW.da NA NA NA NA NA NA NA NA NA
## vol 0.93 NA NA NA NA NA NA NA NA
## bulk 0.55 0.73 NA NA NA NA NA NA NA
## pol 0.29 0.24 -0.20 NA NA NA NA NA NA
## isoelec 0.20 0.36 0.08 0.27 NA NA NA NA NA
## H2Opho.34 -0.27 -0.08 0.44 -0.67 -0.20 NA NA NA NA
## H2Opho.35 -0.25 -0.16 0.32 -0.85 -0.26 0.85 NA NA NA
## saaH2O 0.97 0.99 0.64 0.29 0.35 -0.18 -0.23 NA NA
## faal.fold 0.11 0.18 0.49 -0.53 -0.18 0.84 0.79 0.12 NA
## polar.req -0.05 -0.19 -0.53 0.76 -0.11 -0.79 -0.87 -0.11 -0.81
## freq -0.52 -0.30 -0.04 -0.01 0.02 0.26 -0.02 -0.38 -0.18
## polar.req freq
## MW.da NA NA
## vol NA NA
## bulk NA NA
## pol NA NA
## isoelec NA NA
## H2Opho.34 NA NA
## H2Opho.35 NA NA
## saaH2O NA NA
## faal.fold NA NA
## polar.req NA NA
## freq 0.14 NA
Describe what the code below is doing and summarzie the next 2 TODOs in 2-3 sentences. The code is examining the correlation between variable and determining the most positive and negative correlation and skipping over any NA values. The saaH2O has the most positive correlation while polar.req has the most negative correlation in the data set.
which(cor_ == max(cor_, na.rm = T), arr.ind = T)
## row col
## saaH2O 8 2
max(cor_, na.rm = T)
## [1] 0.99
min(cor_, na.rm = T)
## [1] -0.87
which(cor_ == min(cor_, na.rm = T), arr.ind = T)
## row col
## polar.req 10 7
Make a scatterplot matrix of all of the data. Included the following variables: 1.MW.da 2. vol 3. pol 4. H2Opho.34 5. polar.req PLUS 2 others (6 total) NOTE: Depending on how you do this you may have to make sure the first column doesn’t go into the plotting command
Write 1-2 sentences describing what this below code does in your scatter plot matrix below This an R function that can found in the help file. This function creates a pair correlation matrix to visualize correlation in large datasets.
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor,...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste0(prefix, txt)
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * r)
}
WRITE 2-3 sentences describing the relationships between the 6 variables you plotted; which have strong correlations which ones weak, which ones have non-linear relationships.
The most positive correlations are between MW.da and saaH2O, vol and saaH2O, and Mw.da and vol which all have correlations above .90.These are also strong correlations as indicated by the number and the size of the number in the plot. Weaker correlations can be seen between variables like pol and faal.fold and H2Opho.34 and pol. The weakest correlations can be seen between saaH2O and faal.fold, polar.req and saaH2O, Mw.da and polar.req to name a few. These show non-linear relationships.
plot(aa_dat[,c("MW.da", "vol", "pol", "H2Opho.34", "polar.req", "saaH2O", "faal.fold")],
panel = panel.smooth,
upper.panel = panel.cor ,
main = "Example")
This plot is comparing two properties of aminoacids, the isoelectic point and the volumes to visualize these properties.
#plot() does plotting
plot(vol ~ isoelec, data = aa_dat,
xlab = "pI ", #label for x axis
ylab = "Volume Å3" , #label for y axis
main = "Comparing the Isoelectric point(pI) and Volumes of Amino Acids",
col = 0)
#text - adding labels
text(vol ~ isoelec,
labels = aa,
data = aa_dat,
col = 1:20)
ggscatter(y = "vol",
x = "isoelec",
size = "polar.req",
color = "freq",
data = aa_dat,
xlab = "pI",
ylab = "Volume Å3")
###Figure 3: Highly Correlated data
CODE: 1. Identify 2 variables that are highly positively correlated and plot them in GGPUBR 2. Add a regression line 3. Add a data ellipse 4. Add a correlation coefficient 5. Label the axes with the full name of the variables
COMMENTS: 1. Add a comment indicating which line of the ggpubr is adding the line 2. Add a comment indicating which line of the ggpubr is adding the ellipse 3. Add a comment indicating which line of the ggpubr is adding the correlation coefficient
WRITING: 1. Describe the general mathematical form of the regression line. The line has the general form of y = m * x + b which is written in stats as y = B0 + B1*x. The distance from this line to each data point is the residual. 2. Describe the general meaning of the data ellipse. A data ellipse shows the variance of the data.
ggscatter(y = "vol",
x = "saaH2O",
add = "reg.line", #adding line of best fit
ellipse = TRUE, #adding data ellipse
cor.coef = TRUE, #adding correlation coefficient
data = aa_dat,
xlab = "Surface area accessible to water in an unfolded peptide",
ylab = "Volume Å3")
## `geom_smooth()` using formula 'y ~ x'
CODE: 1. Identfy 2 variables that have a non-linear relationships between them (scatterplot has a curved shape) 2. Use ggpubr to plot a scatterplot with a loess smoother. 3. Also vary the size and color of the data by 2 other NUMERIC variables 4. Label the axes with the full name of the variables.
ggscatter(y = "H2Opho.34",
x = "saaH2O",
size = "isoelec",
color = "pol",
add = "loess",
data = aa_dat,
xlab = "Surface area accessible to water in an unfolded peptide",
ylab = "1st Hydrophobicity scale")
## `geom_smooth()` using formula 'y ~ x'
### Figure 5: Non-linear relationship on LOG scale CODE: 1. Log-transform both of your variables you used in Figre 4 to make the plot again 2. A regression line instead of a loess smoother 3. Also vary the size and color of the data by the 2 other NUMBERIC variables
WRITING: 1. Describe what the log transformation does in 1 to 2 sentences. Taking the log of data is used to re-scale it or make the relationships linear.
aa_dat$saaH2O_log <- log(aa_dat$saaH2O)
aa_dat$H2Opho.34_log <- log(aa_dat$H2Opho.34)
## Warning in log(aa_dat$H2Opho.34): NaNs produced
ggscatter(y = "H2Opho.34_log",
x = "saaH2O_log",
add = "reg.line",
size = "bulk",
color = "faal.fold",
data = aa_dat,
xlab = "Surface area accessible to water in an unfolded peptide",
ylab = "1st Hydrophobicity scale")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 13 rows containing non-finite values (stat_smooth).
## Warning: Removed 13 rows containing missing values (geom_point).
### Figure 5: 3D Scatterplot 1. CODE: Make a 3D scatterplot of 3 variables that have a relatively strong correlation 2. CODE: use highlight.3d = TRUE to color code 3. CODE: Use type = “h” 4. WRITING: Summarize the relationships and correlations between the variables that you chose in 2-4 sentences. The residual lengths of the data points rise together as you move up in the graph indicating a positive correlation
scatterplot3d(x = MW.da,
y = vol,
z = saaH2O,
highlight.3d = TRUE,
type="h")
## Principal component analysis 1.Write 3-4 sentences describing the basic use and interpretation of PCA as we have used it in this class. Refer to Higss and Attwood and provide a useful quote that explains PCA.Cite the quote as (Higgs and Attwood 2005, pg XX).
PCA or principal component analysis is a “way of visualizing the important features of multidimensional data sets, such as tables of amino acid properties.PCA chooses coordinates that are linear combinations of the original variables in such a way that the maximum variability between the data points is explained by the first few coordinates.” (Higgs and Attwood 2005, pg 34). It provides a way to take information from multiple variables and showcase them in a 2D graph. Data points are scaled, shifted, and rotated such that all points are spread out, making the data easy to visualize.
2.Write 3-4 sentences describing the 2 basic ways we made PCA plots in this class. In class, we made PCA plots using base R functions. We used the prcomp() function to run PCA and biplot to plot the data. We also ran PCA using the vegan packaged. Here, the rda() function was using to run PCA and biplot to plot that data.
#subset dataframe by naming columns
aa_dat2 <- aa_dat[,c("vol", "bulk", "pol","isoelec","H2Opho.34","H2Opho.35","saaH2O","faal.fold","polar.req")]
Write 1-2 sentences describing the appraoch you use to make this new dataframe. To make the dataframe created a new subset called aa_dat2 and named the columns I wanted in this dataframe.
Run a PCA and create a PCA plot using base R command Plot the output
pca.out <- prcomp(aa_dat2, scale = TRUE)
biplot(pca.out)
### Figure 8: Principal component analysis- vegan
rda.out <- vegan::rda(aa_dat2,scale = TRUE)
rda_scores <- scores(rda.out) #4
biplot(rda.out, display = "sites", col = 0)
orditorp(rda.out, display = "sites", labels = aa_dat$aa, cex = 1.2)
ordihull(rda.out,
group = aa_dat$hydropathy,
col = 1:7,
lty = 1:7,
lwd = c(3,6))
## Cluster Analysis
Write 2-3 sentences describing the purpose of cluster analysis. The purpose of cluster analysis is to place objects in groups or clusters such that the objects in a given cluster are similar to each other. This can be done by organizing data by distance or hierarchy.
Write 3-5 sentences describing how UPGMA works. UPGMA is a type of hierarchical cluster analysis that uses a distance matrix. To make this distance matrix, all sequences are compared through pairwise alignment. The closest distances are grouped together to form a cluster.
row.names(aa_dat2) <- paste(aa_dat$aa, sep = ".")
dist_euc <- dist(aa_dat2,
method = "euclidean")
clust_euc <- hclust(dist_euc, method = "average") #method = "average sets it to use UPGMA
dendro_euc <- as.dendrogram(clust_euc)
plot(dendro_euc,
horiz = T,
nodePar = list(pch = c(1,NA),
cex = 0.5,
lab.cex = 1.2))