Introduction

We find the correlation between each pair of variables and plot them both in 2D diagrams and 3D scatterplot. Then we analysis the data through PCA and cluster analysis. PCA are helpful to simplify high-dimensional data such as gene code in biology. It allows gene code to reduce the dimension and retain the pattern as the same time. It is helpful for biologists to deal with large amount of data in gene coding and trend finding ## Preliminaries ## Packages

#install.packages("ggplot2")
#install.packages("ggpubr")
#install.packages("scatterplot3d")
#install.packages("vegan")

ggpubr is a package that creates “wrappers” for much of ggplot2’s core functionality. ggpubr makes ggplot2 easier to use.

library(ggplot2) #make ggplot
library(ggpubr) #make ggplot2 easier
library(vegan) #plotting PCA
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-6
library(scatterplot3d) #make 3d scatterplot

Build the dataframe

# 1 letter code
aa <-c('A','C','D','E','F','G','H','I','K','L','M','N',
'P','Q','R','S','T','V','W','Y')
## molecular weight in dalton
MW.da <-c(89,121,133,146,165,75,155,131,146,131,149,132,115,147,174,105,119,117,204,181)
## vol from van der Waals radii
vol <-c(67,86,91,109,135,48,118,124,135,124,124,96,90,
114,148,73,93,105,163,141)
## bulk – a measure of the shape of the side chain
bulk <-c(11.5,13.46,11.68,13.57,19.8,3.4,13.69,21.4,
15.71,21.4,16.25,12.28,17.43,
14.45,14.28,9.47,15.77,21.57,21.67,18.03)
## pol – a measure of the electric field strength around the molecule
pol <-c(0,1.48,49.7,49.9,0.35,0,51.6,0.13,49.5,0.13,
1.43,3.38,1.58,3.53,52,1.67,1.66,0.13,2.1,1.61)
## isoelec point
isoelec <-c(6,5.07,2.77,3.22,5.48,5.97,7.59,6.02,9.74,5.98,
5.74,5.41,6.3,5.65,10.76,5.68,6.16,5.96,5.89,5.66)
## 1st Hydrophobicity scale
H2Opho.34 <-c(1.8,2.5,-3.5,-3.5,2.8,-0.4,-3.2,4.5,-3.9,3.8,1.9,
-3.5,-1.6,-3.5,-4.5,-0.8,-0.7,4.2,-0.9,-1.3)
## 2nd Hydrophobicity scale
H2Opho.35 <-c(1.6,2,-9.2,-8.2,3.7,1,-3,3.1,-8.8,2.8,3.4,-4.8,
-0.2,-4.1,-12.3,0.6,1.2,2.6,1.9,-0.7)
## Surface area accessible to water in an unfolded peptide
saaH2O <-c(113,140,151,183,218,85,194,182,211,180,204,158,
143,189,241,122,146,160,259,229)
## Fraction of accessible area lost when a protein folds
faal.fold <-c(0.74,0.91,0.62,0.62,0.88,0.72,0.78,0.88,0.52,
0.85,0.85,0.63,0.64,0.62,0.64,0.66,0.7,0.86,0.85,0.76)
# Polar requirement
polar.req <-c(7,4.8,13,12.5,5,7.9,8.4,4.9,10.1,4.9,5.3,10,
6.6,8.6,9.1,7.5,6.6,5.6,5.2,5.4)
## relative frequency of occurance
## "The frequencies column shows the mean
### percentage of each amino acid in the protein sequences
### of modern organisms"
freq <-c(7.8,1.1,5.19,6.72,4.39,6.77,2.03,6.95,6.32,
10.15,2.28,4.37,4.26,3.45,5.23,6.46,5.12,7.01,1.09,3.3)
##
# charges
## un = Un-charged
## neg = negative
## pos = positive
charge<-c('un','un','neg','neg','un','un','pos','un','pos',
'un','un','un','un','un','pos','un','un','un','un','un')
## hydropathy
hydropathy<-c('hydrophobic','hydrophobic','hydrophilic','
hydrophilic','hydrophobic','neutral','neutral',
'hydrophobic','hydrophilic','hydrophobic','hydrophobic',
'hydrophilic','neutral',
'hydrophilic','hydrophilic','neutral','neutral',
'hydrophobic','hydrophobic','neutral')
## vol
vol.cat<-c('verysmall','small','small','medium',
'verylarge','verysmall','medium','large','large',
'large','large','small','small','medium','large',
'verysmall',
'small','medium','verylarge','verylarge')
## pol
pol.cat<-c('nonpolar','nonpolar','polar','polar',
'nonpolar','nonpolar','polar','nonpolar',
'polar','nonpolar','nonpolar','polar','nonpolar','polar',
'polar','polar','polar','nonpolar','nonpolar','polar')
## chemical
chemical<-c('aliphatic','sulfur','acidic','acidic','aromatic',
'aliphatic','basic','aliphatic','basic','aliphatic','sulfur',
'amide','aliphatic','amide','basic','hydroxyl','hydroxyl',
'aliphatic','aromatic','aromatic')

build dataframe

aa_dat <- data.frame(aa,MW.da,vol,bulk,
pol,isoelec,H2Opho.34,H2Opho.35,
saaH2O,faal.fold, polar.req,freq,charge,hydropathy,vol.cat,pol.cat,chemical)

Raw data exploration

Correlation The code calculate the correlation between each two variables and assign “NA” to the upper triangle.

cor_ <- round(cor(aa_dat[,-c(1,13:17)]),2)
diag(cor_) <- NA
cor_[upper.tri(cor_)] <- NA
cor_
##           MW.da   vol  bulk   pol isoelec H2Opho.34 H2Opho.35 saaH2O faal.fold
## MW.da        NA    NA    NA    NA      NA        NA        NA     NA        NA
## vol        0.93    NA    NA    NA      NA        NA        NA     NA        NA
## bulk       0.55  0.73    NA    NA      NA        NA        NA     NA        NA
## pol        0.29  0.24 -0.20    NA      NA        NA        NA     NA        NA
## isoelec    0.20  0.36  0.08  0.27      NA        NA        NA     NA        NA
## H2Opho.34 -0.27 -0.08  0.44 -0.67   -0.20        NA        NA     NA        NA
## H2Opho.35 -0.25 -0.16  0.32 -0.85   -0.26      0.85        NA     NA        NA
## saaH2O     0.97  0.99  0.64  0.29    0.35     -0.18     -0.23     NA        NA
## faal.fold  0.11  0.18  0.49 -0.53   -0.18      0.84      0.79   0.12        NA
## polar.req -0.05 -0.19 -0.53  0.76   -0.11     -0.79     -0.87  -0.11     -0.81
## freq      -0.52 -0.30 -0.04 -0.01    0.02      0.26     -0.02  -0.38     -0.18
##           polar.req freq
## MW.da            NA   NA
## vol              NA   NA
## bulk             NA   NA
## pol              NA   NA
## isoelec          NA   NA
## H2Opho.34        NA   NA
## H2Opho.35        NA   NA
## saaH2O           NA   NA
## faal.fold        NA   NA
## polar.req        NA   NA
## freq           0.14   NA

The code is to find which variable has most positive correlation. SaaH2O has the most positive correlation and polar.req has most negative correlation

which(cor_ == max(cor_, na.rm = T), arr.ind = T)
##        row col
## saaH2O   8   2

Plot 0: Scatterplot matrix

panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor,...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste0(prefix, txt)
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * r)
}

saaH2O and vol, MW.da and vol, MW.da and saaH2O, faal.fold and H20pho.34, faal.hold and polar.req, polar.req and H2Opho.34, polar.req and pol have strong correlations. faal.fold and saaH2O, faal.fold and vol and MW.da, saa.H2O and polar.req and H2Opho.34, polar.req and MW.da, and H2Opho.34 and vol have non-linear relationship. rest have a weak correlation. .

plot(aa_dat[c("MW.da", "vol","pol","H2Opho.34","polar.req","saaH2O","faal.fold")],
     panel = panel.smooth,
     upper.panel = panel.cor,
     main = "Example")

Plot 1: Replication Higgs and Attwood Figure 2.8

The plot shows the relationship that amino acid volume against PI

plot(vol ~ isoelec, data = aa_dat,
     xlab = "pI",
     ylab = "Volume",
     main = "Amino acid volume against pI",
     col = 0)         #make a plot using variables volume and isoelec. Name axises and title
text(vol ~ isoelec,
     labels = aa,
     data = aa_dat,
     col = 1:20)      #label each point with letter in vector aa

Figure 2: HA Figure 2.8 with ggpubr

The code makes a plot by using ggpubr

ggscatter(y = "vol",
x = "isoelec",
size = "vol",
color = "vol",
data = aa_dat,
xlab = "pI",
ylab = "Amino acid Volume")

Figure 3: Highly correlated data

The code makes a plot with a strong positive correlated relationship between vol and saaH2O Regression line is a line that would best fit the points on the plot and it has a general form of y = m*x + b. Data ellipse means showing the normal probability of the points on the graph

ggscatter(y = "vol",
x = "saaH2O",
size = "vol",
add = "reg.line", # add line of best fit
ellipse = TRUE, # data ellipse
cor.coef = TRUE, # add correlation coef
# color = "freq",
data = aa_dat,
xlab = "saaH2O",
ylab = "Amino acid volume")
## `geom_smooth()` using formula 'y ~ x'

Figure 4: Non-linear relationship

The code makes a plot with non-linear relationship between variable MW.da and polar.req

ggscatter(y = "MW.da",
x = "polar.req",
size = "vol",
add  = "loess", # add smoother
color = "pol",
data = aa_dat,
xlab = "polar.req",
ylab = "MW.da")
## `geom_smooth()` using formula 'y ~ x'

Figure 5: Non-linear relationship on LOG scale

Log transformation is to re-scale the variables or make relationships linear. The code would make a plot based on log-log scale.

aa_dat$MW.da_log <- log(aa_dat$MW.da)
aa_dat$polar.req_log <- log(aa_dat$polar.req)
ggscatter(y = "MW.da_log",
x = "polar.req_log",
size = "vol",
add = "reg.line",
color = "pol",
data = aa_dat,
xlab = "polar.req",
ylab = "MW.da")
## `geom_smooth()` using formula 'y ~ x'

Figure 6: 3D Scatterplot

The 3d scatterplot has a postive correlation

scatterplot3d(x=aa_dat$MW.da,
              y=aa_dat$vol,
              z=aa_dat$bulk,
              highlight.3d = TRUE,
              type="h")

Principal component analysis

PCA is used to transform 3d scatterplot into 2d in the class. We use package vegan to do PCA, which has a lot of useful feature. PCA “is a way of combining the information from all eight properties into a two-dimensional graph” (Higgs and Attwood 2005, pg 26).

Subsetting dataframe

Call the variables’s name in the datafram to get the new subsetting dataframe

aa_dat2 <- aa_dat[,c("vol","bulk",
"pol","isoelec","H2Opho.34","H2Opho.35",
"saaH2O","faal.fold")]

Figure 7: Principal components analysis - base R

pca.out <- prcomp(aa_dat2, scale = TRUE)

plot output

biplot(pca.out)

Figure 8: Principal components analysis - vegan

rda.out <- vegan::rda(aa_dat2,
scale = TRUE)
rda_scores <- scores(rda.out)
biplot(rda.out, display = "sites", col = 0,)
orditorp(rda.out, display = "sites", labels = aa_dat$aa, cex = 1.2)
ordihull(rda.out,
group = aa_dat$hydropathy,
col = 1:7,
lty = 1:7,
lwd = c(3,6))

Cluster analysis

Distance matrix is the distance between a pair of variables. It is symmetric . Euclidean distance is the distance between two clusters in n-dimensional feature space

dist_euc <- dist(aa_dat2,
                 method = "euclidean")
clust_euc <- hclust(dist_euc,
                    method = "average") #use UPGMA

The cluster diagram produced by Higgs and Atwood (2005) in Plate 2.2 is two-dimensional. It has one more cluster diagram at the top of the graph, which could show the relationship between them. The cluster diagram I produced only shows the one aspect of the graph

dendro_euc <- as.dendrogram(clust_euc)
plot(dendro_euc,horiz = T,nodePar = list(pch = c(1,NA), 
                                          cex = 0.5, 
                                          lab.cex = 1.2))