Setup

Install any needed packages.

#install.packages("lsa")
#install.packages("mdendro")
#install.packages("dendextend")

Load all packages

# packages we've used a lot
library(ggplot2)
library(ggpubr)

# packages we've only recently started using
library(scatterplot3d)
library(pander) 
library(vegan)

## Loading required package: permute

## Loading required package: lattice

## This is vegan 2.5-7

# NEW packages- make sure you installed them!
library(dendextend)

## Registered S3 method overwritten by 'dendextend':
##   method     from 
##   rev.hclust vegan

## 
## ---------------------
## Welcome to dendextend version 1.15.2
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------

## 
## Attaching package: 'dendextend'

## The following object is masked from 'package:permute':
## 
##     shuffle

## The following object is masked from 'package:ggpubr':
## 
##     rotate

## The following object is masked from 'package:stats':
## 
##     cutree

library(lsa)

## Loading required package: SnowballC

PART 1: Data preparation

NOTE: These vectors may be useful for exploring the functionality of R functions but are not used in the analysis itself or added to the dataframe.

# data exploration vectors
MW.da     <- c(89, 121, 133, 146, 165, 75, 155, 131, 146, 131,
               149, 132, 115, 147, 174, 105, 119, 117, 204, 181)
polar.req    <-c(7, 4.8, 13, 12.5, 5, 7.9, 8.4, 4.9, 10.1, 4.9,
                 5.3, 10, 6.6, 8.6,9.1,7.5,6.6,5.6,5.2,5.4)
freq        <-c(7.8, 1.1, 5.19 ,6.72, 4.39, 6.77, 2.03, 6.95,
                6.32, 10.15, 2.28, 4.37, 4.26, 3.45, 5.23,
                6.46, 5.12, 7.01, 1.09, 3.3)

The following vectors are the columns that make up Table 2.2 of Higgs and Attwood 2005, Chapter 2.

# main data vectors
aa.1 <-c('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L','M','N',
                'P','Q','R','S','T','V','W','Y')
vol <- c(67, 86, 91, 109, 135, 48, 118, 124, 135, 124, 124, 96, 90, 114,
            148, 73, 93, 105, 163, 141)
bulk <-c(11.5, 13.46, 11.68, 13.57, 19.8, 3.4, 13.69, 21.4, 15.71, 21.4, 16.25,
         12.28,17.43, 14.45, 14.28, 9.47, 15.77, 21.57, 21.67, 18.03)
pol  <-c(0,1.48,49.7,49.9,0.35,0,51.6,0.13,49.5,0.13,
              1.43,3.38,1.58,3.53,52,1.67,1.66,0.13,2.1,1.61)
isoelec <-c(6,5.07,2.77,3.22,5.48,5.97,7.59,6.02,9.74,5.98, 5.74, 5.41, 6.3, 
            5.65, 10.76, 5.68, 6.16, 5.96, 5.89, 5.66)
H2Opho.34 <-c(1.8,2.5,-3.5,-3.5,2.8,-0.4,-3.2,4.5, -3.9, 3.8, 1.9, -3.5, -1.6, 
              -3.5, -4.5, -0.8, -0.7, 4.2, -0.9, -1.3)
H2Opho.35 <-c(1.6,2,-9.2,-8.2,3.7,1,-3,3.1,-8.8,2.8,3.4,-4.8, -0.2,-4.1,-12.3,
              0.6,1.2,2.6,1.9,-0.7)
saaH2O <-c(113, 140,151,183,218,85,194,182,211,180, 204, 158, 143, 189, 
                 241, 122, 146,160,259,229)
faal.fold <-c(0.74,0.91,0.62,0.62,0.88,0.72,0.78,0.88,0.52, 0.85, 0.85, 0.63,
                 0.64,0.62,0.64,0.66,0.7, 0.86, 0.85, 0.76)

TODO: Create a vector of the full names and the 3-letter codes of the amino acids, then add it to the data.frame.

# build dataframe

aa.full <- c("Alanine", "Cysteine", "Aspartic Acid", "Glutamic Acid", "Phenylanlanine", "Glycine", "Histidine", "Isolucine", "Lysine", "Leucine", "Methionine",
             "Aspagine","Proline", "Glutamine", "Arginine", "Serine", "Threonine", "Valine", "Tryptophan", "Tyrosine")

aa.3 <- c("Ala", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile", "Lys", "Leu", "Met", "Asn",
          "Pro", "Gln", "Arg", "Ser", "Thr", "Val", "Trp", "Tyr")

## create dataframe
habk.df <- data.frame(aa.full = aa.full,
                      aa.3 = aa.3,
                      aa.1 = aa.1,
                      Vol = vol, Bulk = bulk, Polarity = pol,
                      pI = isoelec, Hyd.1 = H2Opho.34,
                      Hyd.2 = H2Opho.35, Surface_area = saaH2O,
                      Fract_area = faal.fold)

# name rows - this will help later
# when we do PCA
row.names(habk.df) <- aa.1

# display with pander
pander(habk.df)

Table continues below
	aa.full	aa.3	aa.1	Vol	Bulk	Polarity	pI	Hyd.1
A	Alanine	Ala	A	67	11.5	0	6	1.8
C	Cysteine	Cys	C	86	13.46	1.48	5.07	2.5
D	Aspartic Acid	Asp	D	91	11.68	49.7	2.77	-3.5
E	Glutamic Acid	Glu	E	109	13.57	49.9	3.22	-3.5
F	Phenylanlanine	Phe	F	135	19.8	0.35	5.48	2.8
G	Glycine	Gly	G	48	3.4	0	5.97	-0.4
H	Histidine	His	H	118	13.69	51.6	7.59	-3.2
I	Isolucine	Ile	I	124	21.4	0.13	6.02	4.5
K	Lysine	Lys	K	135	15.71	49.5	9.74	-3.9
L	Leucine	Leu	L	124	21.4	0.13	5.98	3.8
M	Methionine	Met	M	124	16.25	1.43	5.74	1.9
N	Aspagine	Asn	N	96	12.28	3.38	5.41	-3.5
P	Proline	Pro	P	90	17.43	1.58	6.3	-1.6
Q	Glutamine	Gln	Q	114	14.45	3.53	5.65	-3.5
R	Arginine	Arg	R	148	14.28	52	10.76	-4.5
S	Serine	Ser	S	73	9.47	1.67	5.68	-0.8
T	Threonine	Thr	T	93	15.77	1.66	6.16	-0.7
V	Valine	Val	V	105	21.57	0.13	5.96	4.2
W	Tryptophan	Trp	W	163	21.67	2.1	5.89	-0.9
Y	Tyrosine	Tyr	Y	141	18.03	1.61	5.66	-1.3

	Hyd.2	Surface_area	Fract_area
A	1.6	113	0.74
C	2	140	0.91
D	-9.2	151	0.62
E	-8.2	183	0.62
F	3.7	218	0.88
G	1	85	0.72
H	-3	194	0.78
I	3.1	182	0.88
K	-8.8	211	0.52
L	2.8	180	0.85
M	3.4	204	0.85
N	-4.8	158	0.63
P	-0.2	143	0.64
Q	-4.1	189	0.62
R	-12.3	241	0.64
S	0.6	122	0.66
T	1.2	146	0.7
V	2.6	160	0.86
W	1.9	259	0.85
Y	-0.7	229	0.76

PART 2: DATA DICTIONARY

A data dictionary is a dataframe that contains extra information about the data being used. For example, what the column names represent, what the variable/vector name that corresponds to each data is, and the references. It is best practice to include a data table that creates a connection between what is in written as code and what that means: for example variable names. Variable names should be short enough to be repeated easily without error, but descriptive enough to explain what is being stored. “Vol” as a column for “Volume” makes sense, but “pI” for “ph isoelectric point” isn’t intuitive.

# data dictionary columns
column.names <- c("Vol", "Bulk", "Polarity","pI", "Hyd.1", "Hyd.2", 
                  "Surface_area","Fract_area")

name.full <- c("Volume", "Bulkiness", "Polarity", 
               "pH at Isoelectric point",  "Hydrophobicity scale-1",
               "Hydrophobicity scale-2","SA accessible to H2O","Fractional area")

vector.names <- c("vol", "bulk", "pol","isoelec", "H2Opho.34",
                  "H2Opho.35", "saaH2O","faal.fold")

reference <- c("Creighton 1993", "Zimmerman et al. 1968", "Zimmerman et al. 1968", 
               "Zimmerman et al. 1968", "Kyte & Doolittle 1982", "Engelman et al. 1986",
               "Miller et al. 1987"," Rose et al. 1985")

# TODO: Make a dataframe to hold DATA DICTIONARY information
# make dataframe
data.dictionary <- data.frame(Column = column.names,
                              Full = name.full,
                              Vector = vector.names,
                              Refs = reference)

# Display data dictionary
pander(data.dictionary)

Column	Full	Vector	Refs
Vol	Volume	vol	Creighton 1993
Bulk	Bulkiness	bulk	Zimmerman et al. 1968
Polarity	Polarity	pol	Zimmerman et al. 1968
pI	pH at Isoelectric point	isoelec	Zimmerman et al. 1968
Hyd.1	Hydrophobicity scale-1	H2Opho.34	Kyte & Doolittle 1982
Hyd.2	Hydrophobicity scale-2	H2Opho.35	Engelman et al. 1986
Surface_area	SA accessible to H2O	saaH2O	Miller et al. 1987
Fract_area	Fractional area	faal.fold	Rose et al. 1985

PART 3: Remake Higgs and Attwood Figure 2.8: 2-dimensional Data visualization

# Make scatterplot 
ggscatter(x = "pI",
          y = "Vol",  
          data = habk.df,  
          color = "Polarity", 
          size = "Bulk",     
          label = habk.df$aa.1,     
          ylab = "Volume (Å cubed)", 
          xlab = "pI", 
          title = "Plot of amino acid volume against pI - 
                  \ntwo properties thought to be important in protein folding.",
          ylim = c(0, 180),
          xlim = c(1,12)
          )

PART 4: Data centering and scaling

Data centering is moving the data to have a mean of 0. data point - mean Data scaling is shifting the distribution of the data. ratio of data to standard deviation

Demonstate centering / scaling on a single vector

# Subset data 
## get info
dim(habk.df)

## [1] 20 11

ncol(habk.df)

## [1] 11

# easiest 
habk.df.numeric <- habk.df[ , c(-1,-2,-3)]

# equivalent
habk.df.numeric <- habk.df[ , c(4,5,6,7,8,9,10,11)]
habk.df.numeric <- habk.df[ , c(4:11)]

Set up data:

# Select a single column to explore how to do centering / scaling
X <- habk.df.numeric$Vol

Calculate the mean manually:

sum(X)/length(X)

## [1] 109.2

Challenge: what is this calculating?

X <- habk.df.numeric$Vol
sqrt(sum((X-mean(X))^2)/length(X))

## [1] 28.31007

# data centering:
X <- habk.df.numeric$Vol
u <- mean(X)
u-X

##  [1]  42.2  23.2  18.2   0.2 -25.8  61.2  -8.8 -14.8 -25.8 -14.8 -14.8  13.2
## [13]  19.2  -4.8 -38.8  36.2  16.2   4.2 -53.8 -31.8

# data scaling:
X <- habk.df.numeric$Vol
s <- sd(X)
X/s

##  [1] 2.306724 2.960870 3.133014 3.752730 4.647877 1.652579 4.062589 4.269161
##  [9] 4.647877 4.269161 4.269161 3.305157 3.098585 3.924874 5.095451 2.513297
## [17] 3.201871 3.615016 5.611881 4.854450

# scale and center data with built in methods:
X <- habk.df.numeric$Vol
u <- mean(X)
s <- sd(X)
z <- (u-X)/s

# Compare the data via histogram as original, centered, scaled, and both.
par(mfrow = c(2,2), mar = c(3,1,3,1))

hist(X)
abline(v = mean(X), col = 2)

hist(X/s)
abline(v = mean(X/s), col = 3)

hist(u-X)
abline(v = 0, col = 4)

hist(z)
abline(v = 0, col = 5)

par(mfrow = c(1,1), mar = c(4,4,4,4))

# scale() is a generic function that centers and or scales data
scale(X, center = T, scale = F) # center only

##        [,1]
##  [1,] -42.2
##  [2,] -23.2
##  [3,] -18.2
##  [4,]  -0.2
##  [5,]  25.8
##  [6,] -61.2
##  [7,]   8.8
##  [8,]  14.8
##  [9,]  25.8
## [10,]  14.8
## [11,]  14.8
## [12,] -13.2
## [13,] -19.2
## [14,]   4.8
## [15,]  38.8
## [16,] -36.2
## [17,] -16.2
## [18,]  -4.2
## [19,]  53.8
## [20,]  31.8
## attr(,"scaled:center")
## [1] 109.2

scale(X, center = F, scale = T) # scale only

##            [,1]
##  [1,] 0.5788805
##  [2,] 0.7430407
##  [3,] 0.7862407
##  [4,] 0.9417609
##  [5,] 1.1664011
##  [6,] 0.4147204
##  [7,] 1.0195209
##  [8,] 1.0713610
##  [9,] 1.1664011
## [10,] 1.0713610
## [11,] 1.0713610
## [12,] 0.8294408
## [13,] 0.7776007
## [14,] 0.9849609
## [15,] 1.2787212
## [16,] 0.6307206
## [17,] 0.8035207
## [18,] 0.9072008
## [19,] 1.4083213
## [20,] 1.2182411
## attr(,"scaled:scale")
## [1] 115.7406

scale(X, center = T, scale = T) # both

##               [,1]
##  [1,] -1.452891984
##  [2,] -0.798746304
##  [3,] -0.626602704
##  [4,] -0.006885744
##  [5,]  0.888260976
##  [6,] -2.107037664
##  [7,]  0.302972736
##  [8,]  0.509545056
##  [9,]  0.888260976
## [10,]  0.509545056
## [11,]  0.509545056
## [12,] -0.454459104
## [13,] -0.661031424
## [14,]  0.165257856
## [15,]  1.335834336
## [16,] -1.246319664
## [17,] -0.557745264
## [18,] -0.144600624
## [19,]  1.852265136
## [20,]  1.094833296
## attr(,"scaled:center")
## [1] 109.2
## attr(,"scaled:scale")
## [1] 29.04552

Scale an entire matrix

Total mean of the volume row:

sum(habk.df.numeric$Vol)/length(habk.df.numeric$Vol)

## [1] 109.2

Center a single column

# make a copy of the data
habk.df.centered <- habk.df.numeric


# check original mean
mean(habk.df.numeric$Vol)

## [1] 109.2

habk.df.centered$Vol <- habk.df.centered$Vol - mean(habk.df.centered$Vol)
mean(habk.df.centered$Vol)

## [1] -2.842518e-15

# Bulk
habk.df.centered$Bulk <- habk.df.centered$Bulk - mean(habk.df.centered$Bulk)
mean(habk.df.numeric$Bulk)

## [1] 15.3405

# fractional area
habk.df.centered$Fract_area <- habk.df.centered$Fract_area - mean(habk.df.centered$Fract_area)
mean(habk.df.centered$Fract_area)

## [1] -4.44057e-17

xxxx

# What does this for() loop do?

for(i in 1:ncol(habk.df.centered)){
  mean.i <- mean(habk.df.centered[,i])
  habk.df.centered[,i] <- mean.i-habk.df.centered[,i]
}

summary(habk.df.centered)

##       Vol              Bulk            Polarity             pI         
##  Min.   :-53.80   Min.   :-6.3295   Min.   :-38.406   Min.   :-4.7075  
##  1st Qu.:-17.55   1st Qu.:-3.1320   1st Qu.: -1.429   1st Qu.:-0.0025  
##  Median : -2.30   Median : 0.2605   Median : 11.959   Median : 0.1275  
##  Mean   :  0.00   Mean   : 0.0000   Mean   :  0.000   Mean   : 0.0000  
##  3rd Qu.: 18.45   3rd Qu.: 2.1755   3rd Qu.: 13.299   3rd Qu.: 0.4450  
##  Max.   : 61.20   Max.   :11.9405   Max.   : 13.594   Max.   : 3.2825  
##      Hyd.1           Hyd.2         Surface_area      Fract_area     
##  Min.   :-4.99   Min.   :-5.070   Min.   :-83.60   Min.   :-0.1735  
##  1st Qu.:-2.54   1st Qu.:-3.520   1st Qu.:-30.35   1st Qu.:-0.1135  
##  Median : 0.36   Median :-2.170   Median : -5.60   Median : 0.0065  
##  Mean   : 0.00   Mean   : 0.000   Mean   :  0.00   Mean   : 0.0000  
##  3rd Qu.: 3.01   3rd Qu.: 2.905   3rd Qu.: 30.15   3rd Qu.: 0.0990  
##  Max.   : 4.01   Max.   :10.930   Max.   : 90.40   Max.   : 0.2165

# 
for(i in 1:ncol(habk.df.centered)){
  habk.df.centered[,i] <- mean(habk.df.centered[,i])-habk.df.centered[,i]
}

summary(habk.df.centered)

##       Vol              Bulk             Polarity             pI         
##  Min.   :-61.20   Min.   :-11.9405   Min.   :-13.594   Min.   :-3.2825  
##  1st Qu.:-18.45   1st Qu.: -2.1755   1st Qu.:-13.299   1st Qu.:-0.4450  
##  Median :  2.30   Median : -0.2605   Median :-11.959   Median :-0.1275  
##  Mean   :  0.00   Mean   :  0.0000   Mean   :  0.000   Mean   : 0.0000  
##  3rd Qu.: 17.55   3rd Qu.:  3.1320   3rd Qu.:  1.429   3rd Qu.: 0.0025  
##  Max.   : 53.80   Max.   :  6.3295   Max.   : 38.406   Max.   : 4.7075  
##      Hyd.1           Hyd.2          Surface_area      Fract_area     
##  Min.   :-4.01   Min.   :-10.930   Min.   :-90.40   Min.   :-0.2165  
##  1st Qu.:-3.01   1st Qu.: -2.905   1st Qu.:-30.15   1st Qu.:-0.0990  
##  Median :-0.36   Median :  2.170   Median :  5.60   Median :-0.0065  
##  Mean   : 0.00   Mean   :  0.000   Mean   :  0.00   Mean   : 0.0000  
##  3rd Qu.: 2.54   3rd Qu.:  3.520   3rd Qu.: 30.35   3rd Qu.: 0.1135  
##  Max.   : 4.99   Max.   :  5.070   Max.   : 83.60   Max.   : 0.1735

Check …

# do a logical check to make sure the standard deviation of centered and numeric data is the same
sd(habk.df.centered$Vol)

## [1] 29.04552

sd(habk.df.numeric$Vol)

## [1] 29.04552

sd(habk.df.centered$Vol) == sd(habk.df.numeric$Vol)

## [1] TRUE

# logical check
cpisd <- sd(habk.df.centered$pI)
npisd <- sd(habk.df.numeric$pI)

cpisd == npisd

## [1] TRUE

# Standardize the data

# create new dataframe
habk.df.standard <- habk.df.centered

habk.df.standard$Vol <-  habk.df.standard$Vol/sd(habk.df.standard$Vol)

sd(habk.df.standard$Vol)

## [1] 1

TODO: modify another column as done in the previous chunk

# Standardize another column

habk.df.standard$pI <- habk.df.standard$pI/sd(habk.df.standard$pI)

sd(habk.df.standard$pI)

## [1] 1

# for loop for standardizing

for(i in 1:ncol(habk.df.standard)){
  sd.i <- sd(habk.df.standard[,i])
  habk.df.standard[,i] <- habk.df.standard[,i]/sd.i
}

summary(habk.df.standard)

##       Vol                Bulk             Polarity              pI           
##  Min.   :-2.10704   Min.   :-2.56792   Min.   :-0.62033   Min.   :-1.858005  
##  1st Qu.:-0.63521   1st Qu.:-0.46786   1st Qu.:-0.60687   1st Qu.:-0.251885  
##  Median : 0.07919   Median :-0.05602   Median :-0.54572   Median :-0.072169  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.000000  
##  3rd Qu.: 0.60422   3rd Qu.: 0.67357   3rd Qu.: 0.06519   3rd Qu.: 0.001415  
##  Max.   : 1.85227   Max.   : 1.36122   Max.   : 1.75257   Max.   : 2.664603  
##      Hyd.1             Hyd.2          Surface_area       Fract_area      
##  Min.   :-1.3425   Min.   :-2.2338   Min.   :-2.0230   Min.   :-1.87746  
##  1st Qu.:-1.0077   1st Qu.:-0.5937   1st Qu.:-0.6747   1st Qu.:-0.85852  
##  Median :-0.1205   Median : 0.4435   Median : 0.1253   Median :-0.05637  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.8504   3rd Qu.: 0.7194   3rd Qu.: 0.6792   3rd Qu.: 0.98426  
##  Max.   : 1.6706   Max.   : 1.0362   Max.   : 1.8709   Max.   : 1.50457

TODO: Shorten code in the for() loop so that the object sd.i doesn’t need to be created

# for() loop without sd.i line
for(i in 1:ncol(habk.df.standard)){
  habk.df.standard[,i] <- habk.df.standard[,i]/sd(habk.df.standard[,i])
}

summary(habk.df.standard)

##       Vol                Bulk             Polarity              pI           
##  Min.   :-2.10704   Min.   :-2.56792   Min.   :-0.62033   Min.   :-1.858005  
##  1st Qu.:-0.63521   1st Qu.:-0.46786   1st Qu.:-0.60687   1st Qu.:-0.251885  
##  Median : 0.07919   Median :-0.05602   Median :-0.54572   Median :-0.072169  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.000000  
##  3rd Qu.: 0.60422   3rd Qu.: 0.67357   3rd Qu.: 0.06519   3rd Qu.: 0.001415  
##  Max.   : 1.85227   Max.   : 1.36122   Max.   : 1.75257   Max.   : 2.664603  
##      Hyd.1             Hyd.2          Surface_area       Fract_area      
##  Min.   :-1.3425   Min.   :-2.2338   Min.   :-2.0230   Min.   :-1.87746  
##  1st Qu.:-1.0077   1st Qu.:-0.5937   1st Qu.:-0.6747   1st Qu.:-0.85852  
##  Median :-0.1205   Median : 0.4435   Median : 0.1253   Median :-0.05637  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.8504   3rd Qu.: 0.7194   3rd Qu.: 0.6792   3rd Qu.: 0.98426  
##  Max.   : 1.6706   Max.   : 1.0362   Max.   : 1.8709   Max.   : 1.50457

TODO: Write a for() loop that does BOTH tasks in a single loop.

# center
# scale by sd
for(i in 1:ncol(habk.df.standard)){
  habk.df.standard[,i] <- habk.df.standard[,i]/sd(habk.df.standard[,i])
  habk.df.standard[,i] <- mean(habk.df.standard[,i])-habk.df.standard[,i]
}

summary(habk.df.standard)

##       Vol                Bulk             Polarity              pI           
##  Min.   :-1.85227   Min.   :-1.36122   Min.   :-1.75257   Min.   :-2.664603  
##  1st Qu.:-0.60422   1st Qu.:-0.67357   1st Qu.:-0.06519   1st Qu.:-0.001415  
##  Median :-0.07919   Median : 0.05602   Median : 0.54572   Median : 0.072169  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.000000  
##  3rd Qu.: 0.63521   3rd Qu.: 0.46786   3rd Qu.: 0.60687   3rd Qu.: 0.251885  
##  Max.   : 2.10704   Max.   : 2.56792   Max.   : 0.62033   Max.   : 1.858005  
##      Hyd.1             Hyd.2          Surface_area       Fract_area      
##  Min.   :-1.6706   Min.   :-1.0362   Min.   :-1.8709   Min.   :-1.50457  
##  1st Qu.:-0.8504   1st Qu.:-0.7194   1st Qu.:-0.6792   1st Qu.:-0.98426  
##  Median : 0.1205   Median :-0.4435   Median :-0.1253   Median : 0.05637  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 1.0077   3rd Qu.: 0.5937   3rd Qu.: 0.6747   3rd Qu.: 0.85852  
##  Max.   : 1.3425   Max.   : 2.2338   Max.   : 2.0230   Max.   : 1.87746

Write a for() loop that….

# scale data via scale() function

habk.df.standard <- habk.df.numeric
for(i in 1:ncol(habk.df.standard)){
  habk.df.standard[,i] <- scale(habk.df.standard[,i])
}

summary(habk.df.standard)

##         Vol.V1              Bulk.V1            Polarity.V1     
##  Min.   :-2.1070377   Min.   :-2.5679182   Min.   :-0.6203318  
##  1st Qu.:-0.6352099   1st Qu.:-0.4678620   1st Qu.:-0.6068701  
##  Median : 0.0791861   Median :-0.0560230   Median :-0.5457222  
##  Mean   : 0.0000000   Mean   : 0.0000000   Mean   : 0.0000000  
##  3rd Qu.: 0.6042240   3rd Qu.: 0.6735664   3rd Qu.: 0.0651864  
##  Max.   : 1.8522651   Max.   : 1.3612193   Max.   : 1.7525720  
##         pI.V1               Hyd.1.V1             Hyd.2.V1      
##  Min.   :-1.8580053   Min.   :-1.3424968   Min.   :-2.2338170  
##  1st Qu.:-0.2518850   1st Qu.:-1.0077096   1st Qu.:-0.5937089  
##  Median :-0.0721693   Median :-0.1205234   Median : 0.4434934  
##  Mean   : 0.0000000   Mean   : 0.0000000   Mean   : 0.0000000  
##  3rd Qu.: 0.0014151   3rd Qu.: 0.8503596   3rd Qu.: 0.7193994  
##  Max.   : 2.6646032   Max.   : 1.6705883   Max.   : 1.0361804  
##    Surface_area.V1       Fract_area.V1    
##  Min.   :-2.0230352   Min.   :-1.8774603  
##  1st Qu.:-0.6747180   1st Qu.:-0.8585153  
##  Median : 0.1253208   Median :-0.0563672  
##  Mean   : 0.0000000   Mean   : 0.0000000  
##  3rd Qu.: 0.6791938   3rd Qu.: 0.9842575  
##  Max.   : 1.8708600   Max.   : 1.5045698

CHALLENGE (will NOT be on exam). How do these lines of code work:

# CHALLENGE: What does this do?

X.bar <- apply(habk.df.numeric, 2, mean)
summary(habk.df.numeric)

##       Vol              Bulk          Polarity            pI        
##  Min.   : 48.00   Min.   : 3.40   Min.   : 0.000   Min.   : 2.770  
##  1st Qu.: 90.75   1st Qu.:13.16   1st Qu.: 0.295   1st Qu.: 5.607  
##  Median :111.50   Median :15.08   Median : 1.635   Median : 5.925  
##  Mean   :109.20   Mean   :15.34   Mean   :13.594   Mean   : 6.053  
##  3rd Qu.:126.75   3rd Qu.:18.47   3rd Qu.:15.023   3rd Qu.: 6.055  
##  Max.   :163.00   Max.   :21.67   Max.   :52.000   Max.   :10.760  
##      Hyd.1           Hyd.2          Surface_area     Fract_area    
##  Min.   :-4.50   Min.   :-12.300   Min.   : 85.0   Min.   :0.5200  
##  1st Qu.:-3.50   1st Qu.: -4.275   1st Qu.:145.2   1st Qu.:0.6375  
##  Median :-0.85   Median :  0.800   Median :181.0   Median :0.7300  
##  Mean   :-0.49   Mean   : -1.370   Mean   :175.4   Mean   :0.7365  
##  3rd Qu.: 2.05   3rd Qu.:  2.150   3rd Qu.:205.8   3rd Qu.:0.8500  
##  Max.   : 4.50   Max.   :  3.700   Max.   :259.0   Max.   :0.9100

habk.df.standard <- apply(habk.df.numeric, 2, scale)
habk.df.standard <- data.frame(habk.df.standard)
summary(habk.df.standard)

##       Vol                Bulk             Polarity              pI           
##  Min.   :-2.10704   Min.   :-2.56792   Min.   :-0.62033   Min.   :-1.858005  
##  1st Qu.:-0.63521   1st Qu.:-0.46786   1st Qu.:-0.60687   1st Qu.:-0.251885  
##  Median : 0.07919   Median :-0.05602   Median :-0.54572   Median :-0.072169  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.000000  
##  3rd Qu.: 0.60422   3rd Qu.: 0.67357   3rd Qu.: 0.06519   3rd Qu.: 0.001415  
##  Max.   : 1.85227   Max.   : 1.36122   Max.   : 1.75257   Max.   : 2.664603  
##      Hyd.1             Hyd.2          Surface_area       Fract_area      
##  Min.   :-1.3425   Min.   :-2.2338   Min.   :-2.0230   Min.   :-1.87746  
##  1st Qu.:-1.0077   1st Qu.:-0.5937   1st Qu.:-0.6747   1st Qu.:-0.85852  
##  Median :-0.1205   Median : 0.4435   Median : 0.1253   Median :-0.05637  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.8504   3rd Qu.: 0.7194   3rd Qu.: 0.6792   3rd Qu.: 0.98426  
##  Max.   : 1.6706   Max.   : 1.0362   Max.   : 1.8709   Max.   : 1.50457

PART 5: 3D visualization of centering and scaling

Prepare data:
scatterplot3d does NOT have a data = argument. Make seperate vectors to work with to make code cleanr. ….

# raw vectors
Xraw <- habk.df.numeric$Bulk
Yraw <- habk.df.numeric$Polarity
Zraw <- habk.df.numeric$Surface_area

# center
Xcent <- Xraw - mean(Xraw)
Ycent <- Yraw - mean(Yraw)
Zcent <- Zraw - mean(Zraw)

# scale, without centering
Xscale0 <- Xraw/sd(Xraw)
Yscale0 <- Yraw/sd(Yraw)
Zscale0 <- Zraw/sd(Zraw)

# scale AND center
Xscale <- (Xraw - mean(Xraw))/sd(Xraw)
Yscale <- (Yraw - mean(Yraw))/sd(Yraw)
Zscale <- (Zraw - mean(Zraw))/sd(Zraw)

TODO: create ALL of the scaled vectors created above using the scale() function.

# vectors using R function()

# center
xcenter <- scale(Xraw, center = TRUE, scale = FALSE)
ycenter <- scale(Yraw, center = TRUE, scale = FALSE)
zcenter <- scale(Zraw, center = TRUE, scale = FALSE)

# scale without centering
xscaleo <- scale(Xraw, center = FALSE, scale = TRUE)
yscaleo <- scale(Yraw, center = FALSE, scale = TRUE)
zscaleo <- scale(Zraw, center = FALSE, scale = TRUE)

# scale and center
xscale <- scale(Xraw, center = TRUE, scale = TRUE)
yscale <- scale(Yraw, center = TRUE, scale = TRUE)
zscale <- scale(Zraw, center = TRUE, scale = TRUE)

3-D scatter plot of….

# scatterplot with raw data

#  main plot with scatterplot3d() - know how this works
plot1 <- scatterplot3d(x = Xraw, 
                       y = Yraw, 
                       z = Zraw,
              color = 2, 
              pch = 16, type = "h",
              xlim = c(-15,25),
              ylim = c(-30,50),
              zlim = c(-100,275),
              main = "Raw data")


# add-ons don't need to know how these work
# These are lines at x = 0, y = 0, z = 0
      p2 <- plot1$xyz.convert(0,-40,0)
      p3 <- plot1$xyz.convert(0,60,0)
      segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
      p2 <- plot1$xyz.convert(-20,0,0)
      p3 <- plot1$xyz.convert(30,0,0)
      segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)

3-D scatter plot of….

# scatterplot of centered data

#  main plot - know how this works
plot2 <- scatterplot3d(x = Xcent, y = Ycent, z = Zcent,
              color = 2, 
              pch = 16, type = "h",
              xlim = c(-15,25),
              ylim = c(-30,50),
              zlim = c(-100,275), main = "Centered")

# add-ons don't need to know how these work
# These are lines at x = 0, y = 0, z = 0
p2 <- plot2$xyz.convert(0,-40,0)
p3 <- plot2$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot2$xyz.convert(-20,0,0)
p3 <- plot2$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)

3-D scatter plot of….

# scatterplot of scaled data without centering

#  main plot - know how this works
plot3 <- scatterplot3d(x = Xscale0, y = Yscale0, z = Zscale0,
              color = 3, 
              pch = 12, type = "h",
              xlim = c(-15,25),
              ylim = c(-30,50),
              zlim = c(-100,275), 
              main = "Scaled (NOT centered)")

p2 <- plot3$xyz.convert(0,-40,0)
p3 <- plot3$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot3$xyz.convert(-20,0,0)
p3 <- plot3$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)

3-D scatter plot of….

#  main plot - know how this works
plot4 <- scatterplot3d(x = Xscale, y = Yscale, z = Zscale,
              color = 3, 
              pch = 16, type = "h",
              xlim = c(-25,25),
              ylim = c(-50,50),
              zlim = c(-100,275),
              main = "Centered & Scaled")

p2 <- plot4$xyz.convert(0,-40,0)
p3 <- plot4$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot4$xyz.convert(-20,0,0)
p3 <- plot4$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)

3-D scatter plot of….

#  main plot - know how this works
plot5 <- scatterplot3d(x = Xscale, y = Yscale, z = Zscale,
              color = 3, 
              pch = 16, type = "h",
              #xlim = c(-25,25),
              #ylim = c(-50,50),
              #zlim = c(-100,275),
              main = "Centered & Scaled")


p2 <- plot5$xyz.convert(0,-40,0)
p3 <- plot5$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot5$xyz.convert(-20,0,0)
p3 <- plot5$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)

# plot all scatterplots as one image
# which of the transformations is NOT being shown
par(mfrow = c(2,2), mar = c(1,1,1,1))
plot1 <- scatterplot3d(x = Xraw, 
                       y = Yraw, 
                       z = Zraw,
              color = 2, 
              pch = 16, type = "h",
              xlim = c(-15,25),
              ylim = c(-30,50),
              zlim = c(-100,275),
              main = "")


# add-ons don't need to know how these work
# These are lines at x = 0, y = 0, z = 0
      p2 <- plot1$xyz.convert(0,-40,0)
      p3 <- plot1$xyz.convert(0,60,0)
      segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
      p2 <- plot1$xyz.convert(-20,0,0)
      p3 <- plot1$xyz.convert(30,0,0)
      segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
      
      
#  main plot - know how this works
plot2 <- scatterplot3d(x = Xcent, y = Ycent, z = Zcent,
              color = 2, 
              pch = 16, type = "h",
              xlim = c(-15,25),
              ylim = c(-30,50),
              zlim = c(-100,275), main = "")

# add-ons don't need to know how these work
# These are lines at x = 0, y = 0, z = 0
p2 <- plot2$xyz.convert(0,-40,0)
p3 <- plot2$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot2$xyz.convert(-20,0,0)
p3 <- plot2$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)


plot4 <- scatterplot3d(x = Xscale, y = Yscale, z = Zscale,
              color = 3, 
              pch = 16, type = "h",
              xlim = c(-25,25),
              ylim = c(-50,50),
              zlim = c(-100,275),
              main = "")

p2 <- plot4$xyz.convert(0,-40,0)
p3 <- plot4$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot4$xyz.convert(-20,0,0)
p3 <- plot4$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)


#  main plot - know how this works
plot5 <- scatterplot3d(x = Xscale, y = Yscale, z = Zscale,
              color = 3, 
              pch = 16, type = "h",
              #xlim = c(-25,25),
              #ylim = c(-50,50),
              #zlim = c(-100,275),
              main = "")


p2 <- plot5$xyz.convert(0,-40,0)
p3 <- plot5$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot5$xyz.convert(-20,0,0)
p3 <- plot5$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)

Replication: Multivariate analysis of amino acid chemical properties

Nathan Brouwer

11/29/2021

Setup

PART 1: Data preparation

PART 2: DATA DICTIONARY

PART 3: Remake Higgs and Attwood Figure 2.8: 2-dimensional Data visualization

PART 4: Data centering and scaling

Demonstate centering / scaling on a single vector

Scale an entire matrix

Center a single column

xxxx

PART 5: 3D visualization of centering and scaling