Setup

Install any needed packages.

#install.packages("lsa")
#install.packages("mdendro")
#install.packages("dendextend")

Load all packages

# packages we've used a lot
library(ggplot2)
library(ggpubr)

# packages we've only recently started using
library(scatterplot3d)
library(pander) 
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-7
# NEW packages- make sure you installed them!
library(dendextend)
## Registered S3 method overwritten by 'dendextend':
##   method     from 
##   rev.hclust vegan
## 
## ---------------------
## Welcome to dendextend version 1.15.2
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:permute':
## 
##     shuffle
## The following object is masked from 'package:ggpubr':
## 
##     rotate
## The following object is masked from 'package:stats':
## 
##     cutree
library(lsa)
## Loading required package: SnowballC

PART 1: Data preparation

NOTE: This vectors may be useful for exploring the functionality during lecture of R functions but are not used in the analysis itself or added to the dataframe.

# data exploration vectors
MW.da     <- c(89, 121, 133, 146, 165, 75, 155, 131, 146, 131,
               149, 132, 115, 147, 174, 105, 119, 117, 204, 181)
polar.req    <-c(7, 4.8, 13, 12.5, 5, 7.9, 8.4, 4.9, 10.1, 4.9,
                 5.3, 10, 6.6, 8.6,9.1,7.5,6.6,5.6,5.2,5.4)
freq        <-c(7.8, 1.1, 5.19 ,6.72, 4.39, 6.77, 2.03, 6.95,
                6.32, 10.15, 2.28, 4.37, 4.26, 3.45, 5.23,
                6.46, 5.12, 7.01, 1.09, 3.3)

The following vectors are the columns that make up Table 2.2 of Higgs and Attwood 2005, Chapter 2.

# main data vectors
aa.1        <-c('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L','M','N',
                'P','Q','R','S','T','V','W','Y')
vol    <- c(67, 86, 91, 109, 135, 48, 118, 124, 135, 124, 124, 96, 90, 114,
            148, 73, 93, 105, 163, 141)
bulk <-c(11.5, 13.46, 11.68, 13.57, 19.8, 3.4, 13.69, 21.4, 15.71, 21.4, 16.25,
         12.28,17.43, 14.45, 14.28, 9.47, 15.77, 21.57, 21.67, 18.03)
pol  <-c(0,1.48,49.7,49.9,0.35,0,51.6,0.13,49.5,0.13,
              1.43,3.38,1.58,3.53,52,1.67,1.66,0.13,2.1,1.61)
isoelec <-c(6,5.07,2.77,3.22,5.48,5.97,7.59,6.02,9.74,5.98, 5.74, 5.41, 6.3, 
            5.65, 10.76, 5.68, 6.16, 5.96, 5.89, 5.66)
H2Opho.34 <-c(1.8,2.5,-3.5,-3.5,2.8,-0.4,-3.2,4.5, -3.9, 3.8, 1.9, -3.5, -1.6, 
              -3.5, -4.5, -0.8, -0.7, 4.2, -0.9, -1.3)
H2Opho.35 <-c(1.6,2,-9.2,-8.2,3.7,1,-3,3.1,-8.8,2.8,3.4,-4.8, -0.2,-4.1,-12.3,
              0.6,1.2,2.6,1.9,-0.7)
saaH2O       <-c(113, 140,151,183,218,85,194,182,211,180, 204, 158, 143, 189, 
                 241, 122, 146,160,259,229)
faal.fold    <-c(0.74,0.91,0.62,0.62,0.88,0.72,0.78,0.88,0.52, 0.85, 0.85, 0.63,
                 0.64,0.62,0.64,0.66,0.7, 0.86, 0.85, 0.76)

TODO: Create a vector of the full names and the 3-letter codes of the amino acids, then add it to the data.frame.

# build dataframe

## create a vector with the FULL names of all the amino acids
aa.full <- c("alanine", "cysteine", "aspartic acid", "glutamic acid", "phenylalanine", "glycine", "histidine", "isoleucine", "lysine", "leucine", "methionine", "asparagine", "proline", "glutamine", "arginine", "serine", "threonine", "valine", "tryptophan", "tyrosine")

## create a vector with the 3-letter codes for amino acids
aa.3 <- c("ala", "cys", "asp", "glu", "phe", "gly", "his", "ile", "lys", "leu", "met", "asn", "pro", "gln", "asn", "ser", "thr", "val", "trp", "tyr")

## create dataframe
habk.df <- data.frame(aa.full = aa.full,
                      aa.3 = aa.3,
                      aa.1 = aa.1,
                      Vol = vol, Bulk = bulk, Polarity = pol,
                      pI = isoelec, Hyd.1 = H2Opho.34,
                      Hyd.2 = H2Opho.35, Surface_area = saaH2O,
                      Fract_area = faal.fold)
# name rows - this will help later
# when we do PCA
row.names(habk.df) <- aa.1
# display with pander
pander(habk.df)
Table continues below
  aa.full aa.3 aa.1 Vol Bulk Polarity pI Hyd.1
A alanine ala A 67 11.5 0 6 1.8
C cysteine cys C 86 13.46 1.48 5.07 2.5
D aspartic acid asp D 91 11.68 49.7 2.77 -3.5
E glutamic acid glu E 109 13.57 49.9 3.22 -3.5
F phenylalanine phe F 135 19.8 0.35 5.48 2.8
G glycine gly G 48 3.4 0 5.97 -0.4
H histidine his H 118 13.69 51.6 7.59 -3.2
I isoleucine ile I 124 21.4 0.13 6.02 4.5
K lysine lys K 135 15.71 49.5 9.74 -3.9
L leucine leu L 124 21.4 0.13 5.98 3.8
M methionine met M 124 16.25 1.43 5.74 1.9
N asparagine asn N 96 12.28 3.38 5.41 -3.5
P proline pro P 90 17.43 1.58 6.3 -1.6
Q glutamine gln Q 114 14.45 3.53 5.65 -3.5
R arginine asn R 148 14.28 52 10.76 -4.5
S serine ser S 73 9.47 1.67 5.68 -0.8
T threonine thr T 93 15.77 1.66 6.16 -0.7
V valine val V 105 21.57 0.13 5.96 4.2
W tryptophan trp W 163 21.67 2.1 5.89 -0.9
Y tyrosine tyr Y 141 18.03 1.61 5.66 -1.3
  Hyd.2 Surface_area Fract_area
A 1.6 113 0.74
C 2 140 0.91
D -9.2 151 0.62
E -8.2 183 0.62
F 3.7 218 0.88
G 1 85 0.72
H -3 194 0.78
I 3.1 182 0.88
K -8.8 211 0.52
L 2.8 180 0.85
M 3.4 204 0.85
N -4.8 158 0.63
P -0.2 143 0.64
Q -4.1 189 0.62
R -12.3 241 0.64
S 0.6 122 0.66
T 1.2 146 0.7
V 2.6 160 0.86
W 1.9 259 0.85
Y -0.7 229 0.76

PART 2: DATA DICTIONARY

A data dictionary is a guide to what the code means, what the columns are, what the variables are, etcetera. Metadata is data about data. A good column name is one that is short and concise, while a bad column name is one that is long.

https://library.ucmerced.edu/data-dictionaries

# (For exam you need to be able to...
# describe why a data dictionary is needed, 
# what makes good column names, 
# what makes bad column names, etc.)


# data dictionary columns
column.names <- c("Vol", "Bulk", "Polarity","pI", "Hyd.1", "Hyd.2", 
                  "Surface_area","Fract_area")

name.full <- c("Volume", "Bulkiness", "Polarity", 
               "pH at Isoelectric point",  "Hydrophobicity scale-1",
               "Hydrophobicity scale-2","SA accessible to H2O","Fractional area")
vector.names <- c("vol", "bulk", "pol","isoelec", "H2Opho.34",
                  "H2Opho.35", "saaH2O","faal.fold")
reference <- c("Creighton 1993", "Zimmerman et al. 1968", "Zimmerman et al. 1968", 
               "Zimmerman et al. 1968", "Kyte & Doolittle 1982", "Engelman et al. 1986",
               "Miller et al. 1987"," Rose et al. 1985")

Make a dataframe to hold the DATA DICTIONARY information

# TODO: Make a dataframe to hold DATA DICTIONARY information
# make dataframe
data.dictionary <- data.frame( ColumnNames = column.names,
                              FullName = name.full,
                              VectorNames = vector.names,
                              References  = reference)
# Display data dictionary
pander(data.dictionary)
ColumnNames FullName VectorNames References
Vol Volume vol Creighton 1993
Bulk Bulkiness bulk Zimmerman et al. 1968
Polarity Polarity pol Zimmerman et al. 1968
pI pH at Isoelectric point isoelec Zimmerman et al. 1968
Hyd.1 Hydrophobicity scale-1 H2Opho.34 Kyte & Doolittle 1982
Hyd.2 Hydrophobicity scale-2 H2Opho.35 Engelman et al. 1986
Surface_area SA accessible to H2O saaH2O Miller et al. 1987
Fract_area Fractional area faal.fold Rose et al. 1985

PART 3: Remake Higgs and Attwood Figure 2.8: 2-dimensional Data visualization

TODO: Using ggpubr, remake AND update/augment Figure 2.8 of Higgs and Attwood 2005, Chapter 2, page 26.

  1. Plot the appropriate variables from the original figure against each other, on the appropriate axes
  2. Add the caption from the original paper to the “title =” section of ggscatter
  3. Include newline character in the title to split it accross two lines
  4. Add a 3rd variable via color-coding using color = “…”
  5. Add a 4th variable via size-coding using size = “…”
  6. Label the axes with xlab and ylab
  7. Set ylim to c(0, 180) and xlim to c(1,12)
# Make scatterplot with appropriate ggpubr function
ggscatter(x = "pI",        
          y = "Vol",       
          data = habk.df,  
          color = "Polarity", 
          size = "Hyd.1",     
          label = "aa.1",     
          ylab = "Volume (Angstorms)",
          xlab = "pI (pH at isoelectric point)", 
          title = "Plot of Amino Acid Volume agaist pI, two properies \n thought to important to protein folding",
          ylim = c(0, 180),
          xlim = c(1,12))

PART 4: Data centering and scaling

For the exam you need to be able to understand what data centering and scaling is, how do do them in R with and without specialized functions, how to do them to an individual vector, how to do them to a column in a dataframe, how to do them using a for loop.

Data centering is centering around the mean (mean goes to 0) Data scaling is dividing by the SD

Demonstate centering / scaling on a single vector

# For the exam you need to be able to understand what data centering and scaling is, 
# how do do them in R with and without specialized functions, how to do them to an
#individual vector, how to do them to a column in a dataframe, 
# how to do them using a for loop.


# Subset data 
## get info
dim(habk.df)
## [1] 20 11
ncol(habk.df)
## [1] 11
# easiest 
habk.df.numeric <- habk.df[ , c(-1,-2,-3)]

# equivalent
habk.df.numeric <- habk.df[ , c(4,5,6,7,8,9,10,11)]
habk.df.numeric <- habk.df[ , c(4:11)]

Set up data:

# Select a single column to explore how to do centering / scaling
X <- habk.df.numeric$Vol

This is calculating the mean

sum(X)/length(X)
## [1] 109.2

This is calculating the standard deviation

X <- habk.df.numeric$Vol
sqrt(sum((X-mean(X))^2)/length(X))
## [1] 28.31007
# This is centering the data
X <- habk.df.numeric$Vol
u <- mean(X)
u-X
##  [1]  42.2  23.2  18.2   0.2 -25.8  61.2  -8.8 -14.8 -25.8 -14.8 -14.8  13.2
## [13]  19.2  -4.8 -38.8  36.2  16.2   4.2 -53.8 -31.8
# This is scaling the data without centering
X <- habk.df.numeric$Vol
s <- sd(X)
X/s
##  [1] 2.306724 2.960870 3.133014 3.752730 4.647877 1.652579 4.062589 4.269161
##  [9] 4.647877 4.269161 4.269161 3.305157 3.098585 3.924874 5.095451 2.513297
## [17] 3.201871 3.615016 5.611881 4.854450
# This is scaling the data with centering 
X <- habk.df.numeric$Vol
u <- mean(X)
s <- sd(X)
z <- (u-X)/s
# This is creating histograms with different variations of scaling and centering
par(mfrow = c(2,2), mar = c(3,1,3,1))

hist(X)
abline(v = mean(X), col = 2)

hist(X/s)
abline(v = mean(X/s), col = 3)

hist(u-X)
abline(v = 0, col = 4)

hist(z)
abline(v = 0, col = 5)

par(mfrow = c(1,1), mar = c(4,4,4,4))
# This uses the scale function to scale and center  
# Each of these lines tests different variations of scaling and centering
scale(X, center = T, scale = F)
##        [,1]
##  [1,] -42.2
##  [2,] -23.2
##  [3,] -18.2
##  [4,]  -0.2
##  [5,]  25.8
##  [6,] -61.2
##  [7,]   8.8
##  [8,]  14.8
##  [9,]  25.8
## [10,]  14.8
## [11,]  14.8
## [12,] -13.2
## [13,] -19.2
## [14,]   4.8
## [15,]  38.8
## [16,] -36.2
## [17,] -16.2
## [18,]  -4.2
## [19,]  53.8
## [20,]  31.8
## attr(,"scaled:center")
## [1] 109.2
scale(X, center = F, scale = T)
##            [,1]
##  [1,] 0.5788805
##  [2,] 0.7430407
##  [3,] 0.7862407
##  [4,] 0.9417609
##  [5,] 1.1664011
##  [6,] 0.4147204
##  [7,] 1.0195209
##  [8,] 1.0713610
##  [9,] 1.1664011
## [10,] 1.0713610
## [11,] 1.0713610
## [12,] 0.8294408
## [13,] 0.7776007
## [14,] 0.9849609
## [15,] 1.2787212
## [16,] 0.6307206
## [17,] 0.8035207
## [18,] 0.9072008
## [19,] 1.4083213
## [20,] 1.2182411
## attr(,"scaled:scale")
## [1] 115.7406
scale(X, center = T, scale = T)
##               [,1]
##  [1,] -1.452891984
##  [2,] -0.798746304
##  [3,] -0.626602704
##  [4,] -0.006885744
##  [5,]  0.888260976
##  [6,] -2.107037664
##  [7,]  0.302972736
##  [8,]  0.509545056
##  [9,]  0.888260976
## [10,]  0.509545056
## [11,]  0.509545056
## [12,] -0.454459104
## [13,] -0.661031424
## [14,]  0.165257856
## [15,]  1.335834336
## [16,] -1.246319664
## [17,] -0.557745264
## [18,] -0.144600624
## [19,]  1.852265136
## [20,]  1.094833296
## attr(,"scaled:center")
## [1] 109.2
## attr(,"scaled:scale")
## [1] 29.04552

Scale an entire matrix

This is calculating the mean

sum(habk.df.numeric$Vol)/length(habk.df.numeric$Vol)
## [1] 109.2

Center a single column

# make a copy of the data
habk.df.centered <- habk.df.numeric


# check original mean
mean(habk.df.numeric$Vol)
## [1] 109.2
habk.df.centered$Vol <- habk.df.centered$Vol - mean(habk.df.centered$Vol)
mean(habk.df.centered$Vol)
## [1] -2.842518e-15
# Bulk
habk.df.centered$Bulk <- habk.df.centered$Bulk - mean(habk.df.centered$Bulk)
mean(habk.df.numeric$Bulk)
## [1] 15.3405

TODO: Center another column using the entire dataframe

# make a copy of the data
habk.df.centered <- habk.df.numeric


# check original mean
mean(habk.df.numeric$Polarity)
## [1] 13.594
habk.df.centered$Polarity <- habk.df.centered$Polarity - mean(habk.df.centered$Polarity)
mean(habk.df.centered$Polarity)
## [1] 3.56052e-16
# pI
habk.df.centered$pI <- habk.df.centered$pI - mean(habk.df.centered$pI)
mean(habk.df.numeric$pI)
## [1] 6.0525

xxxx

# This for loop creates a summary of all of the data

for(i in 1:ncol(habk.df.centered)){
  mean.i <- mean(habk.df.centered[,i])
  habk.df.centered[,i] <- mean.i-habk.df.centered[,i]
}

summary(habk.df.centered)
##       Vol              Bulk            Polarity             pI         
##  Min.   :-53.80   Min.   :-6.3295   Min.   :-38.406   Min.   :-4.7075  
##  1st Qu.:-17.55   1st Qu.:-3.1320   1st Qu.: -1.429   1st Qu.:-0.0025  
##  Median : -2.30   Median : 0.2605   Median : 11.959   Median : 0.1275  
##  Mean   :  0.00   Mean   : 0.0000   Mean   :  0.000   Mean   : 0.0000  
##  3rd Qu.: 18.45   3rd Qu.: 2.1755   3rd Qu.: 13.299   3rd Qu.: 0.4450  
##  Max.   : 61.20   Max.   :11.9405   Max.   : 13.594   Max.   : 3.2825  
##      Hyd.1           Hyd.2         Surface_area      Fract_area     
##  Min.   :-4.99   Min.   :-5.070   Min.   :-83.60   Min.   :-0.1735  
##  1st Qu.:-2.54   1st Qu.:-3.520   1st Qu.:-30.35   1st Qu.:-0.1135  
##  Median : 0.36   Median :-2.170   Median : -5.60   Median : 0.0065  
##  Mean   : 0.00   Mean   : 0.000   Mean   :  0.00   Mean   : 0.0000  
##  3rd Qu.: 3.01   3rd Qu.: 2.905   3rd Qu.: 30.15   3rd Qu.: 0.0990  
##  Max.   : 4.01   Max.   :10.930   Max.   : 90.40   Max.   : 0.2165

Rewrite the for() loop above so that the variable mean.i doesn’t have to be created and only a single line of code is executed within the for() loop.

#for(i in 1:ncol(habk.df.centered)){
#  habk.df.centered[,i] <- (mean(habk.df.centered[,i])-habk.df.centered[,i]
#}

#summary(habk.df.centered)

Check …

# This chunk is just checking to make sure that thd SD is the same between the centered data and the non-centered data
sd(habk.df.centered$Vol) 
## [1] 29.04552
sd(habk.df.numeric$Vol)
## [1] 29.04552
sd(habk.df.centered$Vol) == sd(habk.df.numeric$Vol)
## [1] TRUE

TODO: Write logical checks for the standard deviation (sd) of another variable

# logical check

sd(habk.df.centered$Polarity) 
## [1] 21.91408
sd(habk.df.numeric$Polarity)
## [1] 21.91408
sd(habk.df.centered$Polarity) == sd(habk.df.numeric$Polarity)
## [1] TRUE
# This chunk is creating a new dataframe with the centered data and then obtaining its standard deviation

# create new dataframe
habk.df.standard <- habk.df.centered

habk.df.standard$Vol <-  habk.df.standard$Vol/sd(habk.df.standard$Vol)

sd(habk.df.standard$Vol)
## [1] 1

TODO: modify another column a done in the previous chunk

# Standardize another column

habk.df.standard <- habk.df.centered

habk.df.standard$Polarity <-  habk.df.standard$Polarity/sd(habk.df.standard$Polarity)

sd(habk.df.standard$Polarity)
## [1] 1
# This chunk creates a summary of the standard data

for(i in 1:ncol(habk.df.standard)){
  sd.i <- sd(habk.df.standard[,i])
  habk.df.standard[,i] <- habk.df.standard[,i]/sd.i
}

summary(habk.df.standard)
##       Vol                Bulk             Polarity              pI           
##  Min.   :-1.85227   Min.   :-1.36122   Min.   :-1.75257   Min.   :-2.664603  
##  1st Qu.:-0.60422   1st Qu.:-0.67357   1st Qu.:-0.06519   1st Qu.:-0.001415  
##  Median :-0.07919   Median : 0.05602   Median : 0.54572   Median : 0.072169  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.000000  
##  3rd Qu.: 0.63521   3rd Qu.: 0.46786   3rd Qu.: 0.60687   3rd Qu.: 0.251885  
##  Max.   : 2.10704   Max.   : 2.56792   Max.   : 0.62033   Max.   : 1.858005  
##      Hyd.1             Hyd.2          Surface_area       Fract_area      
##  Min.   :-1.6706   Min.   :-1.0362   Min.   :-1.8709   Min.   :-1.50457  
##  1st Qu.:-0.8504   1st Qu.:-0.7194   1st Qu.:-0.6792   1st Qu.:-0.98426  
##  Median : 0.1205   Median :-0.4435   Median :-0.1253   Median : 0.05637  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 1.0077   3rd Qu.: 0.5937   3rd Qu.: 0.6747   3rd Qu.: 0.85852  
##  Max.   : 1.3425   Max.   : 2.2338   Max.   : 2.0230   Max.   : 1.87746

Shorten code in the for() loop so that the object sd.i doesn’t need to be created

# for() loop without sd.i line

for(i in 1:ncol(habk.df.standard)){
  habk.df.standard[,i] <- habk.df.standard[,i]/sd(habk.df.standard[,i])
}

summary(habk.df.standard)
##       Vol                Bulk             Polarity              pI           
##  Min.   :-1.85227   Min.   :-1.36122   Min.   :-1.75257   Min.   :-2.664603  
##  1st Qu.:-0.60422   1st Qu.:-0.67357   1st Qu.:-0.06519   1st Qu.:-0.001415  
##  Median :-0.07919   Median : 0.05602   Median : 0.54572   Median : 0.072169  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.000000  
##  3rd Qu.: 0.63521   3rd Qu.: 0.46786   3rd Qu.: 0.60687   3rd Qu.: 0.251885  
##  Max.   : 2.10704   Max.   : 2.56792   Max.   : 0.62033   Max.   : 1.858005  
##      Hyd.1             Hyd.2          Surface_area       Fract_area      
##  Min.   :-1.6706   Min.   :-1.0362   Min.   :-1.8709   Min.   :-1.50457  
##  1st Qu.:-0.8504   1st Qu.:-0.7194   1st Qu.:-0.6792   1st Qu.:-0.98426  
##  Median : 0.1205   Median :-0.4435   Median :-0.1253   Median : 0.05637  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 1.0077   3rd Qu.: 0.5937   3rd Qu.: 0.6747   3rd Qu.: 0.85852  
##  Max.   : 1.3425   Max.   : 2.2338   Max.   : 2.0230   Max.   : 1.87746

Write a for() loop that does BOTH tasks in a single loop.

# center
# scale by sd

#for(i in 1:ncol(habk.df.centered)){
#  habk.df.centered[,i] <- (mean(habk.df.centered[,i])-habk.df.centered[,i]
#  habk.df.standard[,i] <- habk.df.standard[,i]/sd(habk.df.standard[,i])        
#}

#summary(habk.df.centered)
#summary(habk.df.standard)

Write a for() loop that….

# This chunk scales numeric data using a for loop

habk.df.standard <- habk.df.numeric
for(i in 1:ncol(habk.df.standard)){
  habk.df.standard[,i] <- scale(habk.df.standard[,i])
}

summary(habk.df.standard)
##         Vol.V1              Bulk.V1            Polarity.V1     
##  Min.   :-2.1070377   Min.   :-2.5679182   Min.   :-0.6203318  
##  1st Qu.:-0.6352099   1st Qu.:-0.4678620   1st Qu.:-0.6068701  
##  Median : 0.0791861   Median :-0.0560230   Median :-0.5457222  
##  Mean   : 0.0000000   Mean   : 0.0000000   Mean   : 0.0000000  
##  3rd Qu.: 0.6042240   3rd Qu.: 0.6735664   3rd Qu.: 0.0651864  
##  Max.   : 1.8522651   Max.   : 1.3612193   Max.   : 1.7525720  
##         pI.V1               Hyd.1.V1             Hyd.2.V1      
##  Min.   :-1.8580053   Min.   :-1.3424968   Min.   :-2.2338170  
##  1st Qu.:-0.2518850   1st Qu.:-1.0077096   1st Qu.:-0.5937089  
##  Median :-0.0721693   Median :-0.1205234   Median : 0.4434934  
##  Mean   : 0.0000000   Mean   : 0.0000000   Mean   : 0.0000000  
##  3rd Qu.: 0.0014151   3rd Qu.: 0.8503596   3rd Qu.: 0.7193994  
##  Max.   : 2.6646032   Max.   : 1.6705883   Max.   : 1.0361804  
##    Surface_area.V1       Fract_area.V1    
##  Min.   :-2.0230352   Min.   :-1.8774603  
##  1st Qu.:-0.6747180   1st Qu.:-0.8585153  
##  Median : 0.1253208   Median :-0.0563672  
##  Mean   : 0.0000000   Mean   : 0.0000000  
##  3rd Qu.: 0.6791938   3rd Qu.: 0.9842575  
##  Max.   : 1.8708600   Max.   : 1.5045698

CHALLENGE (will NOT be on exam). How do these lines of code work:

# This chunk applies the mean function to the numeric data and scales the standardized data

X.bar <- apply(habk.df.numeric, 2, mean)
summary(habk.df.numeric)
##       Vol              Bulk          Polarity            pI        
##  Min.   : 48.00   Min.   : 3.40   Min.   : 0.000   Min.   : 2.770  
##  1st Qu.: 90.75   1st Qu.:13.16   1st Qu.: 0.295   1st Qu.: 5.607  
##  Median :111.50   Median :15.08   Median : 1.635   Median : 5.925  
##  Mean   :109.20   Mean   :15.34   Mean   :13.594   Mean   : 6.053  
##  3rd Qu.:126.75   3rd Qu.:18.47   3rd Qu.:15.023   3rd Qu.: 6.055  
##  Max.   :163.00   Max.   :21.67   Max.   :52.000   Max.   :10.760  
##      Hyd.1           Hyd.2          Surface_area     Fract_area    
##  Min.   :-4.50   Min.   :-12.300   Min.   : 85.0   Min.   :0.5200  
##  1st Qu.:-3.50   1st Qu.: -4.275   1st Qu.:145.2   1st Qu.:0.6375  
##  Median :-0.85   Median :  0.800   Median :181.0   Median :0.7300  
##  Mean   :-0.49   Mean   : -1.370   Mean   :175.4   Mean   :0.7365  
##  3rd Qu.: 2.05   3rd Qu.:  2.150   3rd Qu.:205.8   3rd Qu.:0.8500  
##  Max.   : 4.50   Max.   :  3.700   Max.   :259.0   Max.   :0.9100
habk.df.standard <- apply(habk.df.numeric, 2, scale)
habk.df.standard <- data.frame(habk.df.standard)
summary(habk.df.standard)
##       Vol                Bulk             Polarity              pI           
##  Min.   :-2.10704   Min.   :-2.56792   Min.   :-0.62033   Min.   :-1.858005  
##  1st Qu.:-0.63521   1st Qu.:-0.46786   1st Qu.:-0.60687   1st Qu.:-0.251885  
##  Median : 0.07919   Median :-0.05602   Median :-0.54572   Median :-0.072169  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.000000  
##  3rd Qu.: 0.60422   3rd Qu.: 0.67357   3rd Qu.: 0.06519   3rd Qu.: 0.001415  
##  Max.   : 1.85227   Max.   : 1.36122   Max.   : 1.75257   Max.   : 2.664603  
##      Hyd.1             Hyd.2          Surface_area       Fract_area      
##  Min.   :-1.3425   Min.   :-2.2338   Min.   :-2.0230   Min.   :-1.87746  
##  1st Qu.:-1.0077   1st Qu.:-0.5937   1st Qu.:-0.6747   1st Qu.:-0.85852  
##  Median :-0.1205   Median : 0.4435   Median : 0.1253   Median :-0.05637  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.8504   3rd Qu.: 0.7194   3rd Qu.: 0.6792   3rd Qu.: 0.98426  
##  Max.   : 1.6706   Max.   : 1.0362   Max.   : 1.8709   Max.   : 1.50457

PART 5: 3D visualization of centering and scaling

Prepare data:
scatterplot3d does NOT have a data = argument. Make seperate vectors to work with to make code cleaner. ….

# raw vectors
Xraw <- habk.df.numeric$Bulk
Yraw <- habk.df.numeric$Polarity
Zraw <- habk.df.numeric$Surface_area

# center
Xcent <- Xraw - mean(Xraw)
Ycent <- Yraw - mean(Yraw)
Zcent <- Zraw - mean(Zraw)

# scale, without centering
Xscale0 <- Xraw/sd(Xraw)
Yscale0 <- Yraw/sd(Yraw)
Zscale0 <- Zraw/sd(Zraw)

# scale AND center
Xscale <- (Xraw - mean(Xraw))/sd(Xraw)
Yscale <- (Yraw - mean(Yraw))/sd(Yraw)
Zscale <- (Zraw - mean(Zraw))/sd(Zraw)

Create ALL of the scaled vectors created above using the scale() function.

# vectors using R function()

scale(Xraw, center = T, scale = F)
##           [,1]
##  [1,]  -3.8405
##  [2,]  -1.8805
##  [3,]  -3.6605
##  [4,]  -1.7705
##  [5,]   4.4595
##  [6,] -11.9405
##  [7,]  -1.6505
##  [8,]   6.0595
##  [9,]   0.3695
## [10,]   6.0595
## [11,]   0.9095
## [12,]  -3.0605
## [13,]   2.0895
## [14,]  -0.8905
## [15,]  -1.0605
## [16,]  -5.8705
## [17,]   0.4295
## [18,]   6.2295
## [19,]   6.3295
## [20,]   2.6895
## attr(,"scaled:center")
## [1] 15.3405
scale(Yraw, center = T, scale = F)
##          [,1]
##  [1,] -13.594
##  [2,] -12.114
##  [3,]  36.106
##  [4,]  36.306
##  [5,] -13.244
##  [6,] -13.594
##  [7,]  38.006
##  [8,] -13.464
##  [9,]  35.906
## [10,] -13.464
## [11,] -12.164
## [12,] -10.214
## [13,] -12.014
## [14,] -10.064
## [15,]  38.406
## [16,] -11.924
## [17,] -11.934
## [18,] -13.464
## [19,] -11.494
## [20,] -11.984
## attr(,"scaled:center")
## [1] 13.594
scale(Zraw, center = T, scale = F)
##        [,1]
##  [1,] -62.4
##  [2,] -35.4
##  [3,] -24.4
##  [4,]   7.6
##  [5,]  42.6
##  [6,] -90.4
##  [7,]  18.6
##  [8,]   6.6
##  [9,]  35.6
## [10,]   4.6
## [11,]  28.6
## [12,] -17.4
## [13,] -32.4
## [14,]  13.6
## [15,]  65.6
## [16,] -53.4
## [17,] -29.4
## [18,] -15.4
## [19,]  83.6
## [20,]  53.6
## attr(,"scaled:center")
## [1] 175.4
scale(Xraw, center = F, scale = T)
##            [,1]
##  [1,] 0.7007271
##  [2,] 0.8201554
##  [3,] 0.7116950
##  [4,] 0.8268580
##  [5,] 1.2064693
##  [6,] 0.2071715
##  [7,] 0.8341699
##  [8,] 1.3039618
##  [9,] 0.9572542
## [10,] 1.3039618
## [11,] 0.9901579
## [12,] 0.7482547
## [13,] 1.0620586
## [14,] 0.8804789
## [15,] 0.8701203
## [16,] 0.5770336
## [17,] 0.9609101
## [18,] 1.3143203
## [19,] 1.3204136
## [20,] 1.0986183
## attr(,"scaled:scale")
## [1] 16.41152
scale(Yraw, center = F, scale = T)
##             [,1]
##  [1,] 0.00000000
##  [2,] 0.05697579
##  [3,] 1.91330848
##  [4,] 1.92100791
##  [5,] 0.01347400
##  [6,] 0.00000000
##  [7,] 1.98645307
##  [8,] 0.00500463
##  [9,] 1.90560905
## [10,] 0.00500463
## [11,] 0.05505093
## [12,] 0.13012038
## [13,] 0.06082550
## [14,] 0.13589495
## [15,] 2.00185193
## [16,] 0.06429024
## [17,] 0.06390527
## [18,] 0.00500463
## [19,] 0.08084402
## [20,] 0.06198042
## attr(,"scaled:scale")
## [1] 25.97595
scale(Zraw, center = F, scale = T)
##            [,1]
##  [1,] 0.6094221
##  [2,] 0.7550362
##  [3,] 0.8143605
##  [4,] 0.9869402
##  [5,] 1.1756993
##  [6,] 0.4584148
##  [7,] 1.0462645
##  [8,] 0.9815471
##  [9,] 1.1379474
## [10,] 0.9707609
## [11,] 1.1001956
## [12,] 0.8521123
## [13,] 0.7712156
## [14,] 1.0192989
## [15,] 1.2997409
## [16,] 0.6579601
## [17,] 0.7873949
## [18,] 0.8628985
## [19,] 1.3968170
## [20,] 1.2350235
## attr(,"scaled:scale")
## [1] 185.4216
scale(Xraw, center = T, scale = T)
##              [,1]
##  [1,] -0.82593610
##  [2,] -0.40441943
##  [3,] -0.78722538
##  [4,] -0.38076289
##  [5,]  0.95905794
##  [6,] -2.56791824
##  [7,] -0.35495574
##  [8,]  1.30315318
##  [9,]  0.07946449
## [10,]  1.30315318
## [11,]  0.19559664
## [12,] -0.65818967
## [13,]  0.44936687
## [14,] -0.19151051
## [15,] -0.22807062
## [16,] -1.26250693
## [17,]  0.09236807
## [18,]  1.33971330
## [19,]  1.36121925
## [20,]  0.57840259
## attr(,"scaled:center")
## [1] 15.3405
## attr(,"scaled:scale")
## [1] 4.649875
scale(Yraw, center = T, scale = T)
##             [,1]
##  [1,] -0.6203318
##  [2,] -0.5527953
##  [3,]  1.6476166
##  [4,]  1.6567432
##  [5,] -0.6043603
##  [6,] -0.6203318
##  [7,]  1.7343189
##  [8,] -0.6143995
##  [9,]  1.6384901
## [10,] -0.6143995
## [11,] -0.5550770
## [12,] -0.4660931
## [13,] -0.5482320
## [14,] -0.4592481
## [15,]  1.7525720
## [16,] -0.5441251
## [17,] -0.5445814
## [18,] -0.6143995
## [19,] -0.5245030
## [20,] -0.5468631
## attr(,"scaled:center")
## [1] 13.594
## attr(,"scaled:scale")
## [1] 21.91408
scale(Zraw, center = T, scale = T)
##             [,1]
##  [1,] -1.3964314
##  [2,] -0.7922063
##  [3,] -0.5460405
##  [4,]  0.1700782
##  [5,]  0.9533330
##  [6,] -2.0230352
##  [7,]  0.4162440
##  [8,]  0.1476995
##  [9,]  0.7966820
## [10,]  0.1029421
## [11,]  0.6400310
## [12,] -0.3893895
## [13,] -0.7250701
## [14,]  0.3043504
## [15,]  1.4680432
## [16,] -1.1950230
## [17,] -0.6579340
## [18,] -0.3446321
## [19,]  1.8708600
## [20,]  1.1994987
## attr(,"scaled:center")
## [1] 175.4
## attr(,"scaled:scale")
## [1] 44.68533

3-D scatter plot of raw data

# This chunk creates a 3D plot of the raw data

#  main plot with scatterplot3d() - know how this works
plot1 <- scatterplot3d(x = Xraw, 
                       y = Yraw, 
                       z = Zraw,
              color = 2, 
              pch = 16, type = "h",
              xlim = c(-15,25),
              ylim = c(-30,50),
              zlim = c(-100,275),
              main = "Raw data")


# add-ons don't need to know how these work
# These are lines at x = 0, y = 0, z = 0
      p2 <- plot1$xyz.convert(0,-40,0)
      p3 <- plot1$xyz.convert(0,60,0)
      segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
      p2 <- plot1$xyz.convert(-20,0,0)
      p3 <- plot1$xyz.convert(30,0,0)
      segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)

3-D scatter plot of centered data

# This chunk creates a 3D plot of the centered data

#  main plot - know how this works
plot2 <- scatterplot3d(x = Xcent, y = Ycent, z = Zcent,
              color = 2, 
              pch = 16, type = "h",
              xlim = c(-15,25),
              ylim = c(-30,50),
              zlim = c(-100,275), main = "Centered")

# add-ons don't need to know how these work
# These are lines at x = 0, y = 0, z = 0
p2 <- plot2$xyz.convert(0,-40,0)
p3 <- plot2$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot2$xyz.convert(-20,0,0)
p3 <- plot2$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)

3-D scatter plot of scaled (NOT centered) data

# This chunk creates a 3D plot of the scaled (NOT centered) data

#  main plot - know how this works
plot3 <- scatterplot3d(x = Xscale0, y = Yscale0, z = Zscale0,
              color = 3, 
              pch = 12, type = "h",
              xlim = c(-15,25),
              ylim = c(-30,50),
              zlim = c(-100,275), 
              main = "Scaled (NOT centered)")

p2 <- plot3$xyz.convert(0,-40,0)
p3 <- plot3$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot3$xyz.convert(-20,0,0)
p3 <- plot3$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)

3-D scatter plot of scaled and centered data

#  main plot - know how this works
plot4 <- scatterplot3d(x = Xscale, y = Yscale, z = Zscale,
              color = 3, 
              pch = 16, type = "h",
              xlim = c(-25,25),
              ylim = c(-50,50),
              zlim = c(-100,275),
              main = "Centered & Scaled")

p2 <- plot4$xyz.convert(0,-40,0)
p3 <- plot4$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot4$xyz.convert(-20,0,0)
p3 <- plot4$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)

3-D scatter plot of scaled and centered data

#  main plot - know how this works
plot5 <- scatterplot3d(x = Xscale, y = Yscale, z = Zscale,
              color = 3, 
              pch = 16, type = "h",
              #xlim = c(-25,25),
              #ylim = c(-50,50),
              #zlim = c(-100,275),
              main = "Centered & Scaled")


p2 <- plot5$xyz.convert(0,-40,0)
p3 <- plot5$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot5$xyz.convert(-20,0,0)
p3 <- plot5$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)

# This chunk displays four of the plots at once
# The 3-D scatter plot of the scaled (NOT centered) data is not shown here
par(mfrow = c(2,2), mar = c(1,1,1,1))
plot1 <- scatterplot3d(x = Xraw, 
                       y = Yraw, 
                       z = Zraw,
              color = 2, 
              pch = 16, type = "h",
              xlim = c(-15,25),
              ylim = c(-30,50),
              zlim = c(-100,275),
              main = "")


# add-ons don't need to know how these work
# These are lines at x = 0, y = 0, z = 0
      p2 <- plot1$xyz.convert(0,-40,0)
      p3 <- plot1$xyz.convert(0,60,0)
      segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
      p2 <- plot1$xyz.convert(-20,0,0)
      p3 <- plot1$xyz.convert(30,0,0)
      segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
      
      
#  main plot - know how this works
plot2 <- scatterplot3d(x = Xcent, y = Ycent, z = Zcent,
              color = 2, 
              pch = 16, type = "h",
              xlim = c(-15,25),
              ylim = c(-30,50),
              zlim = c(-100,275), main = "")

# add-ons don't need to know how these work
# These are lines at x = 0, y = 0, z = 0
p2 <- plot2$xyz.convert(0,-40,0)
p3 <- plot2$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot2$xyz.convert(-20,0,0)
p3 <- plot2$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)


plot4 <- scatterplot3d(x = Xscale, y = Yscale, z = Zscale,
              color = 3, 
              pch = 16, type = "h",
              xlim = c(-25,25),
              ylim = c(-50,50),
              zlim = c(-100,275),
              main = "")

p2 <- plot4$xyz.convert(0,-40,0)
p3 <- plot4$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot4$xyz.convert(-20,0,0)
p3 <- plot4$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)


#  main plot - know how this works
plot5 <- scatterplot3d(x = Xscale, y = Yscale, z = Zscale,
              color = 3, 
              pch = 16, type = "h",
              #xlim = c(-25,25),
              #ylim = c(-50,50),
              #zlim = c(-100,275),
              main = "")


p2 <- plot5$xyz.convert(0,-40,0)
p3 <- plot5$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot5$xyz.convert(-20,0,0)
p3 <- plot5$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)