Setup

Install any needed packages.

#install.packages("lsa")
#install.packages("mdendro")
#install.packages("dendextend")

Load all packages

# packages we've used a lot
library(ggplot2)
library(ggpubr)

# packages we've only recently started using
library(scatterplot3d)
library(pander) 
library(vegan)

## Loading required package: permute

## Loading required package: lattice

## This is vegan 2.5-7

# NEW packages- make sure you installed them!
library(dendextend)

## Registered S3 method overwritten by 'dendextend':
##   method     from 
##   rev.hclust vegan

## 
## ---------------------
## Welcome to dendextend version 1.15.2
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------

## 
## Attaching package: 'dendextend'

## The following object is masked from 'package:permute':
## 
##     shuffle

## The following object is masked from 'package:ggpubr':
## 
##     rotate

## The following object is masked from 'package:stats':
## 
##     cutree

library(lsa)

## Loading required package: SnowballC

PART 1: Data preparation

NOTE: This vectors may be useful for exploring the functionality during lecture of R functions but are not used in the analysis itself or added to the dataframe.

# data exploration vectors
MW.da     <- c(89, 121, 133, 146, 165, 75, 155, 131, 146, 131,
               149, 132, 115, 147, 174, 105, 119, 117, 204, 181)
polar.req    <-c(7, 4.8, 13, 12.5, 5, 7.9, 8.4, 4.9, 10.1, 4.9,
                 5.3, 10, 6.6, 8.6,9.1,7.5,6.6,5.6,5.2,5.4)
freq        <-c(7.8, 1.1, 5.19 ,6.72, 4.39, 6.77, 2.03, 6.95,
                6.32, 10.15, 2.28, 4.37, 4.26, 3.45, 5.23,
                6.46, 5.12, 7.01, 1.09, 3.3)

The following vectors are the columns that make up Table 2.2 of Higgs and Attwood 2005, Chapter 2.

# main data vectors
aa.1        <-c('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L','M','N',
                'P','Q','R','S','T','V','W','Y')
vol    <- c(67, 86, 91, 109, 135, 48, 118, 124, 135, 124, 124, 96, 90, 114,
            148, 73, 93, 105, 163, 141)
bulk <-c(11.5, 13.46, 11.68, 13.57, 19.8, 3.4, 13.69, 21.4, 15.71, 21.4, 16.25,
         12.28,17.43, 14.45, 14.28, 9.47, 15.77, 21.57, 21.67, 18.03)
pol  <-c(0,1.48,49.7,49.9,0.35,0,51.6,0.13,49.5,0.13,
              1.43,3.38,1.58,3.53,52,1.67,1.66,0.13,2.1,1.61)
isoelec <-c(6,5.07,2.77,3.22,5.48,5.97,7.59,6.02,9.74,5.98, 5.74, 5.41, 6.3, 
            5.65, 10.76, 5.68, 6.16, 5.96, 5.89, 5.66)
H2Opho.34 <-c(1.8,2.5,-3.5,-3.5,2.8,-0.4,-3.2,4.5, -3.9, 3.8, 1.9, -3.5, -1.6, 
              -3.5, -4.5, -0.8, -0.7, 4.2, -0.9, -1.3)
H2Opho.35 <-c(1.6,2,-9.2,-8.2,3.7,1,-3,3.1,-8.8,2.8,3.4,-4.8, -0.2,-4.1,-12.3,
              0.6,1.2,2.6,1.9,-0.7)
saaH2O       <-c(113, 140,151,183,218,85,194,182,211,180, 204, 158, 143, 189, 
                 241, 122, 146,160,259,229)
faal.fold    <-c(0.74,0.91,0.62,0.62,0.88,0.72,0.78,0.88,0.52, 0.85, 0.85, 0.63,
                 0.64,0.62,0.64,0.66,0.7, 0.86, 0.85, 0.76)

TODO: Create a vector of the full names and the 3-letter codes of the amino acids, then add it to the data.frame.

# build dataframe

## create a vector with the FULL names of all the amino acids
aa.full <- c("alanine", "cysteine", "aspartic acid", "glutamic acid", "phenylalanine", "glycine", "histidine", "isoleucine", "lysine", "leucine", "methionine", "asparagine", "proline", "glutamine", "arginine", "serine", "threonine", "valine", "tryptophan", "tyrosine")

## create a vector with the 3-letter codes for amino acids
aa.3 <- c("ala", "cys", "asp", "glu", "phe", "gly", "his", "ile", "lys", "leu", "met", "asn", "pro", "gln", "asn", "ser", "thr", "val", "trp", "tyr")

## create dataframe
habk.df <- data.frame(aa.full = aa.full,
                      aa.3 = aa.3,
                      aa.1 = aa.1,
                      Vol = vol, Bulk = bulk, Polarity = pol,
                      pI = isoelec, Hyd.1 = H2Opho.34,
                      Hyd.2 = H2Opho.35, Surface_area = saaH2O,
                      Fract_area = faal.fold)

# name rows - this will help later
# when we do PCA
row.names(habk.df) <- aa.1

# display with pander
pander(habk.df)

Table continues below
	aa.full	aa.3	aa.1	Vol	Bulk	Polarity	pI	Hyd.1
A	alanine	ala	A	67	11.5	0	6	1.8
C	cysteine	cys	C	86	13.46	1.48	5.07	2.5
D	aspartic acid	asp	D	91	11.68	49.7	2.77	-3.5
E	glutamic acid	glu	E	109	13.57	49.9	3.22	-3.5
F	phenylalanine	phe	F	135	19.8	0.35	5.48	2.8
G	glycine	gly	G	48	3.4	0	5.97	-0.4
H	histidine	his	H	118	13.69	51.6	7.59	-3.2
I	isoleucine	ile	I	124	21.4	0.13	6.02	4.5
K	lysine	lys	K	135	15.71	49.5	9.74	-3.9
L	leucine	leu	L	124	21.4	0.13	5.98	3.8
M	methionine	met	M	124	16.25	1.43	5.74	1.9
N	asparagine	asn	N	96	12.28	3.38	5.41	-3.5
P	proline	pro	P	90	17.43	1.58	6.3	-1.6
Q	glutamine	gln	Q	114	14.45	3.53	5.65	-3.5
R	arginine	asn	R	148	14.28	52	10.76	-4.5
S	serine	ser	S	73	9.47	1.67	5.68	-0.8
T	threonine	thr	T	93	15.77	1.66	6.16	-0.7
V	valine	val	V	105	21.57	0.13	5.96	4.2
W	tryptophan	trp	W	163	21.67	2.1	5.89	-0.9
Y	tyrosine	tyr	Y	141	18.03	1.61	5.66	-1.3

	Hyd.2	Surface_area	Fract_area
A	1.6	113	0.74
C	2	140	0.91
D	-9.2	151	0.62
E	-8.2	183	0.62
F	3.7	218	0.88
G	1	85	0.72
H	-3	194	0.78
I	3.1	182	0.88
K	-8.8	211	0.52
L	2.8	180	0.85
M	3.4	204	0.85
N	-4.8	158	0.63
P	-0.2	143	0.64
Q	-4.1	189	0.62
R	-12.3	241	0.64
S	0.6	122	0.66
T	1.2	146	0.7
V	2.6	160	0.86
W	1.9	259	0.85
Y	-0.7	229	0.76

PART 2: DATA DICTIONARY

A data dictionary is a guide to what the code means, what the columns are, what the variables are, etcetera. Metadata is data about data. A good column name is one that is short and concise, while a bad column name is one that is long.

https://library.ucmerced.edu/data-dictionaries

# (For exam you need to be able to...
# describe why a data dictionary is needed, 
# what makes good column names, 
# what makes bad column names, etc.)


# data dictionary columns
column.names <- c("Vol", "Bulk", "Polarity","pI", "Hyd.1", "Hyd.2", 
                  "Surface_area","Fract_area")

name.full <- c("Volume", "Bulkiness", "Polarity", 
               "pH at Isoelectric point",  "Hydrophobicity scale-1",
               "Hydrophobicity scale-2","SA accessible to H2O","Fractional area")
vector.names <- c("vol", "bulk", "pol","isoelec", "H2Opho.34",
                  "H2Opho.35", "saaH2O","faal.fold")
reference <- c("Creighton 1993", "Zimmerman et al. 1968", "Zimmerman et al. 1968", 
               "Zimmerman et al. 1968", "Kyte & Doolittle 1982", "Engelman et al. 1986",
               "Miller et al. 1987"," Rose et al. 1985")

Make a dataframe to hold the DATA DICTIONARY information

# TODO: Make a dataframe to hold DATA DICTIONARY information
# make dataframe
data.dictionary <- data.frame( ColumnNames = column.names,
                              FullName = name.full,
                              VectorNames = vector.names,
                              References  = reference)

# Display data dictionary
pander(data.dictionary)

ColumnNames	FullName	VectorNames	References
Vol	Volume	vol	Creighton 1993
Bulk	Bulkiness	bulk	Zimmerman et al. 1968
Polarity	Polarity	pol	Zimmerman et al. 1968
pI	pH at Isoelectric point	isoelec	Zimmerman et al. 1968
Hyd.1	Hydrophobicity scale-1	H2Opho.34	Kyte & Doolittle 1982
Hyd.2	Hydrophobicity scale-2	H2Opho.35	Engelman et al. 1986
Surface_area	SA accessible to H2O	saaH2O	Miller et al. 1987
Fract_area	Fractional area	faal.fold	Rose et al. 1985

PART 3: Remake Higgs and Attwood Figure 2.8: 2-dimensional Data visualization

TODO: Using ggpubr, remake AND update/augment Figure 2.8 of Higgs and Attwood 2005, Chapter 2, page 26.

Plot the appropriate variables from the original figure against each other, on the appropriate axes
Add the caption from the original paper to the “title =” section of ggscatter
Include newline character in the title to split it accross two lines
Add a 3rd variable via color-coding using color = “…”
Add a 4th variable via size-coding using size = “…”
Label the axes with xlab and ylab
Set ylim to c(0, 180) and xlim to c(1,12)

# Make scatterplot with appropriate ggpubr function
ggscatter(x = "pI",        
          y = "Vol",       
          data = habk.df,  
          color = "Polarity", 
          size = "Hyd.1",     
          label = "aa.1",     
          ylab = "Volume (Angstorms)",
          xlab = "pI (pH at isoelectric point)", 
          title = "Plot of Amino Acid Volume agaist pI, two properies \n thought to important to protein folding",
          ylim = c(0, 180),
          xlim = c(1,12))

PART 4: Data centering and scaling

For the exam you need to be able to understand what data centering and scaling is, how do do them in R with and without specialized functions, how to do them to an individual vector, how to do them to a column in a dataframe, how to do them using a for loop.

Data centering is centering around the mean (mean goes to 0) Data scaling is dividing by the SD

Demonstate centering / scaling on a single vector

# For the exam you need to be able to understand what data centering and scaling is, 
# how do do them in R with and without specialized functions, how to do them to an
#individual vector, how to do them to a column in a dataframe, 
# how to do them using a for loop.


# Subset data 
## get info
dim(habk.df)

## [1] 20 11

ncol(habk.df)

## [1] 11

# easiest 
habk.df.numeric <- habk.df[ , c(-1,-2,-3)]

# equivalent
habk.df.numeric <- habk.df[ , c(4,5,6,7,8,9,10,11)]
habk.df.numeric <- habk.df[ , c(4:11)]

Set up data:

# Select a single column to explore how to do centering / scaling
X <- habk.df.numeric$Vol

This is calculating the mean

sum(X)/length(X)

## [1] 109.2

This is calculating the standard deviation

X <- habk.df.numeric$Vol
sqrt(sum((X-mean(X))^2)/length(X))

## [1] 28.31007

# This is centering the data
X <- habk.df.numeric$Vol
u <- mean(X)
u-X

##  [1]  42.2  23.2  18.2   0.2 -25.8  61.2  -8.8 -14.8 -25.8 -14.8 -14.8  13.2
## [13]  19.2  -4.8 -38.8  36.2  16.2   4.2 -53.8 -31.8

# This is scaling the data without centering
X <- habk.df.numeric$Vol
s <- sd(X)
X/s

##  [1] 2.306724 2.960870 3.133014 3.752730 4.647877 1.652579 4.062589 4.269161
##  [9] 4.647877 4.269161 4.269161 3.305157 3.098585 3.924874 5.095451 2.513297
## [17] 3.201871 3.615016 5.611881 4.854450

# This is scaling the data with centering 
X <- habk.df.numeric$Vol
u <- mean(X)
s <- sd(X)
z <- (u-X)/s

# This is creating histograms with different variations of scaling and centering
par(mfrow = c(2,2), mar = c(3,1,3,1))

hist(X)
abline(v = mean(X), col = 2)

hist(X/s)
abline(v = mean(X/s), col = 3)

hist(u-X)
abline(v = 0, col = 4)

hist(z)
abline(v = 0, col = 5)

par(mfrow = c(1,1), mar = c(4,4,4,4))

# This uses the scale function to scale and center  
# Each of these lines tests different variations of scaling and centering
scale(X, center = T, scale = F)

##        [,1]
##  [1,] -42.2
##  [2,] -23.2
##  [3,] -18.2
##  [4,]  -0.2
##  [5,]  25.8
##  [6,] -61.2
##  [7,]   8.8
##  [8,]  14.8
##  [9,]  25.8
## [10,]  14.8
## [11,]  14.8
## [12,] -13.2
## [13,] -19.2
## [14,]   4.8
## [15,]  38.8
## [16,] -36.2
## [17,] -16.2
## [18,]  -4.2
## [19,]  53.8
## [20,]  31.8
## attr(,"scaled:center")
## [1] 109.2

scale(X, center = F, scale = T)

##            [,1]
##  [1,] 0.5788805
##  [2,] 0.7430407
##  [3,] 0.7862407
##  [4,] 0.9417609
##  [5,] 1.1664011
##  [6,] 0.4147204
##  [7,] 1.0195209
##  [8,] 1.0713610
##  [9,] 1.1664011
## [10,] 1.0713610
## [11,] 1.0713610
## [12,] 0.8294408
## [13,] 0.7776007
## [14,] 0.9849609
## [15,] 1.2787212
## [16,] 0.6307206
## [17,] 0.8035207
## [18,] 0.9072008
## [19,] 1.4083213
## [20,] 1.2182411
## attr(,"scaled:scale")
## [1] 115.7406

scale(X, center = T, scale = T)

##               [,1]
##  [1,] -1.452891984
##  [2,] -0.798746304
##  [3,] -0.626602704
##  [4,] -0.006885744
##  [5,]  0.888260976
##  [6,] -2.107037664
##  [7,]  0.302972736
##  [8,]  0.509545056
##  [9,]  0.888260976
## [10,]  0.509545056
## [11,]  0.509545056
## [12,] -0.454459104
## [13,] -0.661031424
## [14,]  0.165257856
## [15,]  1.335834336
## [16,] -1.246319664
## [17,] -0.557745264
## [18,] -0.144600624
## [19,]  1.852265136
## [20,]  1.094833296
## attr(,"scaled:center")
## [1] 109.2
## attr(,"scaled:scale")
## [1] 29.04552

Scale an entire matrix

This is calculating the mean

sum(habk.df.numeric$Vol)/length(habk.df.numeric$Vol)

## [1] 109.2

Center a single column

# make a copy of the data
habk.df.centered <- habk.df.numeric


# check original mean
mean(habk.df.numeric$Vol)

## [1] 109.2

habk.df.centered$Vol <- habk.df.centered$Vol - mean(habk.df.centered$Vol)
mean(habk.df.centered$Vol)

## [1] -2.842518e-15

# Bulk
habk.df.centered$Bulk <- habk.df.centered$Bulk - mean(habk.df.centered$Bulk)
mean(habk.df.numeric$Bulk)

## [1] 15.3405

TODO: Center another column using the entire dataframe

# make a copy of the data
habk.df.centered <- habk.df.numeric


# check original mean
mean(habk.df.numeric$Polarity)

## [1] 13.594

habk.df.centered$Polarity <- habk.df.centered$Polarity - mean(habk.df.centered$Polarity)
mean(habk.df.centered$Polarity)

## [1] 3.56052e-16

# pI
habk.df.centered$pI <- habk.df.centered$pI - mean(habk.df.centered$pI)
mean(habk.df.numeric$pI)

## [1] 6.0525

xxxx

# This for loop creates a summary of all of the data

for(i in 1:ncol(habk.df.centered)){
  mean.i <- mean(habk.df.centered[,i])
  habk.df.centered[,i] <- mean.i-habk.df.centered[,i]
}

summary(habk.df.centered)

##       Vol              Bulk            Polarity             pI         
##  Min.   :-53.80   Min.   :-6.3295   Min.   :-38.406   Min.   :-4.7075  
##  1st Qu.:-17.55   1st Qu.:-3.1320   1st Qu.: -1.429   1st Qu.:-0.0025  
##  Median : -2.30   Median : 0.2605   Median : 11.959   Median : 0.1275  
##  Mean   :  0.00   Mean   : 0.0000   Mean   :  0.000   Mean   : 0.0000  
##  3rd Qu.: 18.45   3rd Qu.: 2.1755   3rd Qu.: 13.299   3rd Qu.: 0.4450  
##  Max.   : 61.20   Max.   :11.9405   Max.   : 13.594   Max.   : 3.2825  
##      Hyd.1           Hyd.2         Surface_area      Fract_area     
##  Min.   :-4.99   Min.   :-5.070   Min.   :-83.60   Min.   :-0.1735  
##  1st Qu.:-2.54   1st Qu.:-3.520   1st Qu.:-30.35   1st Qu.:-0.1135  
##  Median : 0.36   Median :-2.170   Median : -5.60   Median : 0.0065  
##  Mean   : 0.00   Mean   : 0.000   Mean   :  0.00   Mean   : 0.0000  
##  3rd Qu.: 3.01   3rd Qu.: 2.905   3rd Qu.: 30.15   3rd Qu.: 0.0990  
##  Max.   : 4.01   Max.   :10.930   Max.   : 90.40   Max.   : 0.2165

Rewrite the for() loop above so that the variable mean.i doesn’t have to be created and only a single line of code is executed within the for() loop.

#for(i in 1:ncol(habk.df.centered)){
#  habk.df.centered[,i] <- (mean(habk.df.centered[,i])-habk.df.centered[,i]
#}

#summary(habk.df.centered)

Check …

# This chunk is just checking to make sure that thd SD is the same between the centered data and the non-centered data
sd(habk.df.centered$Vol)

## [1] 29.04552

sd(habk.df.numeric$Vol)

## [1] 29.04552

sd(habk.df.centered$Vol) == sd(habk.df.numeric$Vol)

## [1] TRUE

TODO: Write logical checks for the standard deviation (sd) of another variable

# logical check

sd(habk.df.centered$Polarity)

## [1] 21.91408

sd(habk.df.numeric$Polarity)

## [1] 21.91408

sd(habk.df.centered$Polarity) == sd(habk.df.numeric$Polarity)

## [1] TRUE

# This chunk is creating a new dataframe with the centered data and then obtaining its standard deviation

# create new dataframe
habk.df.standard <- habk.df.centered

habk.df.standard$Vol <-  habk.df.standard$Vol/sd(habk.df.standard$Vol)

sd(habk.df.standard$Vol)

## [1] 1

TODO: modify another column a done in the previous chunk

# Standardize another column

habk.df.standard <- habk.df.centered

habk.df.standard$Polarity <-  habk.df.standard$Polarity/sd(habk.df.standard$Polarity)

sd(habk.df.standard$Polarity)

## [1] 1

# This chunk creates a summary of the standard data

for(i in 1:ncol(habk.df.standard)){
  sd.i <- sd(habk.df.standard[,i])
  habk.df.standard[,i] <- habk.df.standard[,i]/sd.i
}

summary(habk.df.standard)

##       Vol                Bulk             Polarity              pI           
##  Min.   :-1.85227   Min.   :-1.36122   Min.   :-1.75257   Min.   :-2.664603  
##  1st Qu.:-0.60422   1st Qu.:-0.67357   1st Qu.:-0.06519   1st Qu.:-0.001415  
##  Median :-0.07919   Median : 0.05602   Median : 0.54572   Median : 0.072169  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.000000  
##  3rd Qu.: 0.63521   3rd Qu.: 0.46786   3rd Qu.: 0.60687   3rd Qu.: 0.251885  
##  Max.   : 2.10704   Max.   : 2.56792   Max.   : 0.62033   Max.   : 1.858005  
##      Hyd.1             Hyd.2          Surface_area       Fract_area      
##  Min.   :-1.6706   Min.   :-1.0362   Min.   :-1.8709   Min.   :-1.50457  
##  1st Qu.:-0.8504   1st Qu.:-0.7194   1st Qu.:-0.6792   1st Qu.:-0.98426  
##  Median : 0.1205   Median :-0.4435   Median :-0.1253   Median : 0.05637  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 1.0077   3rd Qu.: 0.5937   3rd Qu.: 0.6747   3rd Qu.: 0.85852  
##  Max.   : 1.3425   Max.   : 2.2338   Max.   : 2.0230   Max.   : 1.87746

Shorten code in the for() loop so that the object sd.i doesn’t need to be created

# for() loop without sd.i line

for(i in 1:ncol(habk.df.standard)){
  habk.df.standard[,i] <- habk.df.standard[,i]/sd(habk.df.standard[,i])
}

summary(habk.df.standard)

##       Vol                Bulk             Polarity              pI           
##  Min.   :-1.85227   Min.   :-1.36122   Min.   :-1.75257   Min.   :-2.664603  
##  1st Qu.:-0.60422   1st Qu.:-0.67357   1st Qu.:-0.06519   1st Qu.:-0.001415  
##  Median :-0.07919   Median : 0.05602   Median : 0.54572   Median : 0.072169  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.000000  
##  3rd Qu.: 0.63521   3rd Qu.: 0.46786   3rd Qu.: 0.60687   3rd Qu.: 0.251885  
##  Max.   : 2.10704   Max.   : 2.56792   Max.   : 0.62033   Max.   : 1.858005  
##      Hyd.1             Hyd.2          Surface_area       Fract_area      
##  Min.   :-1.6706   Min.   :-1.0362   Min.   :-1.8709   Min.   :-1.50457  
##  1st Qu.:-0.8504   1st Qu.:-0.7194   1st Qu.:-0.6792   1st Qu.:-0.98426  
##  Median : 0.1205   Median :-0.4435   Median :-0.1253   Median : 0.05637  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 1.0077   3rd Qu.: 0.5937   3rd Qu.: 0.6747   3rd Qu.: 0.85852  
##  Max.   : 1.3425   Max.   : 2.2338   Max.   : 2.0230   Max.   : 1.87746

Write a for() loop that does BOTH tasks in a single loop.

# center
# scale by sd

#for(i in 1:ncol(habk.df.centered)){
#  habk.df.centered[,i] <- (mean(habk.df.centered[,i])-habk.df.centered[,i]
#  habk.df.standard[,i] <- habk.df.standard[,i]/sd(habk.df.standard[,i])        
#}

#summary(habk.df.centered)
#summary(habk.df.standard)

Write a for() loop that….

# This chunk scales numeric data using a for loop

habk.df.standard <- habk.df.numeric
for(i in 1:ncol(habk.df.standard)){
  habk.df.standard[,i] <- scale(habk.df.standard[,i])
}

summary(habk.df.standard)

##         Vol.V1              Bulk.V1            Polarity.V1     
##  Min.   :-2.1070377   Min.   :-2.5679182   Min.   :-0.6203318  
##  1st Qu.:-0.6352099   1st Qu.:-0.4678620   1st Qu.:-0.6068701  
##  Median : 0.0791861   Median :-0.0560230   Median :-0.5457222  
##  Mean   : 0.0000000   Mean   : 0.0000000   Mean   : 0.0000000  
##  3rd Qu.: 0.6042240   3rd Qu.: 0.6735664   3rd Qu.: 0.0651864  
##  Max.   : 1.8522651   Max.   : 1.3612193   Max.   : 1.7525720  
##         pI.V1               Hyd.1.V1             Hyd.2.V1      
##  Min.   :-1.8580053   Min.   :-1.3424968   Min.   :-2.2338170  
##  1st Qu.:-0.2518850   1st Qu.:-1.0077096   1st Qu.:-0.5937089  
##  Median :-0.0721693   Median :-0.1205234   Median : 0.4434934  
##  Mean   : 0.0000000   Mean   : 0.0000000   Mean   : 0.0000000  
##  3rd Qu.: 0.0014151   3rd Qu.: 0.8503596   3rd Qu.: 0.7193994  
##  Max.   : 2.6646032   Max.   : 1.6705883   Max.   : 1.0361804  
##    Surface_area.V1       Fract_area.V1    
##  Min.   :-2.0230352   Min.   :-1.8774603  
##  1st Qu.:-0.6747180   1st Qu.:-0.8585153  
##  Median : 0.1253208   Median :-0.0563672  
##  Mean   : 0.0000000   Mean   : 0.0000000  
##  3rd Qu.: 0.6791938   3rd Qu.: 0.9842575  
##  Max.   : 1.8708600   Max.   : 1.5045698

CHALLENGE (will NOT be on exam). How do these lines of code work:

# This chunk applies the mean function to the numeric data and scales the standardized data

X.bar <- apply(habk.df.numeric, 2, mean)
summary(habk.df.numeric)

##       Vol              Bulk          Polarity            pI        
##  Min.   : 48.00   Min.   : 3.40   Min.   : 0.000   Min.   : 2.770  
##  1st Qu.: 90.75   1st Qu.:13.16   1st Qu.: 0.295   1st Qu.: 5.607  
##  Median :111.50   Median :15.08   Median : 1.635   Median : 5.925  
##  Mean   :109.20   Mean   :15.34   Mean   :13.594   Mean   : 6.053  
##  3rd Qu.:126.75   3rd Qu.:18.47   3rd Qu.:15.023   3rd Qu.: 6.055  
##  Max.   :163.00   Max.   :21.67   Max.   :52.000   Max.   :10.760  
##      Hyd.1           Hyd.2          Surface_area     Fract_area    
##  Min.   :-4.50   Min.   :-12.300   Min.   : 85.0   Min.   :0.5200  
##  1st Qu.:-3.50   1st Qu.: -4.275   1st Qu.:145.2   1st Qu.:0.6375  
##  Median :-0.85   Median :  0.800   Median :181.0   Median :0.7300  
##  Mean   :-0.49   Mean   : -1.370   Mean   :175.4   Mean   :0.7365  
##  3rd Qu.: 2.05   3rd Qu.:  2.150   3rd Qu.:205.8   3rd Qu.:0.8500  
##  Max.   : 4.50   Max.   :  3.700   Max.   :259.0   Max.   :0.9100

habk.df.standard <- apply(habk.df.numeric, 2, scale)
habk.df.standard <- data.frame(habk.df.standard)
summary(habk.df.standard)

##       Vol                Bulk             Polarity              pI           
##  Min.   :-2.10704   Min.   :-2.56792   Min.   :-0.62033   Min.   :-1.858005  
##  1st Qu.:-0.63521   1st Qu.:-0.46786   1st Qu.:-0.60687   1st Qu.:-0.251885  
##  Median : 0.07919   Median :-0.05602   Median :-0.54572   Median :-0.072169  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.000000  
##  3rd Qu.: 0.60422   3rd Qu.: 0.67357   3rd Qu.: 0.06519   3rd Qu.: 0.001415  
##  Max.   : 1.85227   Max.   : 1.36122   Max.   : 1.75257   Max.   : 2.664603  
##      Hyd.1             Hyd.2          Surface_area       Fract_area      
##  Min.   :-1.3425   Min.   :-2.2338   Min.   :-2.0230   Min.   :-1.87746  
##  1st Qu.:-1.0077   1st Qu.:-0.5937   1st Qu.:-0.6747   1st Qu.:-0.85852  
##  Median :-0.1205   Median : 0.4435   Median : 0.1253   Median :-0.05637  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.8504   3rd Qu.: 0.7194   3rd Qu.: 0.6792   3rd Qu.: 0.98426  
##  Max.   : 1.6706   Max.   : 1.0362   Max.   : 1.8709   Max.   : 1.50457

PART 5: 3D visualization of centering and scaling

Prepare data:
scatterplot3d does NOT have a data = argument. Make seperate vectors to work with to make code cleaner. ….

# raw vectors
Xraw <- habk.df.numeric$Bulk
Yraw <- habk.df.numeric$Polarity
Zraw <- habk.df.numeric$Surface_area

# center
Xcent <- Xraw - mean(Xraw)
Ycent <- Yraw - mean(Yraw)
Zcent <- Zraw - mean(Zraw)

# scale, without centering
Xscale0 <- Xraw/sd(Xraw)
Yscale0 <- Yraw/sd(Yraw)
Zscale0 <- Zraw/sd(Zraw)

# scale AND center
Xscale <- (Xraw - mean(Xraw))/sd(Xraw)
Yscale <- (Yraw - mean(Yraw))/sd(Yraw)
Zscale <- (Zraw - mean(Zraw))/sd(Zraw)

Create ALL of the scaled vectors created above using the scale() function.

# vectors using R function()

scale(Xraw, center = T, scale = F)

##           [,1]
##  [1,]  -3.8405
##  [2,]  -1.8805
##  [3,]  -3.6605
##  [4,]  -1.7705
##  [5,]   4.4595
##  [6,] -11.9405
##  [7,]  -1.6505
##  [8,]   6.0595
##  [9,]   0.3695
## [10,]   6.0595
## [11,]   0.9095
## [12,]  -3.0605
## [13,]   2.0895
## [14,]  -0.8905
## [15,]  -1.0605
## [16,]  -5.8705
## [17,]   0.4295
## [18,]   6.2295
## [19,]   6.3295
## [20,]   2.6895
## attr(,"scaled:center")
## [1] 15.3405

scale(Yraw, center = T, scale = F)

##          [,1]
##  [1,] -13.594
##  [2,] -12.114
##  [3,]  36.106
##  [4,]  36.306
##  [5,] -13.244
##  [6,] -13.594
##  [7,]  38.006
##  [8,] -13.464
##  [9,]  35.906
## [10,] -13.464
## [11,] -12.164
## [12,] -10.214
## [13,] -12.014
## [14,] -10.064
## [15,]  38.406
## [16,] -11.924
## [17,] -11.934
## [18,] -13.464
## [19,] -11.494
## [20,] -11.984
## attr(,"scaled:center")
## [1] 13.594

scale(Zraw, center = T, scale = F)

##        [,1]
##  [1,] -62.4
##  [2,] -35.4
##  [3,] -24.4
##  [4,]   7.6
##  [5,]  42.6
##  [6,] -90.4
##  [7,]  18.6
##  [8,]   6.6
##  [9,]  35.6
## [10,]   4.6
## [11,]  28.6
## [12,] -17.4
## [13,] -32.4
## [14,]  13.6
## [15,]  65.6
## [16,] -53.4
## [17,] -29.4
## [18,] -15.4
## [19,]  83.6
## [20,]  53.6
## attr(,"scaled:center")
## [1] 175.4

scale(Xraw, center = F, scale = T)

##            [,1]
##  [1,] 0.7007271
##  [2,] 0.8201554
##  [3,] 0.7116950
##  [4,] 0.8268580
##  [5,] 1.2064693
##  [6,] 0.2071715
##  [7,] 0.8341699
##  [8,] 1.3039618
##  [9,] 0.9572542
## [10,] 1.3039618
## [11,] 0.9901579
## [12,] 0.7482547
## [13,] 1.0620586
## [14,] 0.8804789
## [15,] 0.8701203
## [16,] 0.5770336
## [17,] 0.9609101
## [18,] 1.3143203
## [19,] 1.3204136
## [20,] 1.0986183
## attr(,"scaled:scale")
## [1] 16.41152

scale(Yraw, center = F, scale = T)

##             [,1]
##  [1,] 0.00000000
##  [2,] 0.05697579
##  [3,] 1.91330848
##  [4,] 1.92100791
##  [5,] 0.01347400
##  [6,] 0.00000000
##  [7,] 1.98645307
##  [8,] 0.00500463
##  [9,] 1.90560905
## [10,] 0.00500463
## [11,] 0.05505093
## [12,] 0.13012038
## [13,] 0.06082550
## [14,] 0.13589495
## [15,] 2.00185193
## [16,] 0.06429024
## [17,] 0.06390527
## [18,] 0.00500463
## [19,] 0.08084402
## [20,] 0.06198042
## attr(,"scaled:scale")
## [1] 25.97595

scale(Zraw, center = F, scale = T)

##            [,1]
##  [1,] 0.6094221
##  [2,] 0.7550362
##  [3,] 0.8143605
##  [4,] 0.9869402
##  [5,] 1.1756993
##  [6,] 0.4584148
##  [7,] 1.0462645
##  [8,] 0.9815471
##  [9,] 1.1379474
## [10,] 0.9707609
## [11,] 1.1001956
## [12,] 0.8521123
## [13,] 0.7712156
## [14,] 1.0192989
## [15,] 1.2997409
## [16,] 0.6579601
## [17,] 0.7873949
## [18,] 0.8628985
## [19,] 1.3968170
## [20,] 1.2350235
## attr(,"scaled:scale")
## [1] 185.4216

scale(Xraw, center = T, scale = T)

##              [,1]
##  [1,] -0.82593610
##  [2,] -0.40441943
##  [3,] -0.78722538
##  [4,] -0.38076289
##  [5,]  0.95905794
##  [6,] -2.56791824
##  [7,] -0.35495574
##  [8,]  1.30315318
##  [9,]  0.07946449
## [10,]  1.30315318
## [11,]  0.19559664
## [12,] -0.65818967
## [13,]  0.44936687
## [14,] -0.19151051
## [15,] -0.22807062
## [16,] -1.26250693
## [17,]  0.09236807
## [18,]  1.33971330
## [19,]  1.36121925
## [20,]  0.57840259
## attr(,"scaled:center")
## [1] 15.3405
## attr(,"scaled:scale")
## [1] 4.649875

scale(Yraw, center = T, scale = T)

##             [,1]
##  [1,] -0.6203318
##  [2,] -0.5527953
##  [3,]  1.6476166
##  [4,]  1.6567432
##  [5,] -0.6043603
##  [6,] -0.6203318
##  [7,]  1.7343189
##  [8,] -0.6143995
##  [9,]  1.6384901
## [10,] -0.6143995
## [11,] -0.5550770
## [12,] -0.4660931
## [13,] -0.5482320
## [14,] -0.4592481
## [15,]  1.7525720
## [16,] -0.5441251
## [17,] -0.5445814
## [18,] -0.6143995
## [19,] -0.5245030
## [20,] -0.5468631
## attr(,"scaled:center")
## [1] 13.594
## attr(,"scaled:scale")
## [1] 21.91408

scale(Zraw, center = T, scale = T)

##             [,1]
##  [1,] -1.3964314
##  [2,] -0.7922063
##  [3,] -0.5460405
##  [4,]  0.1700782
##  [5,]  0.9533330
##  [6,] -2.0230352
##  [7,]  0.4162440
##  [8,]  0.1476995
##  [9,]  0.7966820
## [10,]  0.1029421
## [11,]  0.6400310
## [12,] -0.3893895
## [13,] -0.7250701
## [14,]  0.3043504
## [15,]  1.4680432
## [16,] -1.1950230
## [17,] -0.6579340
## [18,] -0.3446321
## [19,]  1.8708600
## [20,]  1.1994987
## attr(,"scaled:center")
## [1] 175.4
## attr(,"scaled:scale")
## [1] 44.68533

3-D scatter plot of raw data

# This chunk creates a 3D plot of the raw data

#  main plot with scatterplot3d() - know how this works
plot1 <- scatterplot3d(x = Xraw, 
                       y = Yraw, 
                       z = Zraw,
              color = 2, 
              pch = 16, type = "h",
              xlim = c(-15,25),
              ylim = c(-30,50),
              zlim = c(-100,275),
              main = "Raw data")


# add-ons don't need to know how these work
# These are lines at x = 0, y = 0, z = 0
      p2 <- plot1$xyz.convert(0,-40,0)
      p3 <- plot1$xyz.convert(0,60,0)
      segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
      p2 <- plot1$xyz.convert(-20,0,0)
      p3 <- plot1$xyz.convert(30,0,0)
      segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)

3-D scatter plot of centered data

# This chunk creates a 3D plot of the centered data

#  main plot - know how this works
plot2 <- scatterplot3d(x = Xcent, y = Ycent, z = Zcent,
              color = 2, 
              pch = 16, type = "h",
              xlim = c(-15,25),
              ylim = c(-30,50),
              zlim = c(-100,275), main = "Centered")

# add-ons don't need to know how these work
# These are lines at x = 0, y = 0, z = 0
p2 <- plot2$xyz.convert(0,-40,0)
p3 <- plot2$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot2$xyz.convert(-20,0,0)
p3 <- plot2$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)

3-D scatter plot of scaled (NOT centered) data

# This chunk creates a 3D plot of the scaled (NOT centered) data

#  main plot - know how this works
plot3 <- scatterplot3d(x = Xscale0, y = Yscale0, z = Zscale0,
              color = 3, 
              pch = 12, type = "h",
              xlim = c(-15,25),
              ylim = c(-30,50),
              zlim = c(-100,275), 
              main = "Scaled (NOT centered)")

p2 <- plot3$xyz.convert(0,-40,0)
p3 <- plot3$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot3$xyz.convert(-20,0,0)
p3 <- plot3$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)

3-D scatter plot of scaled and centered data

#  main plot - know how this works
plot4 <- scatterplot3d(x = Xscale, y = Yscale, z = Zscale,
              color = 3, 
              pch = 16, type = "h",
              xlim = c(-25,25),
              ylim = c(-50,50),
              zlim = c(-100,275),
              main = "Centered & Scaled")

p2 <- plot4$xyz.convert(0,-40,0)
p3 <- plot4$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot4$xyz.convert(-20,0,0)
p3 <- plot4$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)