Setup

Install any needed packages.

#install.packages("lsa")
#install.packages("mdendro")
#install.packages("dendextend")

Load all packages

# packages we've used a lot
library(ggplot2)
library(ggpubr)

# packages we've only recently started using
library(scatterplot3d)
library(pander) 
library(vegan)

## Loading required package: permute

## Loading required package: lattice

## This is vegan 2.5-7

# NEW packages- make sure you installed them!
library(dendextend)

## Registered S3 method overwritten by 'dendextend':
##   method     from 
##   rev.hclust vegan

## 
## ---------------------
## Welcome to dendextend version 1.15.2
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------

## 
## Attaching package: 'dendextend'

## The following object is masked from 'package:permute':
## 
##     shuffle

## The following object is masked from 'package:ggpubr':
## 
##     rotate

## The following object is masked from 'package:stats':
## 
##     cutree

library(lsa)

## Loading required package: SnowballC

PART 1: Data preparation

NOTE: This vectors may be useful for exploring the functionality during lecture of R functions but are not used in the analysis itself or added to the dataframe.

# data exploration vectors
MW.da     <- c(89, 121, 133, 146, 165, 75, 155, 131, 146, 131,
               149, 132, 115, 147, 174, 105, 119, 117, 204, 181)
polar.req    <-c(7, 4.8, 13, 12.5, 5, 7.9, 8.4, 4.9, 10.1, 4.9,
                 5.3, 10, 6.6, 8.6,9.1,7.5,6.6,5.6,5.2,5.4)
freq        <-c(7.8, 1.1, 5.19 ,6.72, 4.39, 6.77, 2.03, 6.95,
                6.32, 10.15, 2.28, 4.37, 4.26, 3.45, 5.23,
                6.46, 5.12, 7.01, 1.09, 3.3)

The following vectors are the columns that make up Table 2.2 of Higgs and Attwood 2005, Chapter 2.

# main data vectors
aa.1        <-c('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L','M','N',
                'P','Q','R','S','T','V','W','Y')
vol    <- c(67, 86, 91, 109, 135, 48, 118, 124, 135, 124, 124, 96, 90, 114,
            148, 73, 93, 105, 163, 141)
bulk <-c(11.5, 13.46, 11.68, 13.57, 19.8, 3.4, 13.69, 21.4, 15.71, 21.4, 16.25,
         12.28,17.43, 14.45, 14.28, 9.47, 15.77, 21.57, 21.67, 18.03)
pol  <-c(0,1.48,49.7,49.9,0.35,0,51.6,0.13,49.5,0.13,
              1.43,3.38,1.58,3.53,52,1.67,1.66,0.13,2.1,1.61)
isoelec <-c(6,5.07,2.77,3.22,5.48,5.97,7.59,6.02,9.74,5.98, 5.74, 5.41, 6.3, 
            5.65, 10.76, 5.68, 6.16, 5.96, 5.89, 5.66)
H2Opho.34 <-c(1.8,2.5,-3.5,-3.5,2.8,-0.4,-3.2,4.5, -3.9, 3.8, 1.9, -3.5, -1.6, 
              -3.5, -4.5, -0.8, -0.7, 4.2, -0.9, -1.3)
H2Opho.35 <-c(1.6,2,-9.2,-8.2,3.7,1,-3,3.1,-8.8,2.8,3.4,-4.8, -0.2,-4.1,-12.3,
              0.6,1.2,2.6,1.9,-0.7)
saaH2O       <-c(113, 140,151,183,218,85,194,182,211,180, 204, 158, 143, 189, 
                 241, 122, 146,160,259,229)
faal.fold    <-c(0.74,0.91,0.62,0.62,0.88,0.72,0.78,0.88,0.52, 0.85, 0.85, 0.63,
                 0.64,0.62,0.64,0.66,0.7, 0.86, 0.85, 0.76)

TODO: Create a vector of the full names and the 3-letter codes of the amino acids, then add it to the data.frame.

# build dataframe

## TODO:
## create a vector with the FULL names of all the amino acids
aa.full <- NA

## TODO:
## create a vector with the 3-letter codes for amino acids
aa.3 <- NA

## create dataframe
habk.df <- data.frame(aa.full = aa.full,
                      aa.3 = aa.3,
                      aa.1 = aa.1,
                      Vol = vol, Bulk = bulk, Polarity = pol,
                      pI = isoelec, Hyd.1 = H2Opho.34,
                      Hyd.2 = H2Opho.35, Surface_area = saaH2O,
                      Fract_area = faal.fold)

# name rows - this will help later
# when we do PCA
row.names(habk.df) <- aa.1

# display with pander
pander(habk.df)

Table continues below
	aa.full	aa.3	aa.1	Vol	Bulk	Polarity	pI	Hyd.1
A	NA	NA	A	67	11.5	0	6	1.8
C	NA	NA	C	86	13.46	1.48	5.07	2.5
D	NA	NA	D	91	11.68	49.7	2.77	-3.5
E	NA	NA	E	109	13.57	49.9	3.22	-3.5
F	NA	NA	F	135	19.8	0.35	5.48	2.8
G	NA	NA	G	48	3.4	0	5.97	-0.4
H	NA	NA	H	118	13.69	51.6	7.59	-3.2
I	NA	NA	I	124	21.4	0.13	6.02	4.5
K	NA	NA	K	135	15.71	49.5	9.74	-3.9
L	NA	NA	L	124	21.4	0.13	5.98	3.8
M	NA	NA	M	124	16.25	1.43	5.74	1.9
N	NA	NA	N	96	12.28	3.38	5.41	-3.5
P	NA	NA	P	90	17.43	1.58	6.3	-1.6
Q	NA	NA	Q	114	14.45	3.53	5.65	-3.5
R	NA	NA	R	148	14.28	52	10.76	-4.5
S	NA	NA	S	73	9.47	1.67	5.68	-0.8
T	NA	NA	T	93	15.77	1.66	6.16	-0.7
V	NA	NA	V	105	21.57	0.13	5.96	4.2
W	NA	NA	W	163	21.67	2.1	5.89	-0.9
Y	NA	NA	Y	141	18.03	1.61	5.66	-1.3

	Hyd.2	Surface_area	Fract_area
A	1.6	113	0.74
C	2	140	0.91
D	-9.2	151	0.62
E	-8.2	183	0.62
F	3.7	218	0.88
G	1	85	0.72
H	-3	194	0.78
I	3.1	182	0.88
K	-8.8	211	0.52
L	2.8	180	0.85
M	3.4	204	0.85
N	-4.8	158	0.63
P	-0.2	143	0.64
Q	-4.1	189	0.62
R	-12.3	241	0.64
S	0.6	122	0.66
T	1.2	146	0.7
V	2.6	160	0.86
W	1.9	259	0.85
Y	-0.7	229	0.76

PART 2: DATA DICTIONARY

A data dictionary is… (For exam, you need to be able to complete this sentence, describe why a data dictionary is needed, what makes good column names, what makes bad column names, etc.)

# (For exam you need to be able to...
# describe why a data dictionary is needed, 
# what makes good column names, 
# what makes bad column names, etc.)


# data dictionary columns
column.names <- c("Vol", "Bulk", "Polarity","pI", "Hyd.1", "Hyd.2", 
                  "Surface_area","Fract_area")

name.full <- c("Volume", "Bulkiness", "Polarity", 
               "pH at Isoelectric point",  "Hydrophobicity scale-1",
               "Hydrophobicity scale-2","SA accessible to H2O","Fractional area")
vector.names <- c("vol", "bulk", "pol","isoelec", "H2Opho.34",
                  "H2Opho.35", "saaH2O","faal.fold")
reference <- c("Creighton 1993", "Zimmerman et al. 1968", "Zimmerman et al. 1968", 
               "Zimmerman et al. 1968", "Kyte & Doolittle 1982", "Engelman et al. 1986",
               "Miller et al. 1987"," Rose et al. 1985")

TODO: Make a dataframe to hold the DATA DICTIONARY information

# TODO: Make a dataframe to hold DATA DICTIONARY information
# make dataframe
data.dictionary <- NA

# Display data dictionary
pander(data.dictionary)

PART 3: Remake Higgs and Attwood Figure 2.8: 2-dimensional Data visualization

TODO: Using ggpubr, remake AND update/augment Figure 2.8 of Higgs and Attwood 2005, Chapter 2, page 26.

Plot the appropriate variables from the original figure against each other, on the appropriate axes
Add the caption from the original paper to the “title =” section of ggscatter
Include newline character in the title to split it accross two lines
Add a 3rd variable via color-coding using color = “…”
Add a 4th variable via size-coding using size = “…”
Label the axes with xlab and ylab
Set ylim to c(0, 180) and xlim to c(1,12)

# Make scatterplot with appropriate ggpubr function
ggscatter(x = "pI",        #TODO
          y = "Vol",       #TODO
          data = habk.df,  #TODO
          color = "Polarity", #TODO
          size = "Hyd.1",     #TODO
          label = "aa.1",     #TODO
          ylab = "Volume (Angstroms)", #TODO
          xlab = "pI (pH at Isoelectric point)", #TODO
          title = "Figure 2.8: Plot of amino acid volumue again pI, two #TODO properties\nthought to be important to protein folding",
          ylim = c(0, 180), #TODO
          xlim = c(1,12)) #TODO

PART 4: Data centering and scaling

For the exam you need to be able to understand what data centering and scaling is, how do do them in R with and without specialized functions, how to do them to an individual vector, how to do them to a column in a dataframe, how to do them using a for loop. Data centering is… Data scaling is…

Demonstate centering / scaling on a single vector

# For the exam you need to be able to understand what data centering and scaling is, 
# how do do them in R with and without specialized functions, how to do them to an
#individual vector, how to do them to a column in a dataframe, 
# how to do them using a for loop.


# Subset data 
## get info
dim(habk.df)

## [1] 20 11

ncol(habk.df)

## [1] 11

# easiest 
habk.df.numeric <- habk.df[ , c(-1,-2,-3)]

# equivalent
habk.df.numeric <- habk.df[ , c(4,5,6,7,8,9,10,11)]
habk.df.numeric <- habk.df[ , c(4:11)]

Set up data:

# Select a single column to explore how to do centering / scaling
X <- habk.df.numeric$Vol

What is this calculating?

sum(X)/length(X)

## [1] 109.2

Challenge: what is this calculating?

X <- habk.df.numeric$Vol
sqrt(sum((X-mean(X))^2)/length(X))

## [1] 28.31007

Demonstrate centering:

# What is this?
X <- habk.df.numeric$Vol
u <- mean(X)
u-X

##  [1]  42.2  23.2  18.2   0.2 -25.8  61.2  -8.8 -14.8 -25.8 -14.8 -14.8  13.2
## [13]  19.2  -4.8 -38.8  36.2  16.2   4.2 -53.8 -31.8

Demonstrate scaling (without centering):

# What is this?
X <- habk.df.numeric$Vol
s <- sd(X)
X/s

##  [1] 2.306724 2.960870 3.133014 3.752730 4.647877 1.652579 4.062589 4.269161
##  [9] 4.647877 4.269161 4.269161 3.305157 3.098585 3.924874 5.095451 2.513297
## [17] 3.201871 3.615016 5.611881 4.854450

Demonstrate scaling (with centering):

# What is this?
X <- habk.df.numeric$Vol
u <- mean(X)
s <- sd(X)
z <- (u-X)/s

Compare the distributions of these varibles

# what is this chunk doing?
par(mfrow = c(2,2), mar = c(3,1,3,1))

hist(X)
abline(v = mean(X), col = 2)

hist(X/s)
abline(v = mean(X/s), col = 3)

hist(u-X)
abline(v = 0, col = 4)

hist(z)
abline(v = 0, col = 5)

par(mfrow = c(1,1), mar = c(4,4,4,4))

Use the scale() function

# What is this?  
# Why are each of these lines different?
scale(X, center = T, scale = F)

##        [,1]
##  [1,] -42.2
##  [2,] -23.2
##  [3,] -18.2
##  [4,]  -0.2
##  [5,]  25.8
##  [6,] -61.2
##  [7,]   8.8
##  [8,]  14.8
##  [9,]  25.8
## [10,]  14.8
## [11,]  14.8
## [12,] -13.2
## [13,] -19.2
## [14,]   4.8
## [15,]  38.8
## [16,] -36.2
## [17,] -16.2
## [18,]  -4.2
## [19,]  53.8
## [20,]  31.8
## attr(,"scaled:center")
## [1] 109.2

scale(X, center = F, scale = T)

##            [,1]
##  [1,] 0.5788805
##  [2,] 0.7430407
##  [3,] 0.7862407
##  [4,] 0.9417609
##  [5,] 1.1664011
##  [6,] 0.4147204
##  [7,] 1.0195209
##  [8,] 1.0713610
##  [9,] 1.1664011
## [10,] 1.0713610
## [11,] 1.0713610
## [12,] 0.8294408
## [13,] 0.7776007
## [14,] 0.9849609
## [15,] 1.2787212
## [16,] 0.6307206
## [17,] 0.8035207
## [18,] 0.9072008
## [19,] 1.4083213
## [20,] 1.2182411
## attr(,"scaled:scale")
## [1] 115.7406

scale(X, center = T, scale = T)

##               [,1]
##  [1,] -1.452891984
##  [2,] -0.798746304
##  [3,] -0.626602704
##  [4,] -0.006885744
##  [5,]  0.888260976
##  [6,] -2.107037664
##  [7,]  0.302972736
##  [8,]  0.509545056
##  [9,]  0.888260976
## [10,]  0.509545056
## [11,]  0.509545056
## [12,] -0.454459104
## [13,] -0.661031424
## [14,]  0.165257856
## [15,]  1.335834336
## [16,] -1.246319664
## [17,] -0.557745264
## [18,] -0.144600624
## [19,]  1.852265136
## [20,]  1.094833296
## attr(,"scaled:center")
## [1] 109.2
## attr(,"scaled:scale")
## [1] 29.04552

Scale an entire matrix

CHALLENGE: What is this calculating?

sum(habk.df.numeric$Vol)/length(habk.df.numeric$Vol)

## [1] 109.2

Center a single column

# make a copy of the data
habk.df.centered <- habk.df.numeric


# check original mean
mean(habk.df.numeric$Vol)

## [1] 109.2

habk.df.centered$Vol <- habk.df.centered$Vol - mean(habk.df.centered$Vol)
mean(habk.df.centered$Vol)

## [1] -2.842518e-15

# Bulk
habk.df.centered$Bulk <- habk.df.centered$Bulk - mean(habk.df.centered$Bulk)
mean(habk.df.numeric$Bulk)

## [1] 15.3405

TODO: Center another column using t

# center another column as in the previous chunk

Center ALL variables with for loop

Center ALL variable with for loop

# What does this for() loop do?

for(i in 1:ncol(habk.df.centered)){
  mean.i <- mean(habk.df.centered[,i])
  habk.df.centered[,i] <- mean.i-habk.df.centered[,i]
}

summary(habk.df.centered)

##       Vol              Bulk            Polarity             pI         
##  Min.   :-53.80   Min.   :-6.3295   Min.   :-38.406   Min.   :-4.7075  
##  1st Qu.:-17.55   1st Qu.:-3.1320   1st Qu.: -1.429   1st Qu.:-0.0025  
##  Median : -2.30   Median : 0.2605   Median : 11.959   Median : 0.1275  
##  Mean   :  0.00   Mean   : 0.0000   Mean   :  0.000   Mean   : 0.0000  
##  3rd Qu.: 18.45   3rd Qu.: 2.1755   3rd Qu.: 13.299   3rd Qu.: 0.4450  
##  Max.   : 61.20   Max.   :11.9405   Max.   : 13.594   Max.   : 3.2825  
##      Hyd.1           Hyd.2         Surface_area      Fract_area     
##  Min.   :-4.99   Min.   :-5.070   Min.   :-83.60   Min.   :-0.1735  
##  1st Qu.:-2.54   1st Qu.:-3.520   1st Qu.:-30.35   1st Qu.:-0.1135  
##  Median : 0.36   Median :-2.170   Median : -5.60   Median : 0.0065  
##  Mean   : 0.00   Mean   : 0.000   Mean   :  0.00   Mean   : 0.0000  
##  3rd Qu.: 3.01   3rd Qu.: 2.905   3rd Qu.: 30.15   3rd Qu.: 0.0990  
##  Max.   : 4.01   Max.   :10.930   Max.   : 90.40   Max.   : 0.2165

TODO: rewrite the for() loop above so that the variable mean.i doesn’t have to be created and only a single line of code is executed within the for() loop.

Check …

# What is happening in this chunk?
sd(habk.df.centered$Vol)

## [1] 29.04552

sd(habk.df.numeric$Vol)

## [1] 29.04552

sd(habk.df.centered$Vol) == sd(habk.df.numeric$Vol)

## [1] TRUE

TODO: Write logical checks for the standard deviation (sd) of another variable

# logical check

Standardize a single column

# What is happening in this chunk?

# create new dataframe
habk.df.standard <- habk.df.centered

habk.df.standard$Vol <-  habk.df.standard$Vol/sd(habk.df.standard$Vol)

sd(habk.df.standard$Vol)

## [1] 1

TODO: Standardize another column a done in the previous column.

# Standardize another column

Standardized an entire matrix with a for() loop

# What does this chunk do?

for(i in 1:ncol(habk.df.standard)){
  sd.i <- sd(habk.df.standard[,i])
  habk.df.standard[,i] <- habk.df.standard[,i]/sd.i
}

summary(habk.df.standard)

##       Vol                Bulk             Polarity              pI           
##  Min.   :-1.85227   Min.   :-1.36122   Min.   :-1.75257   Min.   :-2.664603  
##  1st Qu.:-0.60422   1st Qu.:-0.67357   1st Qu.:-0.06519   1st Qu.:-0.001415  
##  Median :-0.07919   Median : 0.05602   Median : 0.54572   Median : 0.072169  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.000000  
##  3rd Qu.: 0.63521   3rd Qu.: 0.46786   3rd Qu.: 0.60687   3rd Qu.: 0.251885  
##  Max.   : 2.10704   Max.   : 2.56792   Max.   : 0.62033   Max.   : 1.858005  
##      Hyd.1             Hyd.2          Surface_area       Fract_area      
##  Min.   :-1.6706   Min.   :-1.0362   Min.   :-1.8709   Min.   :-1.50457  
##  1st Qu.:-0.8504   1st Qu.:-0.7194   1st Qu.:-0.6792   1st Qu.:-0.98426  
##  Median : 0.1205   Median :-0.4435   Median :-0.1253   Median : 0.05637  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 1.0077   3rd Qu.: 0.5937   3rd Qu.: 0.6747   3rd Qu.: 0.85852  
##  Max.   : 1.3425   Max.   : 2.2338   Max.   : 2.0230   Max.   : 1.87746

TODO: Shorten code in the for() loop so that the object sd.i doesn’t need to be created

# for() loop without sd.i line

TODO: Write a for() loop that does BOTH tasks in a single loop.

# center
# scale by sd

Write a for() loop that….

# What does this chunk do?

habk.df.standard <- habk.df.numeric
for(i in 1:ncol(habk.df.standard)){
  habk.df.standard[,i] <- scale(habk.df.standard[,i])
}

summary(habk.df.standard)

##         Vol.V1              Bulk.V1            Polarity.V1     
##  Min.   :-2.1070377   Min.   :-2.5679182   Min.   :-0.6203318  
##  1st Qu.:-0.6352099   1st Qu.:-0.4678620   1st Qu.:-0.6068701  
##  Median : 0.0791861   Median :-0.0560230   Median :-0.5457222  
##  Mean   : 0.0000000   Mean   : 0.0000000   Mean   : 0.0000000  
##  3rd Qu.: 0.6042240   3rd Qu.: 0.6735664   3rd Qu.: 0.0651864  
##  Max.   : 1.8522651   Max.   : 1.3612193   Max.   : 1.7525720  
##         pI.V1               Hyd.1.V1             Hyd.2.V1      
##  Min.   :-1.8580053   Min.   :-1.3424968   Min.   :-2.2338170  
##  1st Qu.:-0.2518850   1st Qu.:-1.0077096   1st Qu.:-0.5937089  
##  Median :-0.0721693   Median :-0.1205234   Median : 0.4434934  
##  Mean   : 0.0000000   Mean   : 0.0000000   Mean   : 0.0000000  
##  3rd Qu.: 0.0014151   3rd Qu.: 0.8503596   3rd Qu.: 0.7193994  
##  Max.   : 2.6646032   Max.   : 1.6705883   Max.   : 1.0361804  
##    Surface_area.V1       Fract_area.V1    
##  Min.   :-2.0230352   Min.   :-1.8774603  
##  1st Qu.:-0.6747180   1st Qu.:-0.8585153  
##  Median : 0.1253208   Median :-0.0563672  
##  Mean   : 0.0000000   Mean   : 0.0000000  
##  3rd Qu.: 0.6791938   3rd Qu.: 0.9842575  
##  Max.   : 1.8708600   Max.   : 1.5045698

CHALLENGE (will NOT be on exam). How do these lines of code work:

# CHALLENGE: What does this do?

X.bar <- apply(habk.df.numeric, 2, mean)
summary(habk.df.numeric)

##       Vol              Bulk          Polarity            pI        
##  Min.   : 48.00   Min.   : 3.40   Min.   : 0.000   Min.   : 2.770  
##  1st Qu.: 90.75   1st Qu.:13.16   1st Qu.: 0.295   1st Qu.: 5.607  
##  Median :111.50   Median :15.08   Median : 1.635   Median : 5.925  
##  Mean   :109.20   Mean   :15.34   Mean   :13.594   Mean   : 6.053  
##  3rd Qu.:126.75   3rd Qu.:18.47   3rd Qu.:15.023   3rd Qu.: 6.055  
##  Max.   :163.00   Max.   :21.67   Max.   :52.000   Max.   :10.760  
##      Hyd.1           Hyd.2          Surface_area     Fract_area    
##  Min.   :-4.50   Min.   :-12.300   Min.   : 85.0   Min.   :0.5200  
##  1st Qu.:-3.50   1st Qu.: -4.275   1st Qu.:145.2   1st Qu.:0.6375  
##  Median :-0.85   Median :  0.800   Median :181.0   Median :0.7300  
##  Mean   :-0.49   Mean   : -1.370   Mean   :175.4   Mean   :0.7365  
##  3rd Qu.: 2.05   3rd Qu.:  2.150   3rd Qu.:205.8   3rd Qu.:0.8500  
##  Max.   : 4.50   Max.   :  3.700   Max.   :259.0   Max.   :0.9100

habk.df.standard <- apply(habk.df.numeric, 2, scale)
habk.df.standard <- data.frame(habk.df.standard)
summary(habk.df.standard)

##       Vol                Bulk             Polarity              pI           
##  Min.   :-2.10704   Min.   :-2.56792   Min.   :-0.62033   Min.   :-1.858005  
##  1st Qu.:-0.63521   1st Qu.:-0.46786   1st Qu.:-0.60687   1st Qu.:-0.251885  
##  Median : 0.07919   Median :-0.05602   Median :-0.54572   Median :-0.072169  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.000000  
##  3rd Qu.: 0.60422   3rd Qu.: 0.67357   3rd Qu.: 0.06519   3rd Qu.: 0.001415  
##  Max.   : 1.85227   Max.   : 1.36122   Max.   : 1.75257   Max.   : 2.664603  
##      Hyd.1             Hyd.2          Surface_area       Fract_area      
##  Min.   :-1.3425   Min.   :-2.2338   Min.   :-2.0230   Min.   :-1.87746  
##  1st Qu.:-1.0077   1st Qu.:-0.5937   1st Qu.:-0.6747   1st Qu.:-0.85852  
##  Median :-0.1205   Median : 0.4435   Median : 0.1253   Median :-0.05637  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.8504   3rd Qu.: 0.7194   3rd Qu.: 0.6792   3rd Qu.: 0.98426  
##  Max.   : 1.6706   Max.   : 1.0362   Max.   : 1.8709   Max.   : 1.50457

PART 5: 3D visualization of centering and scaling

Prepare data:
scatterplot3d does NOT have a data = argument. Make seperate vectors to work with to make code cleanr. ….

# raw vectors
Xraw <- habk.df.numeric$Bulk
Yraw <- habk.df.numeric$Polarity
Zraw <- habk.df.numeric$Surface_area

# center
Xcent <- Xraw - mean(Xraw)
Ycent <- Yraw - mean(Yraw)
Zcent <- Zraw - mean(Zraw)

# scale, without centering
Xscale0 <- Xraw/sd(Xraw)
Yscale0 <- Yraw/sd(Yraw)
Zscale0 <- Zraw/sd(Zraw)

# scale AND center
Xscale <- (Xraw - mean(Xraw))/sd(Xraw)
Yscale <- (Yraw - mean(Yraw))/sd(Yraw)
Zscale <- (Zraw - mean(Zraw))/sd(Zraw)

TODO: create ALL of the scaled vectors created above using the scale() function.

# vectors using R function()

3-D scatter plot of….

# what does this chunk do?

#  main plot with scatterplot3d() - know how this works
plot1 <- scatterplot3d(x = Xraw, 
                       y = Yraw, 
                       z = Zraw,
              color = 2, 
              pch = 16, type = "h",
              xlim = c(-15,25),
              ylim = c(-30,50),
              zlim = c(-100,275),
              main = "Raw data")


# add-ons don't need to know how these work
# These are lines at x = 0, y = 0, z = 0
      p2 <- plot1$xyz.convert(0,-40,0)
      p3 <- plot1$xyz.convert(0,60,0)
      segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
      p2 <- plot1$xyz.convert(-20,0,0)
      p3 <- plot1$xyz.convert(30,0,0)
      segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)

3-D scatter plot of….

# what does this chunk do?


#  main plot - know how this works
plot2 <- scatterplot3d(x = Xcent, y = Ycent, z = Zcent,
              color = 2, 
              pch = 16, type = "h",
              xlim = c(-15,25),
              ylim = c(-30,50),
              zlim = c(-100,275), main = "Centered")

# add-ons don't need to know how these work
# These are lines at x = 0, y = 0, z = 0
p2 <- plot2$xyz.convert(0,-40,0)
p3 <- plot2$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot2$xyz.convert(-20,0,0)
p3 <- plot2$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)

3-D scatter plot of….

# what does this chunk do?

#  main plot - know how this works
plot3 <- scatterplot3d(x = Xscale0, y = Yscale0, z = Zscale0,
              color = 3, 
              pch = 12, type = "h",
              xlim = c(-15,25),
              ylim = c(-30,50),
              zlim = c(-100,275), 
              main = "Scaled (NOT centered)")

p2 <- plot3$xyz.convert(0,-40,0)
p3 <- plot3$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot3$xyz.convert(-20,0,0)
p3 <- plot3$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)

3-D scatter plot of….

#  main plot - know how this works
plot4 <- scatterplot3d(x = Xscale, y = Yscale, z = Zscale,
              color = 3, 
              pch = 16, type = "h",
              xlim = c(-25,25),
              ylim = c(-50,50),
              zlim = c(-100,275),
              main = "Centered & Scaled")

p2 <- plot4$xyz.convert(0,-40,0)
p3 <- plot4$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot4$xyz.convert(-20,0,0)
p3 <- plot4$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)

3-D scatter plot of….

#  main plot - know how this works
plot5 <- scatterplot3d(x = Xscale, y = Yscale, z = Zscale,
              color = 3, 
              pch = 16, type = "h",
              #xlim = c(-25,25),
              #ylim = c(-50,50),
              #zlim = c(-100,275),
              main = "Centered & Scaled")


p2 <- plot5$xyz.convert(0,-40,0)
p3 <- plot5$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot5$xyz.convert(-20,0,0)
p3 <- plot5$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)

# what does this chunk do?
# which of the transformations is NOT being shown
par(mfrow = c(2,2), mar = c(1,1,1,1))
plot1 <- scatterplot3d(x = Xraw, 
                       y = Yraw, 
                       z = Zraw,
              color = 2, 
              pch = 16, type = "h",
              xlim = c(-15,25),
              ylim = c(-30,50),
              zlim = c(-100,275),
              main = "")


# add-ons don't need to know how these work
# These are lines at x = 0, y = 0, z = 0
      p2 <- plot1$xyz.convert(0,-40,0)
      p3 <- plot1$xyz.convert(0,60,0)
      segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
      p2 <- plot1$xyz.convert(-20,0,0)
      p3 <- plot1$xyz.convert(30,0,0)
      segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
      
      
#  main plot - know how this works
plot2 <- scatterplot3d(x = Xcent, y = Ycent, z = Zcent,
              color = 2, 
              pch = 16, type = "h",
              xlim = c(-15,25),
              ylim = c(-30,50),
              zlim = c(-100,275), main = "")

# add-ons don't need to know how these work
# These are lines at x = 0, y = 0, z = 0
p2 <- plot2$xyz.convert(0,-40,0)
p3 <- plot2$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot2$xyz.convert(-20,0,0)
p3 <- plot2$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)


plot4 <- scatterplot3d(x = Xscale, y = Yscale, z = Zscale,
              color = 3, 
              pch = 16, type = "h",
              xlim = c(-25,25),
              ylim = c(-50,50),
              zlim = c(-100,275),
              main = "")

p2 <- plot4$xyz.convert(0,-40,0)
p3 <- plot4$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot4$xyz.convert(-20,0,0)
p3 <- plot4$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)


#  main plot - know how this works
plot5 <- scatterplot3d(x = Xscale, y = Yscale, z = Zscale,
              color = 3, 
              pch = 16, type = "h",
              #xlim = c(-25,25),
              #ylim = c(-50,50),
              #zlim = c(-100,275),
              main = "")


p2 <- plot5$xyz.convert(0,-40,0)
p3 <- plot5$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot5$xyz.convert(-20,0,0)
p3 <- plot5$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)

Replication: Multivariate analysis of amino acid chemical properties

Nathan Brouwer

11/29/2021

Setup

PART 1: Data preparation

PART 2: DATA DICTIONARY

PART 3: Remake Higgs and Attwood Figure 2.8: 2-dimensional Data visualization

PART 4: Data centering and scaling

Demonstate centering / scaling on a single vector

Scale an entire matrix

Center a single column

Center ALL variables with for loop

PART 5: 3D visualization of centering and scaling