Install any needed packages.
#install.packages("lsa")
#install.packages("mdendro")
#install.packages("dendextend")
Load all packages
# packages we've used a lot
library(ggplot2)
library(ggpubr)
# packages we've only recently started using
library(scatterplot3d)
library(pander)
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-7
# NEW packages- make sure you installed them!
library(dendextend)
## Registered S3 method overwritten by 'dendextend':
## method from
## rev.hclust vegan
##
## ---------------------
## Welcome to dendextend version 1.15.2
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:permute':
##
## shuffle
## The following object is masked from 'package:ggpubr':
##
## rotate
## The following object is masked from 'package:stats':
##
## cutree
library(lsa)
## Loading required package: SnowballC
NOTE: This vectors may be useful for exploring the functionality during lecture of R functions but are not used in the analysis itself or added to the dataframe.
# data exploration vectors
MW.da <- c(89, 121, 133, 146, 165, 75, 155, 131, 146, 131,
149, 132, 115, 147, 174, 105, 119, 117, 204, 181)
polar.req <-c(7, 4.8, 13, 12.5, 5, 7.9, 8.4, 4.9, 10.1, 4.9,
5.3, 10, 6.6, 8.6,9.1,7.5,6.6,5.6,5.2,5.4)
freq <-c(7.8, 1.1, 5.19 ,6.72, 4.39, 6.77, 2.03, 6.95,
6.32, 10.15, 2.28, 4.37, 4.26, 3.45, 5.23,
6.46, 5.12, 7.01, 1.09, 3.3)
The following vectors are the columns that make up Table 2.2 of Higgs and Attwood 2005, Chapter 2.
# main data vectors
aa.1 <-c('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L','M','N',
'P','Q','R','S','T','V','W','Y')
vol <- c(67, 86, 91, 109, 135, 48, 118, 124, 135, 124, 124, 96, 90, 114,
148, 73, 93, 105, 163, 141)
bulk <-c(11.5, 13.46, 11.68, 13.57, 19.8, 3.4, 13.69, 21.4, 15.71, 21.4, 16.25,
12.28,17.43, 14.45, 14.28, 9.47, 15.77, 21.57, 21.67, 18.03)
pol <-c(0,1.48,49.7,49.9,0.35,0,51.6,0.13,49.5,0.13,
1.43,3.38,1.58,3.53,52,1.67,1.66,0.13,2.1,1.61)
isoelec <-c(6,5.07,2.77,3.22,5.48,5.97,7.59,6.02,9.74,5.98, 5.74, 5.41, 6.3,
5.65, 10.76, 5.68, 6.16, 5.96, 5.89, 5.66)
H2Opho.34 <-c(1.8,2.5,-3.5,-3.5,2.8,-0.4,-3.2,4.5, -3.9, 3.8, 1.9, -3.5, -1.6,
-3.5, -4.5, -0.8, -0.7, 4.2, -0.9, -1.3)
H2Opho.35 <-c(1.6,2,-9.2,-8.2,3.7,1,-3,3.1,-8.8,2.8,3.4,-4.8, -0.2,-4.1,-12.3,
0.6,1.2,2.6,1.9,-0.7)
saaH2O <-c(113, 140,151,183,218,85,194,182,211,180, 204, 158, 143, 189,
241, 122, 146,160,259,229)
faal.fold <-c(0.74,0.91,0.62,0.62,0.88,0.72,0.78,0.88,0.52, 0.85, 0.85, 0.63,
0.64,0.62,0.64,0.66,0.7, 0.86, 0.85, 0.76)
TODO: Create a vector of the full names and the 3-letter codes of the amino acids, then add it to the data.frame.
# build dataframe
## create a vector with the FULL names of all the amino acids
aa.full <- c("alanine", "cysteine", "aspartic acid", "glutamic acid", "phenylalanine", "glycine", "histidine", "isoleucine", "lysine", "leucine", "methionine", "asparagine", "proline", "glutamine", "arginine", "serine", "threonine", "valine", "tryptophan", "tyrosine")
## create a vector with the 3-letter codes for amino acids
aa.3 <- c("ala", "cys", "asp", "glu", "phe", "gly", "his", "ile", "lys", "leu", "met", "asn", "pro", "gln", "asn", "ser", "thr", "val", "trp", "tyr")
## create dataframe
habk.df <- data.frame(aa.full = aa.full,
aa.3 = aa.3,
aa.1 = aa.1,
Vol = vol, Bulk = bulk, Polarity = pol,
pI = isoelec, Hyd.1 = H2Opho.34,
Hyd.2 = H2Opho.35, Surface_area = saaH2O,
Fract_area = faal.fold)
# name rows - this will help later
# when we do PCA
row.names(habk.df) <- aa.1
# display with pander
pander(habk.df)
| aa.full | aa.3 | aa.1 | Vol | Bulk | Polarity | pI | Hyd.1 | |
|---|---|---|---|---|---|---|---|---|
| A | alanine | ala | A | 67 | 11.5 | 0 | 6 | 1.8 |
| C | cysteine | cys | C | 86 | 13.46 | 1.48 | 5.07 | 2.5 |
| D | aspartic acid | asp | D | 91 | 11.68 | 49.7 | 2.77 | -3.5 |
| E | glutamic acid | glu | E | 109 | 13.57 | 49.9 | 3.22 | -3.5 |
| F | phenylalanine | phe | F | 135 | 19.8 | 0.35 | 5.48 | 2.8 |
| G | glycine | gly | G | 48 | 3.4 | 0 | 5.97 | -0.4 |
| H | histidine | his | H | 118 | 13.69 | 51.6 | 7.59 | -3.2 |
| I | isoleucine | ile | I | 124 | 21.4 | 0.13 | 6.02 | 4.5 |
| K | lysine | lys | K | 135 | 15.71 | 49.5 | 9.74 | -3.9 |
| L | leucine | leu | L | 124 | 21.4 | 0.13 | 5.98 | 3.8 |
| M | methionine | met | M | 124 | 16.25 | 1.43 | 5.74 | 1.9 |
| N | asparagine | asn | N | 96 | 12.28 | 3.38 | 5.41 | -3.5 |
| P | proline | pro | P | 90 | 17.43 | 1.58 | 6.3 | -1.6 |
| Q | glutamine | gln | Q | 114 | 14.45 | 3.53 | 5.65 | -3.5 |
| R | arginine | asn | R | 148 | 14.28 | 52 | 10.76 | -4.5 |
| S | serine | ser | S | 73 | 9.47 | 1.67 | 5.68 | -0.8 |
| T | threonine | thr | T | 93 | 15.77 | 1.66 | 6.16 | -0.7 |
| V | valine | val | V | 105 | 21.57 | 0.13 | 5.96 | 4.2 |
| W | tryptophan | trp | W | 163 | 21.67 | 2.1 | 5.89 | -0.9 |
| Y | tyrosine | tyr | Y | 141 | 18.03 | 1.61 | 5.66 | -1.3 |
| Hyd.2 | Surface_area | Fract_area | |
|---|---|---|---|
| A | 1.6 | 113 | 0.74 |
| C | 2 | 140 | 0.91 |
| D | -9.2 | 151 | 0.62 |
| E | -8.2 | 183 | 0.62 |
| F | 3.7 | 218 | 0.88 |
| G | 1 | 85 | 0.72 |
| H | -3 | 194 | 0.78 |
| I | 3.1 | 182 | 0.88 |
| K | -8.8 | 211 | 0.52 |
| L | 2.8 | 180 | 0.85 |
| M | 3.4 | 204 | 0.85 |
| N | -4.8 | 158 | 0.63 |
| P | -0.2 | 143 | 0.64 |
| Q | -4.1 | 189 | 0.62 |
| R | -12.3 | 241 | 0.64 |
| S | 0.6 | 122 | 0.66 |
| T | 1.2 | 146 | 0.7 |
| V | 2.6 | 160 | 0.86 |
| W | 1.9 | 259 | 0.85 |
| Y | -0.7 | 229 | 0.76 |
A data dictionary is a guide to what the code means, what the columns are, what the variables are, etcetera. Metadata is data about data. A good column name is one that is short and concise, while a bad column name is one that is long.
https://library.ucmerced.edu/data-dictionaries
# (For exam you need to be able to...
# describe why a data dictionary is needed,
# what makes good column names,
# what makes bad column names, etc.)
# data dictionary columns
column.names <- c("Vol", "Bulk", "Polarity","pI", "Hyd.1", "Hyd.2",
"Surface_area","Fract_area")
name.full <- c("Volume", "Bulkiness", "Polarity",
"pH at Isoelectric point", "Hydrophobicity scale-1",
"Hydrophobicity scale-2","SA accessible to H2O","Fractional area")
vector.names <- c("vol", "bulk", "pol","isoelec", "H2Opho.34",
"H2Opho.35", "saaH2O","faal.fold")
reference <- c("Creighton 1993", "Zimmerman et al. 1968", "Zimmerman et al. 1968",
"Zimmerman et al. 1968", "Kyte & Doolittle 1982", "Engelman et al. 1986",
"Miller et al. 1987"," Rose et al. 1985")
Make a dataframe to hold the DATA DICTIONARY information
# TODO: Make a dataframe to hold DATA DICTIONARY information
# make dataframe
data.dictionary <- data.frame( ColumnNames = column.names,
FullName = name.full,
VectorNames = vector.names,
References = reference)
# Display data dictionary
pander(data.dictionary)
| ColumnNames | FullName | VectorNames | References |
|---|---|---|---|
| Vol | Volume | vol | Creighton 1993 |
| Bulk | Bulkiness | bulk | Zimmerman et al. 1968 |
| Polarity | Polarity | pol | Zimmerman et al. 1968 |
| pI | pH at Isoelectric point | isoelec | Zimmerman et al. 1968 |
| Hyd.1 | Hydrophobicity scale-1 | H2Opho.34 | Kyte & Doolittle 1982 |
| Hyd.2 | Hydrophobicity scale-2 | H2Opho.35 | Engelman et al. 1986 |
| Surface_area | SA accessible to H2O | saaH2O | Miller et al. 1987 |
| Fract_area | Fractional area | faal.fold | Rose et al. 1985 |
TODO: Using ggpubr, remake AND update/augment Figure 2.8 of Higgs and Attwood 2005, Chapter 2, page 26.
# Make scatterplot with appropriate ggpubr function
ggscatter(x = "pI",
y = "Vol",
data = habk.df,
color = "Polarity",
size = "Hyd.1",
label = "aa.1",
ylab = "Volume (Angstorms)",
xlab = "pI (pH at isoelectric point)",
title = "Plot of Amino Acid Volume agaist pI, two properies \n thought to important to protein folding",
ylim = c(0, 180),
xlim = c(1,12))
For the exam you need to be able to understand what data centering and scaling is, how do do them in R with and without specialized functions, how to do them to an individual vector, how to do them to a column in a dataframe, how to do them using a for loop.
Data centering is centering around the mean (mean goes to 0) Data scaling is dividing by the SD
# For the exam you need to be able to understand what data centering and scaling is,
# how do do them in R with and without specialized functions, how to do them to an
#individual vector, how to do them to a column in a dataframe,
# how to do them using a for loop.
# Subset data
## get info
dim(habk.df)
## [1] 20 11
ncol(habk.df)
## [1] 11
# easiest
habk.df.numeric <- habk.df[ , c(-1,-2,-3)]
# equivalent
habk.df.numeric <- habk.df[ , c(4,5,6,7,8,9,10,11)]
habk.df.numeric <- habk.df[ , c(4:11)]
Set up data:
# Select a single column to explore how to do centering / scaling
X <- habk.df.numeric$Vol
This is calculating the mean
sum(X)/length(X)
## [1] 109.2
This is calculating the standard deviation
X <- habk.df.numeric$Vol
sqrt(sum((X-mean(X))^2)/length(X))
## [1] 28.31007
# This is centering the data
X <- habk.df.numeric$Vol
u <- mean(X)
u-X
## [1] 42.2 23.2 18.2 0.2 -25.8 61.2 -8.8 -14.8 -25.8 -14.8 -14.8 13.2
## [13] 19.2 -4.8 -38.8 36.2 16.2 4.2 -53.8 -31.8
# This is scaling the data without centering
X <- habk.df.numeric$Vol
s <- sd(X)
X/s
## [1] 2.306724 2.960870 3.133014 3.752730 4.647877 1.652579 4.062589 4.269161
## [9] 4.647877 4.269161 4.269161 3.305157 3.098585 3.924874 5.095451 2.513297
## [17] 3.201871 3.615016 5.611881 4.854450
# This is scaling the data with centering
X <- habk.df.numeric$Vol
u <- mean(X)
s <- sd(X)
z <- (u-X)/s
# This is creating histograms with different variations of scaling and centering
par(mfrow = c(2,2), mar = c(3,1,3,1))
hist(X)
abline(v = mean(X), col = 2)
hist(X/s)
abline(v = mean(X/s), col = 3)
hist(u-X)
abline(v = 0, col = 4)
hist(z)
abline(v = 0, col = 5)
par(mfrow = c(1,1), mar = c(4,4,4,4))
# This uses the scale function to scale and center
# Each of these lines tests different variations of scaling and centering
scale(X, center = T, scale = F)
## [,1]
## [1,] -42.2
## [2,] -23.2
## [3,] -18.2
## [4,] -0.2
## [5,] 25.8
## [6,] -61.2
## [7,] 8.8
## [8,] 14.8
## [9,] 25.8
## [10,] 14.8
## [11,] 14.8
## [12,] -13.2
## [13,] -19.2
## [14,] 4.8
## [15,] 38.8
## [16,] -36.2
## [17,] -16.2
## [18,] -4.2
## [19,] 53.8
## [20,] 31.8
## attr(,"scaled:center")
## [1] 109.2
scale(X, center = F, scale = T)
## [,1]
## [1,] 0.5788805
## [2,] 0.7430407
## [3,] 0.7862407
## [4,] 0.9417609
## [5,] 1.1664011
## [6,] 0.4147204
## [7,] 1.0195209
## [8,] 1.0713610
## [9,] 1.1664011
## [10,] 1.0713610
## [11,] 1.0713610
## [12,] 0.8294408
## [13,] 0.7776007
## [14,] 0.9849609
## [15,] 1.2787212
## [16,] 0.6307206
## [17,] 0.8035207
## [18,] 0.9072008
## [19,] 1.4083213
## [20,] 1.2182411
## attr(,"scaled:scale")
## [1] 115.7406
scale(X, center = T, scale = T)
## [,1]
## [1,] -1.452891984
## [2,] -0.798746304
## [3,] -0.626602704
## [4,] -0.006885744
## [5,] 0.888260976
## [6,] -2.107037664
## [7,] 0.302972736
## [8,] 0.509545056
## [9,] 0.888260976
## [10,] 0.509545056
## [11,] 0.509545056
## [12,] -0.454459104
## [13,] -0.661031424
## [14,] 0.165257856
## [15,] 1.335834336
## [16,] -1.246319664
## [17,] -0.557745264
## [18,] -0.144600624
## [19,] 1.852265136
## [20,] 1.094833296
## attr(,"scaled:center")
## [1] 109.2
## attr(,"scaled:scale")
## [1] 29.04552
This is calculating the mean
sum(habk.df.numeric$Vol)/length(habk.df.numeric$Vol)
## [1] 109.2
# make a copy of the data
habk.df.centered <- habk.df.numeric
# check original mean
mean(habk.df.numeric$Vol)
## [1] 109.2
habk.df.centered$Vol <- habk.df.centered$Vol - mean(habk.df.centered$Vol)
mean(habk.df.centered$Vol)
## [1] -2.842518e-15
# Bulk
habk.df.centered$Bulk <- habk.df.centered$Bulk - mean(habk.df.centered$Bulk)
mean(habk.df.numeric$Bulk)
## [1] 15.3405
TODO: Center another column using the entire dataframe
# make a copy of the data
habk.df.centered <- habk.df.numeric
# check original mean
mean(habk.df.numeric$Polarity)
## [1] 13.594
habk.df.centered$Polarity <- habk.df.centered$Polarity - mean(habk.df.centered$Polarity)
mean(habk.df.centered$Polarity)
## [1] 3.56052e-16
# pI
habk.df.centered$pI <- habk.df.centered$pI - mean(habk.df.centered$pI)
mean(habk.df.numeric$pI)
## [1] 6.0525
# This for loop creates a summary of all of the data
for(i in 1:ncol(habk.df.centered)){
mean.i <- mean(habk.df.centered[,i])
habk.df.centered[,i] <- mean.i-habk.df.centered[,i]
}
summary(habk.df.centered)
## Vol Bulk Polarity pI
## Min. :-53.80 Min. :-6.3295 Min. :-38.406 Min. :-4.7075
## 1st Qu.:-17.55 1st Qu.:-3.1320 1st Qu.: -1.429 1st Qu.:-0.0025
## Median : -2.30 Median : 0.2605 Median : 11.959 Median : 0.1275
## Mean : 0.00 Mean : 0.0000 Mean : 0.000 Mean : 0.0000
## 3rd Qu.: 18.45 3rd Qu.: 2.1755 3rd Qu.: 13.299 3rd Qu.: 0.4450
## Max. : 61.20 Max. :11.9405 Max. : 13.594 Max. : 3.2825
## Hyd.1 Hyd.2 Surface_area Fract_area
## Min. :-4.99 Min. :-5.070 Min. :-83.60 Min. :-0.1735
## 1st Qu.:-2.54 1st Qu.:-3.520 1st Qu.:-30.35 1st Qu.:-0.1135
## Median : 0.36 Median :-2.170 Median : -5.60 Median : 0.0065
## Mean : 0.00 Mean : 0.000 Mean : 0.00 Mean : 0.0000
## 3rd Qu.: 3.01 3rd Qu.: 2.905 3rd Qu.: 30.15 3rd Qu.: 0.0990
## Max. : 4.01 Max. :10.930 Max. : 90.40 Max. : 0.2165
Rewrite the for() loop above so that the variable mean.i doesn’t have to be created and only a single line of code is executed within the for() loop.
#for(i in 1:ncol(habk.df.centered)){
# habk.df.centered[,i] <- (mean(habk.df.centered[,i])-habk.df.centered[,i]
#}
#summary(habk.df.centered)
Check …
# This chunk is just checking to make sure that thd SD is the same between the centered data and the non-centered data
sd(habk.df.centered$Vol)
## [1] 29.04552
sd(habk.df.numeric$Vol)
## [1] 29.04552
sd(habk.df.centered$Vol) == sd(habk.df.numeric$Vol)
## [1] TRUE
TODO: Write logical checks for the standard deviation (sd) of another variable
# logical check
sd(habk.df.centered$Polarity)
## [1] 21.91408
sd(habk.df.numeric$Polarity)
## [1] 21.91408
sd(habk.df.centered$Polarity) == sd(habk.df.numeric$Polarity)
## [1] TRUE
# This chunk is creating a new dataframe with the centered data and then obtaining its standard deviation
# create new dataframe
habk.df.standard <- habk.df.centered
habk.df.standard$Vol <- habk.df.standard$Vol/sd(habk.df.standard$Vol)
sd(habk.df.standard$Vol)
## [1] 1
TODO: modify another column a done in the previous chunk
# Standardize another column
habk.df.standard <- habk.df.centered
habk.df.standard$Polarity <- habk.df.standard$Polarity/sd(habk.df.standard$Polarity)
sd(habk.df.standard$Polarity)
## [1] 1
# This chunk creates a summary of the standard data
for(i in 1:ncol(habk.df.standard)){
sd.i <- sd(habk.df.standard[,i])
habk.df.standard[,i] <- habk.df.standard[,i]/sd.i
}
summary(habk.df.standard)
## Vol Bulk Polarity pI
## Min. :-1.85227 Min. :-1.36122 Min. :-1.75257 Min. :-2.664603
## 1st Qu.:-0.60422 1st Qu.:-0.67357 1st Qu.:-0.06519 1st Qu.:-0.001415
## Median :-0.07919 Median : 0.05602 Median : 0.54572 Median : 0.072169
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.63521 3rd Qu.: 0.46786 3rd Qu.: 0.60687 3rd Qu.: 0.251885
## Max. : 2.10704 Max. : 2.56792 Max. : 0.62033 Max. : 1.858005
## Hyd.1 Hyd.2 Surface_area Fract_area
## Min. :-1.6706 Min. :-1.0362 Min. :-1.8709 Min. :-1.50457
## 1st Qu.:-0.8504 1st Qu.:-0.7194 1st Qu.:-0.6792 1st Qu.:-0.98426
## Median : 0.1205 Median :-0.4435 Median :-0.1253 Median : 0.05637
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 1.0077 3rd Qu.: 0.5937 3rd Qu.: 0.6747 3rd Qu.: 0.85852
## Max. : 1.3425 Max. : 2.2338 Max. : 2.0230 Max. : 1.87746
Shorten code in the for() loop so that the object sd.i doesn’t need to be created
# for() loop without sd.i line
for(i in 1:ncol(habk.df.standard)){
habk.df.standard[,i] <- habk.df.standard[,i]/sd(habk.df.standard[,i])
}
summary(habk.df.standard)
## Vol Bulk Polarity pI
## Min. :-1.85227 Min. :-1.36122 Min. :-1.75257 Min. :-2.664603
## 1st Qu.:-0.60422 1st Qu.:-0.67357 1st Qu.:-0.06519 1st Qu.:-0.001415
## Median :-0.07919 Median : 0.05602 Median : 0.54572 Median : 0.072169
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.63521 3rd Qu.: 0.46786 3rd Qu.: 0.60687 3rd Qu.: 0.251885
## Max. : 2.10704 Max. : 2.56792 Max. : 0.62033 Max. : 1.858005
## Hyd.1 Hyd.2 Surface_area Fract_area
## Min. :-1.6706 Min. :-1.0362 Min. :-1.8709 Min. :-1.50457
## 1st Qu.:-0.8504 1st Qu.:-0.7194 1st Qu.:-0.6792 1st Qu.:-0.98426
## Median : 0.1205 Median :-0.4435 Median :-0.1253 Median : 0.05637
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 1.0077 3rd Qu.: 0.5937 3rd Qu.: 0.6747 3rd Qu.: 0.85852
## Max. : 1.3425 Max. : 2.2338 Max. : 2.0230 Max. : 1.87746
Write a for() loop that does BOTH tasks in a single loop.
# center
# scale by sd
#for(i in 1:ncol(habk.df.centered)){
# habk.df.centered[,i] <- (mean(habk.df.centered[,i])-habk.df.centered[,i]
# habk.df.standard[,i] <- habk.df.standard[,i]/sd(habk.df.standard[,i])
#}
#summary(habk.df.centered)
#summary(habk.df.standard)
Write a for() loop that….
# This chunk scales numeric data using a for loop
habk.df.standard <- habk.df.numeric
for(i in 1:ncol(habk.df.standard)){
habk.df.standard[,i] <- scale(habk.df.standard[,i])
}
summary(habk.df.standard)
## Vol.V1 Bulk.V1 Polarity.V1
## Min. :-2.1070377 Min. :-2.5679182 Min. :-0.6203318
## 1st Qu.:-0.6352099 1st Qu.:-0.4678620 1st Qu.:-0.6068701
## Median : 0.0791861 Median :-0.0560230 Median :-0.5457222
## Mean : 0.0000000 Mean : 0.0000000 Mean : 0.0000000
## 3rd Qu.: 0.6042240 3rd Qu.: 0.6735664 3rd Qu.: 0.0651864
## Max. : 1.8522651 Max. : 1.3612193 Max. : 1.7525720
## pI.V1 Hyd.1.V1 Hyd.2.V1
## Min. :-1.8580053 Min. :-1.3424968 Min. :-2.2338170
## 1st Qu.:-0.2518850 1st Qu.:-1.0077096 1st Qu.:-0.5937089
## Median :-0.0721693 Median :-0.1205234 Median : 0.4434934
## Mean : 0.0000000 Mean : 0.0000000 Mean : 0.0000000
## 3rd Qu.: 0.0014151 3rd Qu.: 0.8503596 3rd Qu.: 0.7193994
## Max. : 2.6646032 Max. : 1.6705883 Max. : 1.0361804
## Surface_area.V1 Fract_area.V1
## Min. :-2.0230352 Min. :-1.8774603
## 1st Qu.:-0.6747180 1st Qu.:-0.8585153
## Median : 0.1253208 Median :-0.0563672
## Mean : 0.0000000 Mean : 0.0000000
## 3rd Qu.: 0.6791938 3rd Qu.: 0.9842575
## Max. : 1.8708600 Max. : 1.5045698
CHALLENGE (will NOT be on exam). How do these lines of code work:
# This chunk applies the mean function to the numeric data and scales the standardized data
X.bar <- apply(habk.df.numeric, 2, mean)
summary(habk.df.numeric)
## Vol Bulk Polarity pI
## Min. : 48.00 Min. : 3.40 Min. : 0.000 Min. : 2.770
## 1st Qu.: 90.75 1st Qu.:13.16 1st Qu.: 0.295 1st Qu.: 5.607
## Median :111.50 Median :15.08 Median : 1.635 Median : 5.925
## Mean :109.20 Mean :15.34 Mean :13.594 Mean : 6.053
## 3rd Qu.:126.75 3rd Qu.:18.47 3rd Qu.:15.023 3rd Qu.: 6.055
## Max. :163.00 Max. :21.67 Max. :52.000 Max. :10.760
## Hyd.1 Hyd.2 Surface_area Fract_area
## Min. :-4.50 Min. :-12.300 Min. : 85.0 Min. :0.5200
## 1st Qu.:-3.50 1st Qu.: -4.275 1st Qu.:145.2 1st Qu.:0.6375
## Median :-0.85 Median : 0.800 Median :181.0 Median :0.7300
## Mean :-0.49 Mean : -1.370 Mean :175.4 Mean :0.7365
## 3rd Qu.: 2.05 3rd Qu.: 2.150 3rd Qu.:205.8 3rd Qu.:0.8500
## Max. : 4.50 Max. : 3.700 Max. :259.0 Max. :0.9100
habk.df.standard <- apply(habk.df.numeric, 2, scale)
habk.df.standard <- data.frame(habk.df.standard)
summary(habk.df.standard)
## Vol Bulk Polarity pI
## Min. :-2.10704 Min. :-2.56792 Min. :-0.62033 Min. :-1.858005
## 1st Qu.:-0.63521 1st Qu.:-0.46786 1st Qu.:-0.60687 1st Qu.:-0.251885
## Median : 0.07919 Median :-0.05602 Median :-0.54572 Median :-0.072169
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.60422 3rd Qu.: 0.67357 3rd Qu.: 0.06519 3rd Qu.: 0.001415
## Max. : 1.85227 Max. : 1.36122 Max. : 1.75257 Max. : 2.664603
## Hyd.1 Hyd.2 Surface_area Fract_area
## Min. :-1.3425 Min. :-2.2338 Min. :-2.0230 Min. :-1.87746
## 1st Qu.:-1.0077 1st Qu.:-0.5937 1st Qu.:-0.6747 1st Qu.:-0.85852
## Median :-0.1205 Median : 0.4435 Median : 0.1253 Median :-0.05637
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.8504 3rd Qu.: 0.7194 3rd Qu.: 0.6792 3rd Qu.: 0.98426
## Max. : 1.6706 Max. : 1.0362 Max. : 1.8709 Max. : 1.50457
Prepare data:
scatterplot3d does NOT have a data = argument. Make seperate vectors to work with to make code cleaner. ….
# raw vectors
Xraw <- habk.df.numeric$Bulk
Yraw <- habk.df.numeric$Polarity
Zraw <- habk.df.numeric$Surface_area
# center
Xcent <- Xraw - mean(Xraw)
Ycent <- Yraw - mean(Yraw)
Zcent <- Zraw - mean(Zraw)
# scale, without centering
Xscale0 <- Xraw/sd(Xraw)
Yscale0 <- Yraw/sd(Yraw)
Zscale0 <- Zraw/sd(Zraw)
# scale AND center
Xscale <- (Xraw - mean(Xraw))/sd(Xraw)
Yscale <- (Yraw - mean(Yraw))/sd(Yraw)
Zscale <- (Zraw - mean(Zraw))/sd(Zraw)
Create ALL of the scaled vectors created above using the scale() function.
# vectors using R function()
scale(Xraw, center = T, scale = F)
## [,1]
## [1,] -3.8405
## [2,] -1.8805
## [3,] -3.6605
## [4,] -1.7705
## [5,] 4.4595
## [6,] -11.9405
## [7,] -1.6505
## [8,] 6.0595
## [9,] 0.3695
## [10,] 6.0595
## [11,] 0.9095
## [12,] -3.0605
## [13,] 2.0895
## [14,] -0.8905
## [15,] -1.0605
## [16,] -5.8705
## [17,] 0.4295
## [18,] 6.2295
## [19,] 6.3295
## [20,] 2.6895
## attr(,"scaled:center")
## [1] 15.3405
scale(Yraw, center = T, scale = F)
## [,1]
## [1,] -13.594
## [2,] -12.114
## [3,] 36.106
## [4,] 36.306
## [5,] -13.244
## [6,] -13.594
## [7,] 38.006
## [8,] -13.464
## [9,] 35.906
## [10,] -13.464
## [11,] -12.164
## [12,] -10.214
## [13,] -12.014
## [14,] -10.064
## [15,] 38.406
## [16,] -11.924
## [17,] -11.934
## [18,] -13.464
## [19,] -11.494
## [20,] -11.984
## attr(,"scaled:center")
## [1] 13.594
scale(Zraw, center = T, scale = F)
## [,1]
## [1,] -62.4
## [2,] -35.4
## [3,] -24.4
## [4,] 7.6
## [5,] 42.6
## [6,] -90.4
## [7,] 18.6
## [8,] 6.6
## [9,] 35.6
## [10,] 4.6
## [11,] 28.6
## [12,] -17.4
## [13,] -32.4
## [14,] 13.6
## [15,] 65.6
## [16,] -53.4
## [17,] -29.4
## [18,] -15.4
## [19,] 83.6
## [20,] 53.6
## attr(,"scaled:center")
## [1] 175.4
scale(Xraw, center = F, scale = T)
## [,1]
## [1,] 0.7007271
## [2,] 0.8201554
## [3,] 0.7116950
## [4,] 0.8268580
## [5,] 1.2064693
## [6,] 0.2071715
## [7,] 0.8341699
## [8,] 1.3039618
## [9,] 0.9572542
## [10,] 1.3039618
## [11,] 0.9901579
## [12,] 0.7482547
## [13,] 1.0620586
## [14,] 0.8804789
## [15,] 0.8701203
## [16,] 0.5770336
## [17,] 0.9609101
## [18,] 1.3143203
## [19,] 1.3204136
## [20,] 1.0986183
## attr(,"scaled:scale")
## [1] 16.41152
scale(Yraw, center = F, scale = T)
## [,1]
## [1,] 0.00000000
## [2,] 0.05697579
## [3,] 1.91330848
## [4,] 1.92100791
## [5,] 0.01347400
## [6,] 0.00000000
## [7,] 1.98645307
## [8,] 0.00500463
## [9,] 1.90560905
## [10,] 0.00500463
## [11,] 0.05505093
## [12,] 0.13012038
## [13,] 0.06082550
## [14,] 0.13589495
## [15,] 2.00185193
## [16,] 0.06429024
## [17,] 0.06390527
## [18,] 0.00500463
## [19,] 0.08084402
## [20,] 0.06198042
## attr(,"scaled:scale")
## [1] 25.97595
scale(Zraw, center = F, scale = T)
## [,1]
## [1,] 0.6094221
## [2,] 0.7550362
## [3,] 0.8143605
## [4,] 0.9869402
## [5,] 1.1756993
## [6,] 0.4584148
## [7,] 1.0462645
## [8,] 0.9815471
## [9,] 1.1379474
## [10,] 0.9707609
## [11,] 1.1001956
## [12,] 0.8521123
## [13,] 0.7712156
## [14,] 1.0192989
## [15,] 1.2997409
## [16,] 0.6579601
## [17,] 0.7873949
## [18,] 0.8628985
## [19,] 1.3968170
## [20,] 1.2350235
## attr(,"scaled:scale")
## [1] 185.4216
scale(Xraw, center = T, scale = T)
## [,1]
## [1,] -0.82593610
## [2,] -0.40441943
## [3,] -0.78722538
## [4,] -0.38076289
## [5,] 0.95905794
## [6,] -2.56791824
## [7,] -0.35495574
## [8,] 1.30315318
## [9,] 0.07946449
## [10,] 1.30315318
## [11,] 0.19559664
## [12,] -0.65818967
## [13,] 0.44936687
## [14,] -0.19151051
## [15,] -0.22807062
## [16,] -1.26250693
## [17,] 0.09236807
## [18,] 1.33971330
## [19,] 1.36121925
## [20,] 0.57840259
## attr(,"scaled:center")
## [1] 15.3405
## attr(,"scaled:scale")
## [1] 4.649875
scale(Yraw, center = T, scale = T)
## [,1]
## [1,] -0.6203318
## [2,] -0.5527953
## [3,] 1.6476166
## [4,] 1.6567432
## [5,] -0.6043603
## [6,] -0.6203318
## [7,] 1.7343189
## [8,] -0.6143995
## [9,] 1.6384901
## [10,] -0.6143995
## [11,] -0.5550770
## [12,] -0.4660931
## [13,] -0.5482320
## [14,] -0.4592481
## [15,] 1.7525720
## [16,] -0.5441251
## [17,] -0.5445814
## [18,] -0.6143995
## [19,] -0.5245030
## [20,] -0.5468631
## attr(,"scaled:center")
## [1] 13.594
## attr(,"scaled:scale")
## [1] 21.91408
scale(Zraw, center = T, scale = T)
## [,1]
## [1,] -1.3964314
## [2,] -0.7922063
## [3,] -0.5460405
## [4,] 0.1700782
## [5,] 0.9533330
## [6,] -2.0230352
## [7,] 0.4162440
## [8,] 0.1476995
## [9,] 0.7966820
## [10,] 0.1029421
## [11,] 0.6400310
## [12,] -0.3893895
## [13,] -0.7250701
## [14,] 0.3043504
## [15,] 1.4680432
## [16,] -1.1950230
## [17,] -0.6579340
## [18,] -0.3446321
## [19,] 1.8708600
## [20,] 1.1994987
## attr(,"scaled:center")
## [1] 175.4
## attr(,"scaled:scale")
## [1] 44.68533
3-D scatter plot of raw data
# This chunk creates a 3D plot of the raw data
# main plot with scatterplot3d() - know how this works
plot1 <- scatterplot3d(x = Xraw,
y = Yraw,
z = Zraw,
color = 2,
pch = 16, type = "h",
xlim = c(-15,25),
ylim = c(-30,50),
zlim = c(-100,275),
main = "Raw data")
# add-ons don't need to know how these work
# These are lines at x = 0, y = 0, z = 0
p2 <- plot1$xyz.convert(0,-40,0)
p3 <- plot1$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot1$xyz.convert(-20,0,0)
p3 <- plot1$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
3-D scatter plot of centered data
# This chunk creates a 3D plot of the centered data
# main plot - know how this works
plot2 <- scatterplot3d(x = Xcent, y = Ycent, z = Zcent,
color = 2,
pch = 16, type = "h",
xlim = c(-15,25),
ylim = c(-30,50),
zlim = c(-100,275), main = "Centered")
# add-ons don't need to know how these work
# These are lines at x = 0, y = 0, z = 0
p2 <- plot2$xyz.convert(0,-40,0)
p3 <- plot2$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot2$xyz.convert(-20,0,0)
p3 <- plot2$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
3-D scatter plot of scaled (NOT centered) data
# This chunk creates a 3D plot of the scaled (NOT centered) data
# main plot - know how this works
plot3 <- scatterplot3d(x = Xscale0, y = Yscale0, z = Zscale0,
color = 3,
pch = 12, type = "h",
xlim = c(-15,25),
ylim = c(-30,50),
zlim = c(-100,275),
main = "Scaled (NOT centered)")
p2 <- plot3$xyz.convert(0,-40,0)
p3 <- plot3$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot3$xyz.convert(-20,0,0)
p3 <- plot3$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
3-D scatter plot of scaled and centered data
# main plot - know how this works
plot4 <- scatterplot3d(x = Xscale, y = Yscale, z = Zscale,
color = 3,
pch = 16, type = "h",
xlim = c(-25,25),
ylim = c(-50,50),
zlim = c(-100,275),
main = "Centered & Scaled")
p2 <- plot4$xyz.convert(0,-40,0)
p3 <- plot4$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot4$xyz.convert(-20,0,0)
p3 <- plot4$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
3-D scatter plot of scaled and centered data
# main plot - know how this works
plot5 <- scatterplot3d(x = Xscale, y = Yscale, z = Zscale,
color = 3,
pch = 16, type = "h",
#xlim = c(-25,25),
#ylim = c(-50,50),
#zlim = c(-100,275),
main = "Centered & Scaled")
p2 <- plot5$xyz.convert(0,-40,0)
p3 <- plot5$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot5$xyz.convert(-20,0,0)
p3 <- plot5$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
# This chunk displays four of the plots at once
# The 3-D scatter plot of the scaled (NOT centered) data is not shown here
par(mfrow = c(2,2), mar = c(1,1,1,1))
plot1 <- scatterplot3d(x = Xraw,
y = Yraw,
z = Zraw,
color = 2,
pch = 16, type = "h",
xlim = c(-15,25),
ylim = c(-30,50),
zlim = c(-100,275),
main = "")
# add-ons don't need to know how these work
# These are lines at x = 0, y = 0, z = 0
p2 <- plot1$xyz.convert(0,-40,0)
p3 <- plot1$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot1$xyz.convert(-20,0,0)
p3 <- plot1$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
# main plot - know how this works
plot2 <- scatterplot3d(x = Xcent, y = Ycent, z = Zcent,
color = 2,
pch = 16, type = "h",
xlim = c(-15,25),
ylim = c(-30,50),
zlim = c(-100,275), main = "")
# add-ons don't need to know how these work
# These are lines at x = 0, y = 0, z = 0
p2 <- plot2$xyz.convert(0,-40,0)
p3 <- plot2$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot2$xyz.convert(-20,0,0)
p3 <- plot2$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
plot4 <- scatterplot3d(x = Xscale, y = Yscale, z = Zscale,
color = 3,
pch = 16, type = "h",
xlim = c(-25,25),
ylim = c(-50,50),
zlim = c(-100,275),
main = "")
p2 <- plot4$xyz.convert(0,-40,0)
p3 <- plot4$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot4$xyz.convert(-20,0,0)
p3 <- plot4$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
# main plot - know how this works
plot5 <- scatterplot3d(x = Xscale, y = Yscale, z = Zscale,
color = 3,
pch = 16, type = "h",
#xlim = c(-25,25),
#ylim = c(-50,50),
#zlim = c(-100,275),
main = "")
p2 <- plot5$xyz.convert(0,-40,0)
p3 <- plot5$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot5$xyz.convert(-20,0,0)
p3 <- plot5$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)