Install any needed packages.
#install.packages("lsa")
#install.packages("mdendro")
#install.packages("dendextend")
Load all packages
# packages we've used a lot
library(ggplot2)
library(ggpubr)
# packages we've only recently started using
library(scatterplot3d)
library(pander)
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-7
# NEW packages- make sure you installed them!
library(dendextend)
## Registered S3 method overwritten by 'dendextend':
## method from
## rev.hclust vegan
##
## ---------------------
## Welcome to dendextend version 1.15.2
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:permute':
##
## shuffle
## The following object is masked from 'package:ggpubr':
##
## rotate
## The following object is masked from 'package:stats':
##
## cutree
library(lsa)
## Loading required package: SnowballC
NOTE: These vectors may be useful for exploring the functionality of R functions but are not used in the analysis itself or added to the dataframe.
# data exploration vectors
MW.da <- c(89, 121, 133, 146, 165, 75, 155, 131, 146, 131,
149, 132, 115, 147, 174, 105, 119, 117, 204, 181)
polar.req <-c(7, 4.8, 13, 12.5, 5, 7.9, 8.4, 4.9, 10.1, 4.9,
5.3, 10, 6.6, 8.6,9.1,7.5,6.6,5.6,5.2,5.4)
freq <-c(7.8, 1.1, 5.19 ,6.72, 4.39, 6.77, 2.03, 6.95,
6.32, 10.15, 2.28, 4.37, 4.26, 3.45, 5.23,
6.46, 5.12, 7.01, 1.09, 3.3)
The following vectors are the columns that make up Table 2.2 of Higgs and Attwood 2005, Chapter 2.
# main data vectors
aa.1 <-c('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L','M','N',
'P','Q','R','S','T','V','W','Y')
vol <- c(67, 86, 91, 109, 135, 48, 118, 124, 135, 124, 124, 96, 90, 114,
148, 73, 93, 105, 163, 141)
bulk <-c(11.5, 13.46, 11.68, 13.57, 19.8, 3.4, 13.69, 21.4, 15.71, 21.4, 16.25,
12.28,17.43, 14.45, 14.28, 9.47, 15.77, 21.57, 21.67, 18.03)
pol <-c(0,1.48,49.7,49.9,0.35,0,51.6,0.13,49.5,0.13,
1.43,3.38,1.58,3.53,52,1.67,1.66,0.13,2.1,1.61)
isoelec <-c(6,5.07,2.77,3.22,5.48,5.97,7.59,6.02,9.74,5.98, 5.74, 5.41, 6.3,
5.65, 10.76, 5.68, 6.16, 5.96, 5.89, 5.66)
H2Opho.34 <-c(1.8,2.5,-3.5,-3.5,2.8,-0.4,-3.2,4.5, -3.9, 3.8, 1.9, -3.5, -1.6,
-3.5, -4.5, -0.8, -0.7, 4.2, -0.9, -1.3)
H2Opho.35 <-c(1.6,2,-9.2,-8.2,3.7,1,-3,3.1,-8.8,2.8,3.4,-4.8, -0.2,-4.1,-12.3,
0.6,1.2,2.6,1.9,-0.7)
saaH2O <-c(113, 140,151,183,218,85,194,182,211,180, 204, 158, 143, 189,
241, 122, 146,160,259,229)
faal.fold <-c(0.74,0.91,0.62,0.62,0.88,0.72,0.78,0.88,0.52, 0.85, 0.85, 0.63,
0.64,0.62,0.64,0.66,0.7, 0.86, 0.85, 0.76)
TODO: Create a vector of the full names and the 3-letter codes of the amino acids, then add it to the data.frame.
# build dataframe
aa.full <- c("Alanine", "Cysteine", "Aspartic Acid", "Glutamic Acid", "Phenylanlanine", "Glycine", "Histidine", "Isolucine", "Lysine", "Leucine", "Methionine",
"Aspagine","Proline", "Glutamine", "Arginine", "Serine", "Threonine", "Valine", "Tryptophan", "Tyrosine")
aa.3 <- c("Ala", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile", "Lys", "Leu", "Met", "Asn",
"Pro", "Gln", "Arg", "Ser", "Thr", "Val", "Trp", "Tyr")
## create dataframe
habk.df <- data.frame(aa.full = aa.full,
aa.3 = aa.3,
aa.1 = aa.1,
Vol = vol, Bulk = bulk, Polarity = pol,
pI = isoelec, Hyd.1 = H2Opho.34,
Hyd.2 = H2Opho.35, Surface_area = saaH2O,
Fract_area = faal.fold)
# name rows - this will help later
# when we do PCA
row.names(habk.df) <- aa.1
# display with pander
pander(habk.df)
| aa.full | aa.3 | aa.1 | Vol | Bulk | Polarity | pI | Hyd.1 | |
|---|---|---|---|---|---|---|---|---|
| A | Alanine | Ala | A | 67 | 11.5 | 0 | 6 | 1.8 |
| C | Cysteine | Cys | C | 86 | 13.46 | 1.48 | 5.07 | 2.5 |
| D | Aspartic Acid | Asp | D | 91 | 11.68 | 49.7 | 2.77 | -3.5 |
| E | Glutamic Acid | Glu | E | 109 | 13.57 | 49.9 | 3.22 | -3.5 |
| F | Phenylanlanine | Phe | F | 135 | 19.8 | 0.35 | 5.48 | 2.8 |
| G | Glycine | Gly | G | 48 | 3.4 | 0 | 5.97 | -0.4 |
| H | Histidine | His | H | 118 | 13.69 | 51.6 | 7.59 | -3.2 |
| I | Isolucine | Ile | I | 124 | 21.4 | 0.13 | 6.02 | 4.5 |
| K | Lysine | Lys | K | 135 | 15.71 | 49.5 | 9.74 | -3.9 |
| L | Leucine | Leu | L | 124 | 21.4 | 0.13 | 5.98 | 3.8 |
| M | Methionine | Met | M | 124 | 16.25 | 1.43 | 5.74 | 1.9 |
| N | Aspagine | Asn | N | 96 | 12.28 | 3.38 | 5.41 | -3.5 |
| P | Proline | Pro | P | 90 | 17.43 | 1.58 | 6.3 | -1.6 |
| Q | Glutamine | Gln | Q | 114 | 14.45 | 3.53 | 5.65 | -3.5 |
| R | Arginine | Arg | R | 148 | 14.28 | 52 | 10.76 | -4.5 |
| S | Serine | Ser | S | 73 | 9.47 | 1.67 | 5.68 | -0.8 |
| T | Threonine | Thr | T | 93 | 15.77 | 1.66 | 6.16 | -0.7 |
| V | Valine | Val | V | 105 | 21.57 | 0.13 | 5.96 | 4.2 |
| W | Tryptophan | Trp | W | 163 | 21.67 | 2.1 | 5.89 | -0.9 |
| Y | Tyrosine | Tyr | Y | 141 | 18.03 | 1.61 | 5.66 | -1.3 |
| Hyd.2 | Surface_area | Fract_area | |
|---|---|---|---|
| A | 1.6 | 113 | 0.74 |
| C | 2 | 140 | 0.91 |
| D | -9.2 | 151 | 0.62 |
| E | -8.2 | 183 | 0.62 |
| F | 3.7 | 218 | 0.88 |
| G | 1 | 85 | 0.72 |
| H | -3 | 194 | 0.78 |
| I | 3.1 | 182 | 0.88 |
| K | -8.8 | 211 | 0.52 |
| L | 2.8 | 180 | 0.85 |
| M | 3.4 | 204 | 0.85 |
| N | -4.8 | 158 | 0.63 |
| P | -0.2 | 143 | 0.64 |
| Q | -4.1 | 189 | 0.62 |
| R | -12.3 | 241 | 0.64 |
| S | 0.6 | 122 | 0.66 |
| T | 1.2 | 146 | 0.7 |
| V | 2.6 | 160 | 0.86 |
| W | 1.9 | 259 | 0.85 |
| Y | -0.7 | 229 | 0.76 |
A data dictionary is a dataframe that contains extra information about the data being used. For example, what the column names represent, what the variable/vector name that corresponds to each data is, and the references. It is best practice to include a data table that creates a connection between what is in written as code and what that means: for example variable names. Variable names should be short enough to be repeated easily without error, but descriptive enough to explain what is being stored. “Vol” as a column for “Volume” makes sense, but “pI” for “ph isoelectric point” isn’t intuitive.
# data dictionary columns
column.names <- c("Vol", "Bulk", "Polarity","pI", "Hyd.1", "Hyd.2",
"Surface_area","Fract_area")
name.full <- c("Volume", "Bulkiness", "Polarity",
"pH at Isoelectric point", "Hydrophobicity scale-1",
"Hydrophobicity scale-2","SA accessible to H2O","Fractional area")
vector.names <- c("vol", "bulk", "pol","isoelec", "H2Opho.34",
"H2Opho.35", "saaH2O","faal.fold")
reference <- c("Creighton 1993", "Zimmerman et al. 1968", "Zimmerman et al. 1968",
"Zimmerman et al. 1968", "Kyte & Doolittle 1982", "Engelman et al. 1986",
"Miller et al. 1987"," Rose et al. 1985")
# TODO: Make a dataframe to hold DATA DICTIONARY information
# make dataframe
data.dictionary <- data.frame(Column = column.names,
Full = name.full,
Vector = vector.names,
Refs = reference)
# Display data dictionary
pander(data.dictionary)
| Column | Full | Vector | Refs |
|---|---|---|---|
| Vol | Volume | vol | Creighton 1993 |
| Bulk | Bulkiness | bulk | Zimmerman et al. 1968 |
| Polarity | Polarity | pol | Zimmerman et al. 1968 |
| pI | pH at Isoelectric point | isoelec | Zimmerman et al. 1968 |
| Hyd.1 | Hydrophobicity scale-1 | H2Opho.34 | Kyte & Doolittle 1982 |
| Hyd.2 | Hydrophobicity scale-2 | H2Opho.35 | Engelman et al. 1986 |
| Surface_area | SA accessible to H2O | saaH2O | Miller et al. 1987 |
| Fract_area | Fractional area | faal.fold | Rose et al. 1985 |
# Make scatterplot
ggscatter(x = "pI",
y = "Vol",
data = habk.df,
color = "Polarity",
size = "Bulk",
label = habk.df$aa.1,
ylab = "Volume (Å cubed)",
xlab = "pI",
title = "Plot of amino acid volume against pI -
\ntwo properties thought to be important in protein folding.",
ylim = c(0, 180),
xlim = c(1,12)
)
Data centering is moving the data to have a mean of 0. data point - mean Data scaling is shifting the distribution of the data. ratio of data to standard deviation
# Subset data
## get info
dim(habk.df)
## [1] 20 11
ncol(habk.df)
## [1] 11
# easiest
habk.df.numeric <- habk.df[ , c(-1,-2,-3)]
# equivalent
habk.df.numeric <- habk.df[ , c(4,5,6,7,8,9,10,11)]
habk.df.numeric <- habk.df[ , c(4:11)]
Set up data:
# Select a single column to explore how to do centering / scaling
X <- habk.df.numeric$Vol
Calculate the mean manually:
sum(X)/length(X)
## [1] 109.2
Challenge: what is this calculating?
X <- habk.df.numeric$Vol
sqrt(sum((X-mean(X))^2)/length(X))
## [1] 28.31007
# data centering:
X <- habk.df.numeric$Vol
u <- mean(X)
u-X
## [1] 42.2 23.2 18.2 0.2 -25.8 61.2 -8.8 -14.8 -25.8 -14.8 -14.8 13.2
## [13] 19.2 -4.8 -38.8 36.2 16.2 4.2 -53.8 -31.8
# data scaling:
X <- habk.df.numeric$Vol
s <- sd(X)
X/s
## [1] 2.306724 2.960870 3.133014 3.752730 4.647877 1.652579 4.062589 4.269161
## [9] 4.647877 4.269161 4.269161 3.305157 3.098585 3.924874 5.095451 2.513297
## [17] 3.201871 3.615016 5.611881 4.854450
# scale and center data with built in methods:
X <- habk.df.numeric$Vol
u <- mean(X)
s <- sd(X)
z <- (u-X)/s
# Compare the data via histogram as original, centered, scaled, and both.
par(mfrow = c(2,2), mar = c(3,1,3,1))
hist(X)
abline(v = mean(X), col = 2)
hist(X/s)
abline(v = mean(X/s), col = 3)
hist(u-X)
abline(v = 0, col = 4)
hist(z)
abline(v = 0, col = 5)
par(mfrow = c(1,1), mar = c(4,4,4,4))
# scale() is a generic function that centers and or scales data
scale(X, center = T, scale = F) # center only
## [,1]
## [1,] -42.2
## [2,] -23.2
## [3,] -18.2
## [4,] -0.2
## [5,] 25.8
## [6,] -61.2
## [7,] 8.8
## [8,] 14.8
## [9,] 25.8
## [10,] 14.8
## [11,] 14.8
## [12,] -13.2
## [13,] -19.2
## [14,] 4.8
## [15,] 38.8
## [16,] -36.2
## [17,] -16.2
## [18,] -4.2
## [19,] 53.8
## [20,] 31.8
## attr(,"scaled:center")
## [1] 109.2
scale(X, center = F, scale = T) # scale only
## [,1]
## [1,] 0.5788805
## [2,] 0.7430407
## [3,] 0.7862407
## [4,] 0.9417609
## [5,] 1.1664011
## [6,] 0.4147204
## [7,] 1.0195209
## [8,] 1.0713610
## [9,] 1.1664011
## [10,] 1.0713610
## [11,] 1.0713610
## [12,] 0.8294408
## [13,] 0.7776007
## [14,] 0.9849609
## [15,] 1.2787212
## [16,] 0.6307206
## [17,] 0.8035207
## [18,] 0.9072008
## [19,] 1.4083213
## [20,] 1.2182411
## attr(,"scaled:scale")
## [1] 115.7406
scale(X, center = T, scale = T) # both
## [,1]
## [1,] -1.452891984
## [2,] -0.798746304
## [3,] -0.626602704
## [4,] -0.006885744
## [5,] 0.888260976
## [6,] -2.107037664
## [7,] 0.302972736
## [8,] 0.509545056
## [9,] 0.888260976
## [10,] 0.509545056
## [11,] 0.509545056
## [12,] -0.454459104
## [13,] -0.661031424
## [14,] 0.165257856
## [15,] 1.335834336
## [16,] -1.246319664
## [17,] -0.557745264
## [18,] -0.144600624
## [19,] 1.852265136
## [20,] 1.094833296
## attr(,"scaled:center")
## [1] 109.2
## attr(,"scaled:scale")
## [1] 29.04552
Total mean of the volume row:
sum(habk.df.numeric$Vol)/length(habk.df.numeric$Vol)
## [1] 109.2
# make a copy of the data
habk.df.centered <- habk.df.numeric
# check original mean
mean(habk.df.numeric$Vol)
## [1] 109.2
habk.df.centered$Vol <- habk.df.centered$Vol - mean(habk.df.centered$Vol)
mean(habk.df.centered$Vol)
## [1] -2.842518e-15
# Bulk
habk.df.centered$Bulk <- habk.df.centered$Bulk - mean(habk.df.centered$Bulk)
mean(habk.df.numeric$Bulk)
## [1] 15.3405
# fractional area
habk.df.centered$Fract_area <- habk.df.centered$Fract_area - mean(habk.df.centered$Fract_area)
mean(habk.df.centered$Fract_area)
## [1] -4.44057e-17
# What does this for() loop do?
for(i in 1:ncol(habk.df.centered)){
mean.i <- mean(habk.df.centered[,i])
habk.df.centered[,i] <- mean.i-habk.df.centered[,i]
}
summary(habk.df.centered)
## Vol Bulk Polarity pI
## Min. :-53.80 Min. :-6.3295 Min. :-38.406 Min. :-4.7075
## 1st Qu.:-17.55 1st Qu.:-3.1320 1st Qu.: -1.429 1st Qu.:-0.0025
## Median : -2.30 Median : 0.2605 Median : 11.959 Median : 0.1275
## Mean : 0.00 Mean : 0.0000 Mean : 0.000 Mean : 0.0000
## 3rd Qu.: 18.45 3rd Qu.: 2.1755 3rd Qu.: 13.299 3rd Qu.: 0.4450
## Max. : 61.20 Max. :11.9405 Max. : 13.594 Max. : 3.2825
## Hyd.1 Hyd.2 Surface_area Fract_area
## Min. :-4.99 Min. :-5.070 Min. :-83.60 Min. :-0.1735
## 1st Qu.:-2.54 1st Qu.:-3.520 1st Qu.:-30.35 1st Qu.:-0.1135
## Median : 0.36 Median :-2.170 Median : -5.60 Median : 0.0065
## Mean : 0.00 Mean : 0.000 Mean : 0.00 Mean : 0.0000
## 3rd Qu.: 3.01 3rd Qu.: 2.905 3rd Qu.: 30.15 3rd Qu.: 0.0990
## Max. : 4.01 Max. :10.930 Max. : 90.40 Max. : 0.2165
#
for(i in 1:ncol(habk.df.centered)){
habk.df.centered[,i] <- mean(habk.df.centered[,i])-habk.df.centered[,i]
}
summary(habk.df.centered)
## Vol Bulk Polarity pI
## Min. :-61.20 Min. :-11.9405 Min. :-13.594 Min. :-3.2825
## 1st Qu.:-18.45 1st Qu.: -2.1755 1st Qu.:-13.299 1st Qu.:-0.4450
## Median : 2.30 Median : -0.2605 Median :-11.959 Median :-0.1275
## Mean : 0.00 Mean : 0.0000 Mean : 0.000 Mean : 0.0000
## 3rd Qu.: 17.55 3rd Qu.: 3.1320 3rd Qu.: 1.429 3rd Qu.: 0.0025
## Max. : 53.80 Max. : 6.3295 Max. : 38.406 Max. : 4.7075
## Hyd.1 Hyd.2 Surface_area Fract_area
## Min. :-4.01 Min. :-10.930 Min. :-90.40 Min. :-0.2165
## 1st Qu.:-3.01 1st Qu.: -2.905 1st Qu.:-30.15 1st Qu.:-0.0990
## Median :-0.36 Median : 2.170 Median : 5.60 Median :-0.0065
## Mean : 0.00 Mean : 0.000 Mean : 0.00 Mean : 0.0000
## 3rd Qu.: 2.54 3rd Qu.: 3.520 3rd Qu.: 30.35 3rd Qu.: 0.1135
## Max. : 4.99 Max. : 5.070 Max. : 83.60 Max. : 0.1735
Check …
# do a logical check to make sure the standard deviation of centered and numeric data is the same
sd(habk.df.centered$Vol)
## [1] 29.04552
sd(habk.df.numeric$Vol)
## [1] 29.04552
sd(habk.df.centered$Vol) == sd(habk.df.numeric$Vol)
## [1] TRUE
# logical check
cpisd <- sd(habk.df.centered$pI)
npisd <- sd(habk.df.numeric$pI)
cpisd == npisd
## [1] TRUE
# Standardize the data
# create new dataframe
habk.df.standard <- habk.df.centered
habk.df.standard$Vol <- habk.df.standard$Vol/sd(habk.df.standard$Vol)
sd(habk.df.standard$Vol)
## [1] 1
TODO: modify another column as done in the previous chunk
# Standardize another column
habk.df.standard$pI <- habk.df.standard$pI/sd(habk.df.standard$pI)
sd(habk.df.standard$pI)
## [1] 1
# for loop for standardizing
for(i in 1:ncol(habk.df.standard)){
sd.i <- sd(habk.df.standard[,i])
habk.df.standard[,i] <- habk.df.standard[,i]/sd.i
}
summary(habk.df.standard)
## Vol Bulk Polarity pI
## Min. :-2.10704 Min. :-2.56792 Min. :-0.62033 Min. :-1.858005
## 1st Qu.:-0.63521 1st Qu.:-0.46786 1st Qu.:-0.60687 1st Qu.:-0.251885
## Median : 0.07919 Median :-0.05602 Median :-0.54572 Median :-0.072169
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.60422 3rd Qu.: 0.67357 3rd Qu.: 0.06519 3rd Qu.: 0.001415
## Max. : 1.85227 Max. : 1.36122 Max. : 1.75257 Max. : 2.664603
## Hyd.1 Hyd.2 Surface_area Fract_area
## Min. :-1.3425 Min. :-2.2338 Min. :-2.0230 Min. :-1.87746
## 1st Qu.:-1.0077 1st Qu.:-0.5937 1st Qu.:-0.6747 1st Qu.:-0.85852
## Median :-0.1205 Median : 0.4435 Median : 0.1253 Median :-0.05637
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.8504 3rd Qu.: 0.7194 3rd Qu.: 0.6792 3rd Qu.: 0.98426
## Max. : 1.6706 Max. : 1.0362 Max. : 1.8709 Max. : 1.50457
TODO: Shorten code in the for() loop so that the object sd.i doesn’t need to be created
# for() loop without sd.i line
for(i in 1:ncol(habk.df.standard)){
habk.df.standard[,i] <- habk.df.standard[,i]/sd(habk.df.standard[,i])
}
summary(habk.df.standard)
## Vol Bulk Polarity pI
## Min. :-2.10704 Min. :-2.56792 Min. :-0.62033 Min. :-1.858005
## 1st Qu.:-0.63521 1st Qu.:-0.46786 1st Qu.:-0.60687 1st Qu.:-0.251885
## Median : 0.07919 Median :-0.05602 Median :-0.54572 Median :-0.072169
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.60422 3rd Qu.: 0.67357 3rd Qu.: 0.06519 3rd Qu.: 0.001415
## Max. : 1.85227 Max. : 1.36122 Max. : 1.75257 Max. : 2.664603
## Hyd.1 Hyd.2 Surface_area Fract_area
## Min. :-1.3425 Min. :-2.2338 Min. :-2.0230 Min. :-1.87746
## 1st Qu.:-1.0077 1st Qu.:-0.5937 1st Qu.:-0.6747 1st Qu.:-0.85852
## Median :-0.1205 Median : 0.4435 Median : 0.1253 Median :-0.05637
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.8504 3rd Qu.: 0.7194 3rd Qu.: 0.6792 3rd Qu.: 0.98426
## Max. : 1.6706 Max. : 1.0362 Max. : 1.8709 Max. : 1.50457
TODO: Write a for() loop that does BOTH tasks in a single loop.
# center
# scale by sd
for(i in 1:ncol(habk.df.standard)){
habk.df.standard[,i] <- habk.df.standard[,i]/sd(habk.df.standard[,i])
habk.df.standard[,i] <- mean(habk.df.standard[,i])-habk.df.standard[,i]
}
summary(habk.df.standard)
## Vol Bulk Polarity pI
## Min. :-1.85227 Min. :-1.36122 Min. :-1.75257 Min. :-2.664603
## 1st Qu.:-0.60422 1st Qu.:-0.67357 1st Qu.:-0.06519 1st Qu.:-0.001415
## Median :-0.07919 Median : 0.05602 Median : 0.54572 Median : 0.072169
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.63521 3rd Qu.: 0.46786 3rd Qu.: 0.60687 3rd Qu.: 0.251885
## Max. : 2.10704 Max. : 2.56792 Max. : 0.62033 Max. : 1.858005
## Hyd.1 Hyd.2 Surface_area Fract_area
## Min. :-1.6706 Min. :-1.0362 Min. :-1.8709 Min. :-1.50457
## 1st Qu.:-0.8504 1st Qu.:-0.7194 1st Qu.:-0.6792 1st Qu.:-0.98426
## Median : 0.1205 Median :-0.4435 Median :-0.1253 Median : 0.05637
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 1.0077 3rd Qu.: 0.5937 3rd Qu.: 0.6747 3rd Qu.: 0.85852
## Max. : 1.3425 Max. : 2.2338 Max. : 2.0230 Max. : 1.87746
Write a for() loop that….
# scale data via scale() function
habk.df.standard <- habk.df.numeric
for(i in 1:ncol(habk.df.standard)){
habk.df.standard[,i] <- scale(habk.df.standard[,i])
}
summary(habk.df.standard)
## Vol.V1 Bulk.V1 Polarity.V1
## Min. :-2.1070377 Min. :-2.5679182 Min. :-0.6203318
## 1st Qu.:-0.6352099 1st Qu.:-0.4678620 1st Qu.:-0.6068701
## Median : 0.0791861 Median :-0.0560230 Median :-0.5457222
## Mean : 0.0000000 Mean : 0.0000000 Mean : 0.0000000
## 3rd Qu.: 0.6042240 3rd Qu.: 0.6735664 3rd Qu.: 0.0651864
## Max. : 1.8522651 Max. : 1.3612193 Max. : 1.7525720
## pI.V1 Hyd.1.V1 Hyd.2.V1
## Min. :-1.8580053 Min. :-1.3424968 Min. :-2.2338170
## 1st Qu.:-0.2518850 1st Qu.:-1.0077096 1st Qu.:-0.5937089
## Median :-0.0721693 Median :-0.1205234 Median : 0.4434934
## Mean : 0.0000000 Mean : 0.0000000 Mean : 0.0000000
## 3rd Qu.: 0.0014151 3rd Qu.: 0.8503596 3rd Qu.: 0.7193994
## Max. : 2.6646032 Max. : 1.6705883 Max. : 1.0361804
## Surface_area.V1 Fract_area.V1
## Min. :-2.0230352 Min. :-1.8774603
## 1st Qu.:-0.6747180 1st Qu.:-0.8585153
## Median : 0.1253208 Median :-0.0563672
## Mean : 0.0000000 Mean : 0.0000000
## 3rd Qu.: 0.6791938 3rd Qu.: 0.9842575
## Max. : 1.8708600 Max. : 1.5045698
CHALLENGE (will NOT be on exam). How do these lines of code work:
# CHALLENGE: What does this do?
X.bar <- apply(habk.df.numeric, 2, mean)
summary(habk.df.numeric)
## Vol Bulk Polarity pI
## Min. : 48.00 Min. : 3.40 Min. : 0.000 Min. : 2.770
## 1st Qu.: 90.75 1st Qu.:13.16 1st Qu.: 0.295 1st Qu.: 5.607
## Median :111.50 Median :15.08 Median : 1.635 Median : 5.925
## Mean :109.20 Mean :15.34 Mean :13.594 Mean : 6.053
## 3rd Qu.:126.75 3rd Qu.:18.47 3rd Qu.:15.023 3rd Qu.: 6.055
## Max. :163.00 Max. :21.67 Max. :52.000 Max. :10.760
## Hyd.1 Hyd.2 Surface_area Fract_area
## Min. :-4.50 Min. :-12.300 Min. : 85.0 Min. :0.5200
## 1st Qu.:-3.50 1st Qu.: -4.275 1st Qu.:145.2 1st Qu.:0.6375
## Median :-0.85 Median : 0.800 Median :181.0 Median :0.7300
## Mean :-0.49 Mean : -1.370 Mean :175.4 Mean :0.7365
## 3rd Qu.: 2.05 3rd Qu.: 2.150 3rd Qu.:205.8 3rd Qu.:0.8500
## Max. : 4.50 Max. : 3.700 Max. :259.0 Max. :0.9100
habk.df.standard <- apply(habk.df.numeric, 2, scale)
habk.df.standard <- data.frame(habk.df.standard)
summary(habk.df.standard)
## Vol Bulk Polarity pI
## Min. :-2.10704 Min. :-2.56792 Min. :-0.62033 Min. :-1.858005
## 1st Qu.:-0.63521 1st Qu.:-0.46786 1st Qu.:-0.60687 1st Qu.:-0.251885
## Median : 0.07919 Median :-0.05602 Median :-0.54572 Median :-0.072169
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.60422 3rd Qu.: 0.67357 3rd Qu.: 0.06519 3rd Qu.: 0.001415
## Max. : 1.85227 Max. : 1.36122 Max. : 1.75257 Max. : 2.664603
## Hyd.1 Hyd.2 Surface_area Fract_area
## Min. :-1.3425 Min. :-2.2338 Min. :-2.0230 Min. :-1.87746
## 1st Qu.:-1.0077 1st Qu.:-0.5937 1st Qu.:-0.6747 1st Qu.:-0.85852
## Median :-0.1205 Median : 0.4435 Median : 0.1253 Median :-0.05637
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.8504 3rd Qu.: 0.7194 3rd Qu.: 0.6792 3rd Qu.: 0.98426
## Max. : 1.6706 Max. : 1.0362 Max. : 1.8709 Max. : 1.50457
Prepare data:
scatterplot3d does NOT have a data = argument. Make seperate vectors to work with to make code cleanr. ….
# raw vectors
Xraw <- habk.df.numeric$Bulk
Yraw <- habk.df.numeric$Polarity
Zraw <- habk.df.numeric$Surface_area
# center
Xcent <- Xraw - mean(Xraw)
Ycent <- Yraw - mean(Yraw)
Zcent <- Zraw - mean(Zraw)
# scale, without centering
Xscale0 <- Xraw/sd(Xraw)
Yscale0 <- Yraw/sd(Yraw)
Zscale0 <- Zraw/sd(Zraw)
# scale AND center
Xscale <- (Xraw - mean(Xraw))/sd(Xraw)
Yscale <- (Yraw - mean(Yraw))/sd(Yraw)
Zscale <- (Zraw - mean(Zraw))/sd(Zraw)
TODO: create ALL of the scaled vectors created above using the scale() function.
# vectors using R function()
# center
xcenter <- scale(Xraw, center = TRUE, scale = FALSE)
ycenter <- scale(Yraw, center = TRUE, scale = FALSE)
zcenter <- scale(Zraw, center = TRUE, scale = FALSE)
# scale without centering
xscaleo <- scale(Xraw, center = FALSE, scale = TRUE)
yscaleo <- scale(Yraw, center = FALSE, scale = TRUE)
zscaleo <- scale(Zraw, center = FALSE, scale = TRUE)
# scale and center
xscale <- scale(Xraw, center = TRUE, scale = TRUE)
yscale <- scale(Yraw, center = TRUE, scale = TRUE)
zscale <- scale(Zraw, center = TRUE, scale = TRUE)
3-D scatter plot of….
# scatterplot with raw data
# main plot with scatterplot3d() - know how this works
plot1 <- scatterplot3d(x = Xraw,
y = Yraw,
z = Zraw,
color = 2,
pch = 16, type = "h",
xlim = c(-15,25),
ylim = c(-30,50),
zlim = c(-100,275),
main = "Raw data")
# add-ons don't need to know how these work
# These are lines at x = 0, y = 0, z = 0
p2 <- plot1$xyz.convert(0,-40,0)
p3 <- plot1$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot1$xyz.convert(-20,0,0)
p3 <- plot1$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
3-D scatter plot of….
# scatterplot of centered data
# main plot - know how this works
plot2 <- scatterplot3d(x = Xcent, y = Ycent, z = Zcent,
color = 2,
pch = 16, type = "h",
xlim = c(-15,25),
ylim = c(-30,50),
zlim = c(-100,275), main = "Centered")
# add-ons don't need to know how these work
# These are lines at x = 0, y = 0, z = 0
p2 <- plot2$xyz.convert(0,-40,0)
p3 <- plot2$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot2$xyz.convert(-20,0,0)
p3 <- plot2$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
3-D scatter plot of….
# scatterplot of scaled data without centering
# main plot - know how this works
plot3 <- scatterplot3d(x = Xscale0, y = Yscale0, z = Zscale0,
color = 3,
pch = 12, type = "h",
xlim = c(-15,25),
ylim = c(-30,50),
zlim = c(-100,275),
main = "Scaled (NOT centered)")
p2 <- plot3$xyz.convert(0,-40,0)
p3 <- plot3$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot3$xyz.convert(-20,0,0)
p3 <- plot3$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
3-D scatter plot of….
# main plot - know how this works
plot4 <- scatterplot3d(x = Xscale, y = Yscale, z = Zscale,
color = 3,
pch = 16, type = "h",
xlim = c(-25,25),
ylim = c(-50,50),
zlim = c(-100,275),
main = "Centered & Scaled")
p2 <- plot4$xyz.convert(0,-40,0)
p3 <- plot4$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot4$xyz.convert(-20,0,0)
p3 <- plot4$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
3-D scatter plot of….
# main plot - know how this works
plot5 <- scatterplot3d(x = Xscale, y = Yscale, z = Zscale,
color = 3,
pch = 16, type = "h",
#xlim = c(-25,25),
#ylim = c(-50,50),
#zlim = c(-100,275),
main = "Centered & Scaled")
p2 <- plot5$xyz.convert(0,-40,0)
p3 <- plot5$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot5$xyz.convert(-20,0,0)
p3 <- plot5$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
# plot all scatterplots as one image
# which of the transformations is NOT being shown
par(mfrow = c(2,2), mar = c(1,1,1,1))
plot1 <- scatterplot3d(x = Xraw,
y = Yraw,
z = Zraw,
color = 2,
pch = 16, type = "h",
xlim = c(-15,25),
ylim = c(-30,50),
zlim = c(-100,275),
main = "")
# add-ons don't need to know how these work
# These are lines at x = 0, y = 0, z = 0
p2 <- plot1$xyz.convert(0,-40,0)
p3 <- plot1$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot1$xyz.convert(-20,0,0)
p3 <- plot1$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
# main plot - know how this works
plot2 <- scatterplot3d(x = Xcent, y = Ycent, z = Zcent,
color = 2,
pch = 16, type = "h",
xlim = c(-15,25),
ylim = c(-30,50),
zlim = c(-100,275), main = "")
# add-ons don't need to know how these work
# These are lines at x = 0, y = 0, z = 0
p2 <- plot2$xyz.convert(0,-40,0)
p3 <- plot2$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot2$xyz.convert(-20,0,0)
p3 <- plot2$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
plot4 <- scatterplot3d(x = Xscale, y = Yscale, z = Zscale,
color = 3,
pch = 16, type = "h",
xlim = c(-25,25),
ylim = c(-50,50),
zlim = c(-100,275),
main = "")
p2 <- plot4$xyz.convert(0,-40,0)
p3 <- plot4$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot4$xyz.convert(-20,0,0)
p3 <- plot4$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
# main plot - know how this works
plot5 <- scatterplot3d(x = Xscale, y = Yscale, z = Zscale,
color = 3,
pch = 16, type = "h",
#xlim = c(-25,25),
#ylim = c(-50,50),
#zlim = c(-100,275),
main = "")
p2 <- plot5$xyz.convert(0,-40,0)
p3 <- plot5$xyz.convert(0,60,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)
p2 <- plot5$xyz.convert(-20,0,0)
p3 <- plot5$xyz.convert(30,0,0)
segments(p2$x,p2$y,p3$x,p3$y,lwd=2,col=2)