Install & Load these packages
Library Installation
# Tidyverse: Uses all ggplot and graphical display options for this course
# Uncomment to run function install.packages()
# install.packages("tidyverse")
# install.packages ("devtools")
# install.packages ("doParallel")
# install.packages ("vegan")
# install.packages ("cluster")
# install.packages ("factoextra")
Load relevant libraries
library (tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(devtools)
## Loading required package: usethis
library (doParallel)
## Loading required package: foreach
##
## Attaching package: 'foreach'
##
## The following objects are masked from 'package:purrr':
##
## accumulate, when
##
## Loading required package: iterators
## Loading required package: parallel
library (vegan)
## Loading required package: permute
##
## Attaching package: 'permute'
##
## The following object is masked from 'package:devtools':
##
## check
##
## Loading required package: lattice
## This is vegan 2.6-4
library (cluster)
library(factoextra) # clustering algorithms & visualization
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
Template datasets
data(iris); str (iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
data (USArrests); str (USArrests)
## 'data.frame': 50 obs. of 4 variables:
## $ Murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
## $ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
## $ UrbanPop: int 58 48 80 50 91 78 77 72 80 60 ...
## $ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
Week 2- Ordination and visualization of data structures
PCA- Principal Components Analysis
- Use the
prcomp
function
- basic PCA plot using
biplot
#pairwise plots of numeric variables with data trend lines
pairs(USArrests, panel = panel.smooth, main = "USArrests data")

#calculate principal components
results <- prcomp(USArrests, scale = TRUE)
#reverse the signs
results$rotation <- -1*results$rotation
#display principal components
results$rotation
## PC1 PC2 PC3 PC4
## Murder 0.5358995 -0.4181809 0.3412327 -0.64922780
## Assault 0.5831836 -0.1879856 0.2681484 0.74340748
## UrbanPop 0.2781909 0.8728062 0.3780158 -0.13387773
## Rape 0.5434321 0.1673186 -0.8177779 -0.08902432
#Observation Scores
#reverse the signs of the scores
results$x <- -1*results$x
#display the first six scores
head(results$x)
## PC1 PC2 PC3 PC4
## Alabama 0.9756604 -1.1220012 0.43980366 -0.154696581
## Alaska 1.9305379 -1.0624269 -2.01950027 0.434175454
## Arizona 1.7454429 0.7384595 -0.05423025 0.826264240
## Arkansas -0.1399989 -1.1085423 -0.11342217 0.180973554
## California 2.4986128 1.5274267 -0.59254100 0.338559240
## Colorado 1.4993407 0.9776297 -1.08400162 -0.001450164
#calculate principal components
results <- prcomp(USArrests, scale = TRUE)
#reverse the signs
results$rotation <- -1*results$rotation
#display principal components
results$rotation
## PC1 PC2 PC3 PC4
## Murder 0.5358995 -0.4181809 0.3412327 -0.64922780
## Assault 0.5831836 -0.1879856 0.2681484 0.74340748
## UrbanPop 0.2781909 0.8728062 0.3780158 -0.13387773
## Rape 0.5434321 0.1673186 -0.8177779 -0.08902432
#Obeservation Scores
#reverse the signs of the scores
results$x <- -1*results$x
#display the first six scores
head(results$x)
## PC1 PC2 PC3 PC4
## Alabama 0.9756604 -1.1220012 0.43980366 -0.154696581
## Alaska 1.9305379 -1.0624269 -2.01950027 0.434175454
## Arizona 1.7454429 0.7384595 -0.05423025 0.826264240
## Arkansas -0.1399989 -1.1085423 -0.11342217 0.180973554
## California 2.4986128 1.5274267 -0.59254100 0.338559240
## Colorado 1.4993407 0.9776297 -1.08400162 -0.001450164
biplot(results, scale = 0)

Fancy Plotting Using ggbiplot using the iris
dataset
# log transform
log.ir <- log(iris[, 1:4])
ir.species <- iris[, 5]
# apply PCA - scale. = TRUE is highly
# advisable, but default is FALSE.
ir.pca <- prcomp(log.ir, center = TRUE, scale. = TRUE)
plot(ir.pca, type = "l")

# Install and load ggbiplot
#if(!("devtools" %in% rownames(installed.packages())) ){
#install.packages("devtools") }
# library(doParallel)
# library(devtools)
# install_github("vqv/ggbiplot")
library (ggbiplot)
## Loading required package: plyr
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
## Loading required package: scales
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
## Loading required package: grid
g <- ggbiplot(ir.pca, obs.scale = 1, var.scale = 1,
groups = ir.species, ellipse = TRUE,
circle = TRUE)
g <- g + scale_color_discrete(name = '')
g <- g + theme(legend.direction = 'horizontal',
legend.position = 'top')
print(g)

Non-Metric Multidimensional Scaling (NMDS) using
vegan
library (vegan)
set.seed(2)
community_matrix=matrix(
sample(1:100,300,replace=T),nrow=10,
dimnames=list(paste("community",1:10,sep=""),paste("sp",1:30,sep="")))
example_NMDS=metaMDS(community_matrix, # Our community-by-species matrix
k=2) # The number of reduced dimensions
## Square root transformation
## Wisconsin double standardization
## Run 0 stress 0.1486475
## Run 1 stress 0.1913689
## Run 2 stress 0.1908303
## Run 3 stress 0.1908305
## Run 4 stress 0.1849153
## Run 5 stress 0.1906937
## Run 6 stress 0.1486477
## ... Procrustes: rmse 0.0002739181 max resid 0.000426955
## ... Similar to previous best
## Run 7 stress 0.1849759
## Run 8 stress 0.1688829
## Run 9 stress 0.1650751
## Run 10 stress 0.1486475
## ... Procrustes: rmse 8.715875e-05 max resid 0.0001400462
## ... Similar to previous best
## Run 11 stress 0.164544
## Run 12 stress 0.3153591
## Run 13 stress 0.2198889
## Run 14 stress 0.2237308
## Run 15 stress 0.1699872
## Run 16 stress 0.1650804
## Run 17 stress 0.1906937
## Run 18 stress 0.1906937
## Run 19 stress 0.1494912
## Run 20 stress 0.1650751
## *** Best solution repeated 2 times
example_NMDS=metaMDS(community_matrix,k=2,trymax=100)
## Square root transformation
## Wisconsin double standardization
## Run 0 stress 0.1486475
## Run 1 stress 0.1906937
## Run 2 stress 0.1486475
## ... New best solution
## ... Procrustes: rmse 4.504062e-05 max resid 7.46737e-05
## ... Similar to previous best
## Run 3 stress 0.164544
## Run 4 stress 0.2043556
## Run 5 stress 0.1486475
## ... New best solution
## ... Procrustes: rmse 3.786411e-05 max resid 5.798541e-05
## ... Similar to previous best
## Run 6 stress 0.1686179
## Run 7 stress 0.1486487
## ... Procrustes: rmse 0.001296765 max resid 0.002408379
## ... Similar to previous best
## Run 8 stress 0.1881025
## Run 9 stress 0.1751078
## Run 10 stress 0.1650804
## Run 11 stress 0.1486475
## ... Procrustes: rmse 0.0001191068 max resid 0.0001867147
## ... Similar to previous best
## Run 12 stress 0.1663641
## Run 13 stress 0.1744082
## Run 14 stress 0.1717124
## Run 15 stress 0.1751081
## Run 16 stress 0.1769791
## Run 17 stress 0.1906937
## Run 18 stress 0.1974143
## Run 19 stress 0.1650751
## Run 20 stress 0.1486475
## ... Procrustes: rmse 0.0001122687 max resid 0.0001737893
## ... Similar to previous best
## *** Best solution repeated 4 times
#stress plot to define resolution of MDS
stressplot(example_NMDS)

# basic plot
plot(example_NMDS)

#Now we can plot the NMDS. The plot shows us both the communities (“sites”, open circles) and species (red crosses), but we don’t know which circle corresponds to which site, and which species corresponds to which cross.
CA and DCA- Correspondance Analyses
#require (vegan)
danube.spe <- read.delim ('https://raw.githubusercontent.com/zdealveindy/anadat-r/master/data/danube.spe.txt', row.names = 1)
#str (danube.spe)- long structure to dataset
#use the cca function in vegan
CA <- cca (danube.spe)
CA
## Call: cca(X = danube.spe)
##
## Inertia Rank
## Total 3.043
## Unconstrained 3.043 24
## Inertia is scaled Chi-square
##
## Eigenvalues for unconstrained axes:
## CA1 CA2 CA3 CA4 CA5 CA6 CA7 CA8
## 0.5718 0.4944 0.2950 0.2472 0.2057 0.1764 0.1528 0.1418
## (Showing 8 of 24 unconstrained eigenvalues)
#plot it:
ordiplot (CA)

Useful links for understanding PCA and Ordination:
Week 3: Cluster and distance analyses
Basic Machine Learning Methods
#basic plot of iris petal data data
ggplot(iris, aes(Petal.Length, Petal.Width, col=Species)) +
geom_point() + ggtitle("Iris scatter plot")

#randomized starting point for drawing centroids
set.seed(55)
#main function for clustering
#?kmeans- Uncomment for Kmeans function syntax and examples
cluster.iris <- kmeans(iris[, 3:4], 3, nstart = 20)
cluster.iris
## K-means clustering with 3 clusters of sizes 50, 52, 48
##
## Cluster means:
## Petal.Length Petal.Width
## 1 1.462000 0.246000
## 2 4.269231 1.342308
## 3 5.595833 2.037500
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [75] 2 2 2 3 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 2 3 3 3 3
## [112] 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3
## [149] 3 3
##
## Within cluster sum of squares by cluster:
## [1] 2.02200 13.05769 16.29167
## (between_SS / total_SS = 94.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
table(cluster.iris$cluster, iris$Species)
##
## setosa versicolor virginica
## 1 50 0 0
## 2 0 48 4
## 3 0 2 46
#Plot by minimized distance to centroids
cluster.iris$cluster <- as.factor(cluster.iris$cluster)
ggplot(iris, aes(Petal.Length, Petal.Width, color=cluster.iris$cluster)) +
geom_point() + ggtitle(" iris cluster plot")

Unsupervised Method- Unknown number of clusters
#require (factoextra)
df <- USArrests
df <- na.omit(df)
#Scale the data: Why?
#?scale-
df <- scale(df)
head(df)
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
## Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144 1.7589234 2.067820292
## Colorado 0.02571456 0.3988593 0.8608085 1.864967207