Install & Load these packages

Library Installation

# Tidyverse: Uses all ggplot and graphical display options for this course 
# Uncomment to run function install.packages()
# install.packages("tidyverse")
# install.packages ("devtools")
# install.packages ("doParallel")
# install.packages ("vegan")
# install.packages ("cluster")
# install.packages ("factoextra")

Load relevant libraries

library (tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(devtools)

## Loading required package: usethis

library (doParallel)

## Loading required package: foreach
## 
## Attaching package: 'foreach'
## 
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## 
## Loading required package: iterators
## Loading required package: parallel

library (vegan)

## Loading required package: permute
## 
## Attaching package: 'permute'
## 
## The following object is masked from 'package:devtools':
## 
##     check
## 
## Loading required package: lattice
## This is vegan 2.6-4

library (cluster)
library(factoextra) # clustering algorithms & visualization

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

Template datasets

data(iris); str (iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

data (USArrests); str (USArrests)

## 'data.frame':    50 obs. of  4 variables:
##  $ Murder  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
##  $ Assault : int  236 263 294 190 276 204 110 238 335 211 ...
##  $ UrbanPop: int  58 48 80 50 91 78 77 72 80 60 ...
##  $ Rape    : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...

Week 2- Ordination and visualization of data structures

PCA- Principal Components Analysis

Use the prcomp function
basic PCA plot using biplot

#pairwise plots of numeric variables with data trend lines
pairs(USArrests, panel = panel.smooth, main = "USArrests data")

#calculate principal components
results <- prcomp(USArrests, scale = TRUE)

#reverse the signs
results$rotation <- -1*results$rotation

#display principal components
results$rotation

##                PC1        PC2        PC3         PC4
## Murder   0.5358995 -0.4181809  0.3412327 -0.64922780
## Assault  0.5831836 -0.1879856  0.2681484  0.74340748
## UrbanPop 0.2781909  0.8728062  0.3780158 -0.13387773
## Rape     0.5434321  0.1673186 -0.8177779 -0.08902432

#Observation Scores
#reverse the signs of the scores
results$x <- -1*results$x

#display the first six scores
head(results$x)

##                   PC1        PC2         PC3          PC4
## Alabama     0.9756604 -1.1220012  0.43980366 -0.154696581
## Alaska      1.9305379 -1.0624269 -2.01950027  0.434175454
## Arizona     1.7454429  0.7384595 -0.05423025  0.826264240
## Arkansas   -0.1399989 -1.1085423 -0.11342217  0.180973554
## California  2.4986128  1.5274267 -0.59254100  0.338559240
## Colorado    1.4993407  0.9776297 -1.08400162 -0.001450164

#calculate principal components
results <- prcomp(USArrests, scale = TRUE)

#reverse the signs
results$rotation <- -1*results$rotation

#display principal components
results$rotation

##                PC1        PC2        PC3         PC4
## Murder   0.5358995 -0.4181809  0.3412327 -0.64922780
## Assault  0.5831836 -0.1879856  0.2681484  0.74340748
## UrbanPop 0.2781909  0.8728062  0.3780158 -0.13387773
## Rape     0.5434321  0.1673186 -0.8177779 -0.08902432

#Obeservation Scores

#reverse the signs of the scores
results$x <- -1*results$x

#display the first six scores
head(results$x)

##                   PC1        PC2         PC3          PC4
## Alabama     0.9756604 -1.1220012  0.43980366 -0.154696581
## Alaska      1.9305379 -1.0624269 -2.01950027  0.434175454
## Arizona     1.7454429  0.7384595 -0.05423025  0.826264240
## Arkansas   -0.1399989 -1.1085423 -0.11342217  0.180973554
## California  2.4986128  1.5274267 -0.59254100  0.338559240
## Colorado    1.4993407  0.9776297 -1.08400162 -0.001450164

biplot(results, scale = 0)

Fancy Plotting Using ggbiplot using the `iris` dataset

details on this analysis and use of ggbiplot() can be found here

# log transform 
log.ir <- log(iris[, 1:4])
ir.species <- iris[, 5]
 
# apply PCA - scale. = TRUE is highly 
# advisable, but default is FALSE. 
ir.pca <- prcomp(log.ir, center = TRUE, scale. = TRUE) 

plot(ir.pca, type = "l")

# Install and load ggbiplot 
#if(!("devtools" %in% rownames(installed.packages()))  ){
  #install.packages("devtools")  }
# library(doParallel)
# library(devtools)
# install_github("vqv/ggbiplot")
library (ggbiplot)

## Loading required package: plyr

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:purrr':
## 
##     compact

## Loading required package: scales

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

## Loading required package: grid

g <- ggbiplot(ir.pca, obs.scale = 1, var.scale = 1, 
              groups = ir.species, ellipse = TRUE, 
              circle = TRUE)
g <- g + scale_color_discrete(name = '')
g <- g + theme(legend.direction = 'horizontal', 
               legend.position = 'top')
print(g)

Non-Metric Multidimensional Scaling (NMDS) using `vegan`

key function is metaMDS

library (vegan)
set.seed(2)
community_matrix=matrix(
   sample(1:100,300,replace=T),nrow=10,
   dimnames=list(paste("community",1:10,sep=""),paste("sp",1:30,sep="")))

example_NMDS=metaMDS(community_matrix, # Our community-by-species matrix
                     k=2) # The number of reduced dimensions

## Square root transformation
## Wisconsin double standardization
## Run 0 stress 0.1486475 
## Run 1 stress 0.1913689 
## Run 2 stress 0.1908303 
## Run 3 stress 0.1908305 
## Run 4 stress 0.1849153 
## Run 5 stress 0.1906937 
## Run 6 stress 0.1486477 
## ... Procrustes: rmse 0.0002739181  max resid 0.000426955 
## ... Similar to previous best
## Run 7 stress 0.1849759 
## Run 8 stress 0.1688829 
## Run 9 stress 0.1650751 
## Run 10 stress 0.1486475 
## ... Procrustes: rmse 8.715875e-05  max resid 0.0001400462 
## ... Similar to previous best
## Run 11 stress 0.164544 
## Run 12 stress 0.3153591 
## Run 13 stress 0.2198889 
## Run 14 stress 0.2237308 
## Run 15 stress 0.1699872 
## Run 16 stress 0.1650804 
## Run 17 stress 0.1906937 
## Run 18 stress 0.1906937 
## Run 19 stress 0.1494912 
## Run 20 stress 0.1650751 
## *** Best solution repeated 2 times

example_NMDS=metaMDS(community_matrix,k=2,trymax=100)

## Square root transformation
## Wisconsin double standardization
## Run 0 stress 0.1486475 
## Run 1 stress 0.1906937 
## Run 2 stress 0.1486475 
## ... New best solution
## ... Procrustes: rmse 4.504062e-05  max resid 7.46737e-05 
## ... Similar to previous best
## Run 3 stress 0.164544 
## Run 4 stress 0.2043556 
## Run 5 stress 0.1486475 
## ... New best solution
## ... Procrustes: rmse 3.786411e-05  max resid 5.798541e-05 
## ... Similar to previous best
## Run 6 stress 0.1686179 
## Run 7 stress 0.1486487 
## ... Procrustes: rmse 0.001296765  max resid 0.002408379 
## ... Similar to previous best
## Run 8 stress 0.1881025 
## Run 9 stress 0.1751078 
## Run 10 stress 0.1650804 
## Run 11 stress 0.1486475 
## ... Procrustes: rmse 0.0001191068  max resid 0.0001867147 
## ... Similar to previous best
## Run 12 stress 0.1663641 
## Run 13 stress 0.1744082 
## Run 14 stress 0.1717124 
## Run 15 stress 0.1751081 
## Run 16 stress 0.1769791 
## Run 17 stress 0.1906937 
## Run 18 stress 0.1974143 
## Run 19 stress 0.1650751 
## Run 20 stress 0.1486475 
## ... Procrustes: rmse 0.0001122687  max resid 0.0001737893 
## ... Similar to previous best
## *** Best solution repeated 4 times

#stress plot to define resolution of MDS 
stressplot(example_NMDS)

# basic plot
plot(example_NMDS)

#Now we can plot the NMDS. The plot shows us both the communities (“sites”, open circles) and species (red crosses), but we don’t know which circle corresponds to which site, and which species corresponds to which cross.

CA and DCA- Correspondance Analyses

Data source is the Danube Vegetation Community Dataset
Interpretation of DCA:
- We would like to learn whether biological data exhibit linear or unimodal distribution. The most important aspect of DCA is how large is the length of the first axis. Where the axis length larger than 4 SD unit suggests data set is nonlinear and unimodal methods are suitable whilst if the axis length is shorter than 3 SD suggests linear response is preferable.
Read more about Interpreting the Cannonical Axis variables

#require (vegan)
danube.spe <- read.delim ('https://raw.githubusercontent.com/zdealveindy/anadat-r/master/data/danube.spe.txt', row.names = 1)

#str (danube.spe)- long structure to dataset

#use the cca function in vegan 
CA <- cca (danube.spe)
CA

## Call: cca(X = danube.spe)
## 
##               Inertia Rank
## Total           3.043     
## Unconstrained   3.043   24
## Inertia is scaled Chi-square 
## 
## Eigenvalues for unconstrained axes:
##    CA1    CA2    CA3    CA4    CA5    CA6    CA7    CA8 
## 0.5718 0.4944 0.2950 0.2472 0.2057 0.1764 0.1528 0.1418 
## (Showing 8 of 24 unconstrained eigenvalues)

#plot it: 
ordiplot (CA)

Useful links for understanding PCA and Ordination:

Week 3: Cluster and distance analyses

Basic Machine Learning Methods

Supervised Clustering

#basic plot of iris petal data data
ggplot(iris, aes(Petal.Length, Petal.Width, col=Species)) + 
  geom_point() + ggtitle("Iris scatter plot")

#randomized starting point for drawing centroids 
set.seed(55)
#main function for clustering 
#?kmeans- Uncomment for Kmeans function syntax and examples
cluster.iris <- kmeans(iris[, 3:4], 3, nstart = 20)
cluster.iris

## K-means clustering with 3 clusters of sizes 50, 52, 48
## 
## Cluster means:
##   Petal.Length Petal.Width
## 1     1.462000    0.246000
## 2     4.269231    1.342308
## 3     5.595833    2.037500
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [75] 2 2 2 3 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 2 3 3 3 3
## [112] 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3
## [149] 3 3
## 
## Within cluster sum of squares by cluster:
## [1]  2.02200 13.05769 16.29167
##  (between_SS / total_SS =  94.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

table(cluster.iris$cluster, iris$Species)

##    
##     setosa versicolor virginica
##   1     50          0         0
##   2      0         48         4
##   3      0          2        46

#Plot by minimized distance to centroids
cluster.iris$cluster <- as.factor(cluster.iris$cluster)
ggplot(iris, aes(Petal.Length, Petal.Width, color=cluster.iris$cluster)) + 
  geom_point() + ggtitle(" iris cluster plot")

Unsupervised Method- Unknown number of clusters

#require (factoextra)
df <- USArrests
df <- na.omit(df)
#Scale the data: Why? 
#?scale- 
df <- scale(df)
head(df)

##                Murder   Assault   UrbanPop         Rape
## Alabama    1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska     0.50786248 1.1068225 -1.2117642  2.484202941
## Arizona    0.07163341 1.4788032  0.9989801  1.042878388
## Arkansas   0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144  1.7589234  2.067820292
## Colorado   0.02571456 0.3988593  0.8608085  1.864967207

Summary of Analyses for Biol375

Install & Load these packages

Library Installation

Load relevant libraries

Template datasets

Week 2- Ordination and visualization of data structures

PCA- Principal Components Analysis

Fancy Plotting Using ggbiplot using the iris dataset

Non-Metric Multidimensional Scaling (NMDS) using vegan

CA and DCA- Correspondance Analyses

Useful links for understanding PCA and Ordination:

Week 3: Cluster and distance analyses

Basic Machine Learning Methods

Unsupervised Method- Unknown number of clusters

Fancy Plotting Using ggbiplot using the `iris` dataset

Non-Metric Multidimensional Scaling (NMDS) using `vegan`