These are the codes I used to run PCA for my metric data calculated using the MBSStools package (Leppo, 2021). Again, I welcome any feedback if there is any to further improve my codes.
Data can be found here.
library(readxl)
metric_data <- read_excel("~/thesis-codes/Part b (09-19)/Main Matrices No Zero.xlsx",
sheet = "MetricIBI")
metric_data_su <- metric_data[,4:ncol(metric_data)] #exclude the first column that contains SU information
metric.pca <- princomp(metric_data_su, cor=T)
summary(metric.pca)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 2.1572307 1.0087384 0.78629939 0.66376068 0.42932883
## Proportion of Variance 0.6648063 0.1453647 0.08832382 0.06293975 0.02633189
## Cumulative Proportion 0.6648063 0.8101710 0.89849486 0.96143461 0.98776650
## Comp.6 Comp.7
## Standard deviation 0.24705751 0.156834615
## Proportion of Variance 0.00871963 0.003513871
## Cumulative Proportion 0.99648613 1.000000000
metric.pca$loadings #check the loading of each variable on each PC
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## Number of Taxa 0.293 0.666 0.263 0.401 0.422 0.158 0.196
## Number of EPT Taxa 0.427 0.294 0.117 -0.163 -0.242 -0.698 -0.380
## Number of Ephemeroptera taxa 0.402 0.184 -0.702 0.141 0.540
## Percent intolerant urban 0.313 0.103 -0.925 0.149 0.109
## Percent Chironomidae -0.391 0.306 -0.554 0.527 -0.403
## Percent Clinger 0.337 -0.603 0.148 0.106 0.657 -0.210 -0.120
## IBI 0.453 -0.178 0.648 -0.580
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.143 0.143 0.143 0.143 0.143 0.143 0.143
## Cumulative Var 0.143 0.286 0.429 0.571 0.714 0.857 1.000
This step is optional. The previous step already showed you how many percentage of variance is explained by each PC. This is to visualize by using package factoextra (Kassambara and Mundt, 2020).
library(factoextra)
fviz_eig(metric.pca)
Another way to add columns to the existing dataset is to use pipes from the dplyr package (Wickham et al., 2022).
library(dplyr) #load the package
data.scores <- as.data.frame(metric.pca$scores[,1:2]) #extract PC scores
data.scores.dplyr <- data.scores %>% mutate(Site = metric_data$Site,Year=metric_data$Year,
Subwatershed=metric_data$Subwatershed)
head(data.scores.dplyr)
## Comp.1 Comp.2 Site Year Subwatershed
## 1 1.8041284 -0.4360372 B1.15 2015 B1
## 2 0.2389307 -0.4469983 B5.15 2015 B1
## 3 1.7497789 2.0585837 B7.15 2015 B7
## 4 2.6965766 0.6352829 B10.15 2015 B9A
## 5 -2.0422203 -0.8068779 REF.15 2015 B1
## 6 0.7586916 -0.9930358 B1.17 2017 B1
library(ggplot2)
ggplot(data.scores.dplyr,aes(Comp.1, Comp.2))+
geom_point(size = 4,aes( shape = as.character(Year))) +
theme(axis.text.y = element_text(colour = "black", size = 12, face = "bold"),
axis.text.x = element_text(colour = "black", face = "bold", size = 12),
legend.text = element_text(size = 12, face ="bold", colour ="black"),
legend.position = "right", axis.title.y = element_text(face = "bold", size = 14),
axis.title.x = element_text(face = "bold", size = 14, colour = "black"),
legend.title = element_text(size = 14, colour = "black", face = "bold"),
panel.background = element_blank(), panel.border = element_rect(colour = "black", fill = NA, size = 1.2),
legend.key=element_blank(),
plot.title = element_text(color = "black", size = 30, face = "bold", hjust = 0.5)) +
labs(
title = "PCA graph of metrics and IBI scores") +
theme(axis.title.x = element_text(margin=margin(t=10)), #add margin to x-axis title
axis.title.y = element_text(margin=margin(r=10)))+
labs(x = "PC1 (66.5%)", y = "PC2 (14.5%)", shape = "Year") +
geom_text(aes(label=Site),hjust=-0.15, vjust=1)