Introduction

These are the codes I used to run PCA for my metric data calculated using the MBSStools package (Leppo, 2021). Again, I welcome any feedback if there is any to further improve my codes.

Load data

Data can be found here.

library(readxl)
metric_data <- read_excel("~/thesis-codes/Part b (09-19)/Main Matrices No Zero.xlsx", 
    sheet = "MetricIBI")

Make a matrix from raw data

metric_data_su <- metric_data[,4:ncol(metric_data)] #exclude the first column that contains SU information
metric.pca <- princomp(metric_data_su, cor=T)
summary(metric.pca)
## Importance of components:
##                           Comp.1    Comp.2     Comp.3     Comp.4     Comp.5
## Standard deviation     2.1572307 1.0087384 0.78629939 0.66376068 0.42932883
## Proportion of Variance 0.6648063 0.1453647 0.08832382 0.06293975 0.02633189
## Cumulative Proportion  0.6648063 0.8101710 0.89849486 0.96143461 0.98776650
##                            Comp.6      Comp.7
## Standard deviation     0.24705751 0.156834615
## Proportion of Variance 0.00871963 0.003513871
## Cumulative Proportion  0.99648613 1.000000000
metric.pca$loadings #check the loading of each variable on each PC
## 
## Loadings:
##                              Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## Number of Taxa                0.293  0.666  0.263  0.401  0.422  0.158  0.196
## Number of EPT Taxa            0.427  0.294  0.117 -0.163 -0.242 -0.698 -0.380
## Number of Ephemeroptera taxa  0.402         0.184 -0.702         0.141  0.540
## Percent intolerant urban      0.313  0.103 -0.925         0.149         0.109
## Percent Chironomidae         -0.391  0.306        -0.554  0.527        -0.403
## Percent Clinger               0.337 -0.603  0.148  0.106  0.657 -0.210 -0.120
## IBI                           0.453                      -0.178  0.648 -0.580
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## SS loadings     1.000  1.000  1.000  1.000  1.000  1.000  1.000
## Proportion Var  0.143  0.143  0.143  0.143  0.143  0.143  0.143
## Cumulative Var  0.143  0.286  0.429  0.571  0.714  0.857  1.000

Plot Screeplot

This step is optional. The previous step already showed you how many percentage of variance is explained by each PC. This is to visualize by using package factoextra (Kassambara and Mundt, 2020).

library(factoextra)
fviz_eig(metric.pca)

Add columns of information to the result

Another way to add columns to the existing dataset is to use pipes from the dplyr package (Wickham et al., 2022).

library(dplyr) #load the package
data.scores <- as.data.frame(metric.pca$scores[,1:2]) #extract PC scores
data.scores.dplyr <- data.scores %>% mutate(Site = metric_data$Site,Year=metric_data$Year,
                                            Subwatershed=metric_data$Subwatershed)
head(data.scores.dplyr)
##       Comp.1     Comp.2   Site Year Subwatershed
## 1  1.8041284 -0.4360372  B1.15 2015           B1
## 2  0.2389307 -0.4469983  B5.15 2015           B1
## 3  1.7497789  2.0585837  B7.15 2015           B7
## 4  2.6965766  0.6352829 B10.15 2015          B9A
## 5 -2.0422203 -0.8068779 REF.15 2015           B1
## 6  0.7586916 -0.9930358  B1.17 2017           B1

Plot PCA graph

library(ggplot2)

ggplot(data.scores.dplyr,aes(Comp.1, Comp.2))+ 
    geom_point(size = 4,aes( shape = as.character(Year))) +
    theme(axis.text.y = element_text(colour = "black", size = 12, face = "bold"), 
    axis.text.x = element_text(colour = "black", face = "bold", size = 12), 
    legend.text = element_text(size = 12, face ="bold", colour ="black"), 
    legend.position = "right", axis.title.y = element_text(face = "bold", size = 14), 
    axis.title.x = element_text(face = "bold", size = 14, colour = "black"), 
    legend.title = element_text(size = 14, colour = "black", face = "bold"), 
    panel.background = element_blank(), panel.border = element_rect(colour = "black", fill = NA, size = 1.2),
    legend.key=element_blank(),
    plot.title = element_text(color = "black", size = 30, face = "bold", hjust = 0.5)) + 
    labs(
    title = "PCA graph of metrics and IBI scores") + 
    theme(axis.title.x = element_text(margin=margin(t=10)), #add margin to x-axis title
        axis.title.y = element_text(margin=margin(r=10)))+
    labs(x = "PC1 (66.5%)", y = "PC2 (14.5%)", shape = "Year") +
    geom_text(aes(label=Site),hjust=-0.15, vjust=1)

References

  1. Erik W. Leppo (2021). MBSStools: MBSS tool suite for calculations and data manipulation. R package version 1.1.0.9053. https://github.com/leppott/MBSStools
  2. Alboukadel Kassambara and Fabian Mundt (2020). factoextra: Extract and Visualize the Results of Multivariate Data Analyses. R package version 1.0.7. https://CRAN.R-project.org/package=factoextra
  3. Hadley Wickham, Romain François, Lionel Henry and Kirill Müller (2022). dplyr: A Grammar of Data Manipulation. R package version 1.0.8. https://CRAN.R-project.org/package=dplyr