Introduction

These are the codes I used to run PCA of my averaged water chemistry data. Instead of using the princomp() function, I am going to try out the rda() function from the vegan package (Oksanen et al. 2012). Any comment is welcome.

Load data

My dataset can be found here.

library(readxl)
wc <- read_excel("F:/Thesis stuff/Data Analysis/Part a (87-09)/PCA (Cl, Ca, SC)/Data for PCA.xlsx")

Make a matrix from raw data

wc_wo_SU <- wc[,4:ncol(wc)] #exclude the first columns that contains SU information
#This dataframe includes all variables

#wc_wo_SU <- wc[,4:6] #exclude the first columns that contains SU information
#This data frame includes only Cl, Ca, SC (excluding nutrient parameters)

m_wc <- as.matrix(wc_wo_SU)

library(moments) #load moments package to check for kurtosis and skewness
skewness(m_wc)
##         Cl         Ca         SC        TKN         TP        TSS 
##  0.8365580  0.8086261  0.7306859  1.1604220 -1.1824354  0.6407346
kurtosis(m_wc)
##       Cl       Ca       SC      TKN       TP      TSS 
## 1.967369 2.068923 1.836838 3.261054 2.809512 3.695279

Run PCA using rda()

library(vegan) #load the package
wc.pca <- rda(m_wc,scale=T) #scale=T will standardize the variables
summary(eigenvals(wc.pca)) #summarize eigenvalues and proportion explained by each PC
## Importance of components:
##                          PC1    PC2     PC3     PC4      PC5      PC6
## Eigenvalue            3.9457 1.3346 0.36425 0.29652 0.051383 0.007499
## Proportion Explained  0.6576 0.2224 0.06071 0.04942 0.008564 0.001250
## Cumulative Proportion 0.6576 0.8801 0.94077 0.99019 0.998750 1.000000

Add columns of information to the result

Another way to add columns to the existing dataset is to use pipes from the dplyr package (Wickham et al., 2022).

library(dplyr)
library(vegan) #load the package
site.scores <- as.data.frame(scores(wc.pca,display = "sites")) #extract site scores
site.scores.dplyr <- site.scores %>% mutate(Site = wc$Site,Year=wc$Year,
                                            Subwatershed=wc$Subwatershed)
head(site.scores.dplyr)
##             PC1         PC2 Site      Year Subwatershed
## sit1 -0.3329020  1.72156307  RR1 1987-1988     B1 (19%)
## sit2 -0.5150767  0.58659590  RR2 1987-1988     B1 (19%)
## sit3 -0.3940687  0.04159008  RR3 1987-1988    B9A (13%)
## sit4 -0.7672729 -0.21760806  RR4 1987-1988      B9 (2%)
## sit5 -0.9397326 -0.98100452  RR5 1987-1988     B7 (12%)
## sit6 -0.8772582 -1.43304048  RR6 1987-1988     B7 (12%)
wc.scores <- as.data.frame(scores(wc.pca,display = "species")) #extract "species" scores (in this case, score for each water chemistry variable)

Plot PCA graph

This is going to produce a PCA graph for water chemistry variables. See below for codes to produce a biplot to examine which water chemistry measurement is driving the changes/differences among sites.

library(ggplot2)

wc_pca_plot <- ggplot(site.scores.dplyr,aes(PC1, PC2))+ 
    geom_point(size = 6,aes(color=Subwatershed, shape = as.character(Year))) +
    theme(axis.text.y = element_text(colour = "black", size = 20, face = "bold"), 
    axis.text.x = element_text(colour = "black", face = "bold", size = 20), 
    legend.text = element_text(size = 18, face ="bold", colour ="black"), 
    legend.position = "right", axis.title.y = element_text(face = "bold", size = 14), 
    axis.title.x = element_text(face = "bold", size = 25, colour = "black"),
    axis.title.y.left = element_text(face = "bold", size = 25, colour = "black"),
    legend.title = element_text(size = 20, colour = "black", face = "bold"), 
    panel.background = element_blank(), panel.border = element_rect(colour = "black", fill = NA, size = 1.2),
    legend.key=element_blank(),
    plot.title = element_text(color = "black", size = 30, face = "bold", hjust = 0.5)) + 
    labs(
    title = "PCA graph of water chemistry measurements") + 
    theme(axis.title.x = element_text(margin=margin(t=10)), #add margin to x-axis title
        axis.title.y = element_text(margin=margin(r=10)))+
    labs(x = "PC1 (65%)", y = "PC2 (23%)", shape = "Year") +
    scale_colour_manual(values = c("#b10026", "#feb14c","#ffeea1","#fc4d2a")) +
    geom_text(aes(label=Site),hjust=0.4, vjust=1.75,size=5)

wc_pca_plot

Plot a PCA biplot

The job now is to add some arrows that correspond to the “species” scores extract from above.

wc_pca_biplot <- wc_pca_plot + 
  geom_segment(data=wc.scores, aes(x=0, xend=PC1, y=0, yend=PC2), 
               color="black", arrow=arrow(length=unit(0.01,"npc"))) + #add arrow
  geom_text(data=wc.scores, 
            aes(x=PC1,y=PC2,label=rownames(wc.scores),
                hjust=-0.75,vjust=0.5*(1-sign(PC2))), 
            color="black", size=4) + #add text to arrow
  geom_hline(yintercept=0, linetype="dashed", 
                color = "black", size=1) + #add horizontal and vertical lines at 0
  geom_vline(xintercept=0, linetype="dashed", 
                color = "black", size=1)

wc_pca_biplot

ggsave("PCA WC Biplot.jpeg", wc_pca_biplot,width=15,height=8)

Black and White PCA plots for Pubs purposes

library(ggplot2)

site.scores.dplyr$Subwatershed = factor(site.scores.dplyr$Subwatershed,levels=c("B9 (2%)", "B7 (12%)","B9A (13%)","B1 (19%)"))

wc_pca_plot_bw <- ggplot(site.scores.dplyr,aes(PC1, PC2))+ 
    geom_point(size = 9,aes(shape = Subwatershed, fill = as.character(Year))) +
    scale_fill_manual(values = c("#cecece","#040404")) +
    scale_shape_manual(values = c(21,22,23,24)) +  
    theme(axis.text.y = element_text(colour = "black", size = 20, face = "bold"), 
    axis.text.x = element_text(colour = "black", face = "bold", size = 20), 
    legend.text = element_text(size = 18, face ="bold", colour ="black"), 
    legend.position = "right", axis.title.y = element_text(face = "bold", size = 14), 
    axis.title.x = element_text(face = "bold", size = 25, colour = "black"),
    axis.title.y.left = element_text(face = "bold", size = 25, colour = "black"),
    legend.title = element_text(size = 20, colour = "black", face = "bold"), 
    panel.background = element_blank(), panel.border = element_rect(colour = "black", fill = NA, size = 1.2),
    legend.key=element_blank(),
    plot.title = element_text(color = "black", size = 30, face = "bold", hjust = 0.5)) + 
    labs(
    title = "PCA graph of water chemistry measurements") + 
    theme(axis.title.x = element_text(margin=margin(t=10)), #add margin to x-axis title
        axis.title.y = element_text(margin=margin(r=10)))+
    labs(x = "PC1 (65%)", y = "PC2 (23%)", shape="Sub-watershed (% ISA)",fill = "Year") +
    geom_text(aes(label=Site),hjust=0.4, vjust=1.85,size=5)

wc_pca_plot_bw

#Biplot

wc_pca_biplot_bw <- wc_pca_plot_bw + 
  geom_segment(data=wc.scores, aes(x=0, xend=PC1, y=0, yend=PC2), 
               color="black", arrow=arrow(length=unit(0.01,"npc"))) + #add arrow
  geom_text(data=wc.scores, 
            aes(x=PC1,y=PC2,label=rownames(wc.scores),
                hjust=-0.75,vjust=0.5*(1-sign(PC2))), 
            color="black", size=4) + #add text to arrow
  geom_hline(yintercept=0, linetype="dashed", 
                color = "black", size=1) + #add horizontal and vertical lines at 0
  geom_vline(xintercept=0, linetype="dashed", 
                color = "black", size=1)

wc_pca_biplot_bw

ggsave("PCA WC Biplot_BW.jpeg", wc_pca_biplot_bw,width=15,height=8)

References

  1. Jari Oksanen, Gavin L. Simpson, F. Guillaume Blanchet, Roeland Kindt, Pierre Legendre, Peter R. Minchin, R.B. O’Hara, Peter Solymos, M. Henry H. Stevens, Eduard Szoecs, Helene Wagner, Matt Barbour, Michael Bedward, Ben Bolker, Daniel Borcard, Gustavo Carvalho, Michael Chirico, Miquel De Caceres, Sebastien Durand, Heloisa Beatriz Antoniazi Evangelista, Rich FitzJohn, Michael Friendly, Brendan Furneaux, Geoffrey Hannigan, Mark O. Hill, Leo Lahti, Dan McGlinn, Marie-Helene Ouellette, Eduardo Ribeiro Cunha, Tyler Smith, Adrian Stier, Cajo J.F. Ter Braak and James Weedon (2022). vegan: Community Ecology Package. R package version 2.6-2. https://CRAN.R-project.org/package=vegan
  2. Hadley Wickham, Romain François, Lionel Henry and Kirill Müller (2022). dplyr: A Grammar of Data Manipulation. R package version 1.0.8. https://CRAN.R-project.org/package=dplyr
  3. Hadley Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York, 2016.