These are the codes I used to run PCA of my averaged water chemistry data. Instead of using the princomp() function, I am going to try out the rda() function from the vegan package (Oksanen et al. 2012). Any comment is welcome.
My dataset can be found here.
library(readxl)
wc <- read_excel("F:/Thesis stuff/Data Analysis/Part a (87-09)/PCA (Cl, Ca, SC)/Data for PCA.xlsx")
wc_wo_SU <- wc[,4:ncol(wc)] #exclude the first columns that contains SU information
#This dataframe includes all variables
#wc_wo_SU <- wc[,4:6] #exclude the first columns that contains SU information
#This data frame includes only Cl, Ca, SC (excluding nutrient parameters)
m_wc <- as.matrix(wc_wo_SU)
library(moments) #load moments package to check for kurtosis and skewness
skewness(m_wc)
## Cl Ca SC TKN TP TSS
## 0.8365580 0.8086261 0.7306859 1.1604220 -1.1824354 0.6407346
kurtosis(m_wc)
## Cl Ca SC TKN TP TSS
## 1.967369 2.068923 1.836838 3.261054 2.809512 3.695279
library(vegan) #load the package
wc.pca <- rda(m_wc,scale=T) #scale=T will standardize the variables
summary(eigenvals(wc.pca)) #summarize eigenvalues and proportion explained by each PC
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Eigenvalue 3.9457 1.3346 0.36425 0.29652 0.051383 0.007499
## Proportion Explained 0.6576 0.2224 0.06071 0.04942 0.008564 0.001250
## Cumulative Proportion 0.6576 0.8801 0.94077 0.99019 0.998750 1.000000
Another way to add columns to the existing dataset is to use pipes from the dplyr package (Wickham et al., 2022).
library(dplyr)
library(vegan) #load the package
site.scores <- as.data.frame(scores(wc.pca,display = "sites")) #extract site scores
site.scores.dplyr <- site.scores %>% mutate(Site = wc$Site,Year=wc$Year,
Subwatershed=wc$Subwatershed)
head(site.scores.dplyr)
## PC1 PC2 Site Year Subwatershed
## sit1 -0.3329020 1.72156307 RR1 1987-1988 B1 (19%)
## sit2 -0.5150767 0.58659590 RR2 1987-1988 B1 (19%)
## sit3 -0.3940687 0.04159008 RR3 1987-1988 B9A (13%)
## sit4 -0.7672729 -0.21760806 RR4 1987-1988 B9 (2%)
## sit5 -0.9397326 -0.98100452 RR5 1987-1988 B7 (12%)
## sit6 -0.8772582 -1.43304048 RR6 1987-1988 B7 (12%)
wc.scores <- as.data.frame(scores(wc.pca,display = "species")) #extract "species" scores (in this case, score for each water chemistry variable)
This is going to produce a PCA graph for water chemistry variables. See below for codes to produce a biplot to examine which water chemistry measurement is driving the changes/differences among sites.
library(ggplot2)
wc_pca_plot <- ggplot(site.scores.dplyr,aes(PC1, PC2))+
geom_point(size = 6,aes(color=Subwatershed, shape = as.character(Year))) +
theme(axis.text.y = element_text(colour = "black", size = 20, face = "bold"),
axis.text.x = element_text(colour = "black", face = "bold", size = 20),
legend.text = element_text(size = 18, face ="bold", colour ="black"),
legend.position = "right", axis.title.y = element_text(face = "bold", size = 14),
axis.title.x = element_text(face = "bold", size = 25, colour = "black"),
axis.title.y.left = element_text(face = "bold", size = 25, colour = "black"),
legend.title = element_text(size = 20, colour = "black", face = "bold"),
panel.background = element_blank(), panel.border = element_rect(colour = "black", fill = NA, size = 1.2),
legend.key=element_blank(),
plot.title = element_text(color = "black", size = 30, face = "bold", hjust = 0.5)) +
labs(
title = "PCA graph of water chemistry measurements") +
theme(axis.title.x = element_text(margin=margin(t=10)), #add margin to x-axis title
axis.title.y = element_text(margin=margin(r=10)))+
labs(x = "PC1 (65%)", y = "PC2 (23%)", shape = "Year") +
scale_colour_manual(values = c("#b10026", "#feb14c","#ffeea1","#fc4d2a")) +
geom_text(aes(label=Site),hjust=0.4, vjust=1.75,size=5)
wc_pca_plot
The job now is to add some arrows that correspond to the “species” scores extract from above.
wc_pca_biplot <- wc_pca_plot +
geom_segment(data=wc.scores, aes(x=0, xend=PC1, y=0, yend=PC2),
color="black", arrow=arrow(length=unit(0.01,"npc"))) + #add arrow
geom_text(data=wc.scores,
aes(x=PC1,y=PC2,label=rownames(wc.scores),
hjust=-0.75,vjust=0.5*(1-sign(PC2))),
color="black", size=4) + #add text to arrow
geom_hline(yintercept=0, linetype="dashed",
color = "black", size=1) + #add horizontal and vertical lines at 0
geom_vline(xintercept=0, linetype="dashed",
color = "black", size=1)
wc_pca_biplot
ggsave("PCA WC Biplot.jpeg", wc_pca_biplot,width=15,height=8)
library(ggplot2)
site.scores.dplyr$Subwatershed = factor(site.scores.dplyr$Subwatershed,levels=c("B9 (2%)", "B7 (12%)","B9A (13%)","B1 (19%)"))
wc_pca_plot_bw <- ggplot(site.scores.dplyr,aes(PC1, PC2))+
geom_point(size = 9,aes(shape = Subwatershed, fill = as.character(Year))) +
scale_fill_manual(values = c("#cecece","#040404")) +
scale_shape_manual(values = c(21,22,23,24)) +
theme(axis.text.y = element_text(colour = "black", size = 20, face = "bold"),
axis.text.x = element_text(colour = "black", face = "bold", size = 20),
legend.text = element_text(size = 18, face ="bold", colour ="black"),
legend.position = "right", axis.title.y = element_text(face = "bold", size = 14),
axis.title.x = element_text(face = "bold", size = 25, colour = "black"),
axis.title.y.left = element_text(face = "bold", size = 25, colour = "black"),
legend.title = element_text(size = 20, colour = "black", face = "bold"),
panel.background = element_blank(), panel.border = element_rect(colour = "black", fill = NA, size = 1.2),
legend.key=element_blank(),
plot.title = element_text(color = "black", size = 30, face = "bold", hjust = 0.5)) +
labs(
title = "PCA graph of water chemistry measurements") +
theme(axis.title.x = element_text(margin=margin(t=10)), #add margin to x-axis title
axis.title.y = element_text(margin=margin(r=10)))+
labs(x = "PC1 (65%)", y = "PC2 (23%)", shape="Sub-watershed (% ISA)",fill = "Year") +
geom_text(aes(label=Site),hjust=0.4, vjust=1.85,size=5)
wc_pca_plot_bw
#Biplot
wc_pca_biplot_bw <- wc_pca_plot_bw +
geom_segment(data=wc.scores, aes(x=0, xend=PC1, y=0, yend=PC2),
color="black", arrow=arrow(length=unit(0.01,"npc"))) + #add arrow
geom_text(data=wc.scores,
aes(x=PC1,y=PC2,label=rownames(wc.scores),
hjust=-0.75,vjust=0.5*(1-sign(PC2))),
color="black", size=4) + #add text to arrow
geom_hline(yintercept=0, linetype="dashed",
color = "black", size=1) + #add horizontal and vertical lines at 0
geom_vline(xintercept=0, linetype="dashed",
color = "black", size=1)
wc_pca_biplot_bw
ggsave("PCA WC Biplot_BW.jpeg", wc_pca_biplot_bw,width=15,height=8)