The PCA test was done using the pcaMethods package written by Stacklies et.al (2007). This package can automatically impute the missing values in the dataset. Previous attempts to use prcomp() and princomp() gained error messages as there are several NAs in the dataset. Arguments of:

were used, but unsuccesful.

The pcaMethods package has several pca methods:

We applied the pca on the dataset, which have been divided in to three groups:

For each group we used the following work flow:

0. Loading libraries and preparing dataset

source("http://bioconductor.org/biocLite.R")
biocLite("pcaMethods")
require("pcaMethods")
## Loading required package: pcaMethods
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist
## 
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Loading required package: Rcpp
## 
## Attaching package: 'pcaMethods'
## 
## The following object is masked from 'package:stats':
## 
##     loadings
data <- as.data.frame(read.csv("0806alldata.csv",
                      header = TRUE))
attach(data)
## The following object is masked from package:datasets:
## 
##     CO2
group1 <- data[,c("x", "y", "type", "ec", "elv",  
                  "ph", "hard", "tds", "temp",
                  "eh", "Q", "cumrain", "lag1")]

group2 <- data[,c("x", "y", "type", "ec", "elv", "aq", 
                  "Ca", "Mg", 
                  "Fe", "Mn",  
                  "K", "Na",  
                  "cumrain", "lag1")]

group3 <- data[,c("x", "y", "type", "ec", "elv", "aq", 
                  "CO3", "HCO3", 
                  "CO2", "Cl", 
                  "SO4", "NO2", 
                  "NO3", "SiO2",  
                  "cumrain", "lag1")]

1. Run the PCA on Group1

pcaSvd1 <- pca(group1, 
            method = "svdImpute", 
            scale = "uv",
            center = T,
            nPcs = 5,
            evalPcs = 1:5)

pcaNipals1 <- pca(group1, 
                method = "nipals", 
                scale = "uv",
                center = T,
                nPcs = 5,
                evalPcs = 1:5)
sDevs1 <- cbind(sDev(pcaSvd1), sDev(pcaNipals1))
matplot(sDevs1, type = 'l', 
        xlab="Eigenvalues", ylab="Standard deviation of PC", 
        lwd = 3)
legend(x="topright", legend=c("SVD Impute", "Nipals"), 
       lty=1:2, 
       col=1:2, 
       lwd=3)

plot of chunk unnamed-chunk-4

summary(pcaSvd1)
## svdImpute calculated PCA
## Importance of component(s):
##                  PC1     PC2     PC3     PC4     PC5
## R2            0.8627 0.03331 0.02638 0.01715 0.01305
## Cumulative R2 0.8627 0.89600 0.92238 0.93953 0.95259
summary(pcaNipals1)
## nipals calculated PCA
## Importance of component(s):
##                  PC1    PC2    PC3    PC4     PC5
## R2            0.2507 0.1427 0.1092 0.1007 0.07587
## Cumulative R2 0.2507 0.3935 0.5026 0.6033 0.67916
summaries1 <- cbind(summary(pcaSvd1), summary(pcaNipals1))
## svdImpute calculated PCA
## Importance of component(s):
##                  PC1     PC2     PC3     PC4     PC5
## R2            0.8627 0.03331 0.02638 0.01715 0.01305
## Cumulative R2 0.8627 0.89600 0.92238 0.93953 0.95259
## nipals calculated PCA
## Importance of component(s):
##                  PC1    PC2    PC3    PC4     PC5
## R2            0.2507 0.1427 0.1092 0.1007 0.07587
## Cumulative R2 0.2507 0.3935 0.5026 0.6033 0.67916
loadings(pcaSvd1)
##               PC1       PC2      PC3       PC4       PC5
## x       -0.055590 -0.258083 -0.10862  0.141716 -0.137169
## y        0.081965  0.237907 -0.07555 -0.186311  0.369474
## ec       0.013444  0.454477  0.02570  0.170224 -0.489954
## elv      0.021887 -0.479781 -0.01203  0.297556 -0.525475
## ph      -0.028040  0.002813 -0.58880 -0.067608 -0.048484
## hard    -0.120396 -0.045594 -0.02631  0.004933 -0.006708
## tds      0.025296  0.501309 -0.07703  0.027373 -0.147327
## temp     0.059726  0.399000 -0.15027  0.076562 -0.299507
## eh      -0.080122  0.089972  0.67051 -0.029060  0.048412
## Q        0.981641 -0.065646  0.04575  0.003528  0.002783
## cumrain -0.001139 -0.045669  0.30513 -0.580484 -0.438093
## lag1    -0.001920 -0.113866 -0.24702 -0.691855 -0.142872
loadings(pcaNipals1)
##              PC1      PC2      PC3      PC4       PC5
## x        0.34744  0.11832  0.10234  0.08131  0.102080
## y       -0.40791 -0.03421 -0.27768 -0.09470 -0.380929
## ec      -0.32578  0.15654  0.38744  0.02015  0.495119
## elv      0.24351 -0.26548 -0.34322  0.44942  0.549732
## ph       0.10725  0.62606 -0.19861 -0.04525  0.116537
## hard     0.39517  0.28306  0.46879 -0.18382 -0.018107
## tds     -0.39953  0.22325  0.24730 -0.06777  0.122003
## temp    -0.45991  0.09961 -0.09020  0.13021  0.319312
## eh      -0.01243 -0.41665  0.20386 -0.16159 -0.005983
## Q       -0.04835 -0.23190  0.09382  0.10677 -0.065828
## cumrain  0.02829 -0.33303  0.06510 -0.62748  0.344662
## lag1     0.07322  0.14040 -0.51128 -0.54167  0.207871
par(mfrow=c(1, 2))
plot(loadings(pcaSvd1), 
     pch = 20,
     main = "Variable loadings",
     sub = "Svd on Group1")
text(loadings(pcaSvd1), 
     row.names(loadings(pcaSvd1)),
     cex=0.6, 
     pos=1, 
     col="red")
plot(loadings(pcaNipals1), 
     pch = 20,
     main = "Variable loadings",
     sub = "Nipals on Group1")
text(loadings(pcaNipals1), row.names(loadings(pcaNipals1)),
     cex=0.6, 
     pos=1, 
     col="red")

plot of chunk unnamed-chunk-6

scores(pcaSvd1)
scores(pcaNipals1)
par(mfrow=c(1, 1))
plot(scores(pcaSvd1), 
     pch = c(group1$type), 
     col = c(group1$type),
     main = "Case scores",
     sub = "Svd on Group1")
legend(x = "topleft", 
        c("Groundwater", "River Water"),
        title = "Water type:",
        pch = c(1, 2), 
        col = c("black", "red"))

plot of chunk unnamed-chunk-8

plot(scores(pcaNipals1), 
     pch = c(group1$type), 
     col = c(group1$type),
     main = "Case scores",
     sub = "Nipals on Group1")
legend(x = "bottomright", 
       c("Groundwater", "River Water"),
       title = "Water type:",
       pch = c(1, 2), 
       col = c("black", "red"))

plot of chunk unnamed-chunk-8

2. Run the PCA on Group2

pcaSvd2 <- pca(group2, 
               method = "svdImpute", 
               scale = "uv",
               center = T,
               nPcs = 5,
               evalPcs = 1:5)

pcaNipals2 <- pca(group2, 
                  method = "nipals", 
                  scale = "uv",
                  center = T,
                  nPcs = 5,
                  evalPcs = 1:5)
sDevs2 <- cbind(sDev(pcaSvd2), 
                sDev(pcaNipals2))

matplot(sDevs2, type = 'l', 
        xlab="Eigenvalues", ylab="Standard deviation of PC", 
        lwd = 3)
legend(x="topright", legend=c("SVD Impute", "Nipals"), 
       lty=1:2, 
       col=1:2, 
       lwd=3)

plot of chunk unnamed-chunk-10

summary(pcaSvd2)
## svdImpute calculated PCA
## Importance of component(s):
##                  PC1    PC2     PC3     PC4     PC5
## R2            0.2812 0.1402 0.09752 0.08543 0.07816
## Cumulative R2 0.2812 0.4214 0.51889 0.60432 0.68249
summary(pcaNipals2)
## nipals calculated PCA
## Importance of component(s):
##                  PC1    PC2     PC3     PC4     PC5
## R2            0.2812 0.1402 0.09752 0.08543 0.07816
## Cumulative R2 0.2812 0.4214 0.51889 0.60432 0.68249
loadings(pcaSvd2, pcaNipals2)
##              PC1      PC2      PC3      PC4      PC5
## x       -0.24485  0.43968 -0.06425  0.06130 -0.19926
## y        0.25851 -0.41408 -0.03813 -0.11925 -0.15694
## ec       0.32826  0.11734  0.18515 -0.11466  0.44728
## elv     -0.30501  0.02586  0.21251 -0.04192  0.03601
## Ca       0.41508  0.23971 -0.13006  0.07614 -0.14927
## Mg       0.42517  0.26012  0.02015  0.06070  0.14408
## Fe       0.05489  0.35676  0.09350 -0.73759  0.16457
## Mn       0.20924 -0.58165  0.18621 -0.22286  0.04176
## K        0.31366  0.05992 -0.27434  0.17778 -0.43282
## Na       0.40999  0.11887  0.04121  0.05847 -0.01283
## cumrain -0.04763 -0.08078 -0.59370  0.24220  0.66366
## lag1    -0.06194 -0.07626 -0.65176 -0.51986 -0.18575
par(mfrow=c(1, 2))
plot(loadings(pcaSvd2), 
     pch = 20,
     main = "Variable loadings",
     sub = "Svd on Group2")
text(loadings(pcaSvd2), 
     row.names(loadings(pcaSvd2)),
     cex=0.6, 
     pos=1, 
     col="red")
plot(loadings(pcaNipals2), 
     pch = 20,
     main = "Variable loadings", 
     sub = "Nipals on Group2")
text(loadings(pcaNipals2), 
     row.names(loadings(pcaNipals2)),
     cex=0.6, 
     pos=1, 
     col="red")

plot of chunk unnamed-chunk-12

scores(pcaSvd2)
scores(pcaNipals2)

plot.new()
par(mfrow=c(1, 1))
plot(scores(pcaSvd2), 
     pch = c(group2$type), 
     col = c(group2$type),
     main = "Case scores",
     sub = "Svd on Group2")
legend(x = "bottomright", 
       c("Groundwater", "River Water"),
       title = "Water type:",
       pch = c(1, 2), 
       col = c("black", "red"))

plot(scores(pcaNipals2), 
     pch = c(group2$type), 
     col = c(group2$type),
     main = "Case scores",
     sub = "Nipals on Group2")
legend(x = "bottomleft", 
       c("Groundwater", "River Water"),
       title = "Water type:",
       pch = c(1, 2), 
       col = c("black", "red"))

3. Run the PCA on Group3

pcaSvd3 <- pca(group3, 
               method = "svdImpute", 
               scale = "uv",
               center = T,
               nPcs = 5,
               evalPcs = 1:5)

pcaNipals3 <- pca(group3, 
                  method = "nipals", 
                  scale = "uv",
                  center = T,
                  nPcs = 5,
                  evalPcs = 1:5)
sDevs3 <- cbind(sDev(pcaSvd3), 
                sDev(pcaNipals3))

plot.new()
matplot(sDevs3, type = 'l', 
        xlab="Eigenvalues", ylab="Standard deviation of PC", 
        lwd = 3)
legend(x="topright", 
       legend=c("SVD Impute", "Nipals"), 
       lty=1:2, 
       col=1:2, 
       lwd=3)

plot of chunk unnamed-chunk-15

summary(pcaSvd3)
## svdImpute calculated PCA
## Importance of component(s):
##                  PC1    PC2     PC3    PC4     PC5
## R2            0.1985 0.1331 0.09616 0.0824 0.07553
## Cumulative R2 0.1985 0.3316 0.42777 0.5102 0.58570
summary(pcaNipals3)
## nipals calculated PCA
## Importance of component(s):
##                  PC1    PC2     PC3    PC4     PC5
## R2            0.1985 0.1331 0.09616 0.0824 0.07553
## Cumulative R2 0.1985 0.3316 0.42777 0.5102 0.58570
loadings(pcaSvd3)
##                PC1      PC2      PC3      PC4      PC5
## x       -2.742e-01  0.22823 -0.13022  0.03249  0.37242
## y        3.105e-01 -0.38729 -0.26044 -0.26450 -0.01471
## ec       3.799e-01  0.11277  0.05207  0.34225 -0.02558
## elv     -3.609e-01 -0.13550 -0.02988  0.39823  0.14732
## CO3     -5.996e-05 -0.41039  0.09646  0.26083  0.11118
## HCO3     4.288e-01  0.16297 -0.08358 -0.01585 -0.08037
## CO2      7.661e-02 -0.08233  0.57116 -0.06414 -0.38893
## Cl       4.922e-01  0.04829  0.04742  0.19269  0.19472
## SO4      3.229e-01  0.03797  0.21588  0.12078  0.41303
## NO2     -4.244e-02  0.58458  0.18893  0.11726  0.03898
## NO3     -1.683e-02  0.40910  0.12562 -0.25883 -0.02617
## SiO2     1.026e-01  0.10191 -0.35887 -0.48167  0.23097
## cumrain -1.957e-02 -0.14252  0.30957 -0.11399  0.63083
## lag1    -6.750e-02 -0.14666  0.49411 -0.45102  0.08141
loadings(pcaNipals3)
##                PC1      PC2      PC3      PC4      PC5
## x        2.742e-01  0.22823  0.13023  0.03268  0.37263
## y       -3.105e-01 -0.38724  0.26041 -0.26458 -0.01451
## ec      -3.799e-01  0.11280 -0.05199  0.34225 -0.02586
## elv      3.609e-01 -0.13553  0.03002  0.39831  0.14716
## CO3      2.518e-05 -0.41040 -0.09633  0.26093  0.11074
## HCO3    -4.288e-01  0.16302  0.08355 -0.01592 -0.08027
## CO2     -7.662e-02 -0.08240 -0.57119 -0.06421 -0.38872
## Cl      -4.922e-01  0.04833 -0.04736  0.19282  0.19465
## SO4     -3.229e-01  0.03797 -0.21582  0.12109  0.41308
## NO2      4.249e-02  0.58455 -0.18896  0.11734  0.03880
## NO3      1.687e-02  0.40909 -0.12575 -0.25881 -0.02618
## SiO2    -1.026e-01  0.10196  0.35874 -0.48163  0.23115
## cumrain  1.956e-02 -0.14256 -0.30954 -0.11351  0.63084
## lag1     6.749e-02 -0.14672 -0.49421 -0.45083  0.08176
par(mfrow=c(1, 2))
plot(loadings(pcaSvd3), 
     pch = 20,
     main = "Variable loadings",
     sub = "Svd on Group3")
text(loadings(pcaSvd3), 
     row.names(loadings(pcaSvd3)),
     cex=0.6, 
     pos=1, 
     col="red")
plot(loadings(pcaNipals3), 
     pch = 20,
     main = "Variable loadings",
     sub = "Nipals on Group3")
text(loadings(pcaNipals3), 
     row.names(loadings(pcaNipals3)),
     cex=0.6, 
     pos=1, 
     col="red")

plot of chunk unnamed-chunk-17

scores(pcaSvd3)
scores(pcaNipals3)

plot.new()
par(mfrow=c(1,1))
plot(scores(pcaSvd3), 
     pch = c(group3$type), 
     col = c(group3$type),
     main = "Case scores",
     sub = "SVD on Group3")
legend(x = "topleft", 
       c("Groundwater", "River Water"),
       title = "Water type:",
       pch = c(1, 2), 
       col = c("black", "red"))

plot(scores(pcaNipals3), 
     pch = c(group3$type), 
     col = c(group3$type),
     main = "Case scores",
     sub = "Nipals on Group3")
legend(x = "topleft", 
       c("Groundwater", "River Water"),
       title = "Water type:",
       pch = c(1, 2), 
       col = c("black", "red"))

4. Preliminary results

4.1 Group 1

  • The eigenvalues suggests two PCs to be retained, from svdImpute method. Both PCs explaine 89.6% of total variance. We drop the Nipals method, based on the poor PC result.

  • The loading plot from svd shows good separation of the variables. All variables, except Q is aligned in vertical (PC1) direction, while the Nipals loading plot shows no pattern.

  • The case score plot from svd shows good separation of groundwater data (black) from the river data (red). The groundwater data is aligned in vertical axis, while the river is in diagonal pattern.

4.2 Group 2

  • The eigenvalues suggests three PCs to be retained, from svdImpute method. All PCs explain 51.9% of total variance. Here’s we see that the Nipals method shows similar result.

  • The loading plot from both methods shows similar distribution of variables in respect to PC1 and PC2. Nipals plot show a vertical flipped distribution from the svd plot.
    • The spatial variables (x, y, elv) are not closely distributed.
    • The rain variables (cumrain and lag1) fall in one group.
    • We also see the EC has stronger interaction with Ca, Na, Mg, K, and Mn, while Fe is separated from the group.
  • The case score plot from both method show similar distribution. The Nipals plot is a vertical flipped of the svd plot. The plots show a clear separation of groundwater data (black) and river data (red).

4.3 Group 3

  • The eigenvalues from both methods suggest three PCs to be retained. Both methods show 42.7% of total variance explained by the PCs.

  • The loading plot from both methods shows similar distribution of variables in respect to PC1 and PC2. Nipals plot show a vertical flipped distribution from the svd plot.
    • The spatial variables (x, y, elv) again are not closely distributed.
    • The rain variables (cumrain and lag1) dan CO2 fall in one group.
    • We also see the EC has stronger interaction with the HCO3, CO3, CL, SO4, and SiO2, while NO2 and NO3 is separated from the group.
  • The case score plot from svd shows good separation of groundwater data (black) from the river data (red). The river data is aligned in vertical axis, while the river is in scaterred diagonal pattern.

References

citation("base")
## 
## To cite R in publications use:
## 
##   R Core Team (2014). R: A language and environment for
##   statistical computing. R Foundation for Statistical Computing,
##   Vienna, Austria. URL http://www.R-project.org/.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {R: A Language and Environment for Statistical Computing},
##     author = {{R Core Team}},
##     organization = {R Foundation for Statistical Computing},
##     address = {Vienna, Austria},
##     year = {2014},
##     url = {http://www.R-project.org/},
##   }
## 
## We have invested a lot of time and effort in creating R, please
## cite it when using it for data analysis. See also
## 'citation("pkgname")' for citing R packages.
citation("knitr")
## 
## To cite the 'knitr' package in publications use:
## 
##   Yihui Xie (2014). knitr: A general-purpose package for dynamic
##   report generation in R. R package version 1.6.
## 
##   Yihui Xie (2013) Dynamic Documents with R and knitr. Chapman and
##   Hall/CRC. ISBN 978-1482203530
## 
##   Yihui Xie (2014) knitr: A Comprehensive Tool for Reproducible
##   Research in R. In Victoria Stodden, Friedrich Leisch and Roger
##   D. Peng, editors, Implementing Reproducible Computational
##   Research. Chapman and Hall/CRC. ISBN 978-1466561595
citation("pcaMethods")
## 
## The pcaMethods package implement algorithms found in several
## different publication. Refer to function documentation for
## reference to the original articles.
## 
##   Stacklies, W., Redestig, H., Scholz, M., Walther, D. and Selbig,
##   J.  pcaMethods -- a Bioconductor package providing PCA methods
##   for incomplete data. Bioinformatics, 2007, 23, 1164-1167
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     title = {pcaMethods -- a Bioconductor package providing PCA methods for incomplete   data},
##     author = {Wolfram Stacklies and Henning Redestig and Matthias Scholz and Dirk Walther and Joachim Selbig},
##     journal = {Bioinformatics},
##     year = {2007},
##     pages = {1164--1167},
##     volume = {23},
##   }
## 
## This free open-source software implements academic research by the
## authors and co-workers. If you use it, please support the project
## by citing the appropriate journal articles.