Data exploration of FTIR spectra (mineral-associated SOC) using ChemoSpec (Hanson, 2012)

This is a preliminary exploration of FTIR data of FTIR spectra of the minera-associated fraction of SOC samples from transects collected at Cedar Mountain (2010) and Franklin Basin (2011), at Utah. In te beginning, I am only looking for differences between species

setwd("~/Desktop/CSV files/S&C CM&FB processed averaged")

library("R.utils")
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.4.2 (2012-06-22) successfully loaded. See ?R.methodsS3 for
## help.
## R.oo v1.11.7 (2013-01-08) successfully loaded. See ?R.oo for help.
## Attaching package: 'R.oo'
## The following object(s) are masked from 'package:methods':
## 
## getClasses, getMethods
## The following object(s) are masked from 'package:base':
## 
## attach, detach, gc, load, save
## R.utils v1.19.5 (2013-01-11) successfully loaded. See ?R.utils for help.
## Attaching package: 'R.utils'
## The following object(s) are masked from 'package:utils':
## 
## timestamp
## The following object(s) are masked from 'package:base':
## 
## cat, commandArgs, getOption, inherits, isOpen, parse, warnings
library(ChemoSpec)
## Warning: replacing previous import 'panel.lines' when loading 'lattice'
## Package SparseM (0.96) loaded.  To cite, see citation("SparseM")

### loading Data, grouped by vegetation type ('A' aspen, 'M' mixed, 'C'
### conifer)
getManyCsv(gr.crit = c("aspen", "mixed", "conifer"), gr.cols = c("green3", "darkcyan", 
    "red"), freq.unit = "wavenumber", int.unit = "Absorbance intensity", descrip = "Vegetation effect on S&C-SOC spectra", 
    format = "csv", out.file = "SC")
### loading the data
SC <- loadObject("SC.RData")
#### Survey of spectra, looking for areas of interest, and noise:
specSurvey(SC, method = "iqr", title = "Mineral associated SOC (< 53 microm) 0-5 cm", 
    by.gr = TRUE, xlim = c(600, 3800))
## Warning: Invalid or ambiguous component names:

plot of chunk unnamed-chunk-1

### you can zoon in areas of interest
specSurvey(SC, method = "iqr", title = "Mineral associated SOC (< 53 microm) 0-5 cm", 
    by.gr = TRUE, xlim = c(950, 1700), ylim = c(-0.1, 0.35))
## Warning: Invalid or ambiguous component names:

plot of chunk unnamed-chunk-1

###### I remove the area between 1700-2670 cm-1 because of C02 noise,
###### below 950 cm-1 and above 3000 cm-1 for mineral matrix (silica,clay)
###### and adsorbed water noise:

SC1 <- removeFreq(SC, rem.freq = SC$freq > 1700 & SC$freq < 2670)
SC2 <- removeFreq(SC1, rem.freq = SC1$freq < 950)
SC3 <- removeFreq(SC2, rem.freq = SC2$freq > 3000)
sumSpectra(SC3)
## 
##  Vegetation effect on S&C-SOC spectra 
## 
##  There are 46 spectra in this set.
##  The y-axis unit is Absorbance intensity.
## 
##  The frequency scale runs from 950.7 to 3000 wavenumber
##  There are 1121 frequency (x-axis) data points.
##  The frequency resolution is 0.9642 wavenumber/point.
## 
##  This data set is not continuous along the frequency axis.
##  Here are the data chunks:
## 
##   beg.freq end.freq  size beg.indx end.indx
## 1    950.7     1700 749.2        1      778
## 2   2670.0     3000 329.8      779     1121
## 
##  The spectra are divided into 3 groups: 
## 
##     group no.    color symbol alt.sym
## 1   aspen  17   green3      1       a
## 2 conifer  13      red      3       c
## 3   mixed  16 darkcyan      2       b
## 
## *** Note: this data is an S3 object of class 'Spectra'

#### Representation of the areas in the x axis (wavenumber regions) used
#### ofr analysis
check4Gaps(SC3$freq, SC3$data[1, ], plot = TRUE)

plot of chunk unnamed-chunk-1

##   beg.freq end.freq  size beg.indx end.indx
## 1    950.7     1700 749.2        1      778
## 2   2670.0     3000 329.8      779     1121

#### cluster analysis
hcaSpectra(SC3, title = "Mineral-associated SOC (< 53 microm) 0-5 cm")

plot of chunk unnamed-chunk-1


### Classic pca
class <- classPCA(SC3, choice = "noscale")
plotScores(SC3, title = "Mineral-associated SOC (< 53 microm) 0-5 cm", class, 
    pcs = c(1, 2), ellipse = "rob", tol = 0.05)

plot of chunk unnamed-chunk-1

### Robust pca
robust <- robPCA(SC3, choice = "noscale")
plotScores(SC3, title = "Mineral-associated SOC (< 53 microm) 0-5 cm", robust, 
    pcs = c(1, 2), ellipse = "rob", tol = 0.05)

plot of chunk unnamed-chunk-1

plotScores(SC3, title = "Mineral-associated SOC (< 53 microm) 0-5 cm", robust, 
    pcs = c(1, 3), ellipse = "rob", tol = 0.05)

plot of chunk unnamed-chunk-1



#### Visualization of loadings
plotLoadings(SC3, robust, title = "Mineral-associated SOC (< 53 microm) 0-5 cm", 
    loads = c(1, 2, 3), ref = 9)

plot of chunk unnamed-chunk-1



### Checking for outliers
diagnostics <- pcaDiag(SC3, robust, pcs = 2, plot = "OD")

plot of chunk unnamed-chunk-1

diagnostics <- pcaDiag(SC3, robust, pcs = 2, plot = "SD")

plot of chunk unnamed-chunk-1


#### How many pca components are needed?
plotScree(robust, title = "Mineral-associated SOC (< 53 microm) 0-5 cm")

plot of chunk unnamed-chunk-1

plotScree2(robust, title = "Mineral-associated SOC (< 53 microm) 0-5 cm")

plot of chunk unnamed-chunk-1

out <- pcaBoot(SC3, pcs = 5, choice = "noscale")

plot of chunk unnamed-chunk-1



### hca on the clean data
hcaScores(SC3, robust, scores = c(1:3), title = "Mineral-associated SOC (< 53 microm) 0-5 cm")

plot of chunk unnamed-chunk-1

### After performing pca and cluster analysis, I may be able to extract
### the intensities of the wavenumber points indicative of functional
### groups of interest (e.g., 1030 cm-1 -> polysaccharides, 2930 cm-1 ->
### aliphatic C,etc)

wavenumbers <- as.vector(SC3$freq)
### gives me the index no. for all wavenumbers: e.g., 1030.7820 cm-1 is
### the x point no. 84. If I am able to extract from the element SC3$data,
### the column no. 85, I´ll have automatically the intensities of my 46
### spectra For example, from SC3$data, the column no.85 corresponds to
### the intensities for the wavenumber 1030 cm-1 and column 221 corresonds
### to wavenumber 1162 cm-1
AI_1030 <- as.vector(SC3$data[, 85])
AI_1162 <- as.vector(SC3$data[, 221])
ID <- as.vector(SC3$names)
Vegetation <- c("aspen", "aspen", "aspen", "mixed", "mixed", "mixed", "aspen", 
    "aspen", "aspen", "conifer", "conifer", "conifer", "mixed", "mixed", "mixed", 
    "aspen", "aspen", "conifer", "conifer", "mixed", "mixed", "aspen", "aspen", 
    "aspen", "conifer", "conifer", "mixed", "mixed", "aspen", "aspen", "aspen", 
    "conifer", "conifer", "conifer", "mixed", "mixed", "mixed", "aspen", "aspen", 
    "aspen", "conifer", "conifer", "conifer", "mixed", "mixed", "mixed")


#### I prepare a dataframe just with the sample ID, the vegetation type (I
#### could add the site too) and the intensities for the wavenumber 1030
#### cm-1 and 1162 cm-1
data <- data.frame(ID = ID, Vegetation = Vegetation, AI_1030 = AI_1030, AI_1162 = AI_1162)
head(data)
##            ID Vegetation AI_1030 AI_1162
## 1 MA_Q_aspen1      aspen  0.3112 0.07570
## 2 MA_Q_aspen2      aspen  0.2379 0.05915
## 3 MA_Q_aspen3      aspen  0.2344 0.06064
## 4 MA_Q_mixed1      mixed  0.3118 0.06936
## 5 MA_Q_mixed2      mixed  0.2926 0.07305
## 6 MA_Q_mixed3      mixed  0.3014 0.06870

### just very quick, I´ll do an ANOVA (wrong design, only one factor, when
### in reality I have a mixed model, but it jut to illustrate) One Way
### Anova (Completely Randomized Design)
fit <- aov(AI_1030 ~ Vegetation, data = data)
summary(fit)
##             Df Sum Sq Mean Sq F value Pr(>F)
## Vegetation   2 0.0055 0.00274    1.16   0.32
## Residuals   43 0.1014 0.00236
boxplot(AI_1030 ~ Vegetation, data = data, main = "Vegetation effect on absorbance @1030 cm-1", 
    ylab = " Absorbance", xlab = "Vegetation")

plot of chunk unnamed-chunk-1

boxplot(AI_1162 ~ Vegetation, data = data, main = "Vegetation effect on absorbance @1162 cm-1", 
    ylab = " Absorbance", xlab = "Vegetation")

plot of chunk unnamed-chunk-1

#### As we see, this does not inform of great differences among
#### vegetation. But it would be better to look at the loadings from pca,
#### for example, because they capture the greater variability
names(robust)
##  [1] "sdev"     "loadings" "k"        "obj"      "n.obs"    "args"    
##  [7] "scale"    "center"   "call"     "pc.order" "scores"   "method"  
## [13] "rotation" "x"
pca1 <- robust$scores[, 1]
data <- data.frame(ID = ID, Vegetation = Vegetation, pca1 = pca1)
fit <- aov(pca1 ~ Vegetation, data = data)
summary(fit)
##             Df Sum Sq Mean Sq F value Pr(>F)
## Vegetation   2   0.40   0.201    1.44   0.25
## Residuals   43   6.01   0.140
plot(fit)

plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1

boxplot(pca1 ~ Vegetation, data = data, main = "Vegetation effect on scores of pc1", 
    ylab = " Scores ", xlab = "Vegetation")

plot of chunk unnamed-chunk-1

### Still not significant differences

References: Bryan A. Hanson (2012) ChemoSpec: Exploratory Chemometrics for Spectroscopy. R package version 1.51-2, academic.depauw.edu/~hanson/ChemoSpec/ChemoSpec.html