library(pls)
##
## Attaching package: 'pls'
##
## The following object is masked from 'package:stats':
##
## loadings
data(gasoline)
options(digits=4)
## The main reason for using PCR or PLSR is that they have been designed to confront the situation that there are many, possibly correlated, predictor variables, and relatively few samples
## Method of generic functions like predict, update and coef have more specialised functions like scores, loadings, RMSEP.
## for visualization pls package has a number of plot functions for plotting scores, loadings, predictions, coefficients and RMSEP estimates
## One specific version of PLSR, called SIMPLS, it describe covarience between X and Y, where PCR concentrates on the variance of X. No difference when only one Y variable.
## typically, PLSR needs fewer latent variables than PCR. In other words, with same number of latent variables, PLSR will cover more of the variance in Y and PCR will cover more of C.
gasTrain <- gasoline[1:50,]
gasTest <- gasoline[51:60,]
gas1 <- plsr(octane ~ NIR, ncomp=10, data=gasTrain, validation = "LOO") # this fits a model iwth 10 components, and includes leave-one-out (LOO) cross-validated predictions.
summary(gas1) # Summary provides overview of fit and validation results with the summary method
## Data: X dimension: 50 401
## Y dimension: 50 1
## Fit method: kernelpls
## Number of components considered: 10
##
## VALIDATION: RMSEP
## Cross-validated using 50 leave-one-out segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 1.545 1.357 0.2966 0.2524 0.2476 0.2398 0.2319
## adjCV 1.545 1.356 0.2947 0.2521 0.2478 0.2388 0.2313
## 7 comps 8 comps 9 comps 10 comps
## CV 0.2386 0.2316 0.2449 0.2673
## adjCV 0.2377 0.2308 0.2438 0.2657
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 78.17 85.58 93.41 96.06 96.94 97.89 98.38
## octane 29.39 96.85 97.89 98.26 98.86 98.96 99.09
## 8 comps 9 comps 10 comps
## X 98.85 99.02 99.19
## octane 99.16 99.28 99.39
# The validation results here are root mean squared error of prediction (RMSEP). There are 2 CV estimates: CV is the ordinary CV estimate, amd adjCV is a bias-corrected estimate. For a LOO CV, there is virutally no difference)
plot(RMSEP(gas1), legendspos = "topright") #
## Warning in plot.window(...): "legendspos" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "legendspos" is not a graphical
## parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "legendspos"
## is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "legendspos"
## is not a graphical parameter
## Warning in box(...): "legendspos" is not a graphical parameter
## Warning in title(...): "legendspos" is not a graphical parameter

# Two components seem to be enough. This gives an RMSEP of 0.297. PCR would need three components to achieve the same RMSEPplot(gas1, ncomp = 2, asp = 1, line = TRUE)
plot(gas1, ncomp = 2, asp = 1, line = TRUE) # this shows the CV predictions with two components versus measured values. The points follow the target line quite nicely

plot(gas1, plottype = "scores", comps = 1:3) # this gives pairwise plot of the score values for 3 components. Score plots are often used to look for patterns, groups, or outliers in the data

# The number in the parentheses after the component labels are the relative amoiunt of X variance explained by each component.
explvar(gas1) #The explained vaiance can be extracted explicitly with this
## Comp 1 Comp 2 Comp 3 Comp 4 Comp 5 Comp 6 Comp 7 Comp 8 Comp 9
## 78.1708 7.4122 7.8242 2.6578 0.8768 0.9466 0.4922 0.4723 0.1688
## Comp 10
## 0.1694
plot(gas1, "loadings", comps = 1:2, legendpos = "topleft", labels="numbers", xlab="nm") # The loading plot. Look for known spectral peaks or profiles
abline(h=0)

#The labels="numbers" argument makes the plot function try to interpret the variable names as numbers, and use them as X axis labels
predict(gas1, ncomp=2, newdata=gasTest) #fitted model is often used to predict the response values of new observations.
## , , 2 comps
##
## octane
## 51 87.94
## 52 87.25
## 53 88.16
## 54 84.97
## 55 85.15
## 56 84.51
## 57 87.56
## 58 86.85
## 59 89.19
## 60 87.09
#Since we know true response values for these samples, we can calculate the test set RMSEP
RMSEP(gas1, newdata=gasTest) # for two components, we get 0.244 which is quite close to the CV estimates above (0.297)
## (Intercept) 1 comps 2 comps 3 comps 4 comps
## 1.5369 1.1696 0.2445 0.2341 0.3287
## 5 comps 6 comps 7 comps 8 comps 9 comps
## 0.2780 0.2703 0.3301 0.3571 0.4090
## 10 comps
## 0.6116