This is an R Markdown Notebook to illustrate how to retrieve a dataset from the EcoSIS spectral database, choose the “optimal” number of plsr components, and fit a plsr model for leaf nitrogen content (Narea, g/m2)
list.of.packages <- c("pls","dplyr","here","plotrix","ggplot2","gridExtra","spectratrait")
invisible(lapply(list.of.packages, library, character.only = TRUE))
Attaching package: ‘pls’
The following object is masked from ‘package:stats’:
loadings
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
here() starts at /Users/sserbin/Data/GitHub/spectratrait
Attaching package: ‘gridExtra’
The following object is masked from ‘package:dplyr’:
combine
### Setup options
# Script options
pls::pls.options(plsralg = "oscorespls")
pls::pls.options("plsralg")
$plsralg
[1] "oscorespls"
# Default par options
opar <- par(no.readonly = T)
# What is the target variable?
inVar <- "Narea_g_m2"
# What is the source dataset from EcoSIS?
ecosis_id <- "9db4c5a2-7eac-4e1e-8859-009233648e89"
# Specify output directory, output_dir
# Options:
# tempdir - use a OS-specified temporary directory
# user defined PATH - e.g. "~/scratch/PLSR"
output_dir <- "tempdir"
The working directory was changed to /private/var/folders/xp/h3k9vf3n2jx181ts786_yjrn9c2gjq/T/RtmpdFG9hz inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
[1] "/private/var/folders/xp/h3k9vf3n2jx181ts786_yjrn9c2gjq/T/RtmpdFG9hz"
print(paste0("Output directory: ",getwd())) # check wd
[1] "Output directory: /Users/sserbin/Data/GitHub/spectratrait/vignettes"
dat_raw <- spectratrait::get_ecosis_data(ecosis_id = ecosis_id)
[1] "**** Downloading Ecosis data ****"
Downloading data...
Rows: 256 Columns: 2164── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (4): Latin Species, ids, plot code, species code
dbl (2160): Cw/EWT (cm3/cm2), Leaf area (mm2), Leaf calcium content per leaf area (mg/mm2), Leaf magnesium content per leaf area (mg/mm2), Leaf ...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Download complete!
head(dat_raw)
names(dat_raw)[1:40]
[1] "Cw/EWT (cm3/cm2)" "Latin Species"
[3] "Leaf area (mm2)" "Leaf calcium content per leaf area (mg/mm2)"
[5] "Leaf magnesium content per leaf area (mg/mm2)" "Leaf mass per area (g/cm2)"
[7] "Leaf nitrogen content per leaf area (mg/mm2)" "Leaf phosphorus content per leaf area (mg/mm2)"
[9] "Leaf potassium content per leaf area (mg/mm2)" "Plant height vegetative (cm)"
[11] "ids" "plot code"
[13] "species code" "350"
[15] "351" "352"
[17] "353" "354"
[19] "355" "356"
[21] "357" "358"
[23] "359" "360"
[25] "361" "362"
[27] "363" "364"
[29] "365" "366"
[31] "367" "368"
[33] "369" "370"
[35] "371" "372"
[37] "373" "374"
[39] "375" "376"
### Create plsr dataset
Start.wave <- 500
End.wave <- 2400
wv <- seq(Start.wave,End.wave,1)
Spectra <- as.matrix(dat_raw[,names(dat_raw) %in% wv])
colnames(Spectra) <- c(paste0("Wave_",wv))
sample_info <- dat_raw[,names(dat_raw) %notin% seq(350,2500,1)]
head(sample_info)
sample_info2 <- sample_info %>%
select(Plant_Species=`Latin Species`,Species_Code=`species code`,Plot=`plot code`,
Narea_mg_mm2=`Leaf nitrogen content per leaf area (mg/mm2)`)
sample_info2 <- sample_info2 %>%
# mutate(Narea_g_m2=Narea_mg_mm2*(0.001/1e-6)) # based on orig units should be this but conversion wrong
mutate(Narea_g_m2=Narea_mg_mm2*100) # this assumes orig units were g/mm2 or mg/cm2
head(sample_info2)
plsr_data <- data.frame(sample_info2,Spectra)
rm(sample_info,sample_info2,Spectra)
#### End user needs to do what's appropriate for their data. This may be an iterative process.
# Keep only complete rows of inVar and spec data before fitting
plsr_data <- plsr_data[complete.cases(plsr_data[,names(plsr_data) %in%
c(inVar,paste0("Wave_",wv))]),]
### Create cal/val datasets
## Make a stratified random sampling in the strata USDA_Species_Code and Domain
method <- "dplyr" #base/dplyr
# base R - a bit slow
# dplyr - much faster
split_data <- spectratrait::create_data_split(dataset=plsr_data, approach=method, split_seed=1245565,
prop=0.8, group_variables="Species_Code")
names(split_data)
[1] "cal_data" "val_data"
cal.plsr.data <- split_data$cal_data
head(cal.plsr.data)[1:8]
val.plsr.data <- split_data$val_data
head(val.plsr.data)[1:8]
rm(split_data)
# Datasets:
print(paste("Cal observations: ",dim(cal.plsr.data)[1],sep=""))
[1] "Cal observations: 183"
print(paste("Val observations: ",dim(val.plsr.data)[1],sep=""))
[1] "Val observations: 73"
cal_hist_plot <- qplot(cal.plsr.data[,paste0(inVar)],geom="histogram",
main = paste0("Cal. Histogram for ",inVar),
xlab = paste0(inVar),ylab = "Count",fill=I("grey50"),col=I("black"),
alpha=I(.7))
val_hist_plot <- qplot(val.plsr.data[,paste0(inVar)],geom="histogram",
main = paste0("Val. Histogram for ",inVar),
xlab = paste0(inVar),ylab = "Count",fill=I("grey50"),col=I("black"),
alpha=I(.7))
histograms <- grid.arrange(cal_hist_plot, val_hist_plot, ncol=2)
ggsave(filename = file.path(outdir,paste0(inVar,"_Cal_Val_Histograms.png")), plot = histograms,
device="png", width = 30,
height = 12, units = "cm",
dpi = 300)
# output cal/val data
write.csv(cal.plsr.data,file=file.path(outdir,paste0(inVar,'_Cal_PLSR_Dataset.csv')),
row.names=FALSE)
write.csv(val.plsr.data,file=file.path(outdir,paste0(inVar,'_Val_PLSR_Dataset.csv')),
row.names=FALSE)
### Format PLSR data for model fitting
cal_spec <- as.matrix(cal.plsr.data[, which(names(cal.plsr.data) %in% paste0("Wave_",wv))])
cal.plsr.data <- data.frame(cal.plsr.data[, which(names(cal.plsr.data) %notin% paste0("Wave_",wv))],
Spectra=I(cal_spec))
head(cal.plsr.data)[1:5]
val_spec <- as.matrix(val.plsr.data[, which(names(val.plsr.data) %in% paste0("Wave_",wv))])
val.plsr.data <- data.frame(val.plsr.data[, which(names(val.plsr.data) %notin% paste0("Wave_",wv))],
Spectra=I(val_spec))
head(val.plsr.data)[1:5]
par(mfrow=c(1,2)) # B, L, T, R
spectratrait::f.plot.spec(Z=cal.plsr.data$Spectra,wv=wv,plot_label="Calibration")
spectratrait::f.plot.spec(Z=val.plsr.data$Spectra,wv=wv,plot_label="Validation")
dev.copy(png,file.path(outdir,paste0(inVar,'_Cal_Val_Spectra.png')),
height=2500,width=4900, res=340)
quartz_off_screen
3
dev.off();
quartz_off_screen
2
par(mfrow=c(1,1))
### Use permutation to determine the optimal number of components
if(grepl("Windows", sessionInfo()$running)){
pls.options(parallel = NULL)
} else {
pls.options(parallel = parallel::detectCores()-1)
}
method <- "pls" #pls, firstPlateau, firstMin
random_seed <- 1245565
seg <- 50
maxComps <- 16
iterations <- 80
prop <- 0.70
if (method=="pls") {
# pls package approach - faster but estimates more components....
nComps <- spectratrait::find_optimal_components(dataset=cal.plsr.data, targetVariable=inVar,
method=method,
maxComps=maxComps, seg=seg,
random_seed=random_seed)
print(paste0("*** Optimal number of components: ", nComps))
} else {
nComps <- spectratrait::find_optimal_components(dataset=cal.plsr.data, targetVariable=inVar,
method=method,
maxComps=maxComps, iterations=iterations,
seg=seg, prop=prop,
random_seed=random_seed)
}
[1] "*** Identifying optimal number of PLSR components ***"
[1] "*** Running PLS permutation test ***"
[1] "*** Optimal number of components: 10"
dev.copy(png,file.path(outdir,paste0(paste0(inVar,"_PLSR_Component_Selection.png"))),
height=2800, width=3400, res=340)
quartz_off_screen
3
dev.off();
quartz_off_screen
2
plsr.out <- plsr(as.formula(paste(inVar,"~","Spectra")),scale=FALSE,ncomp=nComps,validation="LOO",
trace=FALSE,data=cal.plsr.data)
fit <- plsr.out$fitted.values[,1,nComps]
pls.options(parallel = NULL)
# External validation fit stats
par(mfrow=c(1,2)) # B, L, T, R
pls::RMSEP(plsr.out, newdata = val.plsr.data)
(Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps 9 comps 10 comps
0.5594 0.6034 0.5448 0.3842 0.3481 0.3027 0.2429 0.2268 0.2852 0.2818 0.2780
plot(pls::RMSEP(plsr.out,estimate=c("test"),newdata = val.plsr.data), main="MODEL RMSEP",
xlab="Number of Components",ylab="Model Validation RMSEP",lty=1,col="black",cex=1.5,lwd=2)
box(lwd=2.2)
pls::R2(plsr.out, newdata = val.plsr.data)
(Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps 9 comps 10 comps
-0.007544 -0.172296 0.044153 0.524579 0.609920 0.704963 0.809962 0.834383 0.738093 0.744325 0.751224
plot(pls::R2(plsr.out,estimate=c("test"),newdata = val.plsr.data), main="MODEL R2",
xlab="Number of Components",ylab="Model Validation R2",lty=1,col="black",cex=1.5,lwd=2)
box(lwd=2.2)
dev.copy(png,file.path(outdir,paste0(paste0(inVar,"_Validation_RMSEP_R2_by_Component.png"))),
height=2800, width=4800, res=340)
quartz_off_screen
3
dev.off();
quartz_off_screen
2
par(opar)
#calibration
cal.plsr.output <- data.frame(cal.plsr.data[, which(names(cal.plsr.data) %notin% "Spectra")],
PLSR_Predicted=fit,
PLSR_CV_Predicted=as.vector(plsr.out$validation$pred[,,nComps]))
cal.plsr.output <- cal.plsr.output %>%
mutate(PLSR_CV_Residuals = PLSR_CV_Predicted-get(inVar))
head(cal.plsr.output)
cal.R2 <- round(pls::R2(plsr.out,intercept=F)[[1]][nComps],2)
cal.RMSEP <- round(sqrt(mean(cal.plsr.output$PLSR_CV_Residuals^2)),2)
val.plsr.output <- data.frame(val.plsr.data[, which(names(val.plsr.data) %notin% "Spectra")],
PLSR_Predicted=as.vector(predict(plsr.out,
newdata = val.plsr.data,
ncomp=nComps, type="response")[,,1]))
val.plsr.output <- val.plsr.output %>%
mutate(PLSR_Residuals = PLSR_Predicted-get(inVar))
head(val.plsr.output)
val.R2 <- round(pls::R2(plsr.out,newdata=val.plsr.data,intercept=F)[[1]][nComps],2)
val.RMSEP <- round(sqrt(mean(val.plsr.output$PLSR_Residuals^2)),2)
rng_quant <- quantile(cal.plsr.output[,inVar], probs = c(0.001, 0.999))
cal_scatter_plot <- ggplot(cal.plsr.output, aes(x=PLSR_CV_Predicted, y=get(inVar))) +
theme_bw() + geom_point() + geom_abline(intercept = 0, slope = 1, color="dark grey",
linetype="dashed", size=1.5) + xlim(rng_quant[1],
rng_quant[2]) +
ylim(rng_quant[1], rng_quant[2]) +
labs(x=paste0("Predicted ", paste(inVar), " (units)"),
y=paste0("Observed ", paste(inVar), " (units)"),
title=paste0("Calibration: ", paste0("Rsq = ", cal.R2), "; ", paste0("RMSEP = ",
cal.RMSEP))) +
theme(axis.text=element_text(size=18), legend.position="none",
axis.title=element_text(size=20, face="bold"),
axis.text.x = element_text(angle = 0,vjust = 0.5),
panel.border = element_rect(linetype = "solid", fill = NA, size=1.5))
cal_resid_histogram <- ggplot(cal.plsr.output, aes(x=PLSR_CV_Residuals)) +
geom_histogram(alpha=.5, position="identity") +
geom_vline(xintercept = 0, color="black",
linetype="dashed", size=1) + theme_bw() +
theme(axis.text=element_text(size=18), legend.position="none",
axis.title=element_text(size=20, face="bold"),
axis.text.x = element_text(angle = 0,vjust = 0.5),
panel.border = element_rect(linetype = "solid", fill = NA, size=1.5))
rng_quant <- quantile(val.plsr.output[,inVar], probs = c(0.001, 0.999))
val_scatter_plot <- ggplot(val.plsr.output, aes(x=PLSR_Predicted, y=get(inVar))) +
theme_bw() + geom_point() + geom_abline(intercept = 0, slope = 1, color="dark grey",
linetype="dashed", size=1.5) + xlim(rng_quant[1],
rng_quant[2]) +
ylim(rng_quant[1], rng_quant[2]) +
labs(x=paste0("Predicted ", paste(inVar), " (units)"),
y=paste0("Observed ", paste(inVar), " (units)"),
title=paste0("Validation: ", paste0("Rsq = ", val.R2), "; ", paste0("RMSEP = ",
val.RMSEP))) +
theme(axis.text=element_text(size=18), legend.position="none",
axis.title=element_text(size=20, face="bold"),
axis.text.x = element_text(angle = 0,vjust = 0.5),
panel.border = element_rect(linetype = "solid", fill = NA, size=1.5))
val_resid_histogram <- ggplot(val.plsr.output, aes(x=PLSR_Residuals)) +
geom_histogram(alpha=.5, position="identity") +
geom_vline(xintercept = 0, color="black",
linetype="dashed", size=1) + theme_bw() +
theme(axis.text=element_text(size=18), legend.position="none",
axis.title=element_text(size=20, face="bold"),
axis.text.x = element_text(angle = 0,vjust = 0.5),
panel.border = element_rect(linetype = "solid", fill = NA, size=1.5))
# plot cal/val side-by-side
scatterplots <- grid.arrange(cal_scatter_plot, val_scatter_plot, cal_resid_histogram,
val_resid_histogram, nrow=2,ncol=2)
ggsave(filename = file.path(outdir,paste0(inVar,"_Cal_Val_Scatterplots.png")),
plot = scatterplots, device="png",
width = 32,
height = 30, units = "cm",
dpi = 300)
vips <- spectratrait::VIP(plsr.out)[nComps,]
par(mfrow=c(2,1))
plot(plsr.out, plottype = "coef",xlab="Wavelength (nm)",
ylab="Regression coefficients",legendpos = "bottomright",
ncomp=nComps,lwd=2)
box(lwd=2.2)
plot(seq(Start.wave,End.wave,1),vips,xlab="Wavelength (nm)",ylab="VIP",cex=0.01)
lines(seq(Start.wave,End.wave,1),vips,lwd=3)
abline(h=0.8,lty=2,col="dark grey")
box(lwd=2.2)
dev.copy(png,file.path(outdir,paste0(inVar,'_Coefficient_VIP_plot.png')),
height=3100, width=4100, res=340)
quartz_off_screen
3
dev.off();
quartz_off_screen
2
if(grepl("Windows", sessionInfo()$running)){
pls.options(parallel =NULL)
} else {
pls.options(parallel = parallel::detectCores()-1)
}
jk.plsr.out <- pls::plsr(as.formula(paste(inVar,"~","Spectra")), scale=FALSE,
center=TRUE, ncomp=nComps, validation="LOO", trace=FALSE,
jackknife=TRUE,
data=cal.plsr.data)
pls.options(parallel = NULL)
Jackknife_coef <- spectratrait::f.coef.valid(plsr.out = jk.plsr.out, data_plsr = cal.plsr.data,
ncomp = nComps, inVar=inVar)
Jackknife_intercept <- Jackknife_coef[1,,,]
Jackknife_coef <- Jackknife_coef[2:dim(Jackknife_coef)[1],,,]
interval <- c(0.025,0.975)
Jackknife_Pred <- val.plsr.data$Spectra %*% Jackknife_coef +
matrix(rep(Jackknife_intercept, length(val.plsr.data[,inVar])), byrow=TRUE,
ncol=length(Jackknife_intercept))
Interval_Conf <- apply(X = Jackknife_Pred, MARGIN = 1, FUN = quantile,
probs=c(interval[1], interval[2]))
sd_mean <- apply(X = Jackknife_Pred, MARGIN = 1, FUN =sd)
sd_res <- sd(val.plsr.output$PLSR_Residuals)
sd_tot <- sqrt(sd_mean^2+sd_res^2)
val.plsr.output$LCI <- Interval_Conf[1,]
val.plsr.output$UCI <- Interval_Conf[2,]
val.plsr.output$LPI <- val.plsr.output$PLSR_Predicted-1.96*sd_tot
val.plsr.output$UPI <- val.plsr.output$PLSR_Predicted+1.96*sd_tot
head(val.plsr.output)
val.plsr.output$LPI <- val.plsr.output$PLSR_Predicted-1.96*sd_tot
val.plsr.output$UPI <- val.plsr.output$PLSR_Predicted+1.96*sd_tot
head(val.plsr.output)
spectratrait::f.plot.coef(Z = t(Jackknife_coef), wv = wv,
plot_label="Jackknife regression coefficients",position = 'bottomleft')
abline(h=0,lty=2,col="grey50")
box(lwd=2.2)
dev.copy(png,file.path(outdir,paste0(inVar,'_Jackknife_Regression_Coefficients.png')),
height=2100, width=3800, res=340)
quartz_off_screen
3
dev.off();
quartz_off_screen
2
rmsep_percrmsep <- spectratrait::percent_rmse(plsr_dataset = val.plsr.output,
inVar = inVar,
residuals = val.plsr.output$PLSR_Residuals,
range="full")
RMSEP <- rmsep_percrmsep$rmse
perc_RMSEP <- rmsep_percrmsep$perc_rmse
r2 <- round(pls::R2(plsr.out, newdata = val.plsr.data,intercept=F)$val[nComps],2)
expr <- vector("expression", 3)
expr[[1]] <- bquote(R^2==.(r2))
expr[[2]] <- bquote(RMSEP==.(round(RMSEP,2)))
expr[[3]] <- bquote("%RMSEP"==.(round(perc_RMSEP,2)))
rng_vals <- c(min(val.plsr.output$LPI), max(val.plsr.output$UPI))
par(mfrow=c(1,1), mar=c(4.2,5.3,1,0.4), oma=c(0, 0.1, 0, 0.2))
plotrix::plotCI(val.plsr.output$PLSR_Predicted,val.plsr.output[,inVar],
li=val.plsr.output$LPI, ui=val.plsr.output$UPI, gap=0.009,sfrac=0.004,
lwd=1.6, xlim=c(rng_vals[1], rng_vals[2]), ylim=c(rng_vals[1], rng_vals[2]),
err="x", pch=21, col="black", pt.bg=scales::alpha("grey70",0.7), scol="grey50",
cex=2, xlab=paste0("Predicted ", paste(inVar), " (units)"),
ylab=paste0("Observed ", paste(inVar), " (units)"),
cex.axis=1.5,cex.lab=1.8)
abline(0,1,lty=2,lw=2)
legend("topleft", legend=expr, bty="n", cex=1.5)
box(lwd=2.2)
dev.copy(png,file.path(outdir,paste0(inVar,"_PLSR_Validation_Scatterplot.png")),
height=2800, width=3200, res=340)
quartz_off_screen
3
dev.off();
quartz_off_screen
2
out.jk.coefs <- data.frame(Iteration=seq(1,length(Jackknife_intercept),1),
Intercept=Jackknife_intercept,t(Jackknife_coef))
head(out.jk.coefs)[1:6]
write.csv(out.jk.coefs,file=file.path(outdir,
paste0(inVar,
'_Jackkife_PLSR_Coefficients.csv')),
row.names=FALSE)
print(paste("Output directory: ", outdir))
[1] "Output directory: /var/folders/xp/h3k9vf3n2jx181ts786_yjrn9c2gjq/T//RtmpdFG9hz"
# Observed versus predicted
write.csv(cal.plsr.output,file=file.path(outdir,
paste0(inVar,'_Observed_PLSR_CV_Pred_',
nComps,'comp.csv')),
row.names=FALSE)
# Validation data
write.csv(val.plsr.output,file=file.path(outdir,
paste0(inVar,'_Validation_PLSR_Pred_',
nComps,'comp.csv')),
row.names=FALSE)
# Model coefficients
coefs <- coef(plsr.out,ncomp=nComps,intercept=TRUE)
write.csv(coefs,file=file.path(outdir,
paste0(inVar,'_PLSR_Coefficients_',
nComps,'comp.csv')),
row.names=TRUE)
# PLSR VIP
write.csv(vips,file=file.path(outdir,
paste0(inVar,'_PLSR_VIPs_',
nComps,'comp.csv')))
print("**** PLSR output files: ")
[1] "**** PLSR output files: "
print(list.files(outdir)[grep(pattern = inVar, list.files(outdir))])
[1] "Narea_g_m2_Cal_PLSR_Dataset.csv" "Narea_g_m2_Cal_Val_Histograms.png"
[3] "Narea_g_m2_Cal_Val_Scatterplots.png" "Narea_g_m2_Cal_Val_Spectra.png"
[5] "Narea_g_m2_Coefficient_VIP_plot.png" "Narea_g_m2_Jackkife_PLSR_Coefficients.csv"
[7] "Narea_g_m2_Jackknife_Regression_Coefficients.png" "Narea_g_m2_Observed_PLSR_CV_Pred_10comp.csv"
[9] "Narea_g_m2_PLSR_Coefficients_10comp.csv" "Narea_g_m2_PLSR_Component_Selection.png"
[11] "Narea_g_m2_PLSR_Validation_Scatterplot.png" "Narea_g_m2_PLSR_VIPs_10comp.csv"
[13] "Narea_g_m2_Val_PLSR_Dataset.csv" "Narea_g_m2_Validation_PLSR_Pred_10comp.csv"
[15] "Narea_g_m2_Validation_RMSEP_R2_by_Component.png"