suppressWarnings({library(caret)
library(pls)
library(VIM)
library(AppliedPredictiveModeling)
library(dplyr)
library(tidyr)})
## Loading required package: ggplot2
## Loading required package: lattice
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Friedman (1991) introduced several benchmark data sets created by simulation. One of these simulations used the following nonlinear equation to create data:
\[ y-10\sin(\pi x_1x_2)+20(x_3-0.5)^2+10x_4+5x_5+N(0,\sigma^2)\\ \]
where the \(x\) values are random variables uniformly distributed between [0,1] (there are also 5 other non-informative variables also created in the simulation.) The package mlbench contains a function called mlbench.friedman1 that simulates these data.
library(mlbench)
set.seed(1989)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
Tune several models on these data. For example:
library(caret)
set.seed(1989)
knnModel_1 <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel_1
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.673263 0.4914916 2.950154
## 7 3.611938 0.5171046 2.910604
## 9 3.613619 0.5261556 2.948502
## 11 3.614597 0.5355296 2.945352
## 13 3.602264 0.5509632 2.939492
## 15 3.604075 0.5670811 2.955562
## 17 3.609379 0.5781473 2.959968
## 19 3.612490 0.5927001 2.973828
## 21 3.615889 0.6065100 2.976307
## 23 3.624355 0.6147256 2.989350
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 13.
knnPred_1 <- predict(knnModel_1, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## performance values
postResample(pred = knnPred_1, obs = testData$y)
## RMSE Rsquared MAE
## 3.1629975 0.6561682 2.5411946
trainingData <- as.data.frame(trainingData)
tooHigh <- findCorrelation(cor(trainingData[-11]), cutoff = 0.75)
tooHigh
## integer(0)
library(nnet)
set.seed(1989)
nnetModel_1 <- nnet(trainingData[1:10], trainingData$y,
size = 5,
decay = 0.01,
linout = TRUE,
trace = FALSE,
maxit = 500)
summary(nnetModel_1)
## a 10-5-1 network with 61 weights
## options were - linear output units decay=0.01
## b->h1 i1->h1 i2->h1 i3->h1 i4->h1 i5->h1 i6->h1 i7->h1 i8->h1 i9->h1
## 1.35 -0.87 -0.81 -6.86 0.91 -1.80 0.67 -3.35 -0.08 -1.02
## i10->h1
## 0.22
## b->h2 i1->h2 i2->h2 i3->h2 i4->h2 i5->h2 i6->h2 i7->h2 i8->h2 i9->h2
## -4.10 2.26 1.74 1.19 2.56 1.13 -0.08 0.03 -0.07 -0.20
## i10->h2
## 0.06
## b->h3 i1->h3 i2->h3 i3->h3 i4->h3 i5->h3 i6->h3 i7->h3 i8->h3 i9->h3
## -14.37 -3.59 21.49 9.01 -1.57 2.93 5.13 -2.83 2.33 5.50
## i10->h3
## -0.10
## b->h4 i1->h4 i2->h4 i3->h4 i4->h4 i5->h4 i6->h4 i7->h4 i8->h4 i9->h4
## -18.24 11.79 10.11 -0.26 -1.43 -0.52 -1.09 0.82 -1.05 0.45
## i10->h4
## 0.90
## b->h5 i1->h5 i2->h5 i3->h5 i4->h5 i5->h5 i6->h5 i7->h5 i8->h5 i9->h5
## -4.80 0.14 -5.55 -18.95 6.17 5.69 -2.34 2.95 2.38 4.88
## i10->h5
## 0.27
## b->o h1->o h2->o h3->o h4->o h5->o
## 0.26 14.31 19.35 3.71 -12.07 5.61
nnetPred_1 <- predict(nnetModel_1, newdata = testData$x)
postResample( pred = nnetPred_1, obs = testData$y)
## RMSE Rsquared MAE
## 2.2891126 0.7984834 1.7669998
library(earth)
## Warning: package 'earth' was built under R version 4.3.3
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.3.3
## Loading required package: plotrix
library(Formula)
library(plotmo)
library(plotrix)
set.seed(1989)
marsModel_1 <- earth(trainingData[1:10], trainingData$y)
summary(marsModel_1)
## Call: earth(x=trainingData[1:10], y=trainingData$y)
##
## coefficients
## (Intercept) 6.497745
## h(x.X1-0.422926) -7.772862
## h(0.779509-x.X1) -12.426485
## h(x.X2-0.34687) -3.028095
## h(0.659947-x.X2) -12.830365
## h(x.X3-0.0681259) 49.362553
## h(0.510262-x.X3) 55.423704
## h(x.X3-0.510262) -45.518326
## h(x.X3-0.773716) 13.836121
## h(0.948905-x.X4) -11.421149
## h(0.867854-x.X5) -5.572901
##
## Selected 11 of 16 terms, and 5 of 10 predictors
## Termination condition: Reached nk 21
## Importance: x.X4, x.X2, x.X1, x.X5, x.X3, x.X6-unused, x.X7-unused, ...
## Number of terms at each degree of interaction: 1 10 (additive model)
## GCV 3.013579 RSS 482.7905 GRSq 0.8852932 RSq 0.9071912
marsPred_1 <- predict(marsModel_1, newdata = testData$x)
postResample(pred = marsPred_1, obs = testData$y)
## RMSE Rsquared MAE
## 1.7805275 0.8742564 1.4034468
library(kernlab)
## Warning: package 'kernlab' was built under R version 4.3.3
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
set.seed(1989)
svmModel_1 <- ksvm(x = as.matrix(trainingData[1:10]), y = trainingData$y,
kernel = "rbfdot", kpar = "automatic",
C = 1, epsilon = 0.1)
svmModel_1
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 1
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.0636554327040298
##
## Number of Support Vectors : 163
##
## Objective Function Value : -43.9197
## Training error : 0.086662
svmPred_1 <- predict(svmModel_1, newdata = testData$x)
postResample(pred = svmPred_1, obs = testData$y)
## RMSE Rsquared MAE
## 2.2753581 0.7863505 1.7573860
Which models appear to give the best performance? Does MARS select the informative predictors (those named X1-X5?)
The most accurate model appears to be the MARS model. The MARS model does select the informative predictors X1 through X5, and leaves X6 through X10 unused. In decreasing order of R^2, the models ranked are the MARS (0.87), the neural network (0.80), the support vector machines model (0.79), and the k-nearest neighbors model (0.66).
Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.
data(ChemicalManufacturingProcess)
# use k-nearest neighbors imputation with k=5
ChemicalManufacturingProcess <- kNN(ChemicalManufacturingProcess, k=5)
# remove predictors with near zero variance
cmp_near_zero_var <- nearZeroVar(ChemicalManufacturingProcess)
ChemicalManufacturingProcess <- ChemicalManufacturingProcess[,-cmp_near_zero_var]
# center and scale
ChemicalManufacturingProcess <- as.data.frame(scale(ChemicalManufacturingProcess))
#create train/test split
set.seed(1989)
trainingRows <- createDataPartition(ChemicalManufacturingProcess$Yield, p=0.80, list=FALSE)
cmp_train <- ChemicalManufacturingProcess[trainingRows,]
cmp_test <- ChemicalManufacturingProcess[-trainingRows,]
tooHigh <- findCorrelation(cor(trainingData[-1]), cutoff = 0.75)
tooHigh
## integer(0)
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(foreach)
num_cores <- detectCores()
cl <- makeCluster(num_cores - 1)
registerDoParallel(cl)
train_control <- trainControl(method = "cv", number = 5, allowParallel = TRUE)
nnetGrid <- expand.grid(.decay = c(0, 0.01, .1),
.size = c(1:10),
.bag = FALSE)
set.seed(1989)
nnetTune <- train(cmp_train[-1], cmp_train$Yield,
method = "avNNet",
tuneGrid = nnetGrid,
trControl = train_control,
preProc = c("center", "scale"),
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(cmp_train[-1]) + 1) + 10 + 1,
maxit = 500)
nnetTune$bestTune
## size decay bag
## 27 7 0.1 FALSE
nnetPred_2 <- predict(nnetTune, newdata = cmp_test[-1])
postResample(pred = nnetPred_2, obs = cmp_test$Yield)
## RMSE Rsquared MAE
## 0.7763265 0.5407323 0.6048169
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
set.seed(1989)
marsTuned <- train(cmp_train[-1], cmp_train$Yield,
method = "earth",
tuneGrid = marsGrid,
trControl = train_control)
summary(marsTuned)
## Call: earth(x=data.frame[144,59], y=c(1.226,1.004,0...), keepxy=TRUE, degree=2,
## nprune=5)
##
## coefficients
## (Intercept) -0.6151759
## h(0.72417-ManufacturingProcess09) -0.4124863
## h(ManufacturingProcess30-0.244377) 0.6272757
## h(ManufacturingProcess32- -1.198) 0.6308404
## h(0.0450647-ManufacturingProcess17) * h(0.244377-ManufacturingProcess30) 0.9258183
##
## Selected 5 of 50 terms, and 4 of 59 predictors (nprune=5)
## Termination condition: RSq changed by less than 0.001 at 50 terms
## Importance: ManufacturingProcess32, ManufacturingProcess09, ...
## Number of terms at each degree of interaction: 1 3 1
## GCV 0.3531445 RSS 43.38037 GRSq 0.6239723 RSq 0.6747247
marsPred_2 <- predict(marsTuned, newdata = cmp_test[-1])
postResample(pred = marsPred_2, obs = cmp_test$Yield)
## RMSE Rsquared MAE
## 0.8140276 0.4853746 0.6099804
set.seed(1989)
svmRTuned <- train(cmp_train[-1], cmp_train$Yield,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = train_control)
svmPred_2 <- predict(svmRTuned, newdata = cmp_test[-1])
postResample(pred = svmPred_2, obs = cmp_test$Yield)
## RMSE Rsquared MAE
## 0.7858505 0.5738751 0.6200308
set.seed(1989)
knnModel_2 <- train(x = cmp_train[-1],
y = cmp_train$Yield,
method = "knn",
tuneLength = 10)
knnModel_2
## k-Nearest Neighbors
##
## 144 samples
## 59 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 0.7776339 0.4094798 0.6191288
## 7 0.7721779 0.4124874 0.6197428
## 9 0.7741258 0.4145199 0.6216486
## 11 0.7740105 0.4208849 0.6223671
## 13 0.7788807 0.4193827 0.6278057
## 15 0.7774926 0.4319648 0.6228322
## 17 0.7843950 0.4304737 0.6283642
## 19 0.7898261 0.4309829 0.6332886
## 21 0.7951545 0.4274195 0.6363676
## 23 0.8023689 0.4216891 0.6436688
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 7.
knnPred_2 <- predict(knnModel_2, newdata = cmp_test[-1])
postResample(pred = knnPred_2, obs = cmp_test$Yield)
## RMSE Rsquared MAE
## 0.8472461 0.5004522 0.6683126
Which nonlinear regression model gives the optimal resampling and test set performance?
For this data the support vector machines model gives the optimal performance metric (\(R^2\) of 0.57), followed by the neural network (0.54), k-nearest neighbors (0.50), and MARS (0.49).
Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?
varImp(svmRTuned)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 59)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess13 91.04
## BiologicalMaterial06 83.27
## BiologicalMaterial03 78.82
## ManufacturingProcess09 77.17
## ManufacturingProcess36 72.61
## ManufacturingProcess17 70.90
## ManufacturingProcess31 65.67
## BiologicalMaterial12 63.15
## ManufacturingProcess11 58.65
## ManufacturingProcess06 56.17
## BiologicalMaterial02 50.42
## BiologicalMaterial09 47.54
## BiologicalMaterial11 47.19
## ManufacturingProcess33 45.24
## BiologicalMaterial04 41.47
## ManufacturingProcess30 37.83
## BiologicalMaterial08 36.27
## BiologicalMaterial01 34.80
## ManufacturingProcess12 33.63
The most important predictors are manufacturing processes 32 and 13, and biological materials 6 and 3. The manufacturing processes appear to dominate the list. However, the list includes 9/12 biological materials, and only 11/38 manufacturing processes.
8 out of the top ten predictors are also in the top ten predictors of the optimal linear (partial least squares) model from Homework 7: manufacturing processes 32, 13, 9, 36, 17, and 11, and biological processes 6 and 3. In each model, manufacturing process 32 is the most important predictor.
Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?
The predictors within the top ten of the optimal nonlinear regression model that are not in the optimal linear regression model are manufacturing process 31 and biological material 12.
library(ggplot2)
ggplot(cmp_train, aes(x = ManufacturingProcess31, y = Yield)) +
geom_point() +
geom_smooth(method = "loess", color = "blue") +
labs(title = "Relationship between Manufacturing Process 31 and Yield")
## `geom_smooth()` using formula = 'y ~ x'
Manufacturing Process 31 appears to have a negative influence on yield.
ggplot(cmp_train, aes(x = BiologicalMaterial12, y = Yield)) +
geom_point() +
geom_smooth(method = "loess", color = "blue") +
labs(title = "Relationship between Biological Material 12 and Yield")
## `geom_smooth()` using formula = 'y ~ x'
Biological Material 12 appears to have the most positive impact on yield in the interval from 0 to 1.5, and then it has a negative influence beyond that.
stopCluster(cl)
registerDoSEQ()