Exercises from Chapter 7 of textbook Applied Predictive Modeling by Kuhn & Johnson
\(y = 10 sin(\pi x_1x_2) + 20(x_3 − 0.5)^2 + 10x_4 + 5x_5 + N(0, \sigma^2)\)
where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:
#library(mlbench)
set.seed(200)
trainingData = mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x = data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData = mlbench.friedman1(5000, sd = 1)
testData$x = data.frame(testData$x)
For example:
KNN
knnModel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set > ## perforamnce values
postResample(pred = knnPred, obs = testData$y)
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
Neural Network
library(nnet)
nnetModel <- nnet(trainingData$x, trainingData$y,
size = 5,
decay = 0.01,
linout = TRUE,
trace = FALSE,
maxit = 500,
maxNWts = 5*(ncol(trainingData$x) + 1) +5 +1)
nnetPred <- predict(nnetModel, testData$x)
postResample(pred = nnetPred, obs = testData$y)
## RMSE Rsquared MAE
## 2.8564065 0.7091704 1.9600728
SVM
svmRModel <- train(x=trainingData$x, y=trainingData$y,
method="svmRadial",
preProcess=c("center", "scale"),
tuneLength=20)
svmRModel
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.428871 0.7804492 1.929638
## 0.50 2.238909 0.7955102 1.762495
## 1.00 2.132591 0.8087265 1.677426
## 2.00 2.069153 0.8184683 1.623414
## 4.00 2.040282 0.8232454 1.596516
## 8.00 2.023722 0.8260529 1.582890
## 16.00 2.023316 0.8261109 1.581729
## 32.00 2.023316 0.8261109 1.581729
## 64.00 2.023316 0.8261109 1.581729
## 128.00 2.023316 0.8261109 1.581729
## 256.00 2.023316 0.8261109 1.581729
## 512.00 2.023316 0.8261109 1.581729
## 1024.00 2.023316 0.8261109 1.581729
## 2048.00 2.023316 0.8261109 1.581729
## 4096.00 2.023316 0.8261109 1.581729
## 8192.00 2.023316 0.8261109 1.581729
## 16384.00 2.023316 0.8261109 1.581729
## 32768.00 2.023316 0.8261109 1.581729
## 65536.00 2.023316 0.8261109 1.581729
## 131072.00 2.023316 0.8261109 1.581729
##
## Tuning parameter 'sigma' was held constant at a value of 0.05897103
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.05897103 and C = 16.
svmRPred <- predict(svmRModel, newdata=testData$x)
svmRPR <- postResample(pred=svmRPred, obs=testData$y)
svmRPR
## RMSE Rsquared MAE
## 2.0623398 0.8275173 1.5669732
MARS
marsFit <- earth(trainingData$x, trainingData$y)
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
marsModel <- train(x = trainingData$x,
y = trainingData$y,
method = "earth",
tuneGrid = marsGrid,
trControl = trainControl(method = "cv",
number = 10))
marsModel$finalModel
## Selected 16 of 18 terms, and 5 of 10 predictors (nprune=16)
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6-unused, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 4
## GCV 1.61518 RSS 210.6377 GRSq 0.934423 RSq 0.9568093
pred_mars <- predict(marsModel$finalModel, newdata = testData$x)
PR_mars <- postResample(pred = pred_mars, obs = testData$y)
PR_mars
## RMSE Rsquared MAE
## 1.1492504 0.9471145 0.9158382
From teh models above and comparing the R squared, I can see MARS appears to have the best performance, I can see MARS selected X1-X5 as it’s top 5 predictors and it appears to have the best performance.
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
dim(ChemicalManufacturingProcess)
## [1] 176 58
knn_model <- preProcess(ChemicalManufacturingProcess, "knnImpute")
df <- predict(knn_model, ChemicalManufacturingProcess)
df <- df%>%select_at(vars(-one_of(nearZeroVar(., names = TRUE))))
in_train <- createDataPartition(df$Yield, times = 1, p = 0.8, list = FALSE)
train_df <- df[in_train, ]
test_df <- df[-in_train, ]
KNN
knn_model <- train(
Yield ~ ., data = train_df, method = "knn",
center = TRUE,
scale = TRUE,
trControl = trainControl("cv", number = 10),
tuneLength = 25
)
knn_model
## k-Nearest Neighbors
##
## 144 samples
## 56 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 131, 128, 130, 129, 129, 130, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 0.7552782 0.4628101 0.6057732
## 7 0.7534105 0.4850720 0.6127549
## 9 0.7616480 0.4742228 0.6171891
## 11 0.7486736 0.4896930 0.6100923
## 13 0.7425255 0.5191948 0.6083387
## 15 0.7474571 0.5129337 0.6159349
## 17 0.7581577 0.5081583 0.6251681
## 19 0.7592865 0.5101938 0.6280549
## 21 0.7653372 0.5018649 0.6323289
## 23 0.7720588 0.5005638 0.6343030
## 25 0.7823691 0.4868838 0.6418086
## 27 0.7877512 0.4853252 0.6485491
## 29 0.7985177 0.4735718 0.6558193
## 31 0.8077861 0.4556674 0.6619603
## 33 0.8140949 0.4470729 0.6642140
## 35 0.8239989 0.4420473 0.6699565
## 37 0.8322604 0.4283147 0.6778513
## 39 0.8359550 0.4272952 0.6810802
## 41 0.8382319 0.4293695 0.6831565
## 43 0.8421727 0.4291687 0.6881367
## 45 0.8466110 0.4268836 0.6935972
## 47 0.8519136 0.4197597 0.6974306
## 49 0.8540572 0.4203750 0.6983138
## 51 0.8594393 0.4140496 0.7031984
## 53 0.8625312 0.4120098 0.7078943
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 13.
knn_predictions <- predict(knn_model, test_df)
postResample(pred = knn_predictions, obs = test_df$Yield)
## RMSE Rsquared MAE
## 0.6885473 0.4418770 0.5485172
SVM
SVM_model <- train(
Yield ~ ., data = train_df, method = "svmRadial",
center = TRUE,
scale = TRUE,
trControl = trainControl(method = "cv"),
tuneLength = 25
)
SVM_model
## Support Vector Machines with Radial Basis Function Kernel
##
## 144 samples
## 56 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 129, 129, 130, 131, 129, 130, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 0.7654300 0.5237836 0.6191198
## 0.50 0.6967728 0.5751820 0.5598469
## 1.00 0.6359584 0.6351269 0.5035381
## 2.00 0.6121325 0.6600614 0.4768313
## 4.00 0.6009953 0.6730173 0.4702624
## 8.00 0.5965196 0.6792734 0.4657764
## 16.00 0.5953751 0.6803098 0.4642481
## 32.00 0.5953751 0.6803098 0.4642481
## 64.00 0.5953751 0.6803098 0.4642481
## 128.00 0.5953751 0.6803098 0.4642481
## 256.00 0.5953751 0.6803098 0.4642481
## 512.00 0.5953751 0.6803098 0.4642481
## 1024.00 0.5953751 0.6803098 0.4642481
## 2048.00 0.5953751 0.6803098 0.4642481
## 4096.00 0.5953751 0.6803098 0.4642481
## 8192.00 0.5953751 0.6803098 0.4642481
## 16384.00 0.5953751 0.6803098 0.4642481
## 32768.00 0.5953751 0.6803098 0.4642481
## 65536.00 0.5953751 0.6803098 0.4642481
## 131072.00 0.5953751 0.6803098 0.4642481
## 262144.00 0.5953751 0.6803098 0.4642481
## 524288.00 0.5953751 0.6803098 0.4642481
## 1048576.00 0.5953751 0.6803098 0.4642481
## 2097152.00 0.5953751 0.6803098 0.4642481
## 4194304.00 0.5953751 0.6803098 0.4642481
##
## Tuning parameter 'sigma' was held constant at a value of 0.01427805
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01427805 and C = 16.
svm_predictions <- predict(SVM_model, test_df)
postResample(pred = svm_predictions, obs = test_df$Yield)
## RMSE Rsquared MAE
## 0.5796161 0.5838355 0.4715688
Neural Network
nnet_grid <- expand.grid(.decay = c(0, 0.01, .1), .size = c(1:10), .bag = FALSE)
nnet_maxnwts <- 5 * ncol(train_df) + 5 + 1
nnet_model <- train(
Yield ~ ., data = train_df, method = "avNNet",
center = TRUE,
scale = TRUE,
tuneGrid = nnet_grid,
trControl = trainControl(method = "cv"),
linout = TRUE,
trace = FALSE,
MaxNWts = nnet_maxnwts,
maxit = 500
)
nnet_model
## Model Averaged Neural Network
##
## 144 samples
## 56 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 128, 131, 129, 129, 131, 128, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 0.7911611 0.4576647 0.6631840
## 0.00 2 0.7597567 0.5413880 0.6198021
## 0.00 3 0.7738294 0.5401306 0.6228561
## 0.00 4 0.8151203 0.5072658 0.6515518
## 0.00 5 0.7630283 0.5601924 0.6092753
## 0.00 6 NaN NaN NaN
## 0.00 7 NaN NaN NaN
## 0.00 8 NaN NaN NaN
## 0.00 9 NaN NaN NaN
## 0.00 10 NaN NaN NaN
## 0.01 1 0.8417799 0.5139426 0.6680004
## 0.01 2 0.8317788 0.5177218 0.6544578
## 0.01 3 0.7683433 0.5754437 0.6249086
## 0.01 4 0.7288591 0.6270687 0.5890148
## 0.01 5 0.6960698 0.6404665 0.5535684
## 0.01 6 NaN NaN NaN
## 0.01 7 NaN NaN NaN
## 0.01 8 NaN NaN NaN
## 0.01 9 NaN NaN NaN
## 0.01 10 NaN NaN NaN
## 0.10 1 0.7690293 0.5671247 0.6175350
## 0.10 2 0.6848541 0.6151229 0.5511681
## 0.10 3 0.7341391 0.6180884 0.5874503
## 0.10 4 0.6611062 0.6551770 0.5354342
## 0.10 5 0.6901760 0.6345200 0.5501578
## 0.10 6 NaN NaN NaN
## 0.10 7 NaN NaN NaN
## 0.10 8 NaN NaN NaN
## 0.10 9 NaN NaN NaN
## 0.10 10 NaN NaN NaN
##
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 4, decay = 0.1 and bag = FALSE.
nnet_predictions <- predict(nnet_model, test_df)
postResample(pred = nnet_predictions, obs = test_df$Yield)
## RMSE Rsquared MAE
## 0.6251864 0.5198087 0.4823207
varImp(knn_model, 10)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess13 93.85
## BiologicalMaterial06 86.96
## BiologicalMaterial03 76.26
## ManufacturingProcess36 75.79
## ManufacturingProcess17 73.93
## BiologicalMaterial12 71.29
## ManufacturingProcess09 71.17
## BiologicalMaterial02 71.07
## ManufacturingProcess31 64.58
## ManufacturingProcess11 54.15
## ManufacturingProcess06 53.77
## ManufacturingProcess33 48.74
## BiologicalMaterial04 46.98
## BiologicalMaterial11 46.25
## ManufacturingProcess02 42.59
## ManufacturingProcess29 40.77
## ManufacturingProcess30 40.72
## BiologicalMaterial08 37.74
## BiologicalMaterial01 35.52
varImp(SVM_model, 10)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess13 93.85
## BiologicalMaterial06 86.96
## BiologicalMaterial03 76.26
## ManufacturingProcess36 75.79
## ManufacturingProcess17 73.93
## BiologicalMaterial12 71.29
## ManufacturingProcess09 71.17
## BiologicalMaterial02 71.07
## ManufacturingProcess31 64.58
## ManufacturingProcess11 54.15
## ManufacturingProcess06 53.77
## ManufacturingProcess33 48.74
## BiologicalMaterial04 46.98
## BiologicalMaterial11 46.25
## ManufacturingProcess02 42.59
## ManufacturingProcess29 40.77
## ManufacturingProcess30 40.72
## BiologicalMaterial08 37.74
## BiologicalMaterial01 35.52
varImp(nnet_model, 10)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess13 93.85
## BiologicalMaterial06 86.96
## BiologicalMaterial03 76.26
## ManufacturingProcess36 75.79
## ManufacturingProcess17 73.93
## BiologicalMaterial12 71.29
## ManufacturingProcess09 71.17
## BiologicalMaterial02 71.07
## ManufacturingProcess31 64.58
## ManufacturingProcess11 54.15
## ManufacturingProcess06 53.77
## ManufacturingProcess33 48.74
## BiologicalMaterial04 46.98
## BiologicalMaterial11 46.25
## ManufacturingProcess02 42.59
## ManufacturingProcess29 40.77
## ManufacturingProcess30 40.72
## BiologicalMaterial08 37.74
## BiologicalMaterial01 35.52
From the above I can see the SVM has the highest R-squared, followed by Neural Network. for SVB it shows it selected the first 2 process variables, then two biological materials. it shows its close to the optimal linear model.
ggplot(train_df, aes(ManufacturingProcess32, Yield)) +
geom_point()
ggplot(train_df, aes(ManufacturingProcess13, Yield)) +
geom_point()
ggplot(train_df, aes(BiologicalMaterial06, Yield)) +
geom_point()
ggplot(train_df, aes(BiologicalMaterial03, Yield)) +
geom_point()
Checking the top predictors in SVB as the below overall.
ManufacturingProcess32 100.00000 has positive correlation, ManufacturingProcess13 93.84888 has negative correlation,
BiologicalMaterial06 86.95832 has positive correlation,
BiologicalMaterial03 76.26035 has positive correlation,
therefore from the above I can see the manufacturing process they are correlated between negative and positive. while the biological ones are positively correlated.