Load libraries
library(tidyverse)
library(fpp3)
library(caret)
library(RANN)
library(mlbench)
library(nnet)
library(earth)
Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data: where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:
set.seed(200)
#create training data
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)
# plot the data
featurePlot(trainingData$x, trainingData$y)
#create test data
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
Tune several models on these data. For example:
#knn model with preprocessing
knnModel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
#predict the model using knn model on test dataset
knnPred <- predict(knnModel, newdata = testData$x)
# evaluate performance model using postResample
knn_performance <- postResample(pred = knnPred, obs = testData$y)
print(knn_performance)
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?
MARS model provides the best performance since it has the highest rsquared. MARS did select the informative predictors X1-X5 first with X1 and X4 with teh highest importance numbers.
# Multivariate Adaptive Regression Splines (MARS)
set.seed(200)
# Train a MARS model
mars_model <- train(
x = trainingData$x,
y = trainingData$y,
method = "earth",
tuneLength = 10,
trControl = trainControl(method = "cv", number = 10))
# print MARS model
print(mars_model)
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## nprune RMSE Rsquared MAE
## 2 4.188280 0.3042527 3.460689
## 3 3.551182 0.4999832 2.837116
## 4 2.653143 0.7167280 2.128222
## 6 2.295006 0.7754603 1.853199
## 7 1.771950 0.8611767 1.391357
## 9 1.609816 0.8837307 1.299705
## 10 1.635035 0.8798236 1.309436
## 12 1.571561 0.8898750 1.253077
## 13 1.567577 0.8906927 1.250795
## 15 1.571673 0.8909652 1.245508
##
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 13 and degree = 1.
#summary MARS model
summary(mars_model)
## Call: earth(x=data.frame[200,10], y=c(18.46,16.1,17...), keepxy=TRUE, degree=1,
## nprune=13)
##
## coefficients
## (Intercept) 18.451984
## h(0.621722-X1) -11.074396
## h(0.601063-X2) -10.744225
## h(X3-0.281766) 20.607853
## h(0.447442-X3) 17.880232
## h(X3-0.447442) -23.282007
## h(X3-0.636458) 15.150350
## h(0.734892-X4) -10.027487
## h(X4-0.734892) 9.092045
## h(0.850094-X5) -4.723407
## h(X5-0.850094) 10.832932
## h(X6-0.361791) -1.956821
##
## Selected 12 of 18 terms, and 6 of 10 predictors (nprune=13)
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556 RSS 397.9654 GRSq 0.8968524 RSq 0.9183982
#prediction using mars model
mars_prediction <- predict(mars_model, newdata = testData$x)
# performance using postResample
mars_performance <- postResample(pred = mars_prediction, obs = testData$y)
print(mars_performance)
## RMSE Rsquared MAE
## 1.8136467 0.8677298 1.3911836
summary(mars_model$finalModel)
## Call: earth(x=data.frame[200,10], y=c(18.46,16.1,17...), keepxy=TRUE, degree=1,
## nprune=13)
##
## coefficients
## (Intercept) 18.451984
## h(0.621722-X1) -11.074396
## h(0.601063-X2) -10.744225
## h(X3-0.281766) 20.607853
## h(0.447442-X3) 17.880232
## h(X3-0.447442) -23.282007
## h(X3-0.636458) 15.150350
## h(0.734892-X4) -10.027487
## h(X4-0.734892) 9.092045
## h(0.850094-X5) -4.723407
## h(X5-0.850094) 10.832932
## h(X6-0.361791) -1.956821
##
## Selected 12 of 18 terms, and 6 of 10 predictors (nprune=13)
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556 RSS 397.9654 GRSq 0.8968524 RSq 0.9183982
mars_final <- mars_model$finalModel
varImp_mars <- evimp(mars_final)
print(varImp_mars)
## nsubsets gcv rss
## X1 11 100.0 100.0
## X4 10 84.2 83.7
## X2 9 67.2 66.5
## X5 8 45.4 45.4
## X3 7 34.6 35.0
## X6 4 11.9 14.2
#Neural Network model
set.seed(200)
#nnet model
nnet_model <- nnet(
x = trainingData$x,
y = trainingData$y,
size = 5,
decay = 0.01,
lineout = TRUE,
trace = FALSE,
maxit = 500,
MaxNWts = 5 * (ncol(trainingData$x) + 1 + 5 +1))
# predictions using the neural network model
nnet_prediction <- predict(nnet_model, newdata = testData$x)
# performance using postResample
nn_performance <-postResample(pred = nnet_prediction, obs = testData$y)
print(nn_performance)
## RMSE Rsquared MAE
## 14.276935 0.288859 13.386918
#Support Vector Machines
set.seed(200)
#nnet model
svm_model <- train(
x = trainingData$x,
y = trainingData$y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = trainControl(method = "cv"))
#print svm
print(svm_model)
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.525164 0.7810576 2.010680
## 0.50 2.270567 0.7944850 1.794902
## 1.00 2.099319 0.8155594 1.659342
## 2.00 2.005858 0.8302852 1.578799
## 4.00 1.934650 0.8435677 1.528373
## 8.00 1.915672 0.8475581 1.528584
## 16.00 1.923960 0.8463016 1.536051
## 32.00 1.923960 0.8463016 1.536051
## 64.00 1.923960 0.8463016 1.536051
## 128.00 1.923960 0.8463016 1.536051
## 256.00 1.923960 0.8463016 1.536051
## 512.00 1.923960 0.8463016 1.536051
## 1024.00 1.923960 0.8463016 1.536051
## 2048.00 1.923960 0.8463016 1.536051
##
## Tuning parameter 'sigma' was held constant at a value of 0.06299324
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06299324 and C = 8.
# predictions using the neural network model
svm_prediction <- predict(svm_model, newdata = testData$x)
# performance using postResample
svm_performance <- postResample(pred = svm_prediction, obs = testData$y)
print(svm_performance)
## RMSE Rsquared MAE
## 2.0541197 0.8290353 1.5586411
svm_model$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 8
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.0629932410345396
##
## Number of Support Vectors : 152
##
## Objective Function Value : -72.63
## Training error : 0.009177
# Compare performances
performance_results <- rbind(
KNN = knn_performance,
MARS = mars_performance,
SVM = svm_performance,
NeuralNetwork = nn_performance)
print(performance_results)
## RMSE Rsquared MAE
## KNN 3.204059 0.6819919 2.568346
## MARS 1.813647 0.8677298 1.391184
## SVM 2.054120 0.8290353 1.558641
## NeuralNetwork 14.276935 0.2888590 13.386918
Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.
The MARS model has the optiomal performance since it has the lowest RMSE and highest rsquared.
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
#separate out predictor and response
predictor <- ChemicalManufacturingProcess[,-1]
response <- ChemicalManufacturingProcess$Yield
# imputation and pre-process
imputation<- preProcess(predictor, method = c("center", "scale", "knnImpute"))
pred_che <- predict(imputation, predictor)
set.seed(123)
#split data
trainindex <- createDataPartition(response, p = 0.8, list = FALSE)
#create training and test set
train_response <- response[trainindex] # train response
test_response <- response[-trainindex] # test response
train_predictor <- pred_che[trainindex,] # train predictor
test_predictor <- pred_che[-trainindex,] # Test predictor
# KNN Model
knn_model <- train(
x = train_predictor,
y = train_response,
method = "knn",
tuneLength = 10)
print(knn_model)
## k-Nearest Neighbors
##
## 144 samples
## 57 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 1.404328 0.4362944 1.120535
## 7 1.421869 0.4273089 1.140008
## 9 1.427747 0.4198231 1.152886
## 11 1.426694 0.4234220 1.153227
## 13 1.439947 0.4139886 1.164513
## 15 1.440685 0.4169647 1.167104
## 17 1.449785 0.4128142 1.170281
## 19 1.456103 0.4091016 1.174277
## 21 1.463941 0.4060540 1.179770
## 23 1.471751 0.4028304 1.185189
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 5.
knn_pred <- predict(knn_model, newdata = test_predictor)
knn_performances <- postResample(pred = knn_pred, obs = test_response)
print(knn_performances)
## RMSE Rsquared MAE
## 1.4058635 0.4203612 1.1572500
# Neural Network Model
set.seed(200)
#nnet model
nnet_model2 <- nnet(
x = train_predictor,
y = train_response,
size = 5,
decay = 0.01,
lineout = TRUE,
trace = FALSE,
maxit = 500,
MaxNWts = 5 * (ncol(train_predictor) + 1 + 5 +1))
# predictions using the neural network model
nnet_prediction2 <- predict(nnet_model2, newdata = testData$x)
# performance using postResample
nn_performances <-postResample(pred = nnet_prediction2, obs = testData$y)
print(nn_performances)
## RMSE Rsquared MAE
## 14.277987503 0.000167585 13.387911203
# Multivariate Adaptive Regression Splines (MARS)
set.seed(200)
# Train a MARS model
mars_model2 <- train(
x = train_predictor,
y = train_response,
method = "earth",
tuneLength = 10,
trControl = trainControl(method = "cv", number = 10))
# print MARS model
print(mars_model2)
## Multivariate Adaptive Regression Spline
##
## 144 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 130, 129, 129, 130, 131, 131, ...
## Resampling results across tuning parameters:
##
## nprune RMSE Rsquared MAE
## 2 1.422973 0.4663591 1.1152847
## 4 1.214079 0.5968870 0.9765712
## 6 1.280296 0.5458428 1.0198042
## 8 1.298122 0.5328182 1.0511910
## 10 1.337865 0.5363937 1.0626002
## 13 1.413456 0.4921121 1.1144966
## 15 1.419967 0.5082017 1.0872101
## 17 1.417457 0.5161495 1.0820099
## 19 1.418712 0.5156862 1.0845296
## 22 1.418712 0.5156862 1.0845296
##
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 4 and degree = 1.
#summary MARS model
summary(mars_model2)
## Call: earth(x=data.frame[144,57], y=c(42.44,42.03,4...), keepxy=TRUE, degree=1,
## nprune=4)
##
## coefficients
## (Intercept) 39.561204
## h(-1.38676-ManufacturingProcess13) 3.489821
## h(ManufacturingProcess17- -0.916315) -0.699822
## h(ManufacturingProcess32- -0.827442) 1.330065
##
## Selected 4 of 27 terms, and 3 of 57 predictors (nprune=4)
## Termination condition: RSq changed by less than 0.001 at 27 terms
## Importance: ManufacturingProcess32, ManufacturingProcess17, ...
## Number of terms at each degree of interaction: 1 3 (additive model)
## GCV 1.455657 RSS 189.7308 GRSq 0.5784267 RSq 0.6130613
#prediction using mars model
mars_prediction2 <- predict(mars_model2, newdata = test_predictor)
# performance using postResample
mars_performances <- postResample(pred = mars_prediction2, obs = test_response)
print(mars_performances)
## RMSE Rsquared MAE
## 1.0995748 0.6604107 0.9190909
#Support Vector Machines
set.seed(200)
#nnet model
svm_model2 <- train(
x = train_predictor,
y = train_response,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = trainControl(method = "cv"))
#print svm
print(svm_model2)
## Support Vector Machines with Radial Basis Function Kernel
##
## 144 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 130, 129, 129, 130, 131, 131, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 1.368998 0.5420547 1.1209359
## 0.50 1.267554 0.5811406 1.0415826
## 1.00 1.196243 0.6298251 0.9759420
## 2.00 1.149146 0.6632057 0.9416576
## 4.00 1.126931 0.6692433 0.9147001
## 8.00 1.102570 0.6793603 0.9035725
## 16.00 1.102223 0.6794431 0.9034194
## 32.00 1.102223 0.6794431 0.9034194
## 64.00 1.102223 0.6794431 0.9034194
## 128.00 1.102223 0.6794431 0.9034194
## 256.00 1.102223 0.6794431 0.9034194
## 512.00 1.102223 0.6794431 0.9034194
## 1024.00 1.102223 0.6794431 0.9034194
## 2048.00 1.102223 0.6794431 0.9034194
##
## Tuning parameter 'sigma' was held constant at a value of 0.01578546
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01578546 and C = 16.
# predictions using the neural network model
svm_prediction2 <- predict(svm_model2, newdata = test_predictor)
# performance using postResample
svm_performances <- postResample(pred = svm_prediction2, obs = test_response)
print(svm_performances)
## RMSE Rsquared MAE
## 1.2289765 0.5534509 1.0363530
svm_model2$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 16
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.0157854649377838
##
## Number of Support Vectors : 126
##
## Objective Function Value : -78.2082
## Training error : 0.009227
# Compare performances
performance_results2 <- rbind(
KNN = knn_performances,
MARS = mars_performances,
SVM = svm_performances,
NeuralNetwork = nn_performances)
print(performance_results2)
## RMSE Rsquared MAE
## KNN 1.405864 0.420361195 1.1572500
## MARS 1.099575 0.660410699 0.9190909
## SVM 1.228976 0.553450940 1.0363530
## NeuralNetwork 14.277988 0.000167585 13.3879112
The most important predictors for the mars model are ManufacturingProcess32, ManufacturingProcess17, ManufacturingProcess13. The important predictor for linear model was ManufacturingProcess37, ManufacturingProcess29, ManufacturingProcess32. The process variables are dominating the list as they are listed above the biological variables. On the mars model is shows ManufacturingProcess32 as number one but on the linear one it was number 3. The mars model had less predictors than the linear model.
# Get variable importance for MARS model
mars_model3 <- mars_model2$finalModel
mars_import <- varImp(mars_model3)
print(mars_import)
## Overall
## ManufacturingProcess32 100.00000
## ManufacturingProcess17 55.97625
## ManufacturingProcess13 28.37452
# linear model
linear_model <- train(
x = train_predictor,
y = train_response,
method = "lm",
trControl = trainControl(method = "cv", number = 10))
# importance for the linear model
linear_importance <- varImp(linear_model)
print(linear_importance)
## lm variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess37 100.00
## ManufacturingProcess29 88.14
## ManufacturingProcess32 72.81
## ManufacturingProcess45 71.20
## BiologicalMaterial05 62.70
## ManufacturingProcess43 61.09
## ManufacturingProcess04 60.95
## ManufacturingProcess28 60.77
## ManufacturingProcess06 51.67
## BiologicalMaterial03 48.16
## BiologicalMaterial09 45.96
## ManufacturingProcess33 45.87
## ManufacturingProcess31 45.24
## ManufacturingProcess27 44.82
## ManufacturingProcess08 41.44
## ManufacturingProcess20 39.30
## ManufacturingProcess09 39.10
## ManufacturingProcess18 37.56
## ManufacturingProcess38 35.76
## ManufacturingProcess05 35.13
There is a positve relationship between ManufacturingProcess32 and Yield as one increases the other increasing by seeing the dots moving on an upward trend. The some of the predictors have a relationship with yield but not as great as Manufacturingprocess32. The relationship is not very clear on some such as ManufacturingProcess37 and ManufacturingProcess29.
# Plot relationships between MARS-unique predictors and response
#side by side chart
par(mfrow = c(1, 3))
plot(train_predictor$ManufacturingProcess32, train_response,
main = "ManufacturingProcess32 vs Yield",
xlab = "ManufacturingProcess32",
ylab = "Yield",
pch = 16, col = "blue")
plot(train_predictor$ManufacturingProcess17, train_response,
main = "ManufacturingProcess17 vs Yield",
xlab = "ManufacturingProcess17",
ylab = "Yield",
pch = 16, col = "red")
plot(train_predictor$ManufacturingProcess13, train_response,
main = "ManufacturingProcess13 vs Yield",
xlab = "ManufacturingProcess13",
ylab = "Yield",
pch = 16, col = "green")
plot(train_predictor$ManufacturingProcess37, train_response,
main = "ManufacturingProcess37 vs Yield",
xlab = "ManufacturingProcess37",
ylab = "Yield",
pch = 16, col = "purple")
plot(train_predictor$ManufacturingProcess29, train_response,
main = "ManufacturingProcess29 vs Yield",
xlab = "ManufacturingProcess29",
ylab = "Yield",
pch = 16, col = "orange")