library(tidyr)
library(dplyr)
library(knitr)
library(utils)
library(ggplot2)
library(corrplot)
library(mice)
library(psych)
library(gridExtra)
library(MASS)
library(caret)
library(earth)
library(nnet)
library(NeuralNetTools)
library(mlbench)
library(kernlab)
library(AppliedPredictiveModeling)
library(ModelMetrics)
Friedman (1991) introduced several benchmark data sets created by simulation. One of these simulations used the following nonlinear equation to create data:
\[ y = 10sin(\pi x_1x_2) + 20(x_3 - 0.5)^2+ 10x_4 + 5x_5 + N(0, \sigma^2) \]
where the \(x\) values are random variables uniformly distributed between \([0, 1]\). (There are also 5 other non-informative variables created in the simulation).
The package \(mlbench\) contains a function called \(mlbench.friedman1\) that simulates these data. Tune several models on these data.
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)
featurePlot(trainingData$x, trainingData$y)
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
set.seed(123)
df.train = trainingData$x
df.train$y = trainingData$y
svmFit <- kernlab::ksvm(y ~ .,
data = df.train,
kernel ="rbfdot",
kpar = "automatic",
C = 1,
epsilon = 0.1)
svmFit
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 1
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.0697199382297723
##
## Number of Support Vectors : 158
##
## Objective Function Value : -39.7314
## Training error : 0.064648
rmse.svm = rmse(trainingData$y, predict(svmFit, trainingData$x))
cat("For SVM, Resampled RMSE on training data =", rmse.svm)
## For SVM, Resampled RMSE on training data = 1.258707
set.seed(123)
knnModel <- train(y ~ .,
data = df.train,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.617103 0.4815238 2.932357
## 7 3.469391 0.5268823 2.827466
## 9 3.404273 0.5542161 2.764704
## 11 3.367277 0.5745575 2.726791
## 13 3.313918 0.6022923 2.681996
## 15 3.310264 0.6142757 2.687478
## 17 3.308316 0.6266659 2.686031
## 19 3.306431 0.6392056 2.690283
## 21 3.317481 0.6435421 2.700244
## 23 3.323616 0.6521553 2.708896
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 19.
rmse.knn = rmse(trainingData$y, predict(knnModel, trainingData$x))
cat("For KNN, Resampled RMSE on training data =", rmse.knn)
## For KNN, Resampled RMSE on training data = 2.960453
The lowest RMSE obtained by KNN is with \(k=19\). The RMSE value is \(3.306431\).
set.seed(100)
marsFit <- earth(y ~ ., data = df.train)
marsFit
## Selected 12 of 18 terms, and 6 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556 RSS 397.9654 GRSq 0.8968524 RSq 0.9183982
summary(marsFit)
## Call: earth(formula=y~., data=df.train)
##
## coefficients
## (Intercept) 18.451984
## h(0.621722-X1) -11.074396
## h(0.601063-X2) -10.744225
## h(X3-0.281766) 20.607853
## h(0.447442-X3) 17.880232
## h(X3-0.447442) -23.282007
## h(X3-0.636458) 15.150350
## h(0.734892-X4) -10.027487
## h(X4-0.734892) 9.092045
## h(0.850094-X5) -4.723407
## h(X5-0.850094) 10.832932
## h(X6-0.361791) -1.956821
##
## Selected 12 of 18 terms, and 6 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556 RSS 397.9654 GRSq 0.8968524 RSq 0.9183982
rmse.mars = rmse(trainingData$y, predict(marsFit, trainingData$x))
cat("For MARS, Resampled RMSE on training data =", rmse.mars)
## For MARS, Resampled RMSE on training data = 1.410612
Which models appear to give the best performance? Does MARS select the informative predictors (those named \(X1-X5\))?
We rank the performance of the models according to the lowest RMSE produced.
model.rank = data.frame(model.type = c("SVM", "KNN", "MARS"),
resample.rmse=c(rmse.svm, rmse.knn, rmse.mars))
model.rank = model.rank[order(model.rank$resample.rmse),]
print(model.rank)
## model.type resample.rmse
## 1 SVM 1.258707
## 3 MARS 1.410612
## 2 KNN 2.960453
Show the relative performance as a bar plot.
ggplot(data = model.rank, aes(x=model.type, y=resample.rmse)) +
geom_bar(stat="identity", width=0.5) +
ggtitle("Comparing Model RMSE on Training Data")
The SVM model appears to give the best performance. The MARS model does select the informative predictors (\(X1 - X6\)).
Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.
data(ChemicalManufacturingProcess)
chem.full = ChemicalManufacturingProcess
# Use MICE library for imputation of missing data.
imputed.1 <- mice(chem.full, m=5, maxit = 5, seed = 500, printFlag=F)
chem = complete(imputed.1, 2)
set.seed(100)
index = sample(1:floor(0.7 * nrow(chem)))
chem.train = chem[index,] %>% dplyr::select(-Yield)
chem.train.Y = chem[index,]$Yield
chem.test = chem[-index,] %>% dplyr::select(-Yield)
chem.test.Y = chem[-index,]$Yield
chem.train.df = chem.train
chem.train.df$y = chem.train.Y
chem.test.df = chem.test
chem.test.df$y = chem.test.Y
modelperf = data.frame(matrix(ncol=3, nrow=6))
colnames(modelperf) = c("Dataset", "Model", "RMSE")
set.seed(123)
knnModel <- train(y ~ .,
data = chem.train.df,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 123 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 123, 123, 123, 123, 123, 123, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 1.342986 0.4785928 1.049471
## 7 1.368232 0.4623670 1.074551
## 9 1.373570 0.4626582 1.095199
## 11 1.386473 0.4583350 1.109486
## 13 1.394085 0.4594401 1.114403
## 15 1.402909 0.4598781 1.126768
## 17 1.411375 0.4598978 1.132618
## 19 1.412601 0.4647614 1.130958
## 21 1.417229 0.4673599 1.134196
## 23 1.429313 0.4647713 1.144086
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 5.
rmse.train.knn = rmse(chem.train.Y, predict(knnModel, chem.train))
rmse.test.knn = rmse(chem.test.Y, predict(knnModel, chem.test))
modelperf[1,] = list("Train", "KNN", rmse.train.knn)
modelperf[2,] = list("Test", "KNN", rmse.test.knn)
cat("For KNN, the RMSE on train and test data are", rmse.train.knn, "and", rmse.test.knn, "respectively.")
## For KNN, the RMSE on train and test data are 0.9343585 and 1.708756 respectively.
set.seed(100)
marsFit <- earth(y ~ ., data = chem.train.df)
marsFit
## Selected 14 of 21 terms, and 10 of 57 predictors
## Termination condition: RSq changed by less than 0.001 at 21 terms
## Importance: ManufacturingProcess32, ManufacturingProcess13, ...
## Number of terms at each degree of interaction: 1 13 (additive model)
## GCV 0.6125845 RSS 45.89901 GRSq 0.822342 RSq 0.8899962
summary(marsFit)
## Call: earth(formula=y~., data=chem.train.df)
##
## coefficients
## (Intercept) 35.734029
## h(58.85-BiologicalMaterial02) 0.365274
## h(BiologicalMaterial03-70.01) -0.381412
## h(48.64-BiologicalMaterial06) -0.506089
## h(BiologicalMaterial06-48.64) 0.384289
## h(BiologicalMaterial09-13) 2.384785
## h(ManufacturingProcess01-9.7) 0.476082
## h(43.73-ManufacturingProcess09) -0.774393
## h(ManufacturingProcess09-43.73) 0.509586
## h(33.8-ManufacturingProcess13) 1.581220
## h(ManufacturingProcess13-33.8) 0.369047
## h(10.7-ManufacturingProcess28) 0.112272
## h(ManufacturingProcess32-156) 0.222192
## h(ManufacturingProcess45-2.3) 3.333380
##
## Selected 14 of 21 terms, and 10 of 57 predictors
## Termination condition: RSq changed by less than 0.001 at 21 terms
## Importance: ManufacturingProcess32, ManufacturingProcess13, ...
## Number of terms at each degree of interaction: 1 13 (additive model)
## GCV 0.6125845 RSS 45.89901 GRSq 0.822342 RSq 0.8899962
rmse.train.mars = rmse(chem.train.Y, predict(marsFit, chem.train))
rmse.test.mars = rmse(chem.test.Y, predict(marsFit, chem.test))
modelperf[3,] = list("Train", "MARS", rmse.train.mars)
modelperf[4,] = list("Test", "MARS", rmse.test.mars)
cat("For MARS, the RMSE on train and test data are", rmse.train.mars, "and", rmse.test.mars, "respectively.")
## For MARS, the RMSE on train and test data are 0.6108705 and 2.095379 respectively.
set.seed(123)
nnetFit <- nnet(chem.train, chem.train.Y,
size = 4,
decay = 0.01,
linout = TRUE,
trace = FALSE,
maxit = 500, # Iterations
## Number of parameters used by the model
MaxNWts= 4 * (ncol(chem.train) + 1) + 5 + 1)
nnetFit
## a 57-4-1 network with 237 weights
## options were - linear output units decay=0.01
rmse.train.nnet = rmse(chem.train.Y, predict(nnetFit, chem.train))
rmse.test.nnet = rmse(chem.test.Y, predict(nnetFit, chem.test))
modelperf[5,] = list("Train", "ANN", rmse.train.nnet)
modelperf[6,] = list("Test", "ANN", rmse.test.nnet)
cat("For Neural Net, the RMSE on train and test data are",
rmse.train.nnet, "and", rmse.test.nnet, "respectively.")
## For Neural Net, the RMSE on train and test data are 1.455118 and 1.60426 respectively.
#plotnet(nnetFit, skip=TRUE)
By experimenting with different values for the number of hidden layers (controlled by the \(size\) parameter in the \(nnet()\) function), it was seen that \(size = 4\) produced the lowest RMSE.
ggplot(data=modelperf, aes(x=Model, y=RMSE, fill=Dataset)) +
geom_bar(stat="identity", position=position_dodge()) +
ggtitle("Model Performance for Chemical Manufacturing Process data")
The neural network model produces the smallest RMSE values on the test data. On the train data, the MARS model outperforms other models.
There is no straightforward way to determine the most important predictors in a fitted neural network model. The following is the list of the most important predictors from the MARS model.
head(varImp(marsFit), n=12)
## Overall
## ManufacturingProcess32 100.00000
## ManufacturingProcess13 75.19997
## ManufacturingProcess09 55.08487
## ManufacturingProcess28 37.43464
## ManufacturingProcess01 29.55994
## BiologicalMaterial03 27.98040
## BiologicalMaterial06 27.98040
## BiologicalMaterial09 27.98040
## BiologicalMaterial02 21.64894
## ManufacturingProcess45 11.97840
## BiologicalMaterial01 0.00000
## BiologicalMaterial04 0.00000
The following scatter plots show relationships between predictors of the MARS model and the response variable \(Yield\).
set.seed(123)
# Select the top predictors from the MARS model:
chem2 = chem.train.df %>% dplyr::select(ManufacturingProcess32, ManufacturingProcess13,
ManufacturingProcess09, ManufacturingProcess28,
ManufacturingProcess01, BiologicalMaterial03,
BiologicalMaterial06, BiologicalMaterial09,
BiologicalMaterial02, ManufacturingProcess45)
# Abbreviate names for display
names(chem2) = c("MP32", "MP13", "MP09", "MP28", "MP01", "BM03", "BM06", "BM09", "BM02", "MP45")
chem2$Yield = chem.train.df$y
p = list()
df = list()
v = colnames(chem2)
for (i in 1:10) {
df[[i]] = chem2 %>% dplyr::select(v[i], Yield)
}
# This loop doesn't work because of R's rules regarding lazy evaluation.
#for (i in 1:10) {
# p[[i]] = ggplot(df[[i]], aes(x=df[[i]][,1], y=Yield)) + xlab(v[i]) + geom_point()
#}
p[[1]] = ggplot(df[[1]], aes(x=df[[1]][,1], y=Yield)) + xlab(v[1]) + geom_point()
p[[2]] = ggplot(df[[2]], aes(x=df[[2]][,1], y=Yield)) + xlab(v[2]) + geom_point()
p[[3]] = ggplot(df[[3]], aes(x=df[[3]][,1], y=Yield)) + xlab(v[3]) + geom_point()
p[[4]] = ggplot(df[[4]], aes(x=df[[4]][,1], y=Yield)) + xlab(v[4]) + geom_point()
p[[5]] = ggplot(df[[5]], aes(x=df[[5]][,1], y=Yield)) + xlab(v[5]) + geom_point()
p[[6]] = ggplot(df[[6]], aes(x=df[[6]][,1], y=Yield)) + xlab(v[6]) + geom_point()
p[[7]] = ggplot(df[[7]], aes(x=df[[7]][,1], y=Yield)) + xlab(v[7]) + geom_point()
p[[8]] = ggplot(df[[8]], aes(x=df[[8]][,1], y=Yield)) + xlab(v[8]) + geom_point()
p[[9]] = ggplot(df[[9]], aes(x=df[[9]][,1], y=Yield)) + xlab(v[9]) + geom_point()
p[[10]] = ggplot(df[[10]], aes(x=df[[10]][,1], y=Yield)) + xlab(v[10]) + geom_point()
grid.arrange(p[[1]], p[[2]], p[[3]], p[[4]])
grid.arrange(p[[5]], p[[6]], p[[7]], p[[8]])
grid.arrange(p[[9]], p[[10]])