Data 624 Predictive Analytics HW8

Chapter 7

library(tidyr)
library(dplyr)
library(knitr)
library(utils)
library(ggplot2)
library(corrplot)
library(mice)
library(psych)
library(gridExtra)
library(MASS)
library(caret)
library(earth)
library(nnet)
library(NeuralNetTools)
library(mlbench)
library(kernlab)
library(AppliedPredictiveModeling)
library(ModelMetrics)

7.2 Nonlinear Models on Simulation Data

Friedman (1991) introduced several benchmark data sets created by simulation. One of these simulations used the following nonlinear equation to create data:

\[ y = 10sin(\pi x_1x_2) + 20(x_3 - 0.5)^2+ 10x_4 + 5x_5 + N(0, \sigma^2) \]

where the \(x\) values are random variables uniformly distributed between \([0, 1]\). (There are also 5 other non-informative variables created in the simulation).

The package \(mlbench\) contains a function called \(mlbench.friedman1\) that simulates these data. Tune several models on these data.

set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)
featurePlot(trainingData$x, trainingData$y)

testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

SVM Model for Simulation Data

set.seed(123)
df.train = trainingData$x
df.train$y = trainingData$y

svmFit <- kernlab::ksvm(y ~ .,
                        data = df.train,
                        kernel ="rbfdot",
                        kpar = "automatic",
                        C = 1,
                        epsilon = 0.1)
svmFit

## Support Vector Machine object of class "ksvm" 
## 
## SV type: eps-svr  (regression) 
##  parameter : epsilon = 0.1  cost C = 1 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.0697199382297723 
## 
## Number of Support Vectors : 158 
## 
## Objective Function Value : -39.7314 
## Training error : 0.064648

rmse.svm = rmse(trainingData$y, predict(svmFit, trainingData$x))
cat("For SVM, Resampled RMSE on training data =", rmse.svm)

## For SVM, Resampled RMSE on training data = 1.258707

KNN Model for Simulation Data

set.seed(123)
knnModel <- train(y ~ .,
                  data = df.train,
                  method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength = 10)
knnModel

## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.617103  0.4815238  2.932357
##    7  3.469391  0.5268823  2.827466
##    9  3.404273  0.5542161  2.764704
##   11  3.367277  0.5745575  2.726791
##   13  3.313918  0.6022923  2.681996
##   15  3.310264  0.6142757  2.687478
##   17  3.308316  0.6266659  2.686031
##   19  3.306431  0.6392056  2.690283
##   21  3.317481  0.6435421  2.700244
##   23  3.323616  0.6521553  2.708896
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 19.

rmse.knn = rmse(trainingData$y, predict(knnModel, trainingData$x))
cat("For KNN, Resampled RMSE on training data =", rmse.knn)

## For KNN, Resampled RMSE on training data = 2.960453

The lowest RMSE obtained by KNN is with \(k=19\). The RMSE value is \(3.306431\).

MARS Model for Simulation Data

set.seed(100)
marsFit <- earth(y ~ ., data = df.train)
marsFit

## Selected 12 of 18 terms, and 6 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556    RSS 397.9654    GRSq 0.8968524    RSq 0.9183982

summary(marsFit)

## Call: earth(formula=y~., data=df.train)
## 
##                coefficients
## (Intercept)       18.451984
## h(0.621722-X1)   -11.074396
## h(0.601063-X2)   -10.744225
## h(X3-0.281766)    20.607853
## h(0.447442-X3)    17.880232
## h(X3-0.447442)   -23.282007
## h(X3-0.636458)    15.150350
## h(0.734892-X4)   -10.027487
## h(X4-0.734892)     9.092045
## h(0.850094-X5)    -4.723407
## h(X5-0.850094)    10.832932
## h(X6-0.361791)    -1.956821
## 
## Selected 12 of 18 terms, and 6 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556    RSS 397.9654    GRSq 0.8968524    RSq 0.9183982

rmse.mars = rmse(trainingData$y, predict(marsFit, trainingData$x))
cat("For MARS, Resampled RMSE on training data =", rmse.mars)

## For MARS, Resampled RMSE on training data = 1.410612

Which models appear to give the best performance? Does MARS select the informative predictors (those named \(X1-X5\))?

We rank the performance of the models according to the lowest RMSE produced.

model.rank = data.frame(model.type = c("SVM", "KNN", "MARS"),
                        resample.rmse=c(rmse.svm, rmse.knn, rmse.mars))
model.rank = model.rank[order(model.rank$resample.rmse),]
print(model.rank)

##   model.type resample.rmse
## 1        SVM      1.258707
## 3       MARS      1.410612
## 2        KNN      2.960453

Show the relative performance as a bar plot.

ggplot(data = model.rank, aes(x=model.type, y=resample.rmse)) +
    geom_bar(stat="identity", width=0.5) +
    ggtitle("Comparing Model RMSE on Training Data")

The SVM model appears to give the best performance. The MARS model does select the informative predictors (\(X1 - X6\)).

7.5 Nonlinear Models on Chemical Manufacturing Process Data

Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

data(ChemicalManufacturingProcess)
chem.full = ChemicalManufacturingProcess

# Use MICE library for imputation of missing data.
imputed.1 <- mice(chem.full, m=5, maxit = 5, seed = 500, printFlag=F)
chem = complete(imputed.1, 2)
set.seed(100)
index = sample(1:floor(0.7 * nrow(chem)))

chem.train = chem[index,] %>% dplyr::select(-Yield)
chem.train.Y = chem[index,]$Yield
chem.test = chem[-index,] %>% dplyr::select(-Yield)
chem.test.Y = chem[-index,]$Yield

chem.train.df = chem.train
chem.train.df$y = chem.train.Y

chem.test.df = chem.test
chem.test.df$y = chem.test.Y

KNN Model for Chemical Manufacturing Data

modelperf = data.frame(matrix(ncol=3, nrow=6))
colnames(modelperf) = c("Dataset", "Model", "RMSE")

set.seed(123)
knnModel <- train(y ~ .,
                  data = chem.train.df,
                  method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength = 10)
knnModel

## k-Nearest Neighbors 
## 
## 123 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 123, 123, 123, 123, 123, 123, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  1.342986  0.4785928  1.049471
##    7  1.368232  0.4623670  1.074551
##    9  1.373570  0.4626582  1.095199
##   11  1.386473  0.4583350  1.109486
##   13  1.394085  0.4594401  1.114403
##   15  1.402909  0.4598781  1.126768
##   17  1.411375  0.4598978  1.132618
##   19  1.412601  0.4647614  1.130958
##   21  1.417229  0.4673599  1.134196
##   23  1.429313  0.4647713  1.144086
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 5.

rmse.train.knn = rmse(chem.train.Y, predict(knnModel, chem.train))
rmse.test.knn = rmse(chem.test.Y, predict(knnModel, chem.test))

modelperf[1,] = list("Train", "KNN", rmse.train.knn)
modelperf[2,] = list("Test", "KNN", rmse.test.knn)
cat("For KNN, the RMSE on train and test data are", rmse.train.knn, "and", rmse.test.knn, "respectively.")

## For KNN, the RMSE on train and test data are 0.9343585 and 1.708756 respectively.

MARS Model for Chemical Manufacturing Data

set.seed(100)
marsFit <- earth(y ~ ., data = chem.train.df)
marsFit

## Selected 14 of 21 terms, and 10 of 57 predictors
## Termination condition: RSq changed by less than 0.001 at 21 terms
## Importance: ManufacturingProcess32, ManufacturingProcess13, ...
## Number of terms at each degree of interaction: 1 13 (additive model)
## GCV 0.6125845    RSS 45.89901    GRSq 0.822342    RSq 0.8899962

summary(marsFit)

## Call: earth(formula=y~., data=chem.train.df)
## 
##                                 coefficients
## (Intercept)                        35.734029
## h(58.85-BiologicalMaterial02)       0.365274
## h(BiologicalMaterial03-70.01)      -0.381412
## h(48.64-BiologicalMaterial06)      -0.506089
## h(BiologicalMaterial06-48.64)       0.384289
## h(BiologicalMaterial09-13)          2.384785
## h(ManufacturingProcess01-9.7)       0.476082
## h(43.73-ManufacturingProcess09)    -0.774393
## h(ManufacturingProcess09-43.73)     0.509586
## h(33.8-ManufacturingProcess13)      1.581220
## h(ManufacturingProcess13-33.8)      0.369047
## h(10.7-ManufacturingProcess28)      0.112272
## h(ManufacturingProcess32-156)       0.222192
## h(ManufacturingProcess45-2.3)       3.333380
## 
## Selected 14 of 21 terms, and 10 of 57 predictors
## Termination condition: RSq changed by less than 0.001 at 21 terms
## Importance: ManufacturingProcess32, ManufacturingProcess13, ...
## Number of terms at each degree of interaction: 1 13 (additive model)
## GCV 0.6125845    RSS 45.89901    GRSq 0.822342    RSq 0.8899962

rmse.train.mars = rmse(chem.train.Y, predict(marsFit, chem.train))
rmse.test.mars = rmse(chem.test.Y, predict(marsFit, chem.test))

modelperf[3,] = list("Train", "MARS", rmse.train.mars)
modelperf[4,] = list("Test", "MARS", rmse.test.mars)
cat("For MARS, the RMSE on train and test data are", rmse.train.mars, "and", rmse.test.mars, "respectively.")

## For MARS, the RMSE on train and test data are 0.6108705 and 2.095379 respectively.

NNET Model for Chemical Manufacturing Data

set.seed(123)
nnetFit <- nnet(chem.train, chem.train.Y,
                size = 4,
                decay = 0.01,
                linout = TRUE,
                trace = FALSE,
                maxit = 500, # Iterations
                ## Number of parameters used by the model
                MaxNWts= 4 * (ncol(chem.train) + 1) + 5 + 1)
nnetFit

## a 57-4-1 network with 237 weights
## options were - linear output units  decay=0.01

rmse.train.nnet = rmse(chem.train.Y, predict(nnetFit, chem.train))
rmse.test.nnet = rmse(chem.test.Y, predict(nnetFit, chem.test))

modelperf[5,] = list("Train", "ANN", rmse.train.nnet)
modelperf[6,] = list("Test", "ANN", rmse.test.nnet)
cat("For Neural Net, the RMSE on train and test data are",
    rmse.train.nnet, "and", rmse.test.nnet, "respectively.")

## For Neural Net, the RMSE on train and test data are 1.455118 and 1.60426 respectively.

#plotnet(nnetFit, skip=TRUE)

By experimenting with different values for the number of hidden layers (controlled by the \(size\) parameter in the \(nnet()\) function), it was seen that \(size = 4\) produced the lowest RMSE.

Which nonlinear regression model gives the optimal resampling and test set performance?

ggplot(data=modelperf, aes(x=Model, y=RMSE, fill=Dataset)) +
    geom_bar(stat="identity", position=position_dodge()) +
    ggtitle("Model Performance for Chemical Manufacturing Process data")

The neural network model produces the smallest RMSE values on the test data. On the train data, the MARS model outperforms other models.

Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

There is no straightforward way to determine the most important predictors in a fitted neural network model. The following is the list of the most important predictors from the MARS model.

head(varImp(marsFit), n=12)

##                          Overall
## ManufacturingProcess32 100.00000
## ManufacturingProcess13  75.19997
## ManufacturingProcess09  55.08487
## ManufacturingProcess28  37.43464
## ManufacturingProcess01  29.55994
## BiologicalMaterial03    27.98040
## BiologicalMaterial06    27.98040
## BiologicalMaterial09    27.98040
## BiologicalMaterial02    21.64894
## ManufacturingProcess45  11.97840
## BiologicalMaterial01     0.00000
## BiologicalMaterial04     0.00000

Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

The following scatter plots show relationships between predictors of the MARS model and the response variable \(Yield\).

set.seed(123)
# Select the top predictors from the MARS model:
chem2 = chem.train.df %>% dplyr::select(ManufacturingProcess32, ManufacturingProcess13,
                                        ManufacturingProcess09, ManufacturingProcess28,
                                        ManufacturingProcess01, BiologicalMaterial03,
                                        BiologicalMaterial06, BiologicalMaterial09,
                                        BiologicalMaterial02, ManufacturingProcess45)
# Abbreviate names for display
names(chem2) = c("MP32", "MP13", "MP09", "MP28", "MP01", "BM03", "BM06", "BM09", "BM02", "MP45")
chem2$Yield = chem.train.df$y

p = list()
df = list()
v = colnames(chem2)
for (i in 1:10) {
  df[[i]] = chem2 %>% dplyr::select(v[i], Yield)
}

# This loop doesn't work because of R's rules regarding lazy evaluation.
#for (i in 1:10) {
#    p[[i]] = ggplot(df[[i]], aes(x=df[[i]][,1], y=Yield)) + xlab(v[i]) + geom_point()
#}

p[[1]] = ggplot(df[[1]], aes(x=df[[1]][,1], y=Yield)) + xlab(v[1]) + geom_point()
p[[2]] = ggplot(df[[2]], aes(x=df[[2]][,1], y=Yield)) + xlab(v[2]) + geom_point()
p[[3]] = ggplot(df[[3]], aes(x=df[[3]][,1], y=Yield)) + xlab(v[3]) + geom_point()
p[[4]] = ggplot(df[[4]], aes(x=df[[4]][,1], y=Yield)) + xlab(v[4]) + geom_point()
p[[5]] = ggplot(df[[5]], aes(x=df[[5]][,1], y=Yield)) + xlab(v[5]) + geom_point()
p[[6]] = ggplot(df[[6]], aes(x=df[[6]][,1], y=Yield)) + xlab(v[6]) + geom_point()
p[[7]] = ggplot(df[[7]], aes(x=df[[7]][,1], y=Yield)) + xlab(v[7]) + geom_point()
p[[8]] = ggplot(df[[8]], aes(x=df[[8]][,1], y=Yield)) + xlab(v[8]) + geom_point()
p[[9]] = ggplot(df[[9]], aes(x=df[[9]][,1], y=Yield)) + xlab(v[9]) + geom_point()
p[[10]] = ggplot(df[[10]], aes(x=df[[10]][,1], y=Yield)) + xlab(v[10]) + geom_point()

grid.arrange(p[[1]], p[[2]], p[[3]], p[[4]])

grid.arrange(p[[5]], p[[6]], p[[7]], p[[8]])

grid.arrange(p[[9]], p[[10]])