Q1)

Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data:

y = 10 sin(pix1x2) + 20(x3 - 0.5)2 + 10x4 + 5x5 + N(0, sigmasquared)

where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:

set.seed(200)

trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)
featurePlot(trainingData$x, trainingData$y)

testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

Tune several models on these data. For example:

KNN Model

knnModel <- train(x = trainingData$x,
                  y = trainingData$y,
                  method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength = 10)
knnModel
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.466085  0.5121775  2.816838
##    7  3.349428  0.5452823  2.727410
##    9  3.264276  0.5785990  2.660026
##   11  3.214216  0.6024244  2.603767
##   13  3.196510  0.6176570  2.591935
##   15  3.184173  0.6305506  2.577482
##   17  3.183130  0.6425367  2.567787
##   19  3.198752  0.6483184  2.592683
##   21  3.188993  0.6611428  2.588787
##   23  3.200458  0.6638353  2.604529
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
knnRes <- postResample(pred = knnPred, obs = testData$y)

knnRes
##      RMSE  Rsquared       MAE 
## 3.2040595 0.6819919 2.5683461

NNET Model

nnetModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "nnet",
  preProc = c("center", "scale"),
  linout = TRUE,   # for regression
  trace = FALSE,
  tuneLength = 5
)

nnetModel
## Neural Network 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   size  decay  RMSE      Rsquared   MAE     
##   1     0e+00  2.755170  0.6903181  2.162767
##   1     1e-04  2.861942  0.6691057  2.270570
##   1     1e-03  2.573204  0.7363206  2.004196
##   1     1e-02  2.651700  0.7202967  2.080180
##   1     1e-01  2.520327  0.7486306  1.953338
##   3     0e+00  2.883931  0.6891171  2.283922
##   3     1e-04  2.863277  0.7017995  2.221925
##   3     1e-03  2.899866  0.6902986  2.237399
##   3     1e-02  2.749921  0.7278748  2.118012
##   3     1e-01  2.822126  0.6992650  2.217502
##   5     0e+00  3.944658  0.5302631  2.860531
##   5     1e-04  3.695260  0.5741159  2.722062
##   5     1e-03  3.141223  0.6465631  2.428903
##   5     1e-02  3.290679  0.6304601  2.511005
##   5     1e-01  2.914487  0.6843302  2.330338
##   7     0e+00  4.558038  0.4752433  3.213078
##   7     1e-04  4.165272  0.5334456  2.990985
##   7     1e-03  4.144512  0.5021363  3.079549
##   7     1e-02  3.983844  0.5482927  2.963599
##   7     1e-01  3.806945  0.5542832  2.936401
##   9     0e+00  3.621263  0.5687844  2.841848
##   9     1e-04  3.588814  0.5754440  2.833006
##   9     1e-03  3.910524  0.5415794  3.006935
##   9     1e-02  3.871555  0.5403639  3.007931
##   9     1e-01  3.296844  0.6314754  2.597155
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 1 and decay = 0.1.
nnetPred <- predict(nnetModel, newdata = testData$x)
nnetRes <- postResample(pred = nnetPred, obs = testData$y)

nnetRes
##      RMSE  Rsquared       MAE 
## 2.6492285 0.7177439 2.0294441

MARS model

marsModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "earth",
  tuneLength = 10
)

marsModel
## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   nprune  RMSE      Rsquared   MAE     
##    2      4.395533  0.2233305  3.613154
##    3      3.691032  0.4550445  2.977652
##    4      2.718330  0.7015680  2.172572
##    6      2.335844  0.7800163  1.865473
##    7      1.962732  0.8439657  1.547124
##    9      1.759399  0.8764918  1.378305
##   10      1.761784  0.8750228  1.382234
##   12      1.767343  0.8748448  1.383683
##   13      1.789648  0.8721771  1.406168
##   15      1.811550  0.8693218  1.421981
## 
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 9 and degree = 1.
marsPred <- predict(marsModel, newdata = testData$x)
marsRes <- postResample(pred = marsPred, obs = testData$y)

marsRes
##      RMSE  Rsquared       MAE 
## 1.7901760 0.8705315 1.3712537
varImp(marsModel)
## earth variable importance
## 
##    Overall
## X1  100.00
## X4   82.92
## X2   64.47
## X5   40.67
## X3   28.65
## X6    0.00

SVM model

svmModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "svmRadial",
  preProc = c("center", "scale"),
  tuneLength = 10
)
svmModel
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE      Rsquared   MAE     
##     0.25  2.614200  0.7491782  2.061297
##     0.50  2.439262  0.7641973  1.913756
##     1.00  2.319798  0.7816128  1.815240
##     2.00  2.223075  0.7967671  1.731179
##     4.00  2.158347  0.8071546  1.676251
##     8.00  2.122226  0.8130736  1.654238
##    16.00  2.118799  0.8136443  1.651396
##    32.00  2.118799  0.8136443  1.651396
##    64.00  2.118799  0.8136443  1.651396
##   128.00  2.118799  0.8136443  1.651396
## 
## Tuning parameter 'sigma' was held constant at a value of 0.06106608
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06106608 and C = 16.
svmPred <- predict(svmModel, newdata = testData$x)
svmRes <- postResample(pred = svmPred, obs = testData$y)

svmRes
##      RMSE  Rsquared       MAE 
## 2.0687980 0.8264478 1.5714873

Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?

results <- data.frame(
  Model = c("KNN", "MARS", "Neural net", "Support Vector Machines"),
  RMSE = c(knnRes[1], marsRes[1], nnetRes[1], svmRes[1]),
  Rsquared = c(knnRes[2], marsRes[2], nnetRes[2], svmRes[2])
)

print(results)
##                     Model     RMSE  Rsquared
## 1                     KNN 3.204059 0.6819919
## 2                    MARS 1.790176 0.8705315
## 3              Neural net 2.649229 0.7177439
## 4 Support Vector Machines 2.068798 0.8264478

The MARS model here does the best with the lowest RMSE and the highest R squared and outperforms all other models. Its followed by SVM and Neural nets with close RMSE’s. Knn seems the perform the worst.Mars when you look at varImp() also correctly selects the informative predictors X1-X5 and seems to ignores the noise.

Q2)

7.5. Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

Load the data , knn impute the missing values

data("ChemicalManufacturingProcess") 
trans <- preProcess(ChemicalManufacturingProcess, method = "knnImpute")
chemicalData <- predict(trans, ChemicalManufacturingProcess)

trainIndex <- createDataPartition(chemicalData$Yield, p = 0.8, list = FALSE)
chemTrain <- chemicalData[trainIndex, ]
chemTest  <- chemicalData[-trainIndex, ]

trainX <- chemTrain[, -which(names(chemTrain) == "Yield")]
trainY <- chemTrain$Yield
testX  <- chemTest[, -which(names(chemTest) == "Yield")]
testY  <- chemTest$Yield

Training the models on the training set

# KNN
knnModel_chem <- train(
  x = trainX, y = trainY,
  method = "knn",
  preProcess = c("center", "scale"),
  tuneLength = 10
)

# MARS
marsModel_chem <- train(
  x = trainX, y = trainY,
  method = "earth",
  tuneLength = 10
)

# NNet
nnetModel_chem <- train(
  x = trainX, y = trainY,
  method = "nnet",
  preProcess = c("center", "scale"),
  tuneLength = 5,
  trace = FALSE
)

# SVM
svmModel_chem <- train(
  x = trainX, y = trainY,
  method = "svmRadial",
  preProcess = c("center", "scale"),
  tuneLength = 10
)

(a)

Which nonlinear regression model gives the optimal re-sampling and test set performance?

Evaluate against test set

pred_knn_chem  <- predict(knnModel_chem,  newdata = testX)
pred_mars_chem <- predict(marsModel_chem, newdata = testX)
pred_nnet_chem <- predict(nnetModel_chem, newdata = testX)
pred_svm_chem  <- predict(svmModel_chem,  newdata = testX)

res_knn_chem  <- postResample(pred_knn_chem,  testY)
res_mars_chem <- postResample(pred_mars_chem, testY)
res_nnet_chem <- postResample(pred_nnet_chem, testY)
res_svm_chem  <- postResample(pred_svm_chem,  testY)

chem_model_results <- data.frame(
  Model = c("KNN", "MARS", "Neural Net", "Support Vector Machine"),
  RMSE = c(res_knn_chem["RMSE"], res_mars_chem["RMSE"], res_nnet_chem["RMSE"], res_svm_chem["RMSE"]),
  Rsquared = c(res_knn_chem["Rsquared"], res_mars_chem["Rsquared"], res_nnet_chem["Rsquared"], res_svm_chem["Rsquared"])
)

print(chem_model_results)
##                    Model      RMSE  Rsquared
## 1                    KNN 0.5755098 0.5763120
## 2                   MARS 0.6507877 0.5691701
## 3             Neural Net 0.8144255 0.4705910
## 4 Support Vector Machine 0.7559147 0.4360587

Here we see that the MARS model is the optimal nonlinear regression model, with SVM following behind and then the Neural net and KNN

####(b) Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

print(varImp(marsModel_chem))
## earth variable importance
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess09   47.34
## ManufacturingProcess13    0.00

Here we can see the MARS model focuses heavily on the ManufacturingProcess32. We will have to go to the next best model if we need a more representative subjective spread of the predictor variable importance

print(varImp(svmModel_chem))
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   85.44
## ManufacturingProcess36   81.07
## BiologicalMaterial06     79.13
## ManufacturingProcess17   74.39
## BiologicalMaterial03     72.93
## ManufacturingProcess31   70.27
## ManufacturingProcess09   66.92
## BiologicalMaterial12     63.50
## BiologicalMaterial02     59.49
## ManufacturingProcess06   53.55
## ManufacturingProcess33   52.59
## ManufacturingProcess11   50.68
## ManufacturingProcess29   45.57
## ManufacturingProcess30   45.18
## BiologicalMaterial04     43.34
## BiologicalMaterial11     41.05
## BiologicalMaterial09     37.14
## BiologicalMaterial08     35.53
## BiologicalMaterial01     30.64

We can compare this to the best performing linear lasso model from the last assignmnet which had Var Imp

ManufacturingProcess32 ManufacturingProcess13 BiologicalMaterial06 BiologicalMaterial03 ManufacturingProcess09 ManufacturingProcess17 ManufacturingProcess36 BiologicalMaterial02 BiologicalMaterial12 ManufacturingProcess31

Again as we saw from 6.3 we see manufacturing processes dominating the importance list. Its even more prevalent in this model with 7:3 in favor of manufacturing processes when we look at the top 10 predictors when arranged by importance in the mode. Comparing to the linear model from the last example the list remains relatively the same with some differences in the positions of the predictors. The top 3 remain the same, out of the remaining 7, 6 of them are on both lists of top 10. The only predictor unique to the top 10 predictors list for the SVM model is Manufacturing process 06, while the only unique predictor for lasso in the top 10 is Biological material12

####(c) Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

When looking just at the top 10 we see that Manufacturing process06 is the only predictor that is unique to our non linear model. Here we can see how it affects yield and why it wasn’t given a higher importance in the linear model.

ggplot(chemTrain, aes(x = ManufacturingProcess06, y = Yield)) +
  geom_point() +
  geom_smooth(method = "loess") +
  labs(title = "ManufacturingProcess06 vs Yield",
       x = "ManufacturingProcess06",
       y = "Yield")

This shows us why the effect of Manufacturing process06 was not picked up in the top ten predictors for yield. Here we see that the effects it has are non linear , with decreases in yield in a particular range, plateauing in most of its range and then increasing towards the end. The effect it has is not constant across the range so linear regression would have either over fit or under fit if it gave higher importance to this predictor. Small changes at the lower or higher end can affect yield and this would not have been picked up by a linear model .

top9_predictors <- c(
  "ManufacturingProcess32",
  "ManufacturingProcess13",
  "BiologicalMaterial06",
  "ManufacturingProcess36",
  "ManufacturingProcess09",
  "ManufacturingProcess17",
  "ManufacturingProcess31",
  "BiologicalMaterial03",
  "BiologicalMaterial02"
)

plot_list <- list()

for (var in top9_predictors) {
  p <- ggplot(chemTrain, aes_string(x = var, y = "Yield")) +
    geom_point() +
    geom_smooth(method = "loess") +
    labs(title = paste(var, "vs Yield"),
         x = var,
         y = "Yield")
  plot_list[[var]] <- p
}

do.call(grid.arrange, c(plot_list, ncol = 3))

When we plot the other predictors we can understand how they also exhibit non linear tendencies, but their linear approximations are strong enough to produce a fit that is of high importance to the model and its performance