7.2

Going through our various models, we can see that the neural network slightly outperforms the MARS and SVM models while the kNN model trails the others. The MARS model does indeed select the informative predictors.

set.seed(8675309)
gen_data <- mlbench.friedman1(5000, sd = 1)
gen_df <- data.frame(
  y = gen_data$y,
  x = gen_data$x
)
names(gen_df) <- c("y", paste0("x", 1:10))
head(gen_df)
##          y        x1        x2        x3        x4          x5         x6
## 1 12.16672 0.1594836 0.0456208 0.3105660 0.8522188 0.754578808 0.09319804
## 2 16.23564 0.4781883 0.8097633 0.4086639 0.4443186 0.520000371 0.81068487
## 3 23.38607 0.7647987 0.5367907 0.7464495 0.8870306 0.859720897 0.30152153
## 4 15.54643 0.7696877 0.6598489 0.4006697 0.5267653 0.006821756 0.06784336
## 5 14.71833 0.2685485 0.9780525 0.2363302 0.4140515 0.412391180 0.38325321
## 6 18.37025 0.6730459 0.4601106 0.6829390 0.6643708 0.727322878 0.44678765
##          x7         x8        x9       x10
## 1 0.6740919 0.30365168 0.6153587 0.7954506
## 2 0.1719901 0.74346944 0.5178852 0.6728662
## 3 0.5724911 0.17574518 0.2752559 0.9421588
## 4 0.2884637 0.75233852 0.5584243 0.5066027
## 5 0.6904641 0.32679762 0.2907757 0.1011113
## 6 0.3860283 0.04385509 0.1524078 0.9159989
train_ind <- createDataPartition(gen_df$y, p = 0.8, list = FALSE)
train_data <- gen_df[train_ind, ]
test_data <- gen_df[-train_ind, ]

control <- trainControl(
  method = "cv",
  preProcOptions = list(thresh = 0.75, cutoff = 0.75),
  allowParallel = TRUE
)

grid <- expand.grid(decay = c(0, 0.01, 0.1),
                    size = c(1:10),
                    bag = FALSE)

knn

\(RMSE=2.4368788\) and \(R^2=0.8060092\).

set.seed(8675309)

knn_model <- train(y ~ .,
                  data = train_data,
                  method = "knn",
                  preProcess = c("center", "scale"),
                  tuneLength = 10)
knn_model
## k-Nearest Neighbors 
## 
## 4000 samples
##   10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 4000, 4000, 4000, 4000, 4000, 4000, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  2.671699  0.7069602  2.113720
##    7  2.571185  0.7327170  2.037292
##    9  2.516051  0.7496739  1.995026
##   11  2.487896  0.7607198  1.974287
##   13  2.473286  0.7687348  1.966906
##   15  2.465458  0.7749983  1.961537
##   17  2.457092  0.7812662  1.956374
##   19  2.460169  0.7844121  1.960243
##   21  2.460173  0.7880008  1.959089
##   23  2.464701  0.7903930  1.961893
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knn_pred <- predict(knn_model, newdata = test_data)
postResample(pred = knn_pred, obs = test_data$y)
##      RMSE  Rsquared       MAE 
## 2.4368788 0.8060092 1.9632221

Neural network

\(RMSE=1.0701265\) and \(R^2=0.9546408\).

set.seed(8675309)

nn_model <- train(
  y ~ .,
  data = train_data,
  method = "avNNet",
  tuneGrid = grid,
  trControl = control,
  preProc = c("center", "scale", "corr"),
  linout = TRUE,
  trace = FALSE,
  maxit = 500
)
set.seed(8675309)

nn_pred <- predict(nn_model, newdata = test_data)
postResample(pred = nn_pred, obs = test_data$y)
##      RMSE  Rsquared       MAE 
## 1.0701265 0.9546408 0.8472096

MARS

\(RMSE=1.1228369\) and \(R^2=0.9500391\).

set.seed(8675309)

mars_grid <- expand.grid(degree = 1:2, nprune = 2:30)

mars_model <- train(
  y ~ .,
  data = train_data,
  method = "earth",
  trControl = control,
  tuneGrid = mars_grid
)
set.seed(8675309)

mars_pred <- predict(mars_model, newdata = test_data)
postResample(pred = mars_pred, obs = test_data$y)
##      RMSE  Rsquared       MAE 
## 1.1228369 0.9500391 0.8961990
varImp(mars_model)
## earth variable importance
## 
##    Overall
## x4  100.00
## x1   65.84
## x2   43.73
## x5   16.55
## x3    0.00

SVM

\(RMSE=1.2036330\) and \(R^2=0.9426508\).

set.seed(8675309)

svm_model <- train(
  y ~ .,
  data = train_data,
  method = "svmRadial",
  preProc = c("center", "scale"),
  trControl = control,
  tuneLength = 10
)
set.seed(8675309)

svm_pred <- predict(svm_model, newdata = test_data)
postResample(pred = svm_pred, obs = test_data$y)
##      RMSE  Rsquared       MAE 
## 1.2036330 0.9426508 0.9583541

7.5

Copying the data imputation from before.

set.seed(8675309)
data(ChemicalManufacturingProcess)

# Apply BoxCox, center, and scale the imputation model to get better imputation values
impute_model <- preProcess(ChemicalManufacturingProcess, 
                           method = c("knnImpute", "BoxCox", "center", "scale"))
imputed_chemicals <- predict(impute_model, ChemicalManufacturingProcess)

chem_train_ind <- createDataPartition(imputed_chemicals$Yield, p = 0.8, list = FALSE)
chem_train <- imputed_chemicals[chem_train_ind, ]
chem_test <- imputed_chemicals[-chem_train_ind, ]

7.5.1

The SVM model performs best out of the tested models with a \(RMSE=0.6270910\) and \(R^2=0.6388892\).

knn

A knn model with \(k=11\) is chosen as optimal and produces \(RMSE=0.6873036\) and \(R^2=0.5981203\).

set.seed(8675309)

knn_chem_model <- train(Yield ~ .,
                  data = chem_train,
                  method = "knn",
                  preProcess = c("center", "scale"),
                  tuneLength = 10)
knn_chem_model
## k-Nearest Neighbors 
## 
## 144 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE       Rsquared   MAE      
##    5  0.8120707  0.3612825  0.6520544
##    7  0.8022716  0.3747582  0.6446315
##    9  0.7956103  0.3869750  0.6444232
##   11  0.7880382  0.4032646  0.6424597
##   13  0.7894890  0.4056726  0.6466429
##   15  0.7898989  0.4110172  0.6492793
##   17  0.7916958  0.4140582  0.6501608
##   19  0.8006142  0.4041168  0.6567263
##   21  0.8018545  0.4091936  0.6591687
##   23  0.8034769  0.4131538  0.6606181
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 11.
knn_chem_pred <- predict(knn_chem_model, newdata = chem_test)
postResample(pred = knn_chem_pred, obs = chem_test$Yield)
##      RMSE  Rsquared       MAE 
## 0.6873036 0.5981203 0.5660610
plot(knn_chem_model)

Neural Network

The generated neural network has an \(RMSE=0.6872073\) and \(R^2=0.5427729\).

set.seed(8675309)

nn_chem_model <- train(
  Yield ~ .,
  data = chem_train,
  method = "avNNet",
  tuneGrid = grid,
  trControl = control,
  preProc = c("center", "scale", "corr"),
  linout = TRUE,
  trace = FALSE,
  maxit = 500
)
nn_chem_model
## Model Averaged Neural Network 
## 
## 144 samples
##  57 predictor
## 
## Pre-processing: centered (38), scaled (38), remove (19) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 130, 131, 131, 129, 128, 129, ... 
## Resampling results across tuning parameters:
## 
##   decay  size  RMSE       Rsquared   MAE      
##   0.00    1    0.7624246  0.4572574  0.6150850
##   0.00    2    0.7964896  0.4843724  0.6518018
##   0.00    3    0.8527236  0.4403980  0.6477832
##   0.00    4    0.7847627  0.4506978  0.6556870
##   0.00    5    0.8196040  0.4265807  0.6731684
##   0.00    6    0.7209539  0.5447221  0.6090403
##   0.00    7    0.7406530  0.5102422  0.6181129
##   0.00    8    0.6980460  0.5429157  0.5664921
##   0.00    9    0.7188841  0.5110517  0.5890599
##   0.00   10    0.6977562  0.5523332  0.5797705
##   0.01    1    0.7822054  0.4558792  0.6448806
##   0.01    2    0.7760821  0.4727718  0.6244145
##   0.01    3    0.7735140  0.4693177  0.6299371
##   0.01    4    0.7060164  0.5196379  0.5726472
##   0.01    5    0.6891186  0.5473394  0.5603039
##   0.01    6    0.7205574  0.5366551  0.5723960
##   0.01    7    0.6997525  0.5533907  0.5707565
##   0.01    8    0.6716858  0.5630579  0.5528613
##   0.01    9    0.6698427  0.5610157  0.5478894
##   0.01   10    0.6761683  0.5642592  0.5564284
##   0.10    1    0.7402947  0.4975795  0.6165588
##   0.10    2    0.7394963  0.4956531  0.5912123
##   0.10    3    0.7204346  0.5055812  0.5856247
##   0.10    4    0.7121002  0.5180237  0.5703578
##   0.10    5    0.6463951  0.5880689  0.5475386
##   0.10    6    0.6624545  0.5827605  0.5432281
##   0.10    7    0.6744944  0.5666035  0.5387947
##   0.10    8    0.6847442  0.5562500  0.5547336
##   0.10    9    0.7032737  0.5359503  0.5614829
##   0.10   10    0.6919493  0.5422710  0.5625249
## 
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 5, decay = 0.1 and bag = FALSE.
nn_chem_pred <- predict(nn_chem_model, newdata = chem_test)
postResample(pred = nn_chem_pred, obs = chem_test$Yield)
##      RMSE  Rsquared       MAE 
## 0.6872073 0.5427729 0.5544380
plot(nn_chem_model)

MARS

Our MARS model has an \(RMSE=0.6162499\) and \(R^2=0.6335505\). Interestingly there appear to only be two variables of import, and of those only ManufacturingProcess32 seems to have any weight.

set.seed(8675309)

mars_grid <- expand.grid(degree = 1:5, nprune = 2:15)

mars_chem_model <- train(
  Yield ~ .,
  data = chem_train,
  method = "earth",
  trControl = control,
  tuneGrid = mars_grid
)
mars_chem_model
## Multivariate Adaptive Regression Spline 
## 
## 144 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 130, 131, 131, 129, 128, 129, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE       Rsquared   MAE      
##   1        2      0.7838690  0.4223162  0.6219794
##   1        3      0.6889938  0.5481173  0.5707536
##   1        4      0.6780958  0.5760155  0.5670099
##   1        5      0.6943346  0.5544376  0.5675925
##   1        6      0.6689289  0.5841001  0.5405352
##   1        7      0.7041307  0.5457800  0.5756346
##   1        8      0.7000073  0.5495699  0.5676384
##   1        9      0.6895394  0.5608281  0.5445481
##   1       10      0.7177479  0.5265313  0.5810423
##   1       11      0.7269885  0.5174658  0.5843580
##   1       12      0.7157077  0.5264393  0.5766112
##   1       13      0.7334782  0.5080418  0.6007961
##   1       14      0.7463921  0.4927693  0.6076578
##   1       15      0.7478491  0.4922216  0.6078690
##   2        2      0.7838690  0.4223162  0.6219794
##   2        3      0.6950344  0.5239363  0.5743663
##   2        4      0.6346451  0.5938801  0.5272359
##   2        5      0.6704714  0.5539910  0.5588881
##   2        6      2.7329555  0.4141888  1.1371972
##   2        7      0.7879984  0.4633677  0.6351168
##   2        8      0.7803204  0.4768621  0.6319349
##   2        9      0.7985657  0.4614837  0.6374986
##   2       10      0.7899289  0.4858478  0.6209893
##   2       11      0.7939797  0.4824527  0.6205105
##   2       12      0.7739564  0.5028886  0.5966348
##   2       13      0.7752943  0.5129272  0.5965578
##   2       14      0.8322538  0.4835339  0.6522746
##   2       15      0.8383211  0.4829513  0.6470937
##   3        2      0.7838690  0.4223162  0.6219794
##   3        3      0.7256071  0.4528609  0.6038599
##   3        4      0.6533097  0.5620010  0.5446509
##   3        5      0.6417504  0.5820050  0.5358538
##   3        6      0.6950525  0.5386701  0.5548944
##   3        7      0.7032419  0.5241966  0.5659731
##   3        8      0.7246483  0.5080169  0.5692419
##   3        9      0.7393721  0.4997276  0.5800439
##   3       10      0.7244079  0.5125034  0.5698801
##   3       11      0.7474300  0.5029424  0.5809768
##   3       12      0.7729195  0.4934035  0.5952042
##   3       13      0.7889453  0.4845289  0.6126630
##   3       14      0.8213545  0.4660591  0.6381021
##   3       15      0.8293535  0.4752279  0.6411777
##   4        2      0.7838690  0.4223162  0.6219794
##   4        3      0.6874810  0.5155059  0.5604627
##   4        4      0.6514534  0.5611307  0.5424655
##   4        5      0.6361853  0.5835384  0.5338231
##   4        6      0.6642904  0.5555734  0.5496226
##   4        7      0.7465461  0.4832407  0.5977285
##   4        8      0.7542292  0.4712997  0.6034662
##   4        9      0.7555439  0.4644255  0.6083274
##   4       10      0.7581788  0.4734564  0.6003741
##   4       11      0.7795927  0.4738996  0.6128696
##   4       12      0.7424236  0.5104874  0.5818233
##   4       13      0.7680332  0.4952978  0.6004021
##   4       14      0.7692189  0.5011963  0.6052319
##   4       15      0.8008324  0.4806615  0.6230632
##   5        2      0.7838690  0.4223162  0.6219794
##   5        3      0.6801990  0.5278434  0.5634180
##   5        4      0.6497454  0.5635681  0.5413938
##   5        5      0.6360431  0.5844882  0.5345632
##   5        6      0.6674473  0.5508969  0.5533004
##   5        7      0.7497030  0.4785642  0.6014063
##   5        8      0.7505965  0.4761223  0.5973843
##   5        9      0.7512880  0.4741062  0.6050442
##   5       10      0.7752275  0.4475210  0.6192183
##   5       11      0.7919849  0.4557866  0.6245268
##   5       12      0.7545597  0.4958244  0.5954436
##   5       13      0.7794644  0.4806017  0.6122138
##   5       14      0.7778535  0.4848090  0.6110408
##   5       15      0.7858760  0.4853063  0.6097700
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 4 and degree = 2.
mars_chem_pred <- predict(mars_chem_model, newdata = chem_test)
postResample(pred = mars_chem_pred, obs = chem_test$Yield)
##      RMSE  Rsquared       MAE 
## 0.6162499 0.6335505 0.4667909
varImp(mars_chem_model)
## earth variable importance
## 
##                        Overall
## ManufacturingProcess32     100
## ManufacturingProcess09       0
plot(mars_chem_model)

SVM

Our SVM model produces the best results with \(RMSE=0.6154820\) and \(R^2=0.6566040\).

set.seed(8675309)

svm_chem_model <- train(
  Yield ~ .,
  data = chem_train,
  method = "svmRadial",
  preProc = c("center", "scale"),
  trControl = control,
  tuneLength = 15
)
svm_chem_model
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 144 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 130, 131, 131, 129, 128, 129, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE       Rsquared   MAE      
##      0.25  0.7809957  0.4967160  0.6416941
##      0.50  0.7130285  0.5540011  0.5860119
##      1.00  0.6471502  0.6206222  0.5348848
##      2.00  0.6074826  0.6625370  0.5059835
##      4.00  0.5952101  0.6762467  0.4936047
##      8.00  0.5970473  0.6733021  0.4948937
##     16.00  0.5970473  0.6733021  0.4948937
##     32.00  0.5970473  0.6733021  0.4948937
##     64.00  0.5970473  0.6733021  0.4948937
##    128.00  0.5970473  0.6733021  0.4948937
##    256.00  0.5970473  0.6733021  0.4948937
##    512.00  0.5970473  0.6733021  0.4948937
##   1024.00  0.5970473  0.6733021  0.4948937
##   2048.00  0.5970473  0.6733021  0.4948937
##   4096.00  0.5970473  0.6733021  0.4948937
## 
## Tuning parameter 'sigma' was held constant at a value of 0.018078
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.018078 and C = 4.
svm_chem_pred <- predict(svm_chem_model, newdata = chem_test)
postResample(pred = svm_chem_pred, obs = chem_test$Yield)
##      RMSE  Rsquared       MAE 
## 0.6154820 0.6566040 0.4870489
plot(svm_chem_model)

7.5.2

The best performing model was the SVM model, and below are the most important predictors. Of the top ten predictors seven are manufacturing process variables. The top ten predictors for our SVM model are identical to the top ten for the enet model which was our optimal linear model. Not only is the order identical, but so are the weightings.

varImp(svm_chem_model)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess09   93.79
## BiologicalMaterial06     90.22
## ManufacturingProcess13   87.57
## ManufacturingProcess17   87.45
## ManufacturingProcess36   85.75
## BiologicalMaterial03     85.58
## ManufacturingProcess06   79.27
## ManufacturingProcess11   75.05
## BiologicalMaterial12     72.82
## ManufacturingProcess31   65.77
## BiologicalMaterial02     62.86
## BiologicalMaterial11     53.68
## ManufacturingProcess29   50.92
## BiologicalMaterial04     49.63
## ManufacturingProcess18   48.73
## ManufacturingProcess25   46.19
## ManufacturingProcess33   46.00
## BiologicalMaterial08     44.69
## ManufacturingProcess30   44.61

7.5.3

Since our best model had the same importance of predictors as the optimal linear model (which really makes me think something went wrong) we will simply look at the top 5 predictors. The plots indicate our first three predictors are positively correlated with yield while the last two are negatively correlated. The relationships are largely linear, although there is a slight sinusoidal nature to the first three predictors that is most evident in ManufacturingProcess09. The last two predictors show signs of decay at the tail of the relationship plot, indicating that beyond a certain value the importance of the predictor rapidly declines.

library(ggplot2)
library(pdp)
imp <- varImp(svm_chem_model)
top_predictors <- rownames(imp$importance)[order(imp$importance$Overall, decreasing = TRUE)][1:5]
par(mfrow = c(2, 3))

for(pred in top_predictors) {
  partial_plot <- partial(
    svm_chem_model,
    pred.var = pred,
    train = chem_train,
    grid.resolution = 50
  )
  
  plot(partial_plot, 
       main = paste(pred),
       xlab = pred,
       ylab = "Partial Effect on Response")
}