Exercise 7.2

library(caret) 
## Loading required package: ggplot2
## Loading required package: lattice
library(mlbench)
library(earth)
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
library(nnet)
library(AppliedPredictiveModeling)
library(ggplot2)
library(pracma)

set.seed(200)
trainingData = mlbench.friedman1(200, sd=1)
trainingData$x = data.frame(trainingData$x)

featurePlot(trainingData$x,trainingData$y)

testData <- mlbench.friedman1(5000, sd=1)
testData$x <-data.frame(testData$x)


rmses <- c()
r2s <- c()
methods <- c()

K-NN model:

set.seed(0)
knn_Model = train(x=trainingData$x, 
                  y=trainingData$y, 
                  method="knn",
                 preProc=c("center", "scale"),
                 tuneLength=10)
knn_Pred = predict(knn_Model, newdata=testData$x)
knn_PR = postResample(pred=knn_Pred, obs=testData$y)

rmses = c(rmses, knn_PR[1])
r2s = c(r2s, knn_PR[2])
methods = c(methods, "KNN")

Neural Network model:

set.seed(0)
nnetModel = train(x=trainingData$x, y=trainingData$y, method="nnet",
                  preProc=c("center", "scale"),
                  linout=TRUE, trace=FALSE,
                  MaxNWts=10 * (ncol(trainingData$x)+1) + 10 + 1,
                  maxit=500)
nnetPred = predict(nnetModel, newdata=testData$x)
nnetPR = postResample(pred=nnetPred, obs=testData$y)
rmses = c(rmses, nnetPR[1])
r2s = c(r2s, nnetPR[2])
methods = c(methods, "NN")

MARS model:

marsGrid = expand.grid(.degree=1:2, .nprune=2:38)
set.seed(0)
marsModel = train(x=trainingData$x, y=trainingData$y, method="earth",
                  preProc=c("center", "scale"),
                  tuneGrid=marsGrid)
marsPred = predict(marsModel, newdata=testData$x)
marsPR = postResample(pred=marsPred, obs=testData$y)

summary(marsModel$finalModel)
## Call: earth(x=data.frame[200,10], y=c(18.46,16.1,17...), keepxy=TRUE, degree=2,
##             nprune=13)
## 
##                                   coefficients
## (Intercept)                          21.690154
## h(0.507267-X1)                       -4.203744
## h(X1-0.507267)                        3.072355
## h(0.325504-X2)                       -5.314859
## h(-0.216741-X3)                       3.320304
## h(X3- -0.216741)                      2.321760
## h(0.953812-X4)                       -2.775288
## h(X4-0.953812)                        2.778320
## h(1.17878-X5)                        -1.607769
## h(X1-0.507267) * h(X2- -0.798188)    -3.199202
## h(0.606835-X1) * h(0.325504-X2)       2.030856
## h(0.325504-X2) * h(X3-0.795427)       1.369704
## 
## Selected 12 of 21 terms, and 5 of 10 predictors (nprune=13)
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6-unused, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 8 3
## GCV 1.842426    RSS 270.9495    GRSq 0.9251967    RSq 0.9444425
rmses = c(rmses, marsPR[1])
r2s = c(r2s, marsPR[2])
methods = c(methods, "MARS")

Support Vector Machine (SVM):

set.seed(0)
svmRModel = train(x=trainingData$x, y=trainingData$y, method="svmRadial",
                  preProc=c("center", "scale"),
                  tuneLength=20)
svmRPred = predict(svmRModel, newdata=testData$x)
svmPR = postResample(pred=svmRPred, obs=testData$y)
rmses = c(rmses, svmPR[1])
r2s = c(r2s, svmPR[2])
methods = c(methods, "SVM")
res = data.frame(Model=methods, RMSE=rmses, Rsquared=r2s)
res = res[order(res$RMSE), ] 

print(res)
##   Model     RMSE  Rsquared
## 3  MARS 1.322734 0.9291489
## 4   SVM 2.059725 0.8279536
## 2    NN 2.649316 0.7177210
## 1   KNN 3.204059 0.6819919

Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)? The MARS model clearly has the best performance with the lowest RMSE of 1.322734 and highest R-Squared at 0.9291489.

Looking at the summary of the MARS model that it used the informative predictor (X1-X5)

7.5(a) Which nonlinear regression model gives the optimal resampling and test set performance? The SVM model gives the optimal resampling and test set performance.

library(caret)
library(AppliedPredictiveModeling)

set.seed(0)

data(ChemicalManufacturingProcess)

proc_Predic= ChemicalManufacturingProcess[,2:58]
yield = ChemicalManufacturingProcess[,1]

n_samples = dim(proc_Predic)[1]
n_features = dim(proc_Predic)[2]


rep_na = sapply( proc_Predic, median, na.rm=TRUE )
for( ci in 1:n_features ){
  mis_indx = is.na( proc_Predic[,ci] )
  proc_Predic[mis_indx,ci] = rep_na[ci]
}

zero_var = nearZeroVar( proc_Predic )
print( sprintf("Found %d zero variance columns from %d",length(zero_var), dim(proc_Predic)[2] ) )
## [1] "Found 1 zero variance columns from 57"
proc_Predic = proc_Predic[,-zero_var] 

training = createDataPartition( yield, p=0.8 )

proc_Predic_train = proc_Predic[training$Resample1,]
yield_training = yield[training$Resample1]

proc_Predic_test = proc_Predic[-training$Resample1,]
yield_testing = yield[-training$Resample1]

pre_Proc = c("center","scale")

K-NN model

set.seed(0)
knnModel = train(x=proc_Predic_train, 
                 y=yield_training, 
                 method="knn", 
                 preProc=pre_Proc, 
                 tuneLength=10)
knnPred = predict(knnModel, newdata=proc_Predic_train)
knnPR = postResample(pred=knnPred, obs=yield_training)

rmses_training = c(knnPR[1])
r2s_training = c(knnPR[2])
methods = c("KNN")

knnPred = predict(knnModel, newdata=proc_Predic_test)
knnPR = postResample(pred=knnPred, obs=yield_testing)

rmses_testing = c(knnPR[1])
r2s_testing = c(knnPR[2])

Neural Network model

nnGrid = expand.grid( .decay=c(0,0.01,0.1), .size=1:10, .bag=FALSE )
set.seed(123)
nnetModel = train(x=proc_Predic_train, 
                  y=yield_training, 
                  method="nnet", 
                  preProc=pre_Proc,
                  linout=TRUE,trace=FALSE,MaxNWts=10 * 
                    (ncol(proc_Predic_train)+1) + 10 + 1, maxit=500)

nnetPred = predict(nnetModel, newdata=proc_Predic_train)
nnetPR = postResample(pred=nnetPred, obs=yield_training)

rmses_training = c(rmses_training,nnetPR[1])
r2s_training = c(r2s_training,nnetPR[2])
methods = c(methods,"NN")

nnetPred = predict(nnetModel, newdata=proc_Predic_test)
nnetPR = postResample(pred=nnetPred, obs=yield_testing)
rmses_testing = c(rmses_testing,nnetPR[1])
r2s_testing = c(r2s_testing,nnetPR[2])

MARS model

marsGrid = expand.grid(.degree=1:2, .nprune=2:38)
set.seed(0)
marsModel = train(x=proc_Predic_train, 
                  y=yield_training, 
                  method="earth", 
                  preProc=pre_Proc, 
                  tuneGrid=marsGrid)
      
marsPred = predict(marsModel, newdata=proc_Predic_train)
marsPR = postResample(pred=marsPred, obs=yield_training)
rmses_training = c(rmses_training,marsPR[1])
r2s_training = c(r2s_training,marsPR[2])
methods = c(methods,"MARS")

marsPred = predict(marsModel, newdata=proc_Predic_test)
marsPR = postResample(pred=marsPred, obs=yield_testing)
rmses_testing = c(rmses_testing,marsPR[1])
r2s_testing = c(r2s_testing,marsPR[2])

varImp(marsModel)
## earth variable importance
## 
##                        Overall
## ManufacturingProcess32   100.0
## ManufacturingProcess09    46.8
## ManufacturingProcess13     0.0

Support Vector Machine (SVM)

set.seed(0)
svmModel = train(x=proc_Predic_train, 
                 y=yield_training, 
                 method="svmRadial", 
                 preProc=pre_Proc, 
                 tuneLength=20)

svmPred = predict(svmModel, newdata=proc_Predic_train)
svmPR = postResample(pred=svmPred, obs=yield_training) 
rmses_training = c(rmses_training,svmPR[1])
r2s_training = c(r2s_training,svmPR[2])
methods = c(methods,"SVM")

svmPred = predict(svmModel, newdata=proc_Predic_test)
svmPR = postResample(pred=svmPred, obs=yield_testing)
rmses_testing = c(rmses_testing,svmPR[1])
r2s_testing = c(r2s_testing,svmPR[2])
res_train <- data.frame(rmse = rmses_training, r2 = r2s_training)
rownames(res_train) <- methods

trained_order <- order(-res_train$rmse)
res_train <- res_train[trained_order, ]
print(res_train)
##           rmse        r2
## KNN  1.2841290 0.5614593
## NN   1.1704102 0.6069835
## MARS 1.0835591 0.6629220
## SVM  0.3170933 0.9744619
res_test <- data.frame(rmse = rmses_testing, r2 = r2s_testing)
rownames(res_test) <- methods

res_test <- res_test[trained_order, ]
print(res_test)
##           rmse        r2
## KNN  1.0902785 0.6040095
## NN   1.4124514 0.3919552
## MARS 1.0265607 0.6430378
## SVM  0.9190973 0.7343000
resamp <- resamples(list(
  knn = knnModel,
  svm = svmModel,
  mars = marsModel,
  nnet = nnetModel
))
print(summary(resamp))
## 
## Call:
## summary.resamples(object = resamp)
## 
## Models: knn, svm, mars, nnet 
## Number of resamples: 25 
## 
## MAE 
##           Min.   1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## knn  0.9633275 1.1135415 1.209921 1.211159 1.293099 1.392785    0
## svm  0.8988523 0.9840216 1.047575 1.067544 1.119261 1.305342    0
## mars 0.9054659 0.9835820 1.021006 1.072569 1.180873 1.345141    0
## nnet 1.0860829 1.3047414 1.368715 1.377428 1.455976 1.693571    0
## 
## RMSE 
##          Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## knn  1.222936 1.434768 1.556406 1.533576 1.631582 1.777429    0
## svm  1.138352 1.249418 1.317529 1.345280 1.384116 1.755251    0
## mars 1.138482 1.228094 1.297889 1.359079 1.478925 1.792090    0
## nnet 1.395682 1.619169 1.732417 1.733286 1.814316 2.084816    0
## 
## Rsquared 
##            Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## knn  0.15753257 0.3332294 0.3759240 0.3875231 0.4305426 0.6272761    0
## svm  0.34772020 0.4741270 0.5156888 0.5267953 0.5924426 0.6712511    0
## mars 0.21905724 0.4619202 0.5359658 0.5179635 0.5912253 0.6911642    0
## nnet 0.07587787 0.1756677 0.2182918 0.2580030 0.3455614 0.4751747    0
dotplot(resamp, metric = "RMSE")

print(summary(diff(resamp)))
## 
## Call:
## summary.diff.resamples(object = diff(resamp))
## 
## p-value adjustment: bonferroni 
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
## 
## MAE 
##      knn       svm       mars      nnet     
## knn             0.143615  0.138590 -0.166269
## svm  1.400e-08           -0.005025 -0.309884
## mars 5.321e-05 1.0000000           -0.304859
## nnet 0.0003816 2.339e-09 9.396e-07          
## 
## RMSE 
##      knn       svm       mars      nnet   
## knn             0.1883    0.1745   -0.1997
## svm  3.145e-07           -0.0138   -0.3880
## mars 2.262e-05 1.000000            -0.3742
## nnet 0.000196  1.530e-10 5.282e-07        
## 
## Rsquared 
##      knn       svm       mars      nnet     
## knn            -0.139272 -0.130440  0.129520
## svm  3.570e-06            0.008832  0.268792
## mars 1.153e-05 1.000000             0.259961
## nnet 0.001247  4.868e-11 1.594e-07

7.5(b) Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model? The process variables most important predictors in this model.Process variables dominate the top predictors in the SVM model. There is more overlap in th e SVM model than the linear model.

varImp(svmModel)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   84.14
## ManufacturingProcess36   76.06
## BiologicalMaterial06     75.70
## ManufacturingProcess17   72.98
## BiologicalMaterial03     71.70
## BiologicalMaterial12     65.80
## ManufacturingProcess09   64.89
## BiologicalMaterial02     55.11
## ManufacturingProcess06   54.30
## ManufacturingProcess31   47.40
## BiologicalMaterial11     43.85
## BiologicalMaterial04     42.26
## ManufacturingProcess33   40.68
## ManufacturingProcess12   37.19
## ManufacturingProcess11   35.28
## BiologicalMaterial08     35.03
## BiologicalMaterial09     34.55
## BiologicalMaterial01     30.95
## ManufacturingProcess18   27.48

7.5(c) Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield? Yes, the plot shows intuition about process predictors and their relationship with yield. its shows upward trend in predicted yield as ManufacturingProcess32 increases suggests that optimizing this process variable may enhance production output.

p_range <- range(proc_Predic$ManufacturingProcess32)
variation <- seq(from = p_range[1], to = p_range[2], length.out = 100)

mean_predic <- colMeans(proc_Predic)

newdata <- repmat(as.double(mean_predic), length(variation), 1)
newdata <- data.frame(newdata)
colnames(newdata) <- colnames(proc_Predic)
newdata$ManufacturingProcess32 <- variation

xs <- variation
y_hat <- predict(svmModel, newdata = as.matrix(newdata))

plot(xs, y_hat,
     type = "l",
     xlab = "Variation in ManufacturingProcess32",
     ylab = "Predicted Yield",
     main = "Effect of ManufacturingProcess32 on Predicted Yield")
grid()