Homework 8

7.2.

We will create four models(KNN, SVM, NueralNet, and MARS) and evaluate the accuracies of these models on the training and test data to see which models fits the data best .

library(mlbench)
library(kableExtra)
library(caret)

## Warning: package 'caret' was built under R version 3.6.3

## Loading required package: lattice

## Loading required package: ggplot2

set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
 ## We convert the 'x' data from a matrix to a data frame
 ## One reason is that this will give the columns names.
 trainingData$x <- data.frame(trainingData$x)
 ## Look at the data using
 featurePlot(trainingData$x, trainingData$y)

 ## or other methods.

 ## This creates a list with a vector 'y' and a matrix
 ## of predictors 'x'. Also simulate a large test set to
 ## estimate the true error rate with good precision:
 testData <- mlbench.friedman1(5000, sd = 1)
 testData$x <- data.frame(testData$x)

KNN

knn <- train(x = trainingData$x,
                  y = trainingData$y,
                  method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength = 10)
knn

## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.466085  0.5121775  2.816838
##    7  3.349428  0.5452823  2.727410
##    9  3.264276  0.5785990  2.660026
##   11  3.214216  0.6024244  2.603767
##   13  3.196510  0.6176570  2.591935
##   15  3.184173  0.6305506  2.577482
##   17  3.183130  0.6425367  2.567787
##   19  3.198752  0.6483184  2.592683
##   21  3.188993  0.6611428  2.588787
##   23  3.200458  0.6638353  2.604529
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.

knnPred <- predict(knn, newdata = testData$x)

 
knnAccuracy <- postResample(pred = knnPred, obs = testData$y)

SVM

svmRadial <- train(x = trainingData$x,
                        y = trainingData$y,
                        method = "svmRadial",
                        tuneLength=10,
                        preProc = c("center", "scale"))
svmRadial

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE      Rsquared   MAE     
##     0.25  2.545335  0.7804647  2.015121
##     0.50  2.319786  0.7965148  1.830009
##     1.00  2.188349  0.8119636  1.726027
##     2.00  2.103655  0.8241314  1.655842
##     4.00  2.066879  0.8294322  1.631051
##     8.00  2.052681  0.8313929  1.623550
##    16.00  2.049867  0.8318312  1.621820
##    32.00  2.049867  0.8318312  1.621820
##    64.00  2.049867  0.8318312  1.621820
##   128.00  2.049867  0.8318312  1.621820
## 
## Tuning parameter 'sigma' was held constant at a value of 0.06802164
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06802164 and C = 16.

svmRadialPred <- predict(svmRadial, newdata = testData$x)

 
svmRadialAccuracy <- postResample(pred = svmRadialPred, obs = testData$y)

NueralNet

nnetGrid <- expand.grid(.decay=c(0, 0.01, 0.1, 0.5, 0.9),
                        .size=c(1, 10, 15, 20),
                        .bag=FALSE)

nnet <- train(x = trainingData$x,
                   y = trainingData$y,
                   method = "avNNet",
                   tuneGrid = nnetGrid,
                   preProc = c("center", "scale"),
                   trace=FALSE,
                   linout=TRUE,
                   maxit=500)

## Warning: executing %dopar% sequentially: no parallel backend registered

nnet

## Model Averaged Neural Network 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   decay  size  RMSE      Rsquared   MAE     
##   0.00    1    2.589902  0.7324403  2.014450
##   0.00   10    3.082909  0.6581005  2.345346
##   0.00   15    2.666684  0.7202739  2.135432
##   0.00   20    2.634394  0.7273102  2.116201
##   0.01    1    2.567190  0.7363972  1.993021
##   0.01   10    2.714509  0.7118994  2.173845
##   0.01   15    2.435366  0.7634558  1.935705
##   0.01   20    2.346376  0.7801847  1.850974
##   0.10    1    2.580129  0.7336990  2.000459
##   0.10   10    2.528971  0.7492960  2.003431
##   0.10   15    2.309856  0.7879857  1.823430
##   0.10   20    2.289300  0.7922572  1.799799
##   0.50    1    2.620985  0.7251648  2.034073
##   0.50   10    2.389468  0.7734132  1.893293
##   0.50   15    2.248817  0.7979988  1.778851
##   0.50   20    2.257951  0.7973906  1.768133
##   0.90    1    2.649162  0.7195330  2.057453
##   0.90   10    2.339031  0.7803865  1.849270
##   0.90   15    2.247236  0.7980673  1.774157
##   0.90   20    2.248629  0.7989807  1.770371
## 
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 15, decay = 0.9 and bag = FALSE.

nnetPred <- predict(nnet, newdata = testData$x)

 
nnetAccuracy <- postResample(pred = nnetPred, obs = testData$y)

MARS

marsGrid <- expand.grid(.degree=1:2,
                        .nprune=2:20)

mars <- train(x = trainingData$x,
                   y = trainingData$y,
                   method = "earth",
                   tuneGrid = marsGrid,
                   preProc = c("center", "scale"))

## Loading required package: earth

## Warning: package 'earth' was built under R version 3.6.3

## Loading required package: Formula

## Loading required package: plotmo

## Loading required package: plotrix

## Loading required package: TeachingDemos

## Warning: package 'TeachingDemos' was built under R version 3.6.3

mars

## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE     
##   1        2      4.416233  0.2187486  3.630918
##   1        3      3.716956  0.4495842  2.981710
##   1        4      2.829383  0.6783492  2.285456
##   1        5      2.500824  0.7495068  2.002628
##   1        6      2.377484  0.7721684  1.897799
##   1        7      1.964112  0.8402449  1.540566
##   1        8      1.849850  0.8590078  1.446960
##   1        9      1.760287  0.8734284  1.379551
##   1       10      1.745434  0.8758361  1.358502
##   1       11      1.721573  0.8784527  1.333126
##   1       12      1.741003  0.8757510  1.341457
##   1       13      1.762562  0.8729306  1.355525
##   1       14      1.779852  0.8704986  1.376884
##   1       15      1.796118  0.8682289  1.386179
##   1       16      1.801970  0.8673854  1.392551
##   1       17      1.801970  0.8673854  1.392551
##   1       18      1.801970  0.8673854  1.392551
##   1       19      1.801970  0.8673854  1.392551
##   1       20      1.801970  0.8673854  1.392551
##   2        2      4.421087  0.2136318  3.622047
##   2        3      3.738888  0.4424544  3.004778
##   2        4      2.878704  0.6647839  2.315845
##   2        5      2.556082  0.7359640  2.039618
##   2        6      2.448842  0.7590378  1.941403
##   2        7      2.076809  0.8217323  1.631062
##   2        8      1.919449  0.8477097  1.506739
##   2        9      1.750995  0.8737507  1.383005
##   2       10      1.589003  0.8961730  1.262151
##   2       11      1.503925  0.9077370  1.182204
##   2       12      1.459184  0.9131975  1.139692
##   2       13      1.465062  0.9130346  1.140011
##   2       14      1.440347  0.9155789  1.123394
##   2       15      1.462849  0.9131866  1.141915
##   2       16      1.477622  0.9110707  1.148694
##   2       17      1.479198  0.9110804  1.152683
##   2       18      1.480900  0.9108744  1.151674
##   2       19      1.475717  0.9116473  1.146616
##   2       20      1.475717  0.9116473  1.146616
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 14 and degree = 2.

marsPred <- predict(mars, newdata = testData$x)

 
marsAccuracy <- postResample(pred = marsPred, obs = testData$y)

By examining the summary of the models above we see that MARS has the best accuracy on the training set. Table below summarises the accuracy of testing data. Based on the table below, we see that MARS has the best accuracy out four models we creted.

accuracies <- rbind(marsAccuracy,svmRadialAccuracy,knnAccuracy,nnetAccuracy)
rownames(accuracies )<- c("MARS","SVM","KNN", "NeuralNet")
accuracies%>%
  kable() %>%
  kable_styling()

	RMSE	Rsquared	MAE
MARS	1.277999	0.9338365	1.014707
SVM	2.086465	0.8236735	1.585465
KNN	3.204060	0.6819919	2.568346
NeuralNet	1.894755	0.8561760	1.441820

7.5

We will train four models, like previous example, and compare the accuracy metrics to see which model performs better.

library(AppliedPredictiveModeling)

## Warning: package 'AppliedPredictiveModeling' was built under R version 3.6.3

data(ChemicalManufacturingProcess)
 
set.seed(100)
pdata <- preProcess(ChemicalManufacturingProcess[,-1], method = c("center", "scale", "knnImpute", "corr", "nzv")) 
chemdata <- predict(pdata, ChemicalManufacturingProcess[,-1])
dp <- createDataPartition(ChemicalManufacturingProcess$Yield, p=0.8, list=FALSE)
xtrain <- chemdata[dp, ]
ytrain <- ChemicalManufacturingProcess$Yield[dp]
xtest <- chemdata[-dp, ]
ytest <- ChemicalManufacturingProcess$Yield[-dp]

(a)

KNN

set.seed(1)
knn <- train(x = xtrain,
                  y = ytrain,
                  method = "knn",
                  preProc = c("center", "scale", "knnImpute"),
                  tuneLength = 10)
knn

## k-Nearest Neighbors 
## 
## 144 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56), nearest neighbor imputation (56) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  1.482540  0.4100795  1.183748
##    7  1.485428  0.4049431  1.189266
##    9  1.473237  0.4184013  1.177743
##   11  1.467948  0.4248495  1.180289
##   13  1.479784  0.4165194  1.193682
##   15  1.485334  0.4192007  1.201442
##   17  1.496547  0.4160323  1.211116
##   19  1.510702  0.4102281  1.224902
##   21  1.521165  0.4035792  1.231536
##   23  1.525144  0.4097973  1.235632
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 11.

knnpred <- predict(knn, newdata = xtest)

 
knnAccuracy <- postResample(pred = knnpred, obs = ytest)

NueralNet

nnetGrid <- expand.grid(.decay=c(0, 0.01, 0.1),
                        .size=c(1, 5, 10),
                        .bag=FALSE)
set.seed(1)
nnet <- train(x = xtrain,
                   y = ytrain,
                   method = "avNNet",
                   tuneGrid = nnetGrid,
                   preProc = c("center", "scale", "knnImpute"),
                   trace=FALSE,
                   linout=TRUE,
                   maxit=500)

nnet

## Model Averaged Neural Network 
## 
## 144 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56), nearest neighbor imputation (56) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ... 
## Resampling results across tuning parameters:
## 
##   decay  size  RMSE       Rsquared    MAE     
##   0.00    1     1.670154  0.29877873  1.350984
##   0.00    5     2.605996  0.20831484  2.035429
##   0.00   10    10.978989  0.06047725  6.867954
##   0.01    1     1.789480  0.32795087  1.410891
##   0.01    5     2.072122  0.28544188  1.543500
##   0.01   10     2.432073  0.28266858  1.879018
##   0.10    1     2.000773  0.32085344  1.486073
##   0.10    5     2.151048  0.28253172  1.528974
##   0.10   10     1.908922  0.32919978  1.469870
## 
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 1, decay = 0 and bag = FALSE.

nnetPred <- predict(nnet, newdata = xtest)

 
nnetAccuracy <- postResample(pred = nnetPred, obs = ytest)

MARS

marsGrid <- expand.grid(.degree=1:2,
                        .nprune=2:10)
set.seed(1)
mars <- train(x = xtrain,
                   y = ytrain,
                   method = "earth",
                   tuneGrid = marsGrid,
                   preProc = c("center", "scale", "knnImpute"))

mars

## Multivariate Adaptive Regression Spline 
## 
## 144 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56), nearest neighbor imputation (56) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE     
##   1        2      1.494264  0.4136331  1.159887
##   1        3      1.354276  0.5142937  1.071760
##   1        4      1.324060  0.5374051  1.060051
##   1        5      1.346800  0.5289868  1.081678
##   1        6      1.361229  0.5230820  1.090879
##   1        7      1.426639  0.5056107  1.103635
##   1        8      1.482692  0.4867903  1.144496
##   1        9      1.505364  0.4871104  1.146406
##   1       10      1.495891  0.4928657  1.142761
##   2        2      1.494264  0.4136331  1.159887
##   2        3      1.425216  0.4812740  1.111889
##   2        4      1.375637  0.5187868  1.084589
##   2        5      7.066235  0.5047428  1.860955
##   2        6      1.419019  0.5133965  1.102925
##   2        7      6.739013  0.4769475  1.859735
##   2        8      6.157886  0.4572140  1.803299
##   2        9      5.928832  0.4537672  1.793980
##   2       10      5.678943  0.4475402  1.764904
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 4 and degree = 1.

marsPred <- predict(mars, newdata = xtest)

 
marsAccuracy <- postResample(pred = marsPred, obs = ytest)

SVM

set.seed(1)
svm <- train(x = xtrain,
                        y = ytrain,
                        method = "svmRadial",
                        tuneLength=10,
                        preProc = c("center", "scale", "knnImpute"))
svm

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 144 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56), nearest neighbor imputation (56) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE      Rsquared   MAE     
##     0.25  1.511314  0.4351025  1.215913
##     0.50  1.412499  0.4826241  1.137893
##     1.00  1.340844  0.5233655  1.080180
##     2.00  1.300915  0.5490888  1.045006
##     4.00  1.285525  0.5570409  1.026532
##     8.00  1.277907  0.5612855  1.019036
##    16.00  1.277799  0.5613581  1.018883
##    32.00  1.277799  0.5613581  1.018883
##    64.00  1.277799  0.5613581  1.018883
##   128.00  1.277799  0.5613581  1.018883
## 
## Tuning parameter 'sigma' was held constant at a value of 0.01375209
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01375209 and C = 16.

svmPred <- predict(svm, newdata = xtest)

 
svmAccuracy <- postResample(pred = svmPred, obs = ytest)

Examining model summary above we see that SVM model has RMSE 1.277799 and R2 0.5613581. This model has the best accuracy metrics on the test data. Below table dispalys accuray of all the four models on the test set. We see SVM has the highest Rsquared value, howerver it also has highest RMSE value. We will select thise SVM as model because of the high Rsquared value.

accuracies <- rbind(marsAccuracy,svmRadialAccuracy,knnAccuracy,nnetAccuracy)
rownames(accuracies )<- c("MARS","SVM","KNN", "NeuralNet")
accuracies%>%
  kable() %>%
  kable_styling()

	RMSE	Rsquared	MAE
MARS	1.165701	0.5172382	0.9320261
SVM	2.086465	0.8236735	1.5854649
KNN	1.243071	0.4442773	1.0406534
NeuralNet	1.401899	0.2742625	1.2220281

(b)

Based on the table we see that ManufacturingProcess32 and ManufacturingProcess13 dominate the list. Besides the first two manufacturing process variables, we dont see one variable type dominationg the list over the other.

Top Ten variables from previous assignment

ManufacturingProcess32 100.00 ManufacturingProcess13 97.84 BiologicalMaterial06 82.22 ManufacturingProcess17 77.27 BiologicalMaterial03 76.21 ManufacturingProcess36 70.77 BiologicalMaterial02 68.79 ManufacturingProcess09 67.86 BiologicalMaterial12 63.36 ManufacturingProcess06 55.15

Comparing the above list to below, we see that linear and nolinear models genearted the same top ten predictors.

varImp(svm)

## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   97.84
## BiologicalMaterial06     82.22
## ManufacturingProcess17   77.27
## BiologicalMaterial03     76.21
## ManufacturingProcess36   70.77
## BiologicalMaterial02     68.79
## ManufacturingProcess09   67.86
## BiologicalMaterial12     63.36
## ManufacturingProcess06   55.15
## BiologicalMaterial04     54.31
## ManufacturingProcess33   49.26
## ManufacturingProcess31   47.73
## ManufacturingProcess11   45.72
## BiologicalMaterial11     42.44
## BiologicalMaterial08     41.89
## ManufacturingProcess29   41.28
## BiologicalMaterial01     41.19
## BiologicalMaterial09     39.70
## ManufacturingProcess02   36.69

plot(varImp(svm))

(c)

Based on the plot below we see that ManufacturingProcess32 and yeild are positively correlated and has correlation coefficient of .6083.

plot(ChemicalManufacturingProcess$ManufacturingProcess32, ChemicalManufacturingProcess$Yield)
abline(lm(ChemicalManufacturingProcess$Yield~ChemicalManufacturingProcess$ManufacturingProcess32),col="red",lwd=1.5)

cor(ChemicalManufacturingProcess$ManufacturingProcess32, ChemicalManufacturingProcess$Yield)

## [1] 0.6083321

Based on the plot below we see that ManufacturingProcess13 and yeild are negatively correlated and has correlation coefficient of -0.5036797.

plot(ChemicalManufacturingProcess$ManufacturingProcess13, ChemicalManufacturingProcess$Yield)
abline(lm(ChemicalManufacturingProcess$Yield~ChemicalManufacturingProcess$ManufacturingProcess13),col="red",lwd=1.5)

cor(ChemicalManufacturingProcess$ManufacturingProcess13, ChemicalManufacturingProcess$Yield)

## [1] -0.5036797