library(ggplot2)
library(lattice)
library(mlbench)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
set.seed(564)
trainingData <- mlbench.friedman1(200,sd=1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to ## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000,sd=1)
testData$x <- data.frame(testData$x)
library(caret)
knnModel <- train(x = trainingData$x,
y = trainingData$y,
method = 'knn',
preProc = c('center','scale'),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.753983 0.4156708 3.017427
## 7 3.608854 0.4600422 2.939931
## 9 3.543382 0.4901142 2.911466
## 11 3.483787 0.5242548 2.859641
## 13 3.480884 0.5387280 2.852949
## 15 3.481831 0.5516791 2.862117
## 17 3.509560 0.5562605 2.883189
## 19 3.511143 0.5651415 2.879942
## 21 3.532018 0.5713067 2.900776
## 23 3.537411 0.5812718 2.894525
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 13.
knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## performance values
postResample(pred = knnPred,obs = testData$y)
## RMSE Rsquared MAE
## 3.3306275 0.6530948 2.7106980
nn_grid <- expand.grid(.decay = c(0, 0.01, .1),
.size = c(1:10))
ctrl <- trainControl(method = "cv", number = 10)
set.seed(564)
# tune
nn_model <- train(trainingData$x, trainingData$y,
method = "nnet",
tuneGrid = nn_grid,
trControl = ctrl,
preProc = c("center", "scale"),
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(trainingData$x) + 1) + 10 + 1,
maxit = 500)
nn_model
## Neural Network
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 3.209455 0.5739501 2.494867
## 0.00 2 2.843544 0.6524135 2.305833
## 0.00 3 2.937609 0.6329173 2.336340
## 0.00 4 3.079256 0.6353659 2.464685
## 0.00 5 3.782597 0.5086519 2.970718
## 0.00 6 5.318072 0.4283495 3.422173
## 0.00 7 11.061392 0.3402737 4.975965
## 0.00 8 8.390782 0.3790360 4.699529
## 0.00 9 3.963084 0.4801098 3.098639
## 0.00 10 5.383602 0.4083946 3.600418
## 0.01 1 3.092624 0.6037421 2.442070
## 0.01 2 2.890515 0.6566945 2.272363
## 0.01 3 2.900007 0.6653042 2.243147
## 0.01 4 2.939201 0.6615758 2.400150
## 0.01 5 3.568883 0.5560537 2.785203
## 0.01 6 3.349658 0.6159315 2.598298
## 0.01 7 4.096551 0.4564033 3.326715
## 0.01 8 3.997268 0.5008291 3.177985
## 0.01 9 4.397100 0.4372541 3.569775
## 0.01 10 4.036540 0.5139648 3.349403
## 0.10 1 2.803459 0.6728921 2.212933
## 0.10 2 2.901842 0.6490703 2.288738
## 0.10 3 3.119948 0.6062287 2.537085
## 0.10 4 3.163230 0.6088998 2.585101
## 0.10 5 3.423899 0.5508650 2.709710
## 0.10 6 3.470856 0.5428989 2.756895
## 0.10 7 3.456895 0.6161412 2.787543
## 0.10 8 3.840668 0.5157963 3.042470
## 0.10 9 3.704540 0.5244122 3.061513
## 0.10 10 3.776894 0.5393812 3.005502
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 1 and decay = 0.1.
nn_pred <- predict(nn_model, testData$x)
net_pr <- postResample(nn_pred, testData$y)
net_pr
## RMSE Rsquared MAE
## 2.7741085 0.6909316 2.2307974
mars_grid <- expand.grid(.degree = 1:2, .nprune = 2:38)
set.seed(564)
mars_model <- train(trainingData$x, trainingData$y,
method = "earth",
tuneGrid = mars_grid,
trControl = trainControl(method = "cv"))
## Loading required package: earth
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
mars_model
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 3.806467 0.4100565 3.275163
## 1 3 3.565551 0.4829453 2.936463
## 1 4 2.866326 0.6737984 2.362369
## 1 5 2.534644 0.7447474 2.064919
## 1 6 2.440279 0.7597452 1.979661
## 1 7 2.124830 0.8126478 1.685019
## 1 8 2.149145 0.8126021 1.695293
## 1 9 1.953711 0.8422061 1.570651
## 1 10 1.972856 0.8441375 1.583298
## 1 11 1.968220 0.8448106 1.589964
## 1 12 1.955917 0.8474608 1.581000
## 1 13 1.975120 0.8426232 1.601944
## 1 14 1.987977 0.8396165 1.627754
## 1 15 1.995855 0.8392852 1.627905
## 1 16 1.995855 0.8392852 1.627905
## 1 17 1.995855 0.8392852 1.627905
## 1 18 1.995855 0.8392852 1.627905
## 1 19 1.995855 0.8392852 1.627905
## 1 20 1.995855 0.8392852 1.627905
## 1 21 1.995855 0.8392852 1.627905
## 1 22 1.995855 0.8392852 1.627905
## 1 23 1.995855 0.8392852 1.627905
## 1 24 1.995855 0.8392852 1.627905
## 1 25 1.995855 0.8392852 1.627905
## 1 26 1.995855 0.8392852 1.627905
## 1 27 1.995855 0.8392852 1.627905
## 1 28 1.995855 0.8392852 1.627905
## 1 29 1.995855 0.8392852 1.627905
## 1 30 1.995855 0.8392852 1.627905
## 1 31 1.995855 0.8392852 1.627905
## 1 32 1.995855 0.8392852 1.627905
## 1 33 1.995855 0.8392852 1.627905
## 1 34 1.995855 0.8392852 1.627905
## 1 35 1.995855 0.8392852 1.627905
## 1 36 1.995855 0.8392852 1.627905
## 1 37 1.995855 0.8392852 1.627905
## 1 38 1.995855 0.8392852 1.627905
## 2 2 3.798181 0.4126880 3.262618
## 2 3 3.554347 0.4854029 2.928924
## 2 4 2.847354 0.6770950 2.346685
## 2 5 2.505045 0.7497129 2.042724
## 2 6 2.471455 0.7533618 1.965700
## 2 7 2.099971 0.8173685 1.705735
## 2 8 1.838017 0.8687349 1.492371
## 2 9 1.654053 0.8895888 1.329868
## 2 10 1.560603 0.9028938 1.243128
## 2 11 1.405231 0.9206967 1.128728
## 2 12 1.277585 0.9328374 1.041080
## 2 13 1.297521 0.9305610 1.065931
## 2 14 1.297410 0.9312126 1.058846
## 2 15 1.292350 0.9315821 1.067757
## 2 16 1.311652 0.9296514 1.071792
## 2 17 1.319811 0.9283657 1.070536
## 2 18 1.341844 0.9256347 1.087294
## 2 19 1.356038 0.9242607 1.103616
## 2 20 1.356038 0.9242607 1.103616
## 2 21 1.356038 0.9242607 1.103616
## 2 22 1.356038 0.9242607 1.103616
## 2 23 1.356038 0.9242607 1.103616
## 2 24 1.356038 0.9242607 1.103616
## 2 25 1.356038 0.9242607 1.103616
## 2 26 1.356038 0.9242607 1.103616
## 2 27 1.356038 0.9242607 1.103616
## 2 28 1.356038 0.9242607 1.103616
## 2 29 1.356038 0.9242607 1.103616
## 2 30 1.356038 0.9242607 1.103616
## 2 31 1.356038 0.9242607 1.103616
## 2 32 1.356038 0.9242607 1.103616
## 2 33 1.356038 0.9242607 1.103616
## 2 34 1.356038 0.9242607 1.103616
## 2 35 1.356038 0.9242607 1.103616
## 2 36 1.356038 0.9242607 1.103616
## 2 37 1.356038 0.9242607 1.103616
## 2 38 1.356038 0.9242607 1.103616
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 12 and degree = 2.
mars_pred <- predict(mars_model, testData$x)
mars_pr <- postResample(mars_pred, testData$y)
mars_pr
## RMSE Rsquared MAE
## 1.2944914 0.9331898 1.0356572
set.seed(564)
# tune
svm_model <- train(trainingData$x, trainingData$y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = trainControl(method = "cv"))
svm_model
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.960577 0.6972376 2.341898
## 0.50 2.685990 0.7207369 2.087450
## 1.00 2.482804 0.7526149 1.912838
## 2.00 2.302901 0.7868266 1.786034
## 4.00 2.111209 0.8211320 1.676599
## 8.00 2.025872 0.8331595 1.631230
## 16.00 1.989766 0.8388880 1.601933
## 32.00 1.985725 0.8391965 1.597087
## 64.00 1.985725 0.8391965 1.597087
## 128.00 1.985725 0.8391965 1.597087
## 256.00 1.985725 0.8391965 1.597087
## 512.00 1.985725 0.8391965 1.597087
## 1024.00 1.985725 0.8391965 1.597087
## 2048.00 1.985725 0.8391965 1.597087
##
## Tuning parameter 'sigma' was held constant at a value of 0.0673136
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.0673136 and C = 32.
svm_pred <- predict(svm_model, testData$x)
svm_pr <- postResample(svm_pred, testData$y)
svm_pr
## RMSE Rsquared MAE
## 2.0892489 0.8262222 1.6522291
The model with the best performance is the mars model. It’s R-squared value is 12.95% higher than the second highest model’s (svm) R-squared value. While the mars models rsme is 38.04% lower than the svm’s models rmse value.
The important variables used in the prediction for the mars does include the X1 through X5 variables. However X3 has an importance value of 0.00, implying that its contribution to the predictive analysis is rather weak compared to the other four variables.
varImp(mars_model)
## earth variable importance
##
## Overall
## X4 100.00
## X1 62.69
## X2 44.45
## X5 24.88
## X3 0.00
library(AppliedPredictiveModeling)
require(doParallel)
## Loading required package: doParallel
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
data(ChemicalManufacturingProcess)
only_yield <- subset(ChemicalManufacturingProcess, select = c(Yield))
no_yield <- subset(ChemicalManufacturingProcess, select = -c(Yield))
imputed_data <- preProcess(no_yield, "knnImpute")
fixed_cmp <- predict(imputed_data,
no_yield)
cl<-makeCluster(detectCores())
registerDoParallel(cl)
cl2<-makeCluster(detectCores())
registerDoParallel(cl2)
set.seed(791)
training <- createDataPartition(only_yield$Yield, p=0.7, list=FALSE)
X_training <- fixed_cmp[training, ]
y_training <- only_yield$Yield[training]
X_testing <- fixed_cmp[-training, ]
y_testing <- only_yield$Yield[-training]
For the non linear models, the svm model has the best performance on the test set of data, with an R-squared value of .603 and an rmse value of 1.152.
The mars model did really well with the training data obtaining an R-squared of .64 and an rsme of 1.18, however it overfit the data and performed poorly on the test data obtaining an R-squared of.37 and a rmse of 1.49.
knn_model <- train(x = X_training,
y = y_training,
method = 'knn',
preProc = c('center','scale'),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.753983 0.4156708 3.017427
## 7 3.608854 0.4600422 2.939931
## 9 3.543382 0.4901142 2.911466
## 11 3.483787 0.5242548 2.859641
## 13 3.480884 0.5387280 2.852949
## 15 3.481831 0.5516791 2.862117
## 17 3.509560 0.5562605 2.883189
## 19 3.511143 0.5651415 2.879942
## 21 3.532018 0.5713067 2.900776
## 23 3.537411 0.5812718 2.894525
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 13.
knn_pred <- predict(knn_model, newdata = X_testing)
postResample(pred = knn_pred,obs = y_testing)
## RMSE Rsquared MAE
## 1.2218384 0.5734323 0.9876496
nn_grid <- expand.grid(.decay = c(0, 0.01, .1),
.size = c(1:10))
ctrl <- trainControl(method = "cv", number = 10)
nn_model <- train(X_training, y_training,
method = "nnet",
tuneGrid = nn_grid,
trControl = ctrl,
preProc = c("center", "scale"),
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(X_training) + 1) + 10 + 1,
maxit = 500)
nn_model
## Neural Network
##
## 124 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 112, 112, 111, 111, 111, 112, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 1.684756 0.20374787 1.388315
## 0.00 2 1.606284 0.26708079 1.315180
## 0.00 3 1.677571 0.34957039 1.321093
## 0.00 4 3.473764 0.15172155 2.891239
## 0.00 5 2.745048 0.20775170 2.164135
## 0.00 6 3.539054 0.09471051 2.690979
## 0.00 7 4.085245 0.24128809 3.127308
## 0.00 8 3.990213 0.17502692 3.030683
## 0.00 9 4.298156 0.24979475 3.322466
## 0.00 10 6.075611 0.11643538 4.488757
## 0.01 1 1.938444 0.32025423 1.485236
## 0.01 2 2.385887 0.22602382 1.889240
## 0.01 3 3.365075 0.15325435 2.647065
## 0.01 4 2.536147 0.25128862 2.033840
## 0.01 5 2.451065 0.25414163 1.934802
## 0.01 6 3.014200 0.22828479 2.351166
## 0.01 7 2.406997 0.30196569 1.831854
## 0.01 8 2.747339 0.23719289 2.195966
## 0.01 9 2.501105 0.19292140 2.023970
## 0.01 10 2.494137 0.22360622 1.919806
## 0.10 1 1.709336 0.36572861 1.326640
## 0.10 2 2.608109 0.28205452 1.872334
## 0.10 3 3.347874 0.17803081 2.372455
## 0.10 4 2.997621 0.17309599 2.231038
## 0.10 5 2.601451 0.27587015 1.904201
## 0.10 6 2.789187 0.20588552 2.001778
## 0.10 7 2.577580 0.33215415 1.828877
## 0.10 8 2.152113 0.42166538 1.712647
## 0.10 9 2.331242 0.36274820 1.805404
## 0.10 10 2.671610 0.20106691 2.117201
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 2 and decay = 0.
nn_pred <- predict(nn_model, X_testing)
net_pr <- postResample(nn_pred, y_testing)
net_pr
## RMSE Rsquared MAE
## 1.7390740 0.1059804 1.4351988
mars_grid <- expand.grid(.degree = 1:2, .nprune = 2:38)
mars_model <- train(X_training, y_training,
method = "earth",
tuneGrid = mars_grid,
trControl = trainControl(method = "cv"))
mars_model
## Multivariate Adaptive Regression Spline
##
## 124 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 112, 111, 112, 112, 111, 112, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 1.449790 0.4341932 1.1396395
## 1 3 1.239313 0.5826783 1.0111197
## 1 4 1.238569 0.5411655 1.0054456
## 1 5 1.250439 0.5406423 1.0063980
## 1 6 1.273987 0.5368453 1.0332651
## 1 7 1.292119 0.5454769 1.0438809
## 1 8 1.318968 0.5488476 1.0612416
## 1 9 1.308737 0.5439156 1.0579458
## 1 10 1.337171 0.5308300 1.0741581
## 1 11 1.346088 0.5449427 1.0600918
## 1 12 1.381459 0.5260776 1.0916471
## 1 13 1.394066 0.5132213 1.1199043
## 1 14 1.383826 0.5229565 1.1169163
## 1 15 1.374220 0.5244693 1.1123913
## 1 16 1.376278 0.5237751 1.1150553
## 1 17 1.376278 0.5237751 1.1150553
## 1 18 1.376278 0.5237751 1.1150553
## 1 19 1.376278 0.5237751 1.1150553
## 1 20 1.376278 0.5237751 1.1150553
## 1 21 1.376278 0.5237751 1.1150553
## 1 22 1.376278 0.5237751 1.1150553
## 1 23 1.376278 0.5237751 1.1150553
## 1 24 1.376278 0.5237751 1.1150553
## 1 25 1.376278 0.5237751 1.1150553
## 1 26 1.376278 0.5237751 1.1150553
## 1 27 1.376278 0.5237751 1.1150553
## 1 28 1.376278 0.5237751 1.1150553
## 1 29 1.376278 0.5237751 1.1150553
## 1 30 1.376278 0.5237751 1.1150553
## 1 31 1.376278 0.5237751 1.1150553
## 1 32 1.376278 0.5237751 1.1150553
## 1 33 1.376278 0.5237751 1.1150553
## 1 34 1.376278 0.5237751 1.1150553
## 1 35 1.376278 0.5237751 1.1150553
## 1 36 1.376278 0.5237751 1.1150553
## 1 37 1.376278 0.5237751 1.1150553
## 1 38 1.376278 0.5237751 1.1150553
## 2 2 1.520467 0.3443414 1.2091120
## 2 3 1.232619 0.5779428 0.9581159
## 2 4 1.214196 0.5808961 0.9581951
## 2 5 1.240223 0.5546060 0.9960667
## 2 6 1.284855 0.5468697 1.0165534
## 2 7 1.229154 0.5976781 0.9510279
## 2 8 1.251809 0.5845663 0.9553101
## 2 9 1.251381 0.5913524 0.9706832
## 2 10 1.215842 0.5997618 0.9553386
## 2 11 1.198139 0.6029480 0.9457366
## 2 12 1.212182 0.6095594 0.9605302
## 2 13 1.182401 0.6411275 0.9332565
## 2 14 1.222594 0.6363033 0.9675624
## 2 15 1.286124 0.5881993 1.0096001
## 2 16 1.300251 0.5849793 1.0195918
## 2 17 1.284982 0.5990639 1.0021008
## 2 18 1.721999 0.5963197 1.1658751
## 2 19 1.724031 0.6047739 1.1589711
## 2 20 1.701572 0.6137846 1.1490731
## 2 21 1.699997 0.6135462 1.1457168
## 2 22 1.697741 0.6142816 1.1430734
## 2 23 1.697741 0.6142816 1.1430734
## 2 24 1.693861 0.6139842 1.1346854
## 2 25 1.693861 0.6139842 1.1346854
## 2 26 1.693861 0.6139842 1.1346854
## 2 27 1.693861 0.6139842 1.1346854
## 2 28 1.693861 0.6139842 1.1346854
## 2 29 1.693861 0.6139842 1.1346854
## 2 30 1.693861 0.6139842 1.1346854
## 2 31 1.693861 0.6139842 1.1346854
## 2 32 1.693861 0.6139842 1.1346854
## 2 33 1.693861 0.6139842 1.1346854
## 2 34 1.693861 0.6139842 1.1346854
## 2 35 1.693861 0.6139842 1.1346854
## 2 36 1.693861 0.6139842 1.1346854
## 2 37 1.693861 0.6139842 1.1346854
## 2 38 1.693861 0.6139842 1.1346854
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 13 and degree = 2.
mars_pred <- predict(mars_model, X_testing)
mars_pr <- postResample(mars_pred, y_testing)
mars_pr
## RMSE Rsquared MAE
## 1.490242 0.370015 1.162677
svm_model <- train(X_training, y_training,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = trainControl(method = "cv"))
svm_model
## Support Vector Machines with Radial Basis Function Kernel
##
## 124 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 111, 112, 112, 111, 111, 111, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 1.417323 0.5006902 1.1584727
## 0.50 1.280866 0.5533098 1.0377884
## 1.00 1.190610 0.6022621 0.9572707
## 2.00 1.127675 0.6452085 0.8997119
## 4.00 1.133381 0.6471679 0.8771222
## 8.00 1.152245 0.6389754 0.8918421
## 16.00 1.164430 0.6304221 0.9006140
## 32.00 1.164430 0.6304221 0.9006140
## 64.00 1.164430 0.6304221 0.9006140
## 128.00 1.164430 0.6304221 0.9006140
## 256.00 1.164430 0.6304221 0.9006140
## 512.00 1.164430 0.6304221 0.9006140
## 1024.00 1.164430 0.6304221 0.9006140
## 2048.00 1.164430 0.6304221 0.9006140
##
## Tuning parameter 'sigma' was held constant at a value of 0.01347071
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01347071 and C = 2.
svm_pred <- predict(svm_model, X_testing)
svm_pr <- postResample(svm_pred, y_testing)
svm_pr
## RMSE Rsquared MAE
## 1.1527778 0.6030456 0.9161725
Out of 20 of the top predictors 13 are manufacturing processes while 7 are biological materials. Statistically speaking biological materials make up 21% of the variables (12 out of 57 features) and 3 out of the 12 biological variables are in the top 10 list of most important features. In those terms you could say that the biological materials dominate the list because more are on the list than one would statistically assume should be there. However, there are twice as many manufacturing processes on the top 10 list, so technically they dominate the list.
Compared to the linear models this list is a lot more diverse, as 30% of the features are biological materials. There were no biological materials in the top 10 list for linear regression.
varImp(svm_model)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess13 75.64
## ManufacturingProcess17 65.71
## BiologicalMaterial03 63.83
## BiologicalMaterial06 62.99
## ManufacturingProcess09 59.90
## ManufacturingProcess36 58.92
## BiologicalMaterial12 55.61
## ManufacturingProcess06 51.30
## ManufacturingProcess29 49.91
## BiologicalMaterial02 49.48
## ManufacturingProcess31 46.40
## BiologicalMaterial11 42.05
## BiologicalMaterial09 40.61
## ManufacturingProcess33 37.45
## BiologicalMaterial08 37.30
## ManufacturingProcess11 36.55
## ManufacturingProcess12 32.49
## ManufacturingProcess26 32.39
## ManufacturingProcess02 31.31
plot(varImp(svm_model), top = 10)
stopCluster(cl2)
The three most important biological processes all have very similar relationships with yield. They are all positive and the strength of the relationship for all three look like they would be very similar.
The relationship of the manufacturing processes to yield are quite different when comparing to the biological relationships. The manufacturing processes are more varied in relationships. Three of the relationships are positive, while two are negative. One relationship is leveled and looks to have a negative relationship while manufacturing process 29 is almost a vertically straight line centered around zero with one outlier.
If the goal is to improve yield then manufacturing processes 17 and 13 need to be inspected as to why they have a negative correlation. Also, all of the plots seem to have points that are outliers that have strong positive relationships with yield. Those would be worth investigating and finding out exactly why they are different. As those are the points that indicate how to drastically improve yield.
most_important <- varImp(svm_model)$importance %>%
arrange(-Overall) %>%
head(10)
X_training %>%
select(row.names(most_important)) %>%
featurePlot(., y_training)