library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(mlbench)
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1) ##We convert the 'x' data from a matrix to a data frame > ##One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x) ##Look at the data using > featurePlot(trainingData$x, trainingData$y) > ##or other methods. > > ##This creates a list with a vector 'y' and a matrix > ##of predictors 'x'. Also simulate a large test set to > ##estimate the true error rate with good precision: >
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
knnModel <- train(x = trainingData$x, y = trainingData$y, method = "knn", preProc = c("center", "scale"), tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x) ##The function 'postResample' can be used to get the test set
##perforamnce values
postResample(pred = knnPred, obs = testData$y)
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
library(earth)
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
marsFit <- earth(trainingData$x, trainingData$y)
marsFit
## Selected 12 of 18 terms, and 6 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556 RSS 397.9654 GRSq 0.8968524 RSq 0.9183982
summary(marsFit)
## Call: earth(x=trainingData$x, y=trainingData$y)
##
## coefficients
## (Intercept) 18.451984
## h(0.621722-X1) -11.074396
## h(0.601063-X2) -10.744225
## h(X3-0.281766) 20.607853
## h(0.447442-X3) 17.880232
## h(X3-0.447442) -23.282007
## h(X3-0.636458) 15.150350
## h(0.734892-X4) -10.027487
## h(X4-0.734892) 9.092045
## h(0.850094-X5) -4.723407
## h(X5-0.850094) 10.832932
## h(X6-0.361791) -1.956821
##
## Selected 12 of 18 terms, and 6 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556 RSS 397.9654 GRSq 0.8968524 RSq 0.9183982
Tune the model using external resampling
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
marsFittune <- train(trainingData$x, trainingData$y, method = "earth", tuneGrid = marsGrid, trControl = trainControl(method = "cv"))
summary(marsFittune)
## Call: earth(x=data.frame[200,10], y=c(18.46,16.1,17...), keepxy=TRUE, degree=2,
## nprune=13)
##
## coefficients
## (Intercept) 22.050690
## h(0.621722-X1) -15.001651
## h(X1-0.621722) 10.878737
## h(0.601063-X2) -18.830135
## h(0.447442-X3) 9.940077
## h(X3-0.606015) 12.999390
## h(0.734892-X4) -9.877554
## h(X4-0.734892) 10.414930
## h(0.850094-X5) -5.604897
## h(X1-0.621722) * h(X2-0.295997) -43.245766
## h(0.649253-X1) * h(0.601063-X2) 26.218297
##
## Selected 11 of 18 terms, and 5 of 10 predictors (nprune=13)
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6-unused, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 8 2
## GCV 1.747495 RSS 264.5358 GRSq 0.929051 RSq 0.9457576
The optimal MARS model minimized the RMSE when the nprune = 13 and the degree = 2
head(predict(marsFittune, testData$x))
## y
## [1,] 18.539042
## [2,] 21.224619
## [3,] 11.994784
## [4,] 8.087937
## [5,] 11.415761
## [6,] 12.293669
Look at importance of predictors
varImp(marsFittune)
## earth variable importance
##
## Overall
## X1 100.00
## X4 75.33
## X2 48.88
## X5 15.63
## X3 0.00
MARS selected x1 as the most important predictor and x4 as second.
library(e1071)
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
svmRTuned <- train(trainingData$x, trainingData$y, method = "svmRadial", preProc = c("center", "scale"), tuneLength = 14, trControl = trainControl(method = "cv"))
svmRTuned
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.475865 0.7988016 1.992183
## 0.50 2.214578 0.8166670 1.774752
## 1.00 2.046869 0.8398392 1.623489
## 2.00 1.953012 0.8519284 1.552263
## 4.00 1.891830 0.8587427 1.523129
## 8.00 1.875673 0.8604833 1.532307
## 16.00 1.879185 0.8595168 1.542272
## 32.00 1.879054 0.8595352 1.542202
## 64.00 1.879054 0.8595352 1.542202
## 128.00 1.879054 0.8595352 1.542202
## 256.00 1.879054 0.8595352 1.542202
## 512.00 1.879054 0.8595352 1.542202
## 1024.00 1.879054 0.8595352 1.542202
## 2048.00 1.879054 0.8595352 1.542202
##
## Tuning parameter 'sigma' was held constant at a value of 0.06437208
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06437208 and C = 8.
svmRTuned$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 8
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.0643720803922843
##
## Number of Support Vectors : 152
##
## Objective Function Value : -70.5956
## Training error : 0.009066
The model used 152 training set data points as support vectors. The MARS Model fit best with a high R2 of ~.95
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
set.seed(56)
knnmodel2 <- preProcess(ChemicalManufacturingProcess, "knnImpute")
df <- predict(knnmodel2, ChemicalManufacturingProcess)
df <- df %>%
select_at(vars(-one_of(nearZeroVar(., names = TRUE))))
in_train <- createDataPartition(df$Yield, times = 1, p = 0.8, list = FALSE)
train_df <- df[in_train, ]
test_df <- df[-in_train, ]
knn_model <- train(
Yield ~ ., data = train_df, method = "knn",
center = TRUE,
scale = TRUE,
trControl = trainControl("cv", number = 10),
tuneLength = 25
)
knn_model
## k-Nearest Neighbors
##
## 144 samples
## 56 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 130, 129, 131, 130, 130, 130, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 0.6665611 0.6064388 0.5302668
## 7 0.7087263 0.5402774 0.5678798
## 9 0.7130702 0.5331043 0.5820999
## 11 0.7161287 0.5489623 0.5796126
## 13 0.7126469 0.5579189 0.5727752
## 15 0.7244959 0.5500764 0.5840181
## 17 0.7354577 0.5430152 0.5902397
## 19 0.7423521 0.5386067 0.5961459
## 21 0.7615809 0.5095456 0.6142849
## 23 0.7699950 0.4994279 0.6197973
## 25 0.7795341 0.4884387 0.6250342
## 27 0.7825606 0.4899842 0.6296604
## 29 0.7860966 0.4941641 0.6328056
## 31 0.7939608 0.4865922 0.6400718
## 33 0.8040618 0.4650490 0.6460752
## 35 0.8070461 0.4670781 0.6454113
## 37 0.8127633 0.4634604 0.6505921
## 39 0.8176406 0.4642579 0.6554725
## 41 0.8260090 0.4508073 0.6624975
## 43 0.8236464 0.4664046 0.6614813
## 45 0.8290918 0.4530441 0.6652781
## 47 0.8344262 0.4490136 0.6709419
## 49 0.8424615 0.4305877 0.6796501
## 51 0.8481423 0.4220161 0.6835723
## 53 0.8523472 0.4237087 0.6865206
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 5.
library(dplyr)
library(tidyr)
knn_predictions <- predict(knn_model, test_df)
results <- data.frame(t(postResample(pred = knn_predictions, obs = test_df$Yield))) %>%
mutate("Model"= "KNN")
results
## RMSE Rsquared MAE Model
## 1 0.7242108 0.3334588 0.5853915 KNN
MARS_grid <- expand.grid(.degree = 1:2, .nprune = 2:38)
MARS_model <- train(
Yield ~ ., data = train_df, method = "earth",
tuneGrid = MARS_grid,
# If the following lines are uncommented, it throws an error
#center = TRUE,
#scale = TRUE,
trControl = trainControl("cv", number = 10),
tuneLength = 25
)
MARS_model
## Multivariate Adaptive Regression Spline
##
## 144 samples
## 56 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 129, 128, 129, 131, 129, 130, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 0.7840293 0.4910640 0.6195486
## 1 3 0.6440776 0.6478745 0.5256738
## 1 4 0.6019981 0.6842362 0.4905541
## 1 5 0.6204087 0.6604728 0.5057834
## 1 6 0.6206660 0.6540201 0.5125711
## 1 7 0.6270212 0.6627605 0.5226148
## 1 8 0.6666876 0.6214691 0.5517621
## 1 9 0.6746168 0.6287750 0.5515463
## 1 10 0.6696009 0.6301541 0.5543528
## 1 11 0.6762604 0.6309171 0.5575865
## 1 12 0.6679343 0.6390047 0.5542393
## 1 13 0.6831036 0.6274958 0.5619842
## 1 14 0.6800274 0.6242549 0.5611026
## 1 15 0.6734382 0.6238292 0.5582140
## 1 16 0.6734382 0.6238292 0.5582140
## 1 17 0.6734382 0.6238292 0.5582140
## 1 18 0.6734382 0.6238292 0.5582140
## 1 19 0.6734382 0.6238292 0.5582140
## 1 20 0.6734382 0.6238292 0.5582140
## 1 21 0.6734382 0.6238292 0.5582140
## 1 22 0.6734382 0.6238292 0.5582140
## 1 23 0.6734382 0.6238292 0.5582140
## 1 24 0.6734382 0.6238292 0.5582140
## 1 25 0.6734382 0.6238292 0.5582140
## 1 26 0.6734382 0.6238292 0.5582140
## 1 27 0.6734382 0.6238292 0.5582140
## 1 28 0.6734382 0.6238292 0.5582140
## 1 29 0.6734382 0.6238292 0.5582140
## 1 30 0.6734382 0.6238292 0.5582140
## 1 31 0.6734382 0.6238292 0.5582140
## 1 32 0.6734382 0.6238292 0.5582140
## 1 33 0.6734382 0.6238292 0.5582140
## 1 34 0.6734382 0.6238292 0.5582140
## 1 35 0.6734382 0.6238292 0.5582140
## 1 36 0.6734382 0.6238292 0.5582140
## 1 37 0.6734382 0.6238292 0.5582140
## 1 38 0.6734382 0.6238292 0.5582140
## 2 2 0.7840293 0.4910640 0.6195486
## 2 3 0.6747248 0.6159095 0.5529334
## 2 4 0.6471162 0.6397600 0.5243900
## 2 5 0.6292224 0.6378840 0.5209221
## 2 6 0.6159642 0.6543995 0.5139644
## 2 7 0.6177990 0.6634808 0.5104520
## 2 8 0.6150324 0.6645327 0.4993490
## 2 9 0.6465466 0.6220248 0.5246437
## 2 10 0.6629709 0.6106218 0.5385872
## 2 11 0.6246971 0.6625639 0.5112918
## 2 12 1.9004019 0.5374352 0.9110254
## 2 13 1.9726401 0.5082489 0.9404156
## 2 14 1.9796494 0.4939617 0.9354548
## 2 15 2.0489113 0.5126789 0.9716721
## 2 16 2.0820026 0.5027595 0.9877773
## 2 17 2.0814682 0.4984682 0.9855231
## 2 18 2.0888969 0.4951452 0.9909708
## 2 19 2.1084561 0.4927966 0.9980313
## 2 20 2.1091011 0.4889495 0.9969496
## 2 21 2.1184367 0.4853959 1.0009392
## 2 22 1.6662832 0.4851343 0.8722066
## 2 23 1.6625413 0.4914966 0.8721048
## 2 24 1.7625030 0.4922014 0.8985424
## 2 25 1.7609567 0.4902867 0.8953375
## 2 26 1.7609567 0.4902867 0.8953375
## 2 27 1.7557844 0.4871957 0.8869421
## 2 28 1.7557273 0.4870203 0.8865682
## 2 29 1.7610188 0.4813811 0.8917224
## 2 30 1.7609227 0.4811695 0.8907178
## 2 31 1.7609227 0.4811695 0.8907178
## 2 32 1.7609227 0.4811695 0.8907178
## 2 33 1.7609227 0.4811695 0.8907178
## 2 34 1.7609227 0.4811695 0.8907178
## 2 35 1.7609227 0.4811695 0.8907178
## 2 36 1.7609227 0.4811695 0.8907178
## 2 37 1.7609227 0.4811695 0.8907178
## 2 38 1.7609227 0.4811695 0.8907178
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 4 and degree = 1.
The optimal MARS model minimized the RMSE when the nprune = 13 and the degree = 2
head(predict(MARS_model, test_df))
## y
## [1,] 0.39300400
## [2,] 1.30236945
## [3,] 0.09523008
## [4,] 0.75324247
## [5,] 0.51308349
## [6,] -0.39093407
SVM_model <- train(
Yield ~ ., data = train_df, method = "svmRadial",
center = TRUE,
scale = TRUE,
trControl = trainControl(method = "cv"),
tuneLength = 25
)
SVM_model
## Support Vector Machines with Radial Basis Function Kernel
##
## 144 samples
## 56 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 130, 131, 128, 129, 129, 130, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 0.7440003 0.5285639 0.5993020
## 0.50 0.6778165 0.5910408 0.5410692
## 1.00 0.6403732 0.6261244 0.5013226
## 2.00 0.6121304 0.6557169 0.4788648
## 4.00 0.6067117 0.6649793 0.4746695
## 8.00 0.6109981 0.6578050 0.4847679
## 16.00 0.6105438 0.6566254 0.4843467
## 32.00 0.6105438 0.6566254 0.4843467
## 64.00 0.6105438 0.6566254 0.4843467
## 128.00 0.6105438 0.6566254 0.4843467
## 256.00 0.6105438 0.6566254 0.4843467
## 512.00 0.6105438 0.6566254 0.4843467
## 1024.00 0.6105438 0.6566254 0.4843467
## 2048.00 0.6105438 0.6566254 0.4843467
## 4096.00 0.6105438 0.6566254 0.4843467
## 8192.00 0.6105438 0.6566254 0.4843467
## 16384.00 0.6105438 0.6566254 0.4843467
## 32768.00 0.6105438 0.6566254 0.4843467
## 65536.00 0.6105438 0.6566254 0.4843467
## 131072.00 0.6105438 0.6566254 0.4843467
## 262144.00 0.6105438 0.6566254 0.4843467
## 524288.00 0.6105438 0.6566254 0.4843467
## 1048576.00 0.6105438 0.6566254 0.4843467
## 2097152.00 0.6105438 0.6566254 0.4843467
## 4194304.00 0.6105438 0.6566254 0.4843467
##
## Tuning parameter 'sigma' was held constant at a value of 0.01364992
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01364992 and C = 4.
SVM_predictions <- predict(SVM_model, test_df)
results <- data.frame(t(postResample(pred = SVM_predictions, obs = test_df$Yield))) %>%
mutate("Model"= "SVM")
The SVM model was the best model according to the R2
varImp(SVM_model, 10)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess13 100.00
## ManufacturingProcess32 97.81
## BiologicalMaterial06 86.14
## ManufacturingProcess17 82.38
## BiologicalMaterial03 77.56
## BiologicalMaterial12 77.23
## ManufacturingProcess09 74.35
## ManufacturingProcess36 73.84
## BiologicalMaterial02 63.58
## ManufacturingProcess31 59.17
## ManufacturingProcess06 59.09
## BiologicalMaterial11 52.43
## ManufacturingProcess29 51.35
## ManufacturingProcess12 49.40
## ManufacturingProcess11 49.21
## ManufacturingProcess02 48.41
## BiologicalMaterial08 46.71
## BiologicalMaterial04 45.78
## ManufacturingProcess33 45.16
## BiologicalMaterial09 40.05
The processing (manufacturing) variables are most important with mfgprocess13 as most important.
library(corrplot)
## corrplot 0.84 loaded
df %>% select(c('ManufacturingProcess32','ManufacturingProcess13','BiologicalMaterial06','ManufacturingProcess17','BiologicalMaterial03','Yield')) %>% cor() %>% corrplot(method = 'circle')
Mfgprocess32 shows to have a high positive correlation, while Mfg Process 13 has a strong negative correlation. Biological Material 03 and 06 have positive correlations.