Do problems 7.2 and 7.5 in Kuhn and Johnson. There are only two but they have many parts. Please submit both a link to your Rpubs and the .rmd file.
library(mlbench)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(MASS)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(earth)
## Warning: package 'earth' was built under R version 4.3.2
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.3.2
## Loading required package: plotrix
library(AppliedPredictiveModeling)
set.seed(200)
trainingData <-mlbench.friedman1(200, sd =1)
trainingData$x <- data.frame(trainingData$x)
featurePlot(trainingData$x, trainingData$y)
testData <- mlbench.friedman1(5000, sd=1)
testData$x <- data.frame(testData$x)
knnModel <- train(x= trainingData$x, y= trainingData$y, method = "knn",
preProcess = c("center","scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
postResample(pred=knnPred, obs = testData$y)
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
marsFit <- earth(trainingData$x, trainingData$y)
marsFit
## Selected 12 of 18 terms, and 6 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556 RSS 397.9654 GRSq 0.8968524 RSq 0.9183982
summary(marsFit)
## Call: earth(x=trainingData$x, y=trainingData$y)
##
## coefficients
## (Intercept) 18.451984
## h(0.621722-X1) -11.074396
## h(0.601063-X2) -10.744225
## h(X3-0.281766) 20.607853
## h(0.447442-X3) 17.880232
## h(X3-0.447442) -23.282007
## h(X3-0.636458) 15.150350
## h(0.734892-X4) -10.027487
## h(X4-0.734892) 9.092045
## h(0.850094-X5) -4.723407
## h(X5-0.850094) 10.832932
## h(X6-0.361791) -1.956821
##
## Selected 12 of 18 terms, and 6 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556 RSS 397.9654 GRSq 0.8968524 RSq 0.9183982
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
set.seed(100)
marsTuned <- train(trainingData$x, trainingData$y, method = "earth", tuneGrid = marsGrid, trControl = trainControl(method = "cv"))
marsTuned
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.327937 0.2544880 3.600474
## 1 3 3.572450 0.4912720 2.895811
## 1 4 2.596841 0.7183600 2.106341
## 1 5 2.370161 0.7659777 1.918669
## 1 6 2.276141 0.7881481 1.810001
## 1 7 1.766728 0.8751831 1.390215
## 1 8 1.780946 0.8723243 1.401345
## 1 9 1.665091 0.8819775 1.325515
## 1 10 1.663804 0.8821283 1.327657
## 1 11 1.657738 0.8822967 1.331730
## 1 12 1.653784 0.8827903 1.331504
## 1 13 1.648496 0.8823663 1.316407
## 1 14 1.639073 0.8841742 1.312833
## 1 15 1.639073 0.8841742 1.312833
## 1 16 1.639073 0.8841742 1.312833
## 1 17 1.639073 0.8841742 1.312833
## 1 18 1.639073 0.8841742 1.312833
## 1 19 1.639073 0.8841742 1.312833
## 1 20 1.639073 0.8841742 1.312833
## 1 21 1.639073 0.8841742 1.312833
## 1 22 1.639073 0.8841742 1.312833
## 1 23 1.639073 0.8841742 1.312833
## 1 24 1.639073 0.8841742 1.312833
## 1 25 1.639073 0.8841742 1.312833
## 1 26 1.639073 0.8841742 1.312833
## 1 27 1.639073 0.8841742 1.312833
## 1 28 1.639073 0.8841742 1.312833
## 1 29 1.639073 0.8841742 1.312833
## 1 30 1.639073 0.8841742 1.312833
## 1 31 1.639073 0.8841742 1.312833
## 1 32 1.639073 0.8841742 1.312833
## 1 33 1.639073 0.8841742 1.312833
## 1 34 1.639073 0.8841742 1.312833
## 1 35 1.639073 0.8841742 1.312833
## 1 36 1.639073 0.8841742 1.312833
## 1 37 1.639073 0.8841742 1.312833
## 1 38 1.639073 0.8841742 1.312833
## 2 2 4.327937 0.2544880 3.600474
## 2 3 3.572450 0.4912720 2.895811
## 2 4 2.661826 0.7070510 2.173471
## 2 5 2.404015 0.7578971 1.975387
## 2 6 2.243927 0.7914805 1.783072
## 2 7 1.856336 0.8605482 1.435682
## 2 8 1.754607 0.8763186 1.396841
## 2 9 1.603578 0.8938666 1.261361
## 2 10 1.492421 0.9084998 1.168700
## 2 11 1.317350 0.9292504 1.033926
## 2 12 1.304327 0.9320133 1.019108
## 2 13 1.277510 0.9323681 1.002927
## 2 14 1.269626 0.9350024 1.003346
## 2 15 1.266217 0.9359400 1.013893
## 2 16 1.268470 0.9354868 1.011414
## 2 17 1.268470 0.9354868 1.011414
## 2 18 1.268470 0.9354868 1.011414
## 2 19 1.268470 0.9354868 1.011414
## 2 20 1.268470 0.9354868 1.011414
## 2 21 1.268470 0.9354868 1.011414
## 2 22 1.268470 0.9354868 1.011414
## 2 23 1.268470 0.9354868 1.011414
## 2 24 1.268470 0.9354868 1.011414
## 2 25 1.268470 0.9354868 1.011414
## 2 26 1.268470 0.9354868 1.011414
## 2 27 1.268470 0.9354868 1.011414
## 2 28 1.268470 0.9354868 1.011414
## 2 29 1.268470 0.9354868 1.011414
## 2 30 1.268470 0.9354868 1.011414
## 2 31 1.268470 0.9354868 1.011414
## 2 32 1.268470 0.9354868 1.011414
## 2 33 1.268470 0.9354868 1.011414
## 2 34 1.268470 0.9354868 1.011414
## 2 35 1.268470 0.9354868 1.011414
## 2 36 1.268470 0.9354868 1.011414
## 2 37 1.268470 0.9354868 1.011414
## 2 38 1.268470 0.9354868 1.011414
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 15 and degree = 2.
marsPred <- predict(marsTuned, newdata = testData$x)
postResample(pred = marsPred, obs= testData$y)
## RMSE Rsquared MAE
## 1.1589948 0.9460418 0.9250230
varImp(marsTuned)
## earth variable importance
##
## Overall
## X1 100.00
## X4 75.24
## X2 48.73
## X5 15.52
## X3 0.00
MARS does select the most informative predictors, specifically X1, X4, X2 and X5.
data("ChemicalManufacturingProcess")
dim(ChemicalManufacturingProcess)
## [1] 176 58
set.seed(123)
chem_impute <- preProcess(ChemicalManufacturingProcess, method=c('center','knnImpute'))
df <- predict(chem_impute, ChemicalManufacturingProcess)
dfx <- df |> select(-Yield)
dfy <- df |> select(Yield)
set.seed(123)
chem_train <- createDataPartition(dfy$Yield, p = .80, list= FALSE)
x_train <- dfx[chem_train,]
x_test <- dfx[-chem_train,]
y_train <- dfy[chem_train,]
y_test <- dfy[-chem_train,]
tooHigh <- findCorrelation(cor(x_train), cutoff = .75)
x_train_nn <- x_train[, -tooHigh]
x_test_nn <- x_test[, -tooHigh]
nnetGrid <- expand.grid(.decay = c(0, 0.01, .1),
.size = c(1:10))
set.seed(100)
# tune
nnetTune <- train(x_train_nn, y_train,
method = "nnet",
tuneGrid = nnetGrid,
trControl = trainControl(method = "cv"),
preProc = c("center", "scale"),
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(x_train_nn) + 1) + 10 + 1,
maxit = 500)
nnetTune
## Neural Network
##
## 144 samples
## 36 predictor
##
## Pre-processing: centered (36), scaled (36)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 129, 130, 130, 130, 130, 130, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 0.9669425 0.3189283 0.7577877
## 0.00 2 1.0602575 0.2913234 0.8799555
## 0.00 3 0.9657647 0.3758025 0.7915117
## 0.00 4 1.1384314 0.3521874 0.9390868
## 0.00 5 1.1362919 0.3552987 0.9156410
## 0.00 6 1.3571003 0.2921928 1.1042992
## 0.00 7 1.0658166 0.3221804 0.8459426
## 0.00 8 1.0590685 0.3429788 0.8595619
## 0.00 9 0.9594264 0.4128916 0.8046341
## 0.00 10 0.9615087 0.3637568 0.7658226
## 0.01 1 0.8414877 0.4135614 0.6786846
## 0.01 2 1.0100972 0.3642232 0.7962057
## 0.01 3 1.1430451 0.2888640 0.8966275
## 0.01 4 1.0811311 0.3772015 0.8294977
## 0.01 5 0.9598964 0.3894508 0.7509202
## 0.01 6 0.9267037 0.4101971 0.7411379
## 0.01 7 0.9447165 0.3849959 0.7150464
## 0.01 8 0.8541876 0.4945492 0.6715633
## 0.01 9 0.8721534 0.4531012 0.6760305
## 0.01 10 0.7976715 0.4695331 0.6328029
## 0.10 1 0.7468941 0.5008336 0.6095253
## 0.10 2 0.9492320 0.3720083 0.7447810
## 0.10 3 0.8726270 0.4127535 0.6940696
## 0.10 4 0.9274221 0.3913424 0.7395732
## 0.10 5 0.8467550 0.4678982 0.6867256
## 0.10 6 0.8335888 0.4541690 0.6699742
## 0.10 7 0.8400382 0.4236842 0.6681753
## 0.10 8 0.8121382 0.4813507 0.6565713
## 0.10 9 0.8171328 0.4866868 0.6497487
## 0.10 10 0.8021324 0.4938702 0.6283507
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 1 and decay = 0.1.
set.seed(100)
knnModel <- train(x_train, y_train,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 144 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 0.7572933 0.4391988 0.5941190
## 7 0.7670484 0.4238217 0.6077865
## 9 0.7753730 0.4143294 0.6161508
## 11 0.7858045 0.4004661 0.6285027
## 13 0.7848028 0.4067367 0.6253419
## 15 0.7851887 0.4077562 0.6278887
## 17 0.7886832 0.4022573 0.6326840
## 19 0.7895189 0.4023863 0.6304479
## 21 0.7868265 0.4111895 0.6248467
## 23 0.7912004 0.4109584 0.6275228
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 5.
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
set.seed(100)
# tune
marsTune <- train(x_train, y_train,
method = "earth",
tuneGrid = marsGrid,
trControl = trainControl(method = "cv"))
marsTune
## Multivariate Adaptive Regression Spline
##
## 144 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 129, 130, 130, 130, 130, 130, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 0.7665502 0.4560018 0.6049068
## 1 3 0.6595082 0.5636328 0.5312946
## 1 4 0.6469939 0.5797016 0.5232003
## 1 5 0.6859773 0.5475202 0.5517808
## 1 6 0.6870737 0.5484861 0.5502748
## 1 7 0.6751229 0.5666619 0.5452871
## 1 8 0.6887718 0.5613131 0.5587141
## 1 9 0.6775788 0.5781242 0.5555918
## 1 10 0.7008878 0.5610964 0.5686670
## 1 11 0.6993383 0.5620506 0.5637977
## 1 12 0.7029419 0.5602218 0.5578991
## 1 13 0.7063407 0.5571220 0.5674656
## 1 14 0.7025333 0.5522722 0.5708651
## 1 15 0.7099672 0.5519371 0.5768780
## 1 16 0.7128965 0.5461785 0.5804514
## 1 17 0.7129084 0.5460757 0.5813701
## 1 18 0.7213367 0.5389852 0.5913144
## 1 19 0.7226246 0.5374242 0.5922039
## 1 20 0.7218098 0.5397264 0.5918110
## 1 21 0.7218098 0.5397264 0.5918110
## 1 22 0.7218098 0.5397264 0.5918110
## 1 23 0.7218098 0.5397264 0.5918110
## 1 24 0.7218098 0.5397264 0.5918110
## 1 25 0.7218098 0.5397264 0.5918110
## 1 26 0.7218098 0.5397264 0.5918110
## 1 27 0.7218098 0.5397264 0.5918110
## 1 28 0.7218098 0.5397264 0.5918110
## 1 29 0.7218098 0.5397264 0.5918110
## 1 30 0.7218098 0.5397264 0.5918110
## 1 31 0.7218098 0.5397264 0.5918110
## 1 32 0.7218098 0.5397264 0.5918110
## 1 33 0.7218098 0.5397264 0.5918110
## 1 34 0.7218098 0.5397264 0.5918110
## 1 35 0.7218098 0.5397264 0.5918110
## 1 36 0.7218098 0.5397264 0.5918110
## 1 37 0.7218098 0.5397264 0.5918110
## 1 38 0.7218098 0.5397264 0.5918110
## 2 2 0.7665502 0.4560018 0.6049068
## 2 3 0.6659027 0.5722474 0.5262879
## 2 4 0.7531164 0.4382398 0.6043627
## 2 5 0.7413390 0.4798473 0.5948203
## 2 6 0.7963805 0.4423379 0.6336873
## 2 7 0.7573740 0.4856350 0.6059476
## 2 8 0.7428328 0.5219089 0.5831882
## 2 9 0.7468738 0.5128219 0.5896118
## 2 10 0.7320803 0.5215626 0.5876731
## 2 11 0.7173064 0.5335692 0.5727659
## 2 12 0.7542704 0.5011046 0.5943471
## 2 13 0.7190265 0.5221777 0.5779314
## 2 14 0.9755283 0.4767576 0.6687385
## 2 15 1.2456002 0.4433392 0.7413483
## 2 16 1.2768103 0.4412041 0.7438363
## 2 17 1.2734632 0.4460104 0.7476065
## 2 18 1.2638965 0.4541833 0.7377004
## 2 19 1.2577137 0.4590201 0.7387957
## 2 20 1.3124681 0.4433076 0.7722179
## 2 21 1.3172513 0.4464099 0.7694537
## 2 22 1.3219836 0.4421225 0.7743332
## 2 23 1.3243505 0.4411276 0.7723694
## 2 24 1.3280243 0.4372356 0.7741620
## 2 25 1.3243600 0.4412740 0.7727479
## 2 26 1.3243600 0.4412740 0.7727479
## 2 27 1.3243600 0.4412740 0.7727479
## 2 28 1.3243600 0.4412740 0.7727479
## 2 29 1.3243600 0.4412740 0.7727479
## 2 30 1.3243600 0.4412740 0.7727479
## 2 31 1.3243600 0.4412740 0.7727479
## 2 32 1.3243600 0.4412740 0.7727479
## 2 33 1.3243600 0.4412740 0.7727479
## 2 34 1.3243600 0.4412740 0.7727479
## 2 35 1.3243600 0.4412740 0.7727479
## 2 36 1.3243600 0.4412740 0.7727479
## 2 37 1.3243600 0.4412740 0.7727479
## 2 38 1.3243600 0.4412740 0.7727479
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 4 and degree = 1.
set.seed(100)
svmRTuned <- train(x_train, y_train,
method = "svmRadial",
preProcess = c("center","scale"),
tuneLength = 14,
trControl = trainControl(method = "cv"))
svmRTuned$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 8
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.0185661400107102
##
## Number of Support Vectors : 125
##
## Objective Function Value : -66.2017
## Training error : 0.009129
nnet <- predict(nnetTune, x_test)
mars <- predict(marsTune, x_test)
knn <- predict(knnModel, x_test)
svm <- predict( svmRTuned, x_test)
postResample(pred = nnet, obs = y_test)
## RMSE Rsquared MAE
## 0.9244674 0.2364360 0.8026363
postResample(pred = mars, obs = y_test)
## RMSE Rsquared MAE
## 0.5957603 0.6604107 0.4979724
postResample(pred = knn, obs = y_test)
## RMSE Rsquared MAE
## 0.7585275 0.4284176 0.6270432
postResample(pred = svm, obs = y_test)
## RMSE Rsquared MAE
## 0.6697466 0.5525027 0.5654417
The best performing model based on its R^2 and RMSE is the MARS model. It has the lowest RMSE and the highest R^2.
The predictors that are most important in the MARS model (identified in the last problem as the most optimal nonlinear regression) are ‘ManufacturingProcess32’,‘ManufacturingProcess13’,‘ManufacturingProcess17’. There are no biological variables that dominate the list, but the manufaturing processes.
varImp(marsTune)
## earth variable importance
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess17 38.54
## ManufacturingProcess13 0.00
The biological processes do not show any importance. The Manufacturing processes are the most important predictors. Compared to the optimal linear model which listed ‘ManufacturingProcess32’,‘ManufacturingProcess13’,‘BiologicalMaterial06’ as the most important variables. In this model however, the important predictors are ‘ManufacturingProcess32’,‘ManufacturingProcess13’,‘ManufacturingProcess17’. There is no Biological Process listed, as previously stated.
correlation <- cor(select(df, 'ManufacturingProcess32','ManufacturingProcess13','ManufacturingProcess17', 'Yield'))
corrplot::corrplot(correlation, method='square', type="upper")
Manufacturing Process 32 shows a strong positive correlation to yield, while the Manufacturing Process 13 and 17 get more show an increasingly negative correlation. This plot only shows that Biological Processes have no correlation or influence on the Yield variable. The top three processes (32, 13 and 17) are the most influential predictors.