# Load libraries
library(caret)
library(tidyverse)
library(earth)
library(kernlab)
library(nnet)
library(mlbench)
library(AppliedPredictiveModeling)
Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data:
\[y=10sin(\pi ~*x_1x_2)+20(x_3−0.5) +10x_4+5x_5+N(0,\sigma)\]
where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates the data below:
set.seed(200)
trainingData <- mlbench.friedman1(200,sd = 1)
## We convert the 'x' data from a matrix to a data frame > ## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to > ## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
corr <- cor(cbind(trainingData$x, tibble(y = trainingData$y)))
corrplot::corrplot(corr)
set.seed(201)
knnModel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.663055 0.4845252 2.973509
## 7 3.544987 0.5222393 2.888308
## 9 3.444614 0.5597118 2.813217
## 11 3.447524 0.5703913 2.808627
## 13 3.408807 0.5942498 2.780438
## 15 3.395975 0.6124595 2.773349
## 17 3.394028 0.6239658 2.763966
## 19 3.389454 0.6354978 2.759273
## 21 3.402952 0.6423762 2.778907
## 23 3.412867 0.6483598 2.792254
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 19.
knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## performance values
knn_metrics <- postResample(pred = knnPred, obs = testData$y)
knn_metrics
## RMSE Rsquared MAE
## 3.2286834 0.6871735 2.5939727
The best model chosen from the training data was one using 19 nearest neighbors.This model resulted in an RMSE of 3.39 and an R-squared value of 0.6355. Predicting and analyzing the model performance on the test data produced an R-squared value of 0.6871, slightly better than what was achieved on the training set.
# Fit a basic neural network with parameters for size and decay set
set.seed(202)
nnetFit <- nnet(trainingData$x, trainingData$y,
# Number of hidden units
size = 5,
# Weight decay value
decay = 0.01,
linout = TRUE,
## Reduce the amount of printed output
trace = FALSE,
## Expand the number of iterations to find
## parameter estimates..
maxit = 500,
## and the number of parameters used by the model
MaxNWts = 5 * (ncol(trainingData$x) + 1) + 5 + 1)
The 10-5-1 network refers to the neural network being fitted with 10 predictors and 5 hidden units, in which 61 parameters were estimated.
summary(nnetFit)
## a 10-5-1 network with 61 weights
## options were - linear output units decay=0.01
## b->h1 i1->h1 i2->h1 i3->h1 i4->h1 i5->h1 i6->h1 i7->h1 i8->h1 i9->h1
## 30.73 -14.22 -12.29 2.79 -10.21 -12.97 4.77 10.51 -3.66 -8.23
## i10->h1
## -0.58
## b->h2 i1->h2 i2->h2 i3->h2 i4->h2 i5->h2 i6->h2 i7->h2 i8->h2 i9->h2
## -15.90 16.38 4.89 0.47 9.05 -3.57 3.89 -4.06 18.41 -3.45
## i10->h2
## 2.80
## b->h3 i1->h3 i2->h3 i3->h3 i4->h3 i5->h3 i6->h3 i7->h3 i8->h3 i9->h3
## -3.87 1.58 0.57 -0.04 3.34 3.45 -2.25 1.75 -0.41 1.78
## i10->h3
## 1.25
## b->h4 i1->h4 i2->h4 i3->h4 i4->h4 i5->h4 i6->h4 i7->h4 i8->h4 i9->h4
## -2.16 2.88 22.46 0.88 3.73 -1.16 0.10 -2.25 -8.51 -4.94
## i10->h4
## -2.35
## b->h5 i1->h5 i2->h5 i3->h5 i4->h5 i5->h5 i6->h5 i7->h5 i8->h5 i9->h5
## -5.05 21.53 -17.50 -1.32 16.22 2.30 17.74 10.03 1.12 -12.62
## i10->h5
## -6.55
## b->o h1->o h2->o h3->o h4->o h5->o
## 1.27 -3.57 3.74 9.86 4.51 3.73
varImp(nnetFit)
## Overall
## X1 15.558532
## X2 17.611755
## X3 1.485776
## X4 13.867493
## X5 9.410363
## X6 8.442635
## X7 8.754029
## X8 10.570163
## X9 9.615566
## X10 4.683688
nnetPred <- predict(nnetFit, testData$x)
## performance values
nn_metrics <- postResample(pred = nnetPred, obs = testData$y)
nn_metrics
## RMSE Rsquared MAE
## 3.0020014 0.6499896 2.3696503
Another neural net was fitted using the function train() and cross validation and grid search to identify the best values for decay and size.
set.seed(100)
nnetGrid <- expand.grid(data_frame(decay = c(0, 0.0001, 0.001, 0.01, 0.1),
size = c(1:5)))
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## The next option is to use bagging (see the
## next chapter) instead of different random
## seeds.
ctrl <- trainControl(method = "cv", number = 10)
nnetFit2 <- train(trainingData$x, trainingData$y,
method = "nnet",
## Automatically standardize data prior to modeling
## and prediction
preProc = c("center", "scale"),
linout = TRUE,
trace = FALSE,
metric = "Rsquared",
# Use cross validation with parameter set in crtl
trControl = ctrl,
tuneGrid = nnetGrid,
MaxNWts = 10 * (ncol(trainingData$x) + 1) + 10 + 1,
maxit = 500)
nnetFit2
## Neural Network
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0e+00 1 2.607237 0.7127469 2.096428
## 0e+00 2 2.548956 0.7335340 2.035750
## 0e+00 3 2.503043 0.7579204 2.029461
## 0e+00 4 2.399673 0.7600243 1.924916
## 0e+00 5 2.684662 0.7126869 2.099095
## 1e-04 1 2.597490 0.7144538 2.107682
## 1e-04 2 2.603926 0.7200340 2.069772
## 1e-04 3 2.308446 0.7814813 1.830308
## 1e-04 4 2.259230 0.7854771 1.789155
## 1e-04 5 3.609989 0.6738037 2.588714
## 1e-03 1 2.388277 0.7597428 1.889526
## 1e-03 2 2.549721 0.7331913 2.048162
## 1e-03 3 2.119573 0.8095134 1.709940
## 1e-03 4 2.311796 0.7975198 1.857224
## 1e-03 5 2.528018 0.7591689 2.024323
## 1e-02 1 2.385599 0.7602740 1.887921
## 1e-02 2 2.616377 0.7253723 2.094091
## 1e-02 3 2.505821 0.7506812 1.998530
## 1e-02 4 2.628755 0.7287019 2.110121
## 1e-02 5 2.496664 0.7615431 1.981277
## 1e-01 1 2.393940 0.7596468 1.894140
## 1e-01 2 2.644669 0.7149218 2.110677
## 1e-01 3 2.434552 0.7643552 1.930961
## 1e-01 4 2.477422 0.7523673 1.950687
## 1e-01 5 2.504647 0.7535966 1.981100
##
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were size = 3 and decay = 0.001.
summary(nnetFit2)
## a 10-3-1 network with 37 weights
## options were - linear output units decay=0.001
## b->h1 i1->h1 i2->h1 i3->h1 i4->h1 i5->h1 i6->h1 i7->h1 i8->h1 i9->h1
## 0.88 0.61 0.60 -0.56 0.59 0.21 -0.15 -0.01 0.03 -0.07
## i10->h1
## -0.08
## b->h2 i1->h2 i2->h2 i3->h2 i4->h2 i5->h2 i6->h2 i7->h2 i8->h2 i9->h2
## -35.34 13.15 13.36 -13.87 21.26 11.99 3.71 -0.78 -0.80 6.63
## i10->h2
## -2.31
## b->h3 i1->h3 i2->h3 i3->h3 i4->h3 i5->h3 i6->h3 i7->h3 i8->h3 i9->h3
## -2.62 -0.16 -0.31 2.02 0.06 0.26 0.04 0.06 -0.04 0.08
## i10->h3
## 0.17
## b->o h1->o h2->o h3->o
## -1.71 19.76 3.60 13.81
# Use the final best model
nnetPred2 <- predict(nnetFit2, testData$x)
## performance values
nn2_metrics <- postResample(pred = nnetPred2, obs = testData$y)
nn2_metrics
## RMSE Rsquared MAE
## 2.3768141 0.7784505 1.7883484
The neural net fitted via grid search and cross validation identified a model with 10 predictors 3 hidden units and 37 weights. The model did perfrom as well on the test data, noted by a lower R_squared value 0.809 on the training data vs. 0.7784 with the test set. This might be a result of the model being over fit on the training data. The second neural net did perform better with cross validation then the first neural net.
set.seed(203)
# Create a parameter tuning grid
marsGrid <- expand.grid(degree = 1:4, nprune = seq(5, 50, by = 5))
marsFit <- train(trainingData$x, trainingData$y,
method = "earth",
metric = "Rsquared",
trControl = ctrl,
tuneGrid = marsGrid)
summary(marsFit)
## Call: earth(x=data.frame[200,10], y=c(18.46,16.1,17...), keepxy=TRUE, degree=3,
## nprune=15)
##
## coefficients
## (Intercept) 21.218097
## h(0.621722-X1) -15.691434
## h(X1-0.621722) 9.066076
## h(0.601063-X2) -18.220627
## h(X2-0.601063) 10.428191
## h(X3-0.281766) 3.419830
## h(0.447442-X3) 12.201226
## h(X3-0.606015) 7.875117
## h(0.734892-X4) -10.033343
## h(X4-0.734892) 9.894681
## h(0.850094-X5) -5.385087
## h(0.218266-X1) * h(X2-0.601063) -58.628609
## h(X1-0.218266) * h(X2-0.601063) -29.226178
## h(X1-0.621722) * h(X2-0.295997) -25.768101
## h(0.649253-X1) * h(0.601063-X2) 26.940533
##
## Selected 15 of 18 terms, and 5 of 10 predictors (nprune=15)
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6-unused, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 10 4
## GCV 1.618197 RSS 217.6151 GRSq 0.9343005 RSq 0.9553786
marsFit
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 5 2.374157 0.7744430 1.870326
## 1 10 1.655024 0.8877220 1.281697
## 1 15 1.633327 0.8904661 1.261820
## 1 20 1.633327 0.8904661 1.261820
## 1 25 1.633327 0.8904661 1.261820
## 1 30 1.633327 0.8904661 1.261820
## 1 35 1.633327 0.8904661 1.261820
## 1 40 1.633327 0.8904661 1.261820
## 1 45 1.633327 0.8904661 1.261820
## 1 50 1.633327 0.8904661 1.261820
## 2 5 2.413660 0.7682677 1.907182
## 2 10 1.395304 0.9224492 1.124020
## 2 15 1.240538 0.9400268 1.002271
## 2 20 1.255784 0.9380147 1.008117
## 2 25 1.255784 0.9380147 1.008117
## 2 30 1.255784 0.9380147 1.008117
## 2 35 1.255784 0.9380147 1.008117
## 2 40 1.255784 0.9380147 1.008117
## 2 45 1.255784 0.9380147 1.008117
## 2 50 1.255784 0.9380147 1.008117
## 3 5 2.413660 0.7682677 1.907182
## 3 10 1.394865 0.9227581 1.113340
## 3 15 1.234752 0.9411538 1.004014
## 3 20 1.248352 0.9393078 1.007118
## 3 25 1.248352 0.9393078 1.007118
## 3 30 1.248352 0.9393078 1.007118
## 3 35 1.248352 0.9393078 1.007118
## 3 40 1.248352 0.9393078 1.007118
## 3 45 1.248352 0.9393078 1.007118
## 3 50 1.248352 0.9393078 1.007118
## 4 5 2.413660 0.7682677 1.907182
## 4 10 1.394865 0.9227581 1.113340
## 4 15 1.234752 0.9411538 1.004014
## 4 20 1.248352 0.9393078 1.007118
## 4 25 1.248352 0.9393078 1.007118
## 4 30 1.248352 0.9393078 1.007118
## 4 35 1.248352 0.9393078 1.007118
## 4 40 1.248352 0.9393078 1.007118
## 4 45 1.248352 0.9393078 1.007118
## 4 50 1.248352 0.9393078 1.007118
##
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were nprune = 15 and degree = 3.
varImp(marsFit)
## earth variable importance
##
## Overall
## X1 100.00
## X4 75.24
## X2 48.73
## X5 15.52
## X3 0.00
ggplot(marsFit)
# Use the final best model
marsPred <- predict(marsFit, testData$x)
## performance values
mars_metrics <- postResample(pred = marsPred, obs = testData$y)
mars_metrics
## RMSE Rsquared MAE
## 1.1589948 0.9460418 0.9250230
A MARS model was fit using grid search and cross validation, with R-squared bring the metric to determine the best model. The final model selected 15 of 18 terms, and 5 of 10 predictors (nprune=15) with X1, X4, X2, X5, X3 being selected in the repspective order of importance. The fitted model explained 94.11% of variability in the response variable and the model did slightly better on the test data, with an R-squared of 0.9460.
set.seed(204)
svmFit <- train(trainingData$x, trainingData$y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = ctrl)
# View output
svmFit
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.484463 0.7995413 1.986294
## 0.50 2.212816 0.8203388 1.767826
## 1.00 2.031772 0.8405654 1.603824
## 2.00 1.921412 0.8549609 1.503827
## 4.00 1.819381 0.8688953 1.411205
## 8.00 1.782927 0.8743623 1.377246
## 16.00 1.786039 0.8744254 1.386926
## 32.00 1.786039 0.8744254 1.386926
## 64.00 1.786039 0.8744254 1.386926
## 128.00 1.786039 0.8744254 1.386926
## 256.00 1.786039 0.8744254 1.386926
## 512.00 1.786039 0.8744254 1.386926
## 1024.00 1.786039 0.8744254 1.386926
## 2048.00 1.786039 0.8744254 1.386926
##
## Tuning parameter 'sigma' was held constant at a value of 0.06127708
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06127708 and C = 8.
# View final model
svmFit$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 8
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.0612770781663944
##
## Number of Support Vectors : 154
##
## Objective Function Value : -75.3495
## Training error : 0.009332
# Use the final best model
svmPred <- predict(svmFit, testData$x)
## performance values
svm_metrics <- postResample(pred = svmPred, obs = testData$y)
svm_metrics
## RMSE Rsquared MAE
## 2.0475100 0.8300953 1.5531906
models <- c("KNN", "NEURAL_NETWORK", "MARS", "SVM")
values <- c(knn_metrics["Rsquared"], nn2_metrics["Rsquared"], mars_metrics["Rsquared"], svm_metrics["Rsquared"])
tibble(Models = models, R_squared = values) %>% arrange(desc(R_squared))
## # A tibble: 4 × 2
## Models R_squared
## <chr> <dbl>
## 1 MARS 0.946
## 2 SVM 0.830
## 3 NEURAL_NETWORK 0.778
## 4 KNN 0.687
The model that had the highest R-squared was the Multivariate Adaptive Regression Splines at 0.946. This model explained 94.6% of the variability in the response variable.
Does MARS select the informative predictors (those named X1–X5)?
Yes, the MARS model did select the informative predictors with the model judging X1 to be the most important.
# Load library
library(tidymodels)
# Load data
data("ChemicalManufacturingProcess")
chemical_df <- ChemicalManufacturingProcess
# head(chemical_df)
# Find number of missing values in each predictor
chemical_df %>% summarise_all(~ sum(is.na(.))) %>% pivot_longer(cols = colnames(chemical_df), names_to = "Variable", values_to = "Counts") %>%
filter(Counts > 0)
## # A tibble: 28 × 2
## Variable Counts
## <chr> <int>
## 1 ManufacturingProcess01 1
## 2 ManufacturingProcess02 3
## 3 ManufacturingProcess03 15
## 4 ManufacturingProcess04 1
## 5 ManufacturingProcess05 1
## 6 ManufacturingProcess06 2
## 7 ManufacturingProcess07 1
## 8 ManufacturingProcess08 1
## 9 ManufacturingProcess10 9
## 10 ManufacturingProcess11 10
## # ℹ 18 more rows
# Find predictors with a correlation greater or equal to 0.5 or less than or equal to -0.5
corr_tibble <- tibble(Predictor = rownames(cor(chemical_df[,2:ncol(chemical_df)], chemical_df$Yield)),
Corr_coef = cor(chemical_df[,2:ncol(chemical_df)], chemical_df$Yield)[,1])
corr_tibble %>% filter(Corr_coef <= -0.5 | Corr_coef >= 0.5)
## # A tibble: 3 × 2
## Predictor Corr_coef
## <chr> <dbl>
## 1 ManufacturingProcess09 0.503
## 2 ManufacturingProcess13 -0.504
## 3 ManufacturingProcess32 0.608
# Create a function that imputes the median of the predictor for the missing values
for(i in 1:ncol(chemical_df)) {
chemical_df[ , i][is.na(chemical_df[ , i])] <- median(chemical_df[ , i], na.rm=TRUE)
}
# Get zero variance predictors
nzv_labels <- nearZeroVar(chemical_df, names = TRUE)
print(paste0("The predictor/s with near zero variance are: ", nzv_labels))
## [1] "The predictor/s with near zero variance are: BiologicalMaterial07"
# Filter out the zero variance predictors
filtered_chemical_df <- as_tibble(chemical_df) %>% select(-nzv_labels)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(nzv_labels)
##
## # Now:
## data %>% select(all_of(nzv_labels))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
set.seed(300)
# Split data into train and test sets
chemical_split <- initial_split(filtered_chemical_df,
prop = 0.7)
# Create the training data
chemical_train <- chemical_split %>%
training()
chemXtrain <- data.frame(chemical_train %>% select(-Yield))
chemYtrain <- chemical_train %>% select(Yield)
# Create the test data
chemical_test <- chemical_split %>%
testing()
chemXtest <- data.frame(chemical_test %>% select(-Yield))
chemYtest <- chemical_test %>% select(Yield)
Four different non linear regression strategies will be used to build a model. The model will be trained on the training data and each model’s performance will be evaluated and compared by using a test set to make predictions.
# Will use 10 fold cross validation as in previous exercise
ctrl <- trainControl(method = "cv", number = 10)
set.seed(301)
knnChemical <- train(x = chemXtrain,
y = chemYtrain$Yield,
method = "knn",
preProc = c("center", "scale"),
trControl = ctrl,
metric = "Rsquared",
tuneLength = 10)
knnChemical
## k-Nearest Neighbors
##
## 123 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 111, 111, 111, 110, 110, 111, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 1.397233 0.4940931 1.131294
## 7 1.412603 0.4719315 1.123157
## 9 1.461348 0.4259835 1.188005
## 11 1.434232 0.4519084 1.170092
## 13 1.422837 0.4692733 1.168444
## 15 1.442009 0.4576442 1.172183
## 17 1.463912 0.4555665 1.185686
## 19 1.475629 0.4474194 1.199788
## 21 1.467981 0.4750504 1.195504
## 23 1.476938 0.4619071 1.204509
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
plot(knnChemical)
# Use the final best model
knnPredChemical <- predict(knnChemical, chemXtest)
## performance values
knnChemical_metrics <- postResample(pred = knnPredChemical, obs = chemYtest$Yield)
knnChemical_metrics
## RMSE Rsquared MAE
## 1.1829298 0.5965649 0.8871321
Using KNN method did a poor job of capturing the variability in the response variable - Yield.
set.seed(302)
# Create a parameter tuning grid
marsGrid <- expand.grid(degree = 1:4, nprune = seq(5, 50, by = 5))
marsChemicalFit <- train(x = chemXtrain,
y = chemYtrain$Yield,
method = "earth",
# Use R-sqaured to find best model
metric = "Rsquared",
# Use 10 fold cross validation and Grid search
trControl = ctrl,
tuneGrid = marsGrid)
summary(marsChemicalFit)
## Call: earth(x=data.frame[123,56], y=c(38.95,38.44,4...), keepxy=TRUE, degree=4,
## nprune=10)
##
## coefficients
## (Intercept) 37.930007
## h(44.09-ManufacturingProcess09) -0.742552
## h(ManufacturingProcess09-44.09) 0.439954
## h(33.3-ManufacturingProcess13) 2.213490
## h(ManufacturingProcess32-152) 0.223687
## ManufacturingProcess26 * h(61-ManufacturingProcess33) 0.002139
## h(207.3-ManufacturingProcess06) * h(ManufacturingProcess09-44.09) -0.258805
## h(ManufacturingProcess13-33.3) * h(0.7-ManufacturingProcess43) -2.472565
## ManufacturingProcess26 * h(61-ManufacturingProcess33) * ManufacturingProcess36 -0.097280
## BiologicalMaterial04 * h(10.6-ManufacturingProcess28) * h(ManufacturingProcess32-152) 0.001364
##
## Selected 10 of 46 terms, and 10 of 56 predictors (nprune=10)
## Termination condition: GRSq -10 at 46 terms
## Importance: ManufacturingProcess32, ManufacturingProcess09, ...
## Number of terms at each degree of interaction: 1 4 3 2
## GCV 1.062752 RSS 85.54071 GRSq 0.6975312 RSq 0.7988097
marsChemicalFit
## Multivariate Adaptive Regression Spline
##
## 123 samples
## 56 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 111, 111, 110, 111, 111, 110, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 5 1.316534 0.5038886 1.095417
## 1 10 1.335827 0.5081493 1.072217
## 1 15 1.415144 0.4817017 1.147743
## 1 20 1.412757 0.4874074 1.144313
## 1 25 1.412757 0.4874074 1.144313
## 1 30 1.412757 0.4874074 1.144313
## 1 35 1.412757 0.4874074 1.144313
## 1 40 1.412757 0.4874074 1.144313
## 1 45 1.412757 0.4874074 1.144313
## 1 50 1.412757 0.4874074 1.144313
## 2 5 1.246913 0.5776938 1.047440
## 2 10 1.324101 0.5686925 1.054947
## 2 15 1.512324 0.5419897 1.122435
## 2 20 1.647405 0.4971650 1.188040
## 2 25 1.666719 0.4909862 1.215407
## 2 30 1.666719 0.4909862 1.215407
## 2 35 1.666719 0.4909862 1.215407
## 2 40 1.666719 0.4909862 1.215407
## 2 45 1.666719 0.4909862 1.215407
## 2 50 1.666719 0.4909862 1.215407
## 3 5 1.315531 0.5193487 1.047427
## 3 10 1.509136 0.5532722 1.164606
## 3 15 1.590378 0.5351941 1.189340
## 3 20 1.608669 0.5167416 1.228182
## 3 25 1.632211 0.5020129 1.213308
## 3 30 1.630696 0.4998450 1.208556
## 3 35 1.630696 0.4998450 1.208556
## 3 40 1.630696 0.4998450 1.208556
## 3 45 1.630696 0.4998450 1.208556
## 3 50 1.630696 0.4998450 1.208556
## 4 5 1.278208 0.5409620 1.056750
## 4 10 1.424388 0.5799493 1.078559
## 4 15 1.572460 0.5380332 1.178461
## 4 20 1.626354 0.4997918 1.244416
## 4 25 1.653682 0.4923258 1.219965
## 4 30 1.649475 0.4912986 1.215747
## 4 35 1.649475 0.4912986 1.215747
## 4 40 1.649475 0.4912986 1.215747
## 4 45 1.649475 0.4912986 1.215747
## 4 50 1.649475 0.4912986 1.215747
##
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were nprune = 10 and degree = 4.
# Retrive important
varImp(marsChemicalFit)
## earth variable importance
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess09 63.93
## ManufacturingProcess13 42.72
## ManufacturingProcess28 24.73
## BiologicalMaterial04 24.73
## ManufacturingProcess43 20.11
## ManufacturingProcess33 12.23
## ManufacturingProcess26 12.23
## ManufacturingProcess06 7.53
## ManufacturingProcess36 0.00
# Plot
ggplot(marsChemicalFit)
The ranking of features by importance makes sense as the top three
predictors had the highest correlation with the response variable.
# Use the final best model
marsChemicalPred <- predict(marsChemicalFit, chemXtest)
## performance values
marsChemical_metrics <- postResample(pred = marsChemicalPred, obs = chemYtest$Yield)
marsChemical_metrics
## RMSE Rsquared MAE
## 1.1131686 0.6302952 0.8278500
set.seed(303)
# Set parameters
nnetGrid <- expand.grid(data_frame(decay = c(0, 0.0001, 0.001, 0.01, 0.1),
size = c(1:5)))
## The next option is to use bagging (see the
## next chapter) instead of different random
## seeds.
nnetChemicalFit <- train(chemXtrain, chemYtrain$Yield,
method = "nnet",
## Automatically standardize data prior to modeling
## and prediction
preProc = c("center", "scale"),
linout = TRUE,
trace = FALSE,
metric = "Rsquared",
# Use cross validation with parameter set in crtl
trControl = ctrl,
tuneGrid = nnetGrid,
MaxNWts = 10 * (ncol(chemXtrain) + 1) + 10 + 1,
maxit = 500)
nnetChemicalFit
## Neural Network
##
## 123 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 111, 111, 110, 110, 111, 111, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0e+00 1 1.703514 0.2597174 1.380961
## 0e+00 2 1.722955 0.2193736 1.418749
## 0e+00 3 1.743161 0.2469934 1.406113
## 0e+00 4 2.886874 0.2857376 2.256411
## 0e+00 5 3.548444 0.2090359 2.813092
## 1e-04 1 1.618337 0.3864089 1.305114
## 1e-04 2 1.692357 0.3499415 1.339765
## 1e-04 3 1.613197 0.3574490 1.315076
## 1e-04 4 1.935835 0.3700008 1.519958
## 1e-04 5 3.322728 0.2785144 2.587177
## 1e-03 1 1.989049 0.2718451 1.553984
## 1e-03 2 1.750163 0.4052093 1.399875
## 1e-03 3 2.217212 0.3820242 1.678624
## 1e-03 4 2.378709 0.3192322 1.910201
## 1e-03 5 3.051805 0.3058756 2.378399
## 1e-02 1 1.679888 0.4367466 1.337996
## 1e-02 2 2.573273 0.3566771 1.980118
## 1e-02 3 2.305264 0.2926005 1.861456
## 1e-02 4 2.916294 0.2754428 2.310904
## 1e-02 5 3.229173 0.1810854 2.342558
## 1e-01 1 1.872232 0.4167089 1.424454
## 1e-01 2 2.112293 0.3176324 1.600205
## 1e-01 3 2.636771 0.2446606 2.021290
## 1e-01 4 2.561239 0.2591982 1.921910
## 1e-01 5 2.206744 0.3353673 1.730974
##
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were size = 1 and decay = 0.01.
ggplot(nnetChemicalFit)
# Use the final best model
nnetChemicalPred <- predict(nnetChemicalFit, chemXtest)
## performance values
nnChemical_metrics <- postResample(pred = nnetChemicalPred, obs = chemYtest$Yield)
nnChemical_metrics
## RMSE Rsquared MAE
## 1.6329030 0.3762549 1.3240861
set.seed(304)
svmChemicalFit <- train(chemXtrain, chemYtrain$Yield,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = ctrl,
metric = "Rsquared")
# View output
svmChemicalFit
## Support Vector Machines with Radial Basis Function Kernel
##
## 123 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 111, 110, 111, 111, 111, 111, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 1.457806 0.5073477 1.1940027
## 0.50 1.354669 0.5443848 1.1056640
## 1.00 1.252397 0.5975845 1.0405274
## 2.00 1.179359 0.6393431 0.9792180
## 4.00 1.156255 0.6522999 0.9500740
## 8.00 1.146292 0.6636211 0.9264818
## 16.00 1.143445 0.6658972 0.9237502
## 32.00 1.143445 0.6658972 0.9237502
## 64.00 1.143445 0.6658972 0.9237502
## 128.00 1.143445 0.6658972 0.9237502
## 256.00 1.143445 0.6658972 0.9237502
## 512.00 1.143445 0.6658972 0.9237502
## 1024.00 1.143445 0.6658972 0.9237502
## 2048.00 1.143445 0.6658972 0.9237502
##
## Tuning parameter 'sigma' was held constant at a value of 0.01419071
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.01419071 and C = 16.
# Use the final best model
svmChemicalPred <- predict(svmChemicalFit, chemXtest)
## performance values
svmChemical_metrics <- postResample(pred = svmChemicalPred, obs = chemYtest$Yield)
svmChemical_metrics
## RMSE Rsquared MAE
## 1.0764559 0.6378348 0.8492403
models <- c("KNN", "NEURAL_NETWORK", "MARS", "SVM")
values <- c(knnChemical_metrics["Rsquared"], nnChemical_metrics["Rsquared"], marsChemical_metrics["Rsquared"], svmChemical_metrics["Rsquared"])
tibble(Models = models, R_squared = values) %>% arrange(desc(R_squared))
## # A tibble: 4 × 2
## Models R_squared
## <chr> <dbl>
## 1 SVM 0.638
## 2 MARS 0.630
## 3 KNN 0.597
## 4 NEURAL_NETWORK 0.376
a. Which nonlinear regression model gives the optimal re-sampling and test set performance?
The model that gives the optimal re sampling and test set performance is the SVM model. The fitted SVM model explained 66.59% of the variability in Yield and performed similarly with the test data; R-squared value was 0.6378 - explaining 63.78% of the data. In the final SVM model with a radial basis function kernel, the values used for sigma and the cost were 0.01419071 and 16 respectively. The cost parameter in SVM controls the regularization strength. It determines how much the model is penalized for misclassifying training examples. Sigma values control the influence that a single training example is felt over a wider range of distances.
b. Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?
# Find the importance rankings
var_imp <- varImp(svmChemicalFit)
var_imp
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess13 95.40
## ManufacturingProcess17 77.32
## BiologicalMaterial06 66.70
## ManufacturingProcess09 63.30
## BiologicalMaterial03 60.83
## BiologicalMaterial12 60.67
## ManufacturingProcess06 59.87
## BiologicalMaterial02 59.80
## ManufacturingProcess36 59.59
## ManufacturingProcess31 54.33
## BiologicalMaterial04 46.49
## BiologicalMaterial09 45.78
## ManufacturingProcess30 39.93
## ManufacturingProcess11 39.76
## BiologicalMaterial08 39.56
## ManufacturingProcess29 39.49
## BiologicalMaterial11 38.75
## BiologicalMaterial01 37.31
## ManufacturingProcess33 37.04
# Extract only the top 10
top10 <- var_imp$importance %>% arrange(desc(Overall)) %>% head(10)
Within th etop 20 most important features, 11/20 are manufacturing processes and the rest are biological processes. In exercise 6.3 an elastic net model was fitted. The nonlinear svm regression model and the elastic net model shared 8 predictors in their respective top ten lists, which include
c. Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?
# Scatter plots
featurePlot(chemical_df[,rownames(top10)], chemical_df$Yield)
# Corrlation plot
chemical_corr <- cor(chemical_df[,append(rownames(top10), "Yield")])
corrplot::corrplot(chemical_corr,method = "number")
Visualizing the relationships between the top 10 features and the
response variable yield, some patterns in the relationships are easier
to spot than others. In general it looks like the Biological materials
have a positive relationship with yield, which is confirmed by the
correlation plot. The strongest positive relationship with the response
was ManufacturingProcess32 and the strongest negative correlation was
ManufacturingProcess36.