library(AppliedPredictiveModeling)
library(pls)
library(patchwork)
library(lubridate)
library(scales)
library(ggplot2)
library(dplyr)
library(randomForest)
library(pdp)
library(caret)
library(earth)
data(ChemicalManufacturingProcess)
options(scipen=999)
Week 11 Non-Linear Regression
7.2 mlbench simulation practice
Use mlbench.friedman1 to simulate data, and then tune models to it.
library(mlbench)
set.seed(200)
<- mlbench.friedman1(200, sd = 1)
trainingData <- mlbench.friedman1(5000, sd = 1)
testData $x <- data.frame(testData$x)
testData$x <- data.frame(trainingData$x)
trainingDatafeaturePlot(trainingData$x, trainingData$y)
Tune multiple models
#KNN
<- train(x = trainingData$x,
knnModel y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 5)
<- predict(knnModel, newdata = testData$x)
knnPred <- postResample(pred = knnPred, obs = testData$y)
knnMetrics
#MARS
<- train(x = trainingData$x,
marsModel y = trainingData$y,
method = "earth",
preProc = c("center", "scale"),
tuneLength = 5)
<- predict(marsModel, newdata = testData$x)
marsPred <- postResample(pred = marsPred, obs = testData$y)
marsMetrics
#neural net
<- train(x = trainingData$x,
nnModel y = trainingData$y,
method = "nnet",
preProcess = c("center", "scale"),
tuneLength = 5,
trace = FALSE)
<- predict(nnModel, newdata = testData$x)
nnPred <- postResample(pred = nnPred, obs = testData$y)
nnMetrics
## svm
<- train(x = trainingData$x,
svmModel y = trainingData$y,
method = "svmRadial",
preProcess = c("center", "scale"),
tuneLength = 5)
<- predict(svmModel, newdata = testData$x)
svmPred <- postResample(pred = svmPred, obs = testData$y)
svmMetrics
## rf
library(randomForest)
<- train(x = trainingData$x,
rfModel y = trainingData$y,
method = "rf",
preProcess = c("center", "scale"),
tuneLength = 5)
<- predict(rfModel, newdata = testData$x)
rfPred <- postResample(pred = rfPred, obs = testData$y)
rfMetrics
<- data.frame(
modelPerformance RMSE = c(knnMetrics[1], marsMetrics[1], nnMetrics[1], svmMetrics[1], rfMetrics[1]),
Rsquared = c(knnMetrics[2], marsMetrics[2], nnMetrics[2], svmMetrics[2], rfMetrics[2]),
MAE = c(knnMetrics[3], marsMetrics[3], nnMetrics[3], svmMetrics[3], rfMetrics[3]),
row.names = c("KNN", "MARS", "NeuralNetwork", "SVM", "RF")
) modelPerformance
RMSE Rsquared MAE
KNN 3.148156 0.6747755 2.523604
MARS 1.816313 0.8673823 1.389461
NeuralNetwork 14.276927 NA 13.386908
SVM 2.062305 0.8282918 1.558381
RF 2.400172 0.7936459 1.896889
More on Mars
Which works best?Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?**
Mars performs best. And yes, those are all selected.
summary(marsModel)
Call: earth(x=data.frame[200,10], y=c(18.46,16.1,17...), keepxy=TRUE, degree=1,
nprune=11)
coefficients
(Intercept) 18.7269554
h(0.507267-X1) -3.0419709
h(0.325504-X2) -2.8646644
h(X3- -0.804171) 5.9950944
h(-0.216741-X3) 5.0313219
h(X3- -0.216741) -7.0757781
h(X3-0.453446) 4.8280788
h(0.953812-X4) -2.7901249
h(X4-0.953812) 2.6992283
h(1.17878-X5) -1.5048574
h(X6- -0.47556) -0.5743446
Selected 11 of 18 terms, and 6 of 10 predictors (nprune=11)
Termination condition: Reached nk 21
Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
Number of terms at each degree of interaction: 1 10 (additive model)
GCV 2.591365 RSS 415.1497 GRSq 0.8947895 RSq 0.9148746
7.5. Chemical Manufacturing
Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.
<- preProcess(ChemicalManufacturingProcess, method='medianImpute')
imputed_data <- predict(imputed_data, ChemicalManufacturingProcess)
processed_data
set.seed(123)
<- createDataPartition(processed_data$Yield, p = 0.80, list = FALSE)
splitIndex <- processed_data[splitIndex, ]
train_data <- processed_data[-splitIndex, ]
test_data
# KNN
<- train(Yield ~ ., data = train_data,
knnModel method = "knn",
preProcess = c("center", "scale"),
tuneLength = 5)
<- predict(knnModel, newdata = test_data)
knnPred <- postResample(pred = knnPred, obs = test_data$Yield)
knnMetrics
# MARS
<- train(Yield ~ ., data = train_data,
marsModel method = "earth",
preProcess = c("center", "scale"),
tuneLength = 5)
<- predict(marsModel, newdata = test_data)
marsPred <- postResample(pred = marsPred, obs = test_data$Yield)
marsMetrics
# Neural Network
# nnModel <- train(Yield ~ ., data = train_data,
# method = "nnet",
# preProcess = c("center", "scale"),
# tuneLength = 5,
# trace = FALSE)
# nnPred <- predict(nnModel, newdata = test_data)
# nnMetrics <- postResample(pred = nnPred, obs = test_data$Yield)
# NeuralNetwork [RMSE: 39.146857 RSquared:NA MAE: 39.1046875]
<- train(Yield ~ ., data = train_data,
nnModel method = "nnet",
preProcess = c("center", "scale"),
tuneGrid = expand.grid(.size = 10, # Select a specific number of neurons
.decay = 0.01), # Select a specific decay parameter
trControl = trainControl(method = "none"),
maxit = 1000, # Maximum iterations
trace = FALSE)
# SVM
<- train(Yield ~ ., data = train_data,
svmModel method = "svmRadial",
preProcess = c("center", "scale"),
tuneLength = 5)
<- predict(svmModel, newdata = test_data)
svmPred <- postResample(pred = svmPred, obs = test_data$Yield)
svmMetrics
# Random Forest
<- train(Yield ~ ., data = train_data,
rfModel method = "rf",
preProcess = c("center", "scale"),
tuneLength = 5)
<- predict(rfModel, newdata = test_data)
rfPred <- postResample(pred = rfPred, obs = test_data$Yield)
rfMetrics
# Combine model performance metrics
<- data.frame(
modelPerformance RMSE = c(knnMetrics[1], marsMetrics[1], nnMetrics[1], svmMetrics[1], rfMetrics[1]),
Rsquared = c(knnMetrics[2], marsMetrics[2], nnMetrics[2], svmMetrics[2], rfMetrics[2]),
MAE = c(knnMetrics[3], marsMetrics[3], nnMetrics[3], svmMetrics[3], rfMetrics[3]),
row.names = c("KNN", "MARS", "NeuralNetwork", "SVM", "RF")
)
modelPerformance
RMSE Rsquared MAE
KNN 1.406333 0.4224466 1.160688
MARS 1.402311 0.4244340 1.110419
NeuralNetwork 14.276927 NA 13.386908
SVM 1.220243 0.5610862 1.017916
RF 1.276474 0.5531786 1.003412
Which works best?
Which nonlinear regression model gives the optimal resampling and test set performance?
LR performs better, though I didn’t try any cross validation. Let’s try it.
<- train(Yield ~ ., data = train_data,
rfModel method = "rf",
trainControl="cv",
preProcess = c("center", "scale"),
tuneLength = 5)
<- predict(rfModel, newdata = test_data)
rfPred postResample(pred = rfPred, obs = test_data$Yield)
RMSE Rsquared MAE
1.2331581 0.5883704 0.9682290
The Random Forest model, with resampling (cross-validation) performs slightly less accurately then when it isn’t validated. This slight decrease in performance metrics is supposedly typical, and often more representative of how the model will perform in real-world scenarios.
Important Predictors
Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?
At least in the top 12, about 2/3rds are manufacturing in RF. What I find more interesting though is that they are so different beyond the first variable or two. Wow, math is strange.
Overall
ManufacturingProcess32 100.00000
BiologicalMaterial06 41.78610
BiologicalMaterial03 35.43998
ManufacturingProcess31 32.61260
BiologicalMaterial12 23.54109
ManufacturingProcess13 21.90736
ManufacturingProcess36 20.81679
BiologicalMaterial02 20.41648
ManufacturingProcess17 20.21639
ManufacturingProcess09 17.37263
ManufacturingProcess28 16.68615
BiologicalMaterial11 14.66766
Overall
ManufacturingProcess32 100.00000
BiologicalMaterial06 94.06439
ManufacturingProcess36 81.53513
BiologicalMaterial03 81.26510
ManufacturingProcess13 80.63144
ManufacturingProcess31 78.52140
BiologicalMaterial02 76.03669
ManufacturingProcess17 75.91748
ManufacturingProcess09 73.04357
BiologicalMaterial12 69.48078
ManufacturingProcess06 66.24451
BiologicalMaterial11 59.72022
Plots
Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?
You’re supposed to be able to use Partial Dependency Plots, (PDPs,) to show how each predictor affects the model’s target prediction. A flat line supposedly suggests that a predictor has little or no effect on the prediction. We have two flat lines. But those 37 and 32 records definitely do have impact. Those two facts together are supposed to tell me there are other relationships not captured within this data set. Ok, I knew that going in.
No, this doesn’t help my intuition. I wish we were practicing on a dataset with stronger relationships so I could see more visible patterns and get a feel for what relationships look and feel like. This dataset doesn’t have strong relationships. It might be a good dataset to become an expert in, maybe a chemist or something, but for somebody trying to learn what relationships are supposed to look and feel like, this me little and doesn’t seem to register on any intuition.
All good though, I learned a lot about looking up results.
<- rownames(impRf)[order(-impRf$Overall)][1:3]
top_important_vars
<- ggplot(partial(rfModel, pred.var = top_important_vars[1]), aes(x = ManufacturingProcess32, y = yhat)) +
pdp_plot1 geom_line() +
ggtitle(paste("Partial Dependence Plot for", top_important_vars[1])) +
xlab(top_important_vars[1]) +
ylab("Partial Dependence")
<- ggplot(partial(rfModel, pred.var = top_important_vars[2]), aes(x = top_important_vars[2], y = yhat)) +
pdp_plot2 geom_line() +
ggtitle(paste("Partial Dependence Plot for", top_important_vars[2])) +
xlab(top_important_vars[2]) +
ylab("Partial Dependence")
<- ggplot(partial(rfModel, pred.var = top_important_vars[3]), aes(x = top_important_vars[3], y = yhat)) +
pdp_plot3 geom_line() +
ggtitle(paste("Partial Dependence Plot for", top_important_vars[3])) +
xlab(top_important_vars[3]) +
ylab("Partial Dependence")
# Combine the plots side by side using patchwork
<- pdp_plot1 + pdp_plot2 + pdp_plot3 +
combined_plot plot_layout(ncol = 3)
# Print the combined plot
combined_plot