Week 11 Non-Linear Regression

Author

By Tony Fraser

Published

April 7, 2024

library(AppliedPredictiveModeling)
library(pls)
library(patchwork)
library(lubridate)
library(scales)
library(ggplot2)
library(dplyr)
library(randomForest)
library(pdp)
library(caret)
library(earth) 
data(ChemicalManufacturingProcess)
options(scipen=999)

7.2 mlbench simulation practice

Use mlbench.friedman1 to simulate data, and then tune models to it.

library(mlbench)
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
trainingData$x <- data.frame(trainingData$x)
featurePlot(trainingData$x, trainingData$y)

Tune multiple models

#KNN
knnModel <- train(x = trainingData$x,
                  y = trainingData$y,
                  method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength = 5)
knnPred <- predict(knnModel, newdata = testData$x)
knnMetrics <- postResample(pred = knnPred, obs = testData$y)

#MARS
marsModel <- train(x = trainingData$x,
                   y = trainingData$y,
                   method = "earth",
                   preProc = c("center", "scale"),
                   tuneLength = 5)

marsPred <- predict(marsModel, newdata = testData$x)
marsMetrics <- postResample(pred = marsPred, obs = testData$y)

#neural net
nnModel <- train(x = trainingData$x,
                 y = trainingData$y,
                 method = "nnet",
                 preProcess = c("center", "scale"),
                 tuneLength = 5, 
                 trace = FALSE)
nnPred <- predict(nnModel, newdata = testData$x)
nnMetrics <- postResample(pred = nnPred, obs = testData$y)

## svm
svmModel <- train(x = trainingData$x,
                  y = trainingData$y,
                  method = "svmRadial",
                  preProcess = c("center", "scale"),
                  tuneLength = 5)
svmPred <- predict(svmModel, newdata = testData$x)
svmMetrics <- postResample(pred = svmPred, obs = testData$y)

## rf
library(randomForest)

rfModel <- train(x = trainingData$x,
                 y = trainingData$y,
                 method = "rf",
                 preProcess = c("center", "scale"),
                 tuneLength = 5)
rfPred <- predict(rfModel, newdata = testData$x)
rfMetrics <- postResample(pred = rfPred, obs = testData$y)

modelPerformance <- data.frame(
  RMSE = c(knnMetrics[1], marsMetrics[1], nnMetrics[1], svmMetrics[1], rfMetrics[1]),
  Rsquared = c(knnMetrics[2], marsMetrics[2], nnMetrics[2], svmMetrics[2], rfMetrics[2]),
  MAE = c(knnMetrics[3], marsMetrics[3], nnMetrics[3], svmMetrics[3], rfMetrics[3]),
  row.names = c("KNN", "MARS", "NeuralNetwork", "SVM", "RF")
)
modelPerformance
                   RMSE  Rsquared       MAE
KNN            3.148156 0.6747755  2.523604
MARS           1.816313 0.8673823  1.389461
NeuralNetwork 14.276927        NA 13.386908
SVM            2.062305 0.8282918  1.558381
RF             2.400172 0.7936459  1.896889

More on Mars

Which works best?Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?**

Mars performs best. And yes, those are all selected.

summary(marsModel)
Call: earth(x=data.frame[200,10], y=c(18.46,16.1,17...), keepxy=TRUE, degree=1,
            nprune=11)

                 coefficients
(Intercept)        18.7269554
h(0.507267-X1)     -3.0419709
h(0.325504-X2)     -2.8646644
h(X3- -0.804171)    5.9950944
h(-0.216741-X3)     5.0313219
h(X3- -0.216741)   -7.0757781
h(X3-0.453446)      4.8280788
h(0.953812-X4)     -2.7901249
h(X4-0.953812)      2.6992283
h(1.17878-X5)      -1.5048574
h(X6- -0.47556)    -0.5743446

Selected 11 of 18 terms, and 6 of 10 predictors (nprune=11)
Termination condition: Reached nk 21
Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
Number of terms at each degree of interaction: 1 10 (additive model)
GCV 2.591365    RSS 415.1497    GRSq 0.8947895    RSq 0.9148746

7.5. Chemical Manufacturing

Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

imputed_data <- preProcess(ChemicalManufacturingProcess, method='medianImpute')
processed_data <- predict(imputed_data, ChemicalManufacturingProcess)

set.seed(123)
splitIndex <- createDataPartition(processed_data$Yield, p = 0.80, list = FALSE)
train_data <- processed_data[splitIndex, ]
test_data <- processed_data[-splitIndex, ]

# KNN
knnModel <- train(Yield ~ ., data = train_data,
                  method = "knn",
                  preProcess = c("center", "scale"),
                  tuneLength = 5)
knnPred <- predict(knnModel, newdata = test_data)
knnMetrics <- postResample(pred = knnPred, obs = test_data$Yield)

# MARS
marsModel <- train(Yield ~ ., data = train_data,
                   method = "earth",
                   preProcess = c("center", "scale"),
                   tuneLength = 5)
marsPred <- predict(marsModel, newdata = test_data)
marsMetrics <- postResample(pred = marsPred, obs = test_data$Yield)

# Neural Network
# nnModel <- train(Yield ~ ., data = train_data,
#                  method = "nnet",
#                  preProcess = c("center", "scale"),
#                  tuneLength = 5, 
#                  trace = FALSE)
# nnPred <- predict(nnModel, newdata = test_data)
# nnMetrics <- postResample(pred = nnPred, obs = test_data$Yield)
# NeuralNetwork [RMSE: 39.146857 RSquared:NA  MAE: 39.1046875]

nnModel <- train(Yield ~ ., data = train_data,
                 method = "nnet",
                 preProcess = c("center", "scale"),
                 tuneGrid = expand.grid(.size = 10,  # Select a specific number of neurons
                                    .decay = 0.01),  # Select a specific decay parameter
                 trControl = trainControl(method = "none"),
                 maxit = 1000,  # Maximum iterations
                 trace = FALSE)

# SVM
svmModel <- train(Yield ~ ., data = train_data,
                  method = "svmRadial",
                  preProcess = c("center", "scale"),
                  tuneLength = 5)
svmPred <- predict(svmModel, newdata = test_data)
svmMetrics <- postResample(pred = svmPred, obs = test_data$Yield)


# Random Forest
rfModel <- train(Yield ~ ., data = train_data,
                 method = "rf",
                 preProcess = c("center", "scale"),
                 tuneLength = 5)
rfPred <- predict(rfModel, newdata = test_data)
rfMetrics <- postResample(pred = rfPred, obs = test_data$Yield)

# Combine model performance metrics
modelPerformance <- data.frame(
  RMSE = c(knnMetrics[1], marsMetrics[1], nnMetrics[1], svmMetrics[1], rfMetrics[1]),
  Rsquared = c(knnMetrics[2], marsMetrics[2], nnMetrics[2], svmMetrics[2], rfMetrics[2]),
  MAE = c(knnMetrics[3], marsMetrics[3], nnMetrics[3], svmMetrics[3], rfMetrics[3]),
  row.names = c("KNN", "MARS", "NeuralNetwork", "SVM", "RF")
)

modelPerformance
                   RMSE  Rsquared       MAE
KNN            1.406333 0.4224466  1.160688
MARS           1.402311 0.4244340  1.110419
NeuralNetwork 14.276927        NA 13.386908
SVM            1.220243 0.5610862  1.017916
RF             1.276474 0.5531786  1.003412

Which works best?

Which nonlinear regression model gives the optimal resampling and test set performance?

LR performs better, though I didn’t try any cross validation. Let’s try it.

rfModel <- train(Yield ~ ., data = train_data,
                 method = "rf",
                 trainControl="cv",
                 preProcess = c("center", "scale"),
                 tuneLength = 5)
rfPred <- predict(rfModel, newdata = test_data)
postResample(pred = rfPred, obs = test_data$Yield)
     RMSE  Rsquared       MAE 
1.2331581 0.5883704 0.9682290 

The Random Forest model, with resampling (cross-validation) performs slightly less accurately then when it isn’t validated. This slight decrease in performance metrics is supposedly typical, and often more representative of how the model will perform in real-world scenarios.

Important Predictors

Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

At least in the top 12, about 2/3rds are manufacturing in RF. What I find more interesting though is that they are so different beyond the first variable or two. Wow, math is strange.

                         Overall
ManufacturingProcess32 100.00000
BiologicalMaterial06    41.78610
BiologicalMaterial03    35.43998
ManufacturingProcess31  32.61260
BiologicalMaterial12    23.54109
ManufacturingProcess13  21.90736
ManufacturingProcess36  20.81679
BiologicalMaterial02    20.41648
ManufacturingProcess17  20.21639
ManufacturingProcess09  17.37263
ManufacturingProcess28  16.68615
BiologicalMaterial11    14.66766
                         Overall
ManufacturingProcess32 100.00000
BiologicalMaterial06    94.06439
ManufacturingProcess36  81.53513
BiologicalMaterial03    81.26510
ManufacturingProcess13  80.63144
ManufacturingProcess31  78.52140
BiologicalMaterial02    76.03669
ManufacturingProcess17  75.91748
ManufacturingProcess09  73.04357
BiologicalMaterial12    69.48078
ManufacturingProcess06  66.24451
BiologicalMaterial11    59.72022

Plots

Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

You’re supposed to be able to use Partial Dependency Plots, (PDPs,) to show how each predictor affects the model’s target prediction. A flat line supposedly suggests that a predictor has little or no effect on the prediction. We have two flat lines. But those 37 and 32 records definitely do have impact. Those two facts together are supposed to tell me there are other relationships not captured within this data set. Ok, I knew that going in.

No, this doesn’t help my intuition. I wish we were practicing on a dataset with stronger relationships so I could see more visible patterns and get a feel for what relationships look and feel like. This dataset doesn’t have strong relationships. It might be a good dataset to become an expert in, maybe a chemist or something, but for somebody trying to learn what relationships are supposed to look and feel like, this me little and doesn’t seem to register on any intuition.

All good though, I learned a lot about looking up results.

top_important_vars <- rownames(impRf)[order(-impRf$Overall)][1:3]

pdp_plot1 <- ggplot(partial(rfModel, pred.var = top_important_vars[1]), aes(x = ManufacturingProcess32, y = yhat)) +
  geom_line() +
  ggtitle(paste("Partial Dependence Plot for", top_important_vars[1])) +
  xlab(top_important_vars[1]) +
  ylab("Partial Dependence")

pdp_plot2 <- ggplot(partial(rfModel, pred.var = top_important_vars[2]), aes(x = top_important_vars[2], y = yhat)) +
  geom_line() +
  ggtitle(paste("Partial Dependence Plot for", top_important_vars[2])) +
  xlab(top_important_vars[2]) +
  ylab("Partial Dependence")

pdp_plot3 <- ggplot(partial(rfModel, pred.var = top_important_vars[3]), aes(x = top_important_vars[3], y = yhat)) +
  geom_line() +
  ggtitle(paste("Partial Dependence Plot for", top_important_vars[3])) +
  xlab(top_important_vars[3]) +
  ylab("Partial Dependence")

# Combine the plots side by side using patchwork
combined_plot <- pdp_plot1 + pdp_plot2 + pdp_plot3 +
  plot_layout(ncol = 3)

# Print the combined plot
combined_plot