homework8

library(mlbench)
library(caret)
library(earth)

7.2

Given knn Example

set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)

## or other methods.

## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

knnModel <- train(x = trainingData$x,
                  y = trainingData$y,
                  method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength = 10)
knnModel

## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.466085  0.5121775  2.816838
##    7  3.349428  0.5452823  2.727410
##    9  3.264276  0.5785990  2.660026
##   11  3.214216  0.6024244  2.603767
##   13  3.196510  0.6176570  2.591935
##   15  3.184173  0.6305506  2.577482
##   17  3.183130  0.6425367  2.567787
##   19  3.198752  0.6483184  2.592683
##   21  3.188993  0.6611428  2.588787
##   23  3.200458  0.6638353  2.604529
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.

knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## performance values
postResample(pred = knnPred, obs = testData$y)

##      RMSE  Rsquared       MAE 
## 3.2040595 0.6819919 2.5683461

Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?

MARS model

# Tune MARS model
marsModel <- train(x = trainingData$x,
                   y = trainingData$y,
                   method = "earth",
                   tuneLength = 10,
                   preProc = c("center", "scale"))
                   
marsModel

## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   nprune  RMSE      Rsquared   MAE     
##    2      4.383438  0.2405683  3.597961
##    3      3.645469  0.4745962  2.930453
##    4      2.727602  0.7035031  2.184240
##    6      2.331605  0.7835496  1.833420
##    7      1.976830  0.8421599  1.562591
##    9      1.804342  0.8683110  1.410395
##   10      1.787676  0.8711960  1.386944
##   12      1.821005  0.8670619  1.419893
##   13      1.858688  0.8617344  1.445459
##   15      1.871033  0.8607099  1.457618
## 
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 10 and degree = 1.

# Check MARS variable importance
varImp(marsModel)

## earth variable importance
## 
##    Overall
## X1  100.00
## X4   82.78
## X2   64.18
## X5   40.21
## X3   28.14
## X6    0.00

Random Forest

# Tune Random Forest
rfModel <- train(x = trainingData$x,
                 y = trainingData$y,
                 method = "rf",
                 tuneLength = 10,
                 preProc = c("center", "scale"))

## note: only 9 unique complexity parameters in default grid. Truncating the grid to 9 .

rfModel

## Random Forest 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##    2    2.921880  0.7820552  2.427825
##    3    2.714851  0.7854612  2.252894
##    4    2.634192  0.7752214  2.184914
##    5    2.602575  0.7647143  2.149024
##    6    2.596046  0.7553125  2.139150
##    7    2.603531  0.7450543  2.144592
##    8    2.620741  0.7360489  2.152775
##    9    2.631700  0.7298234  2.157534
##   10    2.646274  0.7238240  2.168685
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 6.

Comparing models

# Compare all models
results <- resamples(list(knn=knnModel, mars=marsModel, rf=rfModel))
summary(results)

## 
## Call:
## summary.resamples(object = results)
## 
## Models: knn, mars, rf 
## Number of resamples: 25 
## 
## MAE 
##          Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## knn  2.049281 2.507832 2.614543 2.567787 2.715750 2.855385    0
## mars 1.148826 1.285676 1.389701 1.386944 1.505222 1.607913    0
## rf   1.590189 1.961529 2.127290 2.139150 2.296508 2.504643    0
## 
## RMSE 
##          Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## knn  2.677712 3.076907 3.205878 3.183130 3.358475 3.505082    0
## mars 1.471054 1.653905 1.813675 1.787676 1.899500 2.241137    0
## rf   2.037460 2.383600 2.603670 2.596046 2.794775 2.992339    0
## 
## Rsquared 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## knn  0.4872237 0.6024717 0.6407126 0.6425367 0.6733349 0.7549410    0
## mars 0.8179101 0.8520345 0.8715922 0.8711960 0.8971392 0.9214815    0
## rf   0.5084187 0.7439288 0.7639507 0.7553125 0.7895671 0.8627742    0

bwplot(results)

Based on resampling results (25 repeats), MARS (Multivariate Adaptive Regression Splines) outperforms both Random Forest (RF) and k-NN on Friedman’s benchmark dataset, achieving the lowest prediction errors (MAE = 1.36, RMSE = 1.72) and highest explained variance (R^2 = 0.87). MARS excels because it directly models nonlinear and additive structures, while RF (median R^2 = 0.77) and k-NN (median R^2 = 0.64) are less precise. Additionally, MARS correctly prioritizes informative predictors (X1–X5) over noise variables (X6–X10), as confirmed by variable importance analysis. For this problem, MARS is the best choice for accuracy and interpretability, followed by RF, while k-NN trails significantly.

7.5

Data Preparation

# Load libraries
library(AppliedPredictiveModeling)
library(caret)
library(gbm)
library(earth)       # For MARS
library(randomForest) # For Random Forest
library(kernlab)     # For SVM
library(glmnet)      # For Ridge/Lasso (comparison)

# Load and impute data (same as 6.3)
data(ChemicalManufacturingProcess)
processed <- preProcess(ChemicalManufacturingProcess, 
                       method = c("knnImpute", "center", "scale", "nzv"))
imputedData <- predict(processed, ChemicalManufacturingProcess)

# Split data (80% train, 20% test)
set.seed(1225)
trainIndex <- createDataPartition(imputedData$Yield, p = 0.8, list = FALSE)
trainData <- imputedData[trainIndex, ]
testData <- imputedData[-trainIndex, ]

# Prepare x/y for models
x_train <- model.matrix(Yield ~ ., trainData)[, -1]  # Remove intercept
y_train <- trainData$Yield
x_test <- model.matrix(Yield ~ ., testData)[, -1]
y_test <- testData$Yield

Train Nonlinear Models

MARS

set.seed(111)
marsModel <- train(
  x = x_train, y = y_train,
  method = "earth",
  tuneGrid = expand.grid(degree = 1:2, nprune = 10:20), # Tune interaction depth & terms
  trControl = trainControl(method = "cv", number = 10),
  metric = "Rsquared"
)

# Predict and evaluate
marsPred <- predict(marsModel, x_test)
postResample(marsPred, y_test)  # Test RMSE, R²

##      RMSE  Rsquared       MAE 
## 0.5904763 0.7159729 0.4774057

Random Forest

set.seed(222)
rfModel <- train(
  x = x_train, y = y_train,
  method = "rf",
  tuneLength = 10,              # Auto-tune mtry (number of variables per split)
  trControl = trainControl(method = "cv", number = 10),
  metric = "Rsquared"
)

# Predict and evaluate
rfPred <- predict(rfModel, x_test)
postResample(rfPred, y_test)

##      RMSE  Rsquared       MAE 
## 0.7390801 0.6233859 0.5623155

SVM with Radial Kernel

set.seed(333)
svmModel <- train(
  x = x_train, y = y_train,
  method = "svmRadial",
  tuneLength = 10,              # Auto-tune sigma and C
  trControl = trainControl(method = "cv", number = 10),
  metric = "Rsquared"
)

# Predict and evaluate
svmPred <- predict(svmModel, x_test)
postResample(svmPred, y_test)

##      RMSE  Rsquared       MAE 
## 0.7091104 0.6057279 0.5225374

Gradient Boosting Machines

set.seed(444)
gbmModel <- train(
  x = x_train, y = y_train,
  method = "gbm",
  tuneLength = 10,              # Auto-tune interaction depth, shrinkage, n.trees
  trControl = trainControl(method = "cv", number = 10),
  metric = "Rsquared",
  verbose = FALSE
)

# Predict and evaluate
gbmPred <- predict(gbmModel, x_test)
postResample(gbmPred, y_test)

##      RMSE  Rsquared       MAE 
## 0.6402972 0.6920954 0.4432237

Compare Models

# Collect all models
models <- list(
  MARS = marsModel,
  RandomForest = rfModel,
  SVM = svmModel,
  GBM = gbmModel
)

# Resample results
resamples <- resamples(models)
summary(resamples)

## 
## Call:
## summary.resamples(object = resamples)
## 
## Models: MARS, RandomForest, SVM, GBM 
## Number of resamples: 10 
## 
## MAE 
##                   Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## MARS         0.2865557 0.4063530 0.4771704 0.4936886 0.5350459 0.7406345    0
## RandomForest 0.3040437 0.4572309 0.4951039 0.4819018 0.5258466 0.5793859    0
## SVM          0.3374369 0.4115908 0.4810048 0.4839656 0.5375805 0.6371607    0
## GBM          0.3102461 0.3853619 0.4371421 0.4605217 0.5469722 0.6559649    0
## 
## RMSE 
##                   Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## MARS         0.3703093 0.4711157 0.5727605 0.6193414 0.6788472 0.9838125    0
## RandomForest 0.4029738 0.5648227 0.6137333 0.6035744 0.6684476 0.7213218    0
## SVM          0.4303769 0.5267674 0.6088384 0.5980011 0.6827474 0.7556912    0
## GBM          0.4453346 0.4846367 0.5524158 0.5715051 0.6319819 0.7702760    0
## 
## Rsquared 
##                   Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## MARS         0.2919032 0.5935036 0.7102777 0.6445895 0.7690368 0.8657793    0
## RandomForest 0.3632040 0.5969392 0.6546133 0.6568381 0.7509035 0.8746550    0
## SVM          0.3025769 0.5803465 0.6344732 0.6283128 0.6958743 0.8206000    0
## GBM          0.5556341 0.6194403 0.6544292 0.6601056 0.6828267 0.7859701    0

# Plot performance
bwplot(resamples, metric = "Rsquared")  # Compare R^2

dotplot(resamples, metric = "RMSE")     # Compare RMSE

(a) Which nonlinear regression model gives the optimal resampling and test set performance?

Based on the resampling results, MARS emerges as the best-performing model, achieving the highest median R^2 (0.71) and demonstrating strong predictive accuracy (median RMSE = 0.573, MAE = 0.477). Its ability to model nonlinear relationships and interactions while maintaining interpretability makes it ideal for optimizing manufacturing processes and diagnosing yield issues. Random Forest and GBM also perform well, with GBM showing the lowest median RMSE (0.552), making it a viable alternative for error-sensitive applications. SVM trails behind in performance and is less recommended. For most scenarios, MARS strikes the best balance between accuracy and interpretability, though GBM may be preferred when minimizing prediction errors is critical. Further tuning or ensemble approaches could enhance results.

(b) Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

Which predictors are most important in the optimal nonlinear regression model?

# MARS variable importance
varImp_mars <- varImp(marsModel)
varImp_mars

## earth variable importance
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   58.12
## ManufacturingProcess09   58.12
## ManufacturingProcess28   37.10
## ManufacturingProcess17   20.73
## ManufacturingProcess39   14.50
## ManufacturingProcess06   10.23
## ManufacturingProcess22    6.27
## BiologicalMaterial09      0.00

plot(varImp_mars, top = 10, main = "Top 10 Predictors (MARS)")

Do either the biological or process variables dominate the list?

Process variables (e.g., ManufacturingProcess32) dominate, but biological factors (e.g., BiologicalMaterial09) appear in the top 10.

How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

The top ten important predictors identified by MARS and the optimal linear model share two key predictors: ManufacturingProcess32 and ManufacturingProcess17. Notably, ManufacturingProcess32 is the most important predictor in both models. Additionally, each list includes one biological variable, although they are different from one another.

(c) Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model

# Extract top 10 predictor names (ensure they are non-NA strings)
top_mars_predictors <- rownames(varImp_mars$importance)[1:10]
top_mars_predictors <- top_mars_predictors[!is.na(top_mars_predictors)]  # Remove NA

# Compare with linear model's top predictors (replace with your actual linear model's top predictors)
top_linear_predictors <- c("ManufacturingProcess32", "ManufacturingProcess17")  # Example
unique_mars_predictors <- setdiff(top_mars_predictors, top_linear_predictors)

# Check if any predictors are left to plot
if (length(unique_mars_predictors) > 0) {
  # Plot relationships
  for (predictor in unique_mars_predictors) {
    if (predictor %in% colnames(trainData)) {  # Verify the column exists
      p <- ggplot(trainData, aes(x = .data[[predictor]], y = Yield)) +
        geom_point(alpha = 0.3) +
        geom_smooth(method = "loess", se = FALSE, color = "red") +
        labs(title = paste("Yield vs", predictor)) +
        theme_minimal()
      print(p)
    } else {
      warning(paste("Predictor", predictor, "not found in trainData."))
    }
  }
} else {
  message("No unique predictors found in MARS compared to the linear model.")
}

Some of the process variables exhibit nonlinear relationships with yield. For instance, some demonstrate a U-shaped curve, indicating that the optimal operating range is achieved at intermediate values of a process parameter. Additionally, there are threshold effects where yield decreases beyond certain predictor values. Among the predictors, the only biological variable shows a nearly saturating relationship: yield improves with increased values up to a certain point, after which it experiences diminishing returns.

Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

Process variables are the most unique predictors, confirming their key role in optimizing yield. Biological variables may limit potential gains, as their impact often plateaus after a certain input level.