library(mlbench)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(earth)
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks randomForest::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Friedman (1991) introduced several benchmark data sets create by sim- ulation. One of these simulations used the following nonlinear equation to create data: y = 10sin(πx1x2)+20(x3 −0.5)2 +10x4 +5x5 +N(0,σ2) where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simula- tion). The package mlbench contains a function called mlbench.friedman1 that simulates these data:
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
featurePlot(trainingData$x, trainingData$y)
set.seed(200)
knnModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10
)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.654912 0.4779838 2.958475
## 7 3.529432 0.5118581 2.861742
## 9 3.446330 0.5425096 2.780756
## 11 3.378049 0.5723793 2.719410
## 13 3.332339 0.5953773 2.692863
## 15 3.309235 0.6111389 2.663046
## 17 3.317408 0.6201421 2.678898
## 19 3.311667 0.6333800 2.682098
## 21 3.316340 0.6407537 2.688887
## 23 3.326040 0.6491480 2.705915
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.
set.seed(200)
marsModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "earth",
tuneLength = 10
)
marsModel
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## nprune RMSE Rsquared MAE
## 2 4.447045 0.2249607 3.650128
## 3 3.744821 0.4546610 3.019175
## 4 2.828643 0.6892908 2.244131
## 6 2.406670 0.7747079 1.906733
## 7 2.027113 0.8375721 1.594956
## 9 1.800794 0.8728377 1.411703
## 10 1.810047 0.8721377 1.412023
## 12 1.831608 0.8700790 1.430044
## 13 1.839717 0.8686550 1.440537
## 15 1.856211 0.8663787 1.452430
##
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 9 and degree = 1.
set.seed(200)
rfModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "rf",
tuneLength = 5,
importance = TRUE
)
rfModel
## Random Forest
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 3.025902 0.7806925 2.486498
## 4 2.718930 0.7764228 2.237932
## 6 2.657900 0.7609547 2.186957
## 8 2.666144 0.7458877 2.190858
## 10 2.686343 0.7349354 2.201335
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 6.
set.seed(200)
svmModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 10
)
svmModel
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.635010 0.7685188 2.074977
## 0.50 2.423373 0.7839086 1.902162
## 1.00 2.284133 0.8001542 1.791776
## 2.00 2.196624 0.8126474 1.713560
## 4.00 2.143035 0.8209820 1.668024
## 8.00 2.119159 0.8246308 1.649388
## 16.00 2.117441 0.8248674 1.648573
## 32.00 2.117441 0.8248674 1.648573
## 64.00 2.117441 0.8248674 1.648573
## 128.00 2.117441 0.8248674 1.648573
##
## Tuning parameter 'sigma' was held constant at a value of 0.06299324
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06299324 and C = 16.
models <- list(
kNN = knnModel,
MARS = marsModel,
RandomForest = rfModel,
SVM = svmModel
)
testResults <- lapply(models, function(model) {
preds <- predict(model, newdata = testData$x)
postResample(pred = preds, obs = testData$y)
})
results <- do.call(rbind, testResults)
results
## RMSE Rsquared MAE
## kNN 3.175066 0.6785946 2.544317
## MARS 1.790176 0.8705315 1.371254
## RandomForest 2.432656 0.7982063 1.925944
## SVM 2.073700 0.8256573 1.575197
evimp(marsModel$finalModel)
## nsubsets gcv rss
## X1 8 100.0 100.0
## X4 7 84.0 83.4
## X2 6 66.8 65.9
## X5 5 44.6 44.2
## X3 4 33.3 33.1
## X6 1 6.6 7.7
MARS assigns high importance to X1-X5 Predictors X6-X10 receive negligable or zero importance
MARS achieved the lowest RMSE and MAE and the highest R^2. MARS successfully identifies the informative predictors. The optimal model selected nprune is 9.
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
processPredictors <- subset(ChemicalManufacturingProcess,select = -Yield)
yield <- subset(ChemicalManufacturingProcess, select = "Yield")
set.seed(624)
trainingRows <- createDataPartition(yield$Yield, p = 0.8, list = FALSE)
trainPredictors <- processPredictors[trainingRows,]
testPredictors <- processPredictors[-trainingRows,]
trainYield <- yield[trainingRows,]
testYield <- yield[-trainingRows,]
preprocess <- preProcess(trainPredictors,method=c("medianImpute", "center", "scale"))
preprocessTrainPredictors <- predict(preprocess,trainPredictors)
preprocessTestPredictors <- predict(preprocess,testPredictors)
ctrl_rf <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
rfModel <- train(x = preprocessTrainPredictors, y = trainYield, method = "rf", trControl = ctrl_rf, tuneLength = 5, metric = "RMSE")
rfModel
## Random Forest
##
## 144 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 129, 129, 130, 130, 129, 128, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 1.239907 0.6103719 1.0161675
## 15 1.129110 0.6467689 0.9083443
## 29 1.124617 0.6408498 0.8996999
## 43 1.141014 0.6230428 0.9100439
## 57 1.147395 0.6132943 0.9106176
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 29.
rfModel$results[which.min(rfModel$results$RMSE), ]
## mtry RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 3 29 1.124617 0.6408498 0.8996999 0.2772045 0.1685199 0.1805578
rfPred <- predict(rfModel, newdata = preprocessTestPredictors)
testResults <- postResample(pred = rfPred, obs = testYield)
testResults
## RMSE Rsquared MAE
## 1.2479677 0.7218810 0.8963812
resampled <- rfModel$results[which.min(rfModel$results$RMSE), ]
comparison <- data.frame( Data = c("Training (CV)", "Test"), RMSE = c(resampled$RMSE, testResults["RMSE"]), Rsquared = c(resampled$Rsquared, testResults["Rsquared"]) )
comparison
## Data RMSE Rsquared
## Training (CV) 1.124617 0.6408498
## RMSE Test 1.247968 0.7218810
varImpRF <- varImp(rfModel)
plot(varImpRF, top = 15)
topVars <- varImpRF$importance %>% arrange(desc(Overall)) %>% head(10) %>% rownames()
topVars
## [1] "ManufacturingProcess32" "BiologicalMaterial06" "BiologicalMaterial12"
## [4] "BiologicalMaterial03" "ManufacturingProcess31" "ManufacturingProcess13"
## [7] "ManufacturingProcess09" "ManufacturingProcess06" "BiologicalMaterial02"
## [10] "BiologicalMaterial11"
bio_predictors <- colnames(processPredictors)[1:12]
process_predictors <- colnames(processPredictors)[13:57]
data.frame( Predictor = topVars, Type = ifelse(topVars %in% bio_predictors, "Biological", "Process") )
## Predictor Type
## 1 ManufacturingProcess32 Process
## 2 BiologicalMaterial06 Biological
## 3 BiologicalMaterial12 Biological
## 4 BiologicalMaterial03 Biological
## 5 ManufacturingProcess31 Process
## 6 ManufacturingProcess13 Process
## 7 ManufacturingProcess09 Process
## 8 ManufacturingProcess06 Process
## 9 BiologicalMaterial02 Biological
## 10 BiologicalMaterial11 Biological
library(tidyr)
topData <- processPredictors[, topVars]
yield <- unlist(yield)
topData$Yield <- as.numeric(yield)
top6 <- topVars[1:6]
topData_long <- topData %>% pivot_longer(cols = all_of(top6), names_to = "Predictor", values_to = "Value")
ggplot(topData_long, aes(x = Value, y = Yield)) + geom_point(alpha = 0.6, color = "blue") + geom_smooth(method = "loess", color = "red", se = FALSE) + facet_wrap(~ Predictor, scales = "free", ncol = 3) + labs( title = "Relationships Between Top Predictors and Product Yield", x = "Predictor Value", y = "Percent Yield" ) + theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).
ctrl <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 3
)
## kNN
knnModel <- train(
x = preprocessTrainPredictors,
y = trainYield,
method = "knn",
trControl = ctrl,
tuneLength = 10,
metric = "RMSE"
)
## MARS
marsModel <- train(
x = preprocessTrainPredictors,
y = trainYield,
method = "earth",
trControl = ctrl,
tuneLength = 10,
metric = "RMSE"
)
## Random Forest
rfModel <- train(
x = preprocessTrainPredictors,
y = trainYield,
method = "rf",
trControl = ctrl,
tuneLength = 5,
metric = "RMSE"
)
## SVM (Radial)
svmModel <- train(
x = preprocessTrainPredictors,
y = trainYield,
method = "svmRadial",
trControl = ctrl,
tuneLength = 10,
metric = "RMSE"
)
models <- list(
kNN = knnModel,
MARS = marsModel,
RandomForest = rfModel,
SVM = svmModel
)
testResults <- lapply(models, function(model) {
preds <- predict(model, preprocessTestPredictors)
postResample(pred = preds, obs = testYield)
})
results <- do.call(rbind, testResults)
results
## RMSE Rsquared MAE
## kNN 1.465860 0.5706660 1.1121875
## MARS 1.282667 0.6705045 1.0064775
## RandomForest 1.246560 0.7289408 0.8991363
## SVM 1.175228 0.7410426 0.8197911
varImpRF <- varImp(rfModel)
plot(varImpRF, top = 15)
top10_rf <- varImpRF$importance %>%
arrange(desc(Overall)) %>%
head(10) %>%
rownames()
top10_rf
## [1] "ManufacturingProcess32" "BiologicalMaterial06" "BiologicalMaterial12"
## [4] "BiologicalMaterial03" "ManufacturingProcess31" "ManufacturingProcess13"
## [7] "ManufacturingProcess09" "ManufacturingProcess36" "BiologicalMaterial02"
## [10] "ManufacturingProcess06"
bio_predictors <- colnames(processPredictors)[1:12]
process_predictors <- colnames(processPredictors)[13:57]
data.frame(
Predictor = top10_rf,
Type = ifelse(top10_rf %in% bio_predictors, "Biological", "Process")
)
## Predictor Type
## 1 ManufacturingProcess32 Process
## 2 BiologicalMaterial06 Biological
## 3 BiologicalMaterial12 Biological
## 4 BiologicalMaterial03 Biological
## 5 ManufacturingProcess31 Process
## 6 ManufacturingProcess13 Process
## 7 ManufacturingProcess09 Process
## 8 ManufacturingProcess36 Process
## 9 BiologicalMaterial02 Biological
## 10 ManufacturingProcess06 Process
library(tidyr)
library(ggplot2)
topData <- processPredictors[, top10_rf]
#yield_vec <- as.numeric(yield$Yield)
topData$Yield <- yield
rf_unique <- setdiff(top10_rf, topVars) # topVars from Exercise 6.3
rf_unique
## [1] "ManufacturingProcess36"
topData_long <- topData %>%
pivot_longer(
cols = all_of(rf_unique),
names_to = "Predictor",
values_to = "Value"
)
ggplot(topData_long, aes(x = Value, y = Yield)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "loess", se = FALSE, color = "red") +
facet_wrap(~ Predictor, scales = "free", ncol = 3) +
labs(
title = "Nonlinear Relationships Between Unique RF Predictors and Yield",
x = "Predictor Value",
y = "Percent Yield"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 0.019
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 0.001
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 1e-06
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).
Part a: The nonlinear regression models, SVM with radial basis function kernel provides the optimal nonlinear regression performance. The SVM model achieves the lowerst RMSE, highest R^2, lowest MAE.
part b:
Top 10 Predictors
ManufacturingProcess32
BiologicalMaterial03
BiologicalMaterial06
BiologicalMaterial12
ManufacturingProcess31
ManufacturingProcess13
ManufacturingProcess09
ManufacturingProcess06
BiologicalMaterial11
BiologicalMaterial02
Biological vs. Process Predictors
Among the top ten predictors:
5 are manufacturing process variables and 5 are biological material variables
This indicates a balanced contribution between raw material quality and controllable manufacturing conditions in determining final product yield.
The importance rankings suggest that yield is jointly influenced by biological material properties and process contrl variables. While process variables remain critical due to their direct manipulability, the nonlinear model reveals that biological material characteristics play a more complex and previously underappreciated role in determining yield.
Compared to optimal linear model, the nonlinear approach uncovers additional influential biological predictors, indicating nonlinear relationships between raw material properties and product yield that are not captured by linear regression.
part c:
The nonlinear plots provide meaningful understanding about the manufacturing process. Predictors unique to the nonlinear model show curved and threshold based relationships with yield, explaining why they weren’t selected by linear rregression. This emphasize the value of nonlinear modeling for identifying operational regimes that maximize yield.