# Load necessary libraries
library(mlbench)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(earth)
## Warning: package 'earth' was built under R version 4.3.3
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.3.3
## Loading required package: plotrix
library(e1071)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
7.2. Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data: y = 10 sin(πx1x2) + 20(x3 − 0.5)2 + 10x4 + 5x5 + N(0, σ2) where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
Tune several models on these data. For example:
library(caret)
set.seed(921)
# Train the KNN model
knnModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "knn",
preProcess = c("center", "scale"),
tuneLength = 10
)
200 samples 10 predictors Pre-processing: centered, scaled Resampling: Bootstrap (25 reps) Summary of sample sizes: 200, 200, 200, 200, 200, 200, … Resampling results across tuning parameters: k RMSE Rsquared RMSE SD Rsquared SD 5 3.51 0.496 0.238 0.0641 7 3.36 0.536 0.24 0.0617 9 3.3 0.559 0.251 0.0546 11 3.24 0.586 0.252 0.0501 13 3.2 0.61 0.234 0.0465 15 3.19 0.623 0.264 0.0496 17 3.19 0.63 0.286 0.0528 19 3.18 0.643 0.274 0.048 21 3.2 0.646 0.269 0.0464 23 3.2 0.652 0.267 0.0465 RMSE was used to select the optimal model using the smallest value. The final value used for the model was k = 19.
knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## perforamnce values
postResample
## function (pred, obs)
## {
## isNA <- is.na(pred)
## pred <- pred[!isNA]
## obs <- obs[!isNA]
## if (!is.factor(obs) && is.numeric(obs)) {
## if (length(obs) + length(pred) == 0) {
## out <- rep(NA, 3)
## }
## else {
## if (length(unique(pred)) < 2 || length(unique(obs)) <
## 2) {
## resamplCor <- NA
## }
## else {
## resamplCor <- try(cor(pred, obs, use = "pairwise.complete.obs"),
## silent = TRUE)
## if (inherits(resamplCor, "try-error"))
## resamplCor <- NA
## }
## mse <- mean((pred - obs)^2)
## mae <- mean(abs(pred - obs))
## out <- c(sqrt(mse), resamplCor^2, mae)
## }
## names(out) <- c("RMSE", "Rsquared", "MAE")
## }
## else {
## if (length(obs) + length(pred) == 0) {
## out <- rep(NA, 2)
## }
## else {
## pred <- factor(pred, levels = levels(obs))
## requireNamespaceQuietStop("e1071")
## out <- unlist(e1071::classAgreement(table(obs, pred)))[c("diag",
## "kappa")]
## }
## names(out) <- c("Accuracy", "Kappa")
## }
## if (any(is.nan(out)))
## out[is.nan(out)] <- NA
## out
## }
## <bytecode: 0x7f9b5cdc4388>
## <environment: namespace:caret>
knnPred <- predict(knnModel, newdata = testData\(x) ## The function 'postResample' can be used to get the test set ## perforamnce values postResample(pred = knnPred, obs = testData\)y) RMSE Rsquared 3.2286834 0.6871735 Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?
# Set seed for reproducibility
set.seed(1234)
# Generate training data
trainingdata <- mlbench.friedman1(200, sd = 1)
trainingdata$x <- data.frame(trainingdata$x)
# Generate test data for true error estimation
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
# Model 1: k-Nearest Neighbors
set.seed(921)
knnModel <- train(
x = trainingdata$x,
y = trainingdata$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10
)
knnPred <- predict(knnModel, newdata = testData$x)
knnPerformance <- postResample(pred = knnPred, obs = testData$y)
# Model 2: MARS (Multivariate Adaptive Regression Splines)
marsGrid <- expand.grid(degree = 1:2, nprune = seq(2, 14, by = 2))
set.seed(921)
marsModel <- train(
x = trainingdata$x,
y = trainingdata$y,
method = "earth",
preProc = c("center", "scale"),
tuneGrid = marsGrid
)
marsPred <- predict(marsModel, newdata = testData$x)
marsPerformance <- postResample(pred = marsPred, obs = testData$y)
# Model 3: SVM with Radial Basis Function
set.seed(921)
svmRModel <- train(
x = trainingdata$x,
y = trainingdata$y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 8
)
svmRPred <- predict(svmRModel, newdata = testData$x)
svmRPerformance <- postResample(pred = svmRPred, obs = testData$y)
# Compare model performances
model_performances <- data.frame(
Model = c("k-Nearest Neighbors", "MARS", "SVM Radial"),
RMSE = c(knnPerformance["RMSE"], marsPerformance["RMSE"], svmRPerformance["RMSE"]),
Rsquared = c(knnPerformance["Rsquared"], marsPerformance["Rsquared"], svmRPerformance["Rsquared"])
)
print(model_performances)
## Model RMSE Rsquared
## 1 k-Nearest Neighbors 3.190236 0.6569874
## 2 MARS 1.311784 0.9310091
## 3 SVM Radial 1.946502 0.8499443
# Variable Importance for MARS Model
mars_importance <- varImp(marsModel)
print(mars_importance)
## earth variable importance
##
## Overall
## X4 100.00
## X1 71.79
## X2 48.01
## X5 30.55
## X3 0.00
# Set seed for reproducibility
set.seed(1234)
# Generate training data
trainingdata <- mlbench.friedman1(200, sd = 1)
trainingdata$x <- data.frame(trainingdata$x)
# Generate test data for true error estimation
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
# Fit the MARS model using training data
marsFit <- earth(
x = trainingdata$x, # Use the correct dataset
y = trainingdata$y, # Use the corresponding response variable
nprune = 12,
degree = 2
)
# Print model summary
summary(marsFit)
## Call: earth(x=trainingdata$x, y=trainingdata$y, degree=2, nprune=12)
##
## coefficients
## (Intercept) 12.876509
## h(0.431016-X1) -21.579356
## h(X1-0.431016) 5.174728
## h(0.407251-X2) -20.621542
## h(X2-0.407251) 6.206719
## h(0.572205-X3) 7.572037
## h(X3-0.572205) 10.965334
## h(0.540035-X4) -9.582660
## h(X4-0.540035) 9.760638
## h(X5-0.176527) 5.055725
## h(0.318446-X1) * h(0.407251-X2) 101.701936
## h(X1-0.431016) * h(X2-0.407251) -26.002933
##
## Selected 12 of 20 terms, and 5 of 10 predictors (nprune=12)
## Termination condition: Reached nk 21
## Importance: X4, X1, X2, X5, X3, X6-unused, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 9 2
## GCV 1.850052 RSS 272.071 GRSq 0.931694 RSq 0.9492681
The MARS model appears to give the best performance since it has the lowest RMSE and the highest 𝑅2. The second best would be SVM radial.
# Simulate training and test datasets
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
# Feature visualization (optional)
featurePlot(trainingData$x, trainingData$y)
# Train KNN model
set.seed(200)
knnModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10
)
# Display model results
print(knnModel)
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.654912 0.4779838 2.958475
## 7 3.529432 0.5118581 2.861742
## 9 3.446330 0.5425096 2.780756
## 11 3.378049 0.5723793 2.719410
## 13 3.332339 0.5953773 2.692863
## 15 3.309235 0.6111389 2.663046
## 17 3.317408 0.6201421 2.678898
## 19 3.311667 0.6333800 2.682098
## 21 3.316340 0.6407537 2.688887
## 23 3.326040 0.6491480 2.705915
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.
# Predict on test data
knnPred <- predict(knnModel, newdata = testData$x)
# Evaluate test set performance
postResample(pred = knnPred, obs = testData$y)
## RMSE Rsquared MAE
## 3.1750657 0.6785946 2.5443169
# Optional: Train other models such as MARS to check performance
marsModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "earth",
preProc = c("center", "scale"),
tuneLength = 10
)
print(marsModel)
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## nprune RMSE Rsquared MAE
## 2 4.444975 0.2202864 3.661237
## 3 3.683032 0.4666704 2.968899
## 4 2.751215 0.6962266 2.197465
## 6 2.346335 0.7775830 1.863515
## 7 1.995584 0.8400185 1.571659
## 9 1.842941 0.8653498 1.431858
## 10 1.854786 0.8647576 1.442683
## 12 1.850893 0.8649865 1.438274
## 13 1.856070 0.8643466 1.441230
## 15 1.848205 0.8653756 1.432817
##
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 9 and degree = 1.
marsPred <- predict(marsModel, newdata = testData$x)
postResample(pred = marsPred, obs = testData$y)
## RMSE Rsquared MAE
## 1.7901760 0.8705315 1.3712537
7.5. Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models. (a) Which nonlinear regression model gives the optimal resampling and test set performance?
If we look at the RMSE values from the code below, we can see that the SVM model gives the optimal test performance.
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
# Data imputation and splitting
set.seed(100)
preProcValues <- preProcess(ChemicalManufacturingProcess, method = c("knnImpute"))
processedData <- predict(preProcValues, ChemicalManufacturingProcess)
# Split data
set.seed(200)
trainIndex <- createDataPartition(processedData$Yield, p = 0.8, list = FALSE)
trainData <- processedData[trainIndex, ]
testData <- processedData[-trainIndex, ]
# Train nonlinear regression models
set.seed(200)
nonlinearModel <- train(
Yield ~ .,
data = trainData,
method = "svmRadial", # Example: SVM with radial kernel
preProc = c("center", "scale"),
tuneLength = 10
)
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
# Evaluate model performance
nonlinearPred <- predict(nonlinearModel, testData)
postResample(pred = nonlinearPred, obs = testData$Yield)
## RMSE Rsquared MAE
## 0.7275070 0.6138613 0.5442017
# Variable importance
varImp(nonlinearModel)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## BiologicalMaterial06 87.81
## ManufacturingProcess13 78.23
## BiologicalMaterial03 76.45
## BiologicalMaterial12 69.16
## ManufacturingProcess17 68.44
## ManufacturingProcess36 67.83
## ManufacturingProcess31 67.82
## ManufacturingProcess09 64.12
## ManufacturingProcess06 60.80
## ManufacturingProcess29 53.56
## BiologicalMaterial02 53.50
## ManufacturingProcess11 50.24
## BiologicalMaterial11 48.66
## ManufacturingProcess33 46.88
## ManufacturingProcess30 45.42
## BiologicalMaterial09 38.34
## BiologicalMaterial04 37.89
## BiologicalMaterial08 36.89
## ManufacturingProcess12 36.65
# Compare top 10 predictors to linear models
set.seed(200)
linearModel <- train(
Yield ~ .,
data = trainData,
method = "lm",
preProc = c("center", "scale")
)
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
linearVarImp <- varImp(linearModel)
linearVarImp$importance %>%
arrange(desc(Overall)) %>%
head(10)
## Overall
## ManufacturingProcess32 100.00000
## ManufacturingProcess28 73.80025
## ManufacturingProcess33 68.88580
## ManufacturingProcess04 52.23883
## BiologicalMaterial07 43.31018
## ManufacturingProcess45 40.99224
## ManufacturingProcess37 39.86323
## BiologicalMaterial02 35.86519
## ManufacturingProcess12 35.68715
## BiologicalMaterial11 34.54960
# Evaluate performance on the test set for both models
svmPred <- predict(nonlinearModel, newdata = testData)
svmPerformance <- postResample(pred = svmPred, obs = testData$Yield)
linearPred <- predict(linearModel, newdata = testData)
linearPerformance <- postResample(pred = linearPred, obs = testData$Yield)
# Combine performance metrics into a data frame for plotting
performanceDF <- data.frame(
Model = c("SVM", "Linear"),
RMSE = c(svmPerformance["RMSE"], linearPerformance["RMSE"]),
Rsquared = c(svmPerformance["Rsquared"], linearPerformance["Rsquared"])
)
# Plot RMSE and Rsquared for comparison
ggplot(performanceDF, aes(x = Model, y = RMSE, fill = Model)) +
geom_bar(stat = "identity") +
labs(title = "Model Performance Comparison", y = "RMSE") +
theme_minimal()
ggplot(performanceDF, aes(x = Model, y = Rsquared, fill = Model)) +
geom_bar(stat = "identity") +
labs(title = "Model Performance Comparison (R-squared)", y = "R-squared") +
theme_minimal()
# Extract and arrange top 10 predictors for the nonlinear model (SVM)
svmVarImp <- varImp(nonlinearModel)
svmVarImp <- cbind(rownames(svmVarImp$importance), data.frame(svmVarImp$importance, row.names = NULL))
colnames(svmVarImp) <- c("Predictor", "Overall")
svmVarImp <- svmVarImp %>% arrange(desc(Overall)) %>% head(10)
# Extract and arrange top 10 predictors for the linear model
linearVarImp <- varImp(linearModel)
linearVarImp <- cbind(rownames(linearVarImp$importance), data.frame(linearVarImp$importance, row.names = NULL))
colnames(linearVarImp) <- c("Predictor", "Overall")
linearVarImp <- linearVarImp %>% arrange(desc(Overall)) %>% head(10)
# Combine both models' variable importance for comparison
combinedVarImp <- rbind(
data.frame(Model = "SVM", Predictor = svmVarImp$Predictor, Importance = svmVarImp$Overall),
data.frame(Model = "Linear", Predictor = linearVarImp$Predictor, Importance = linearVarImp$Overall)
)
# Plot variable importance comparison
ggplot(combinedVarImp, aes(x = reorder(Predictor, Importance), y = Importance, fill = Model)) +
geom_bar(stat = "identity", position = "dodge") +
coord_flip() +
labs(title = "Top 10 Predictors - SVM vs Linear Model", x = "Predictor", y = "Importance") +
theme_minimal()
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
# Preprocess the data with multiple methods including BoxCox transformation
preProcess <- preProcess(ChemicalManufacturingProcess,
method = c("BoxCox", "knnImpute", "center", "scale"))
predPreProcess <- predict(preProcess, ChemicalManufacturingProcess)
predPreProcess$Yield = ChemicalManufacturingProcess$Yield
# Split the data into training and testing sets
set.seed(200)
ind <- sample(seq_len(nrow(predPreProcess)), size = floor(0.85 * nrow(predPreProcess)))
train <- predPreProcess[ind, ]
test <- predPreProcess[-ind, ]
# Train a nonlinear regression model (SVM as an example)
svmModel <- train(Yield ~ ., data = train,
method = "svmRadial",
preProcess = c("center", "scale"),
tuneLength = 15)
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
# Evaluate the model's performance on the test data
svmPred <- predict(svmModel, newdata = test)
postResample(pred = svmPred, obs = test$Yield)
## RMSE Rsquared MAE
## 1.4152575 0.6243845 1.2208405
# Extract and sort the top 10 predictors from the SVM model
topPredictors <- names(sort(varImp(svmModel)$importance[, 1], decreasing = TRUE)[1:10])
# Ensure that topPredictors are valid columns in the training data
validPredictors <- topPredictors[topPredictors %in% colnames(train)]
# Feature plot for visualizing relationships between the top predictors and Yield
varI <- varImp(svmModel)$importance
varI <- cbind(rownames(varI), data.frame(varI, row.names = NULL))
colnames(varI) <- c("Predictor", "Overall")
varI <- varI %>% arrange(Overall) %>% tail(10) %>% select(Predictor)
variables <- as.vector(varI$Predictor)
# Top predictors and Yield Visulaiztion
featurePlot(train[, variables], train$Yield)