DATA624-HW8

# Load necessary libraries
library(mlbench)
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(earth)

## Warning: package 'earth' was built under R version 4.3.3

## Loading required package: Formula

## Loading required package: plotmo

## Warning: package 'plotmo' was built under R version 4.3.3

## Loading required package: plotrix

library(e1071)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(dplyr)

7.2. Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data: y = 10 sin(πx1x2) + 20(x3 − 0.5)2 + 10x4 + 5x5 + N(0, σ2) where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:

set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)

## or other methods.

## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

Tune several models on these data. For example:

library(caret)
set.seed(921)

# Train the KNN model
knnModel <- train(
  x = trainingData$x, 
  y = trainingData$y, 
  method = "knn", 
  preProcess = c("center", "scale"), 
  tuneLength = 10
)

200 samples 10 predictors Pre-processing: centered, scaled Resampling: Bootstrap (25 reps) Summary of sample sizes: 200, 200, 200, 200, 200, 200, … Resampling results across tuning parameters: k RMSE Rsquared RMSE SD Rsquared SD 5 3.51 0.496 0.238 0.0641 7 3.36 0.536 0.24 0.0617 9 3.3 0.559 0.251 0.0546 11 3.24 0.586 0.252 0.0501 13 3.2 0.61 0.234 0.0465 15 3.19 0.623 0.264 0.0496 17 3.19 0.63 0.286 0.0528 19 3.18 0.643 0.274 0.048 21 3.2 0.646 0.269 0.0464 23 3.2 0.652 0.267 0.0465 RMSE was used to select the optimal model using the smallest value. The final value used for the model was k = 19.

knnPred <- predict(knnModel, newdata = testData$x)

## The function 'postResample' can be used to get the test set
## perforamnce values
postResample

## function (pred, obs) 
## {
##     isNA <- is.na(pred)
##     pred <- pred[!isNA]
##     obs <- obs[!isNA]
##     if (!is.factor(obs) && is.numeric(obs)) {
##         if (length(obs) + length(pred) == 0) {
##             out <- rep(NA, 3)
##         }
##         else {
##             if (length(unique(pred)) < 2 || length(unique(obs)) < 
##                 2) {
##                 resamplCor <- NA
##             }
##             else {
##                 resamplCor <- try(cor(pred, obs, use = "pairwise.complete.obs"), 
##                   silent = TRUE)
##                 if (inherits(resamplCor, "try-error")) 
##                   resamplCor <- NA
##             }
##             mse <- mean((pred - obs)^2)
##             mae <- mean(abs(pred - obs))
##             out <- c(sqrt(mse), resamplCor^2, mae)
##         }
##         names(out) <- c("RMSE", "Rsquared", "MAE")
##     }
##     else {
##         if (length(obs) + length(pred) == 0) {
##             out <- rep(NA, 2)
##         }
##         else {
##             pred <- factor(pred, levels = levels(obs))
##             requireNamespaceQuietStop("e1071")
##             out <- unlist(e1071::classAgreement(table(obs, pred)))[c("diag", 
##                 "kappa")]
##         }
##         names(out) <- c("Accuracy", "Kappa")
##     }
##     if (any(is.nan(out))) 
##         out[is.nan(out)] <- NA
##     out
## }
## <bytecode: 0x7f9b5cdc4388>
## <environment: namespace:caret>

knnPred <- predict(knnModel, newdata = testData\(x) ## The function 'postResample' can be used to get the test set ## perforamnce values postResample(pred = knnPred, obs = testData\)y) RMSE Rsquared 3.2286834 0.6871735 Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?

# Set seed for reproducibility
set.seed(1234)

# Generate training data
trainingdata <- mlbench.friedman1(200, sd = 1)
trainingdata$x <- data.frame(trainingdata$x)

# Generate test data for true error estimation
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

# Model 1: k-Nearest Neighbors
set.seed(921)
knnModel <- train(
  x = trainingdata$x,
  y = trainingdata$y,
  method = "knn",
  preProc = c("center", "scale"),
  tuneLength = 10
)
knnPred <- predict(knnModel, newdata = testData$x)
knnPerformance <- postResample(pred = knnPred, obs = testData$y)

# Model 2: MARS (Multivariate Adaptive Regression Splines)
marsGrid <- expand.grid(degree = 1:2, nprune = seq(2, 14, by = 2))
set.seed(921)
marsModel <- train(
  x = trainingdata$x,
  y = trainingdata$y,
  method = "earth",
  preProc = c("center", "scale"),
  tuneGrid = marsGrid
)
marsPred <- predict(marsModel, newdata = testData$x)
marsPerformance <- postResample(pred = marsPred, obs = testData$y)

# Model 3: SVM with Radial Basis Function
set.seed(921)
svmRModel <- train(
  x = trainingdata$x,
  y = trainingdata$y,
  method = "svmRadial",
  preProc = c("center", "scale"),
  tuneLength = 8
)
svmRPred <- predict(svmRModel, newdata = testData$x)
svmRPerformance <- postResample(pred = svmRPred, obs = testData$y)

# Compare model performances
model_performances <- data.frame(
  Model = c("k-Nearest Neighbors", "MARS", "SVM Radial"),
  RMSE = c(knnPerformance["RMSE"], marsPerformance["RMSE"], svmRPerformance["RMSE"]),
  Rsquared = c(knnPerformance["Rsquared"], marsPerformance["Rsquared"], svmRPerformance["Rsquared"])
)

print(model_performances)

##                 Model     RMSE  Rsquared
## 1 k-Nearest Neighbors 3.190236 0.6569874
## 2                MARS 1.311784 0.9310091
## 3          SVM Radial 1.946502 0.8499443

# Variable Importance for MARS Model
mars_importance <- varImp(marsModel)
print(mars_importance)

## earth variable importance
## 
##    Overall
## X4  100.00
## X1   71.79
## X2   48.01
## X5   30.55
## X3    0.00

# Set seed for reproducibility
set.seed(1234)

# Generate training data
trainingdata <- mlbench.friedman1(200, sd = 1)
trainingdata$x <- data.frame(trainingdata$x)

# Generate test data for true error estimation
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

# Fit the MARS model using training data
marsFit <- earth(
  x = trainingdata$x,  # Use the correct dataset 
  y = trainingdata$y,  # Use the corresponding response variable
  nprune = 12,
  degree = 2
)

# Print model summary
summary(marsFit)

## Call: earth(x=trainingdata$x, y=trainingdata$y, degree=2, nprune=12)
## 
##                                 coefficients
## (Intercept)                        12.876509
## h(0.431016-X1)                    -21.579356
## h(X1-0.431016)                      5.174728
## h(0.407251-X2)                    -20.621542
## h(X2-0.407251)                      6.206719
## h(0.572205-X3)                      7.572037
## h(X3-0.572205)                     10.965334
## h(0.540035-X4)                     -9.582660
## h(X4-0.540035)                      9.760638
## h(X5-0.176527)                      5.055725
## h(0.318446-X1) * h(0.407251-X2)   101.701936
## h(X1-0.431016) * h(X2-0.407251)   -26.002933
## 
## Selected 12 of 20 terms, and 5 of 10 predictors (nprune=12)
## Termination condition: Reached nk 21
## Importance: X4, X1, X2, X5, X3, X6-unused, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 9 2
## GCV 1.850052    RSS 272.071    GRSq 0.931694    RSq 0.9492681

The MARS model appears to give the best performance since it has the lowest RMSE and the highest 𝑅2. The second best would be SVM radial.

# Simulate training and test datasets
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)

testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

# Feature visualization (optional)
featurePlot(trainingData$x, trainingData$y)

# Train KNN model
set.seed(200)
knnModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "knn",
  preProc = c("center", "scale"),
  tuneLength = 10
)

# Display model results
print(knnModel)

## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.654912  0.4779838  2.958475
##    7  3.529432  0.5118581  2.861742
##    9  3.446330  0.5425096  2.780756
##   11  3.378049  0.5723793  2.719410
##   13  3.332339  0.5953773  2.692863
##   15  3.309235  0.6111389  2.663046
##   17  3.317408  0.6201421  2.678898
##   19  3.311667  0.6333800  2.682098
##   21  3.316340  0.6407537  2.688887
##   23  3.326040  0.6491480  2.705915
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.

# Predict on test data
knnPred <- predict(knnModel, newdata = testData$x)

# Evaluate test set performance
postResample(pred = knnPred, obs = testData$y)

##      RMSE  Rsquared       MAE 
## 3.1750657 0.6785946 2.5443169

# Optional: Train other models such as MARS to check performance
marsModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "earth",
  preProc = c("center", "scale"),
  tuneLength = 10
)

print(marsModel)

## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   nprune  RMSE      Rsquared   MAE     
##    2      4.444975  0.2202864  3.661237
##    3      3.683032  0.4666704  2.968899
##    4      2.751215  0.6962266  2.197465
##    6      2.346335  0.7775830  1.863515
##    7      1.995584  0.8400185  1.571659
##    9      1.842941  0.8653498  1.431858
##   10      1.854786  0.8647576  1.442683
##   12      1.850893  0.8649865  1.438274
##   13      1.856070  0.8643466  1.441230
##   15      1.848205  0.8653756  1.432817
## 
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 9 and degree = 1.

marsPred <- predict(marsModel, newdata = testData$x)
postResample(pred = marsPred, obs = testData$y)

##      RMSE  Rsquared       MAE 
## 1.7901760 0.8705315 1.3712537

7.5. Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models. (a) Which nonlinear regression model gives the optimal resampling and test set performance?

If we look at the RMSE values from the code below, we can see that the SVM model gives the optimal test performance.

Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?
Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)

# Data imputation and splitting
set.seed(100)
preProcValues <- preProcess(ChemicalManufacturingProcess, method = c("knnImpute"))
processedData <- predict(preProcValues, ChemicalManufacturingProcess)

# Split data
set.seed(200)
trainIndex <- createDataPartition(processedData$Yield, p = 0.8, list = FALSE)
trainData <- processedData[trainIndex, ]
testData <- processedData[-trainIndex, ]

# Train nonlinear regression models
set.seed(200)
nonlinearModel <- train(
  Yield ~ .,
  data = trainData,
  method = "svmRadial",  # Example: SVM with radial kernel
  preProc = c("center", "scale"),
  tuneLength = 10
)

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

# Evaluate model performance
nonlinearPred <- predict(nonlinearModel, testData)
postResample(pred = nonlinearPred, obs = testData$Yield)

##      RMSE  Rsquared       MAE 
## 0.7275070 0.6138613 0.5442017

# Variable importance
varImp(nonlinearModel)

## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## BiologicalMaterial06     87.81
## ManufacturingProcess13   78.23
## BiologicalMaterial03     76.45
## BiologicalMaterial12     69.16
## ManufacturingProcess17   68.44
## ManufacturingProcess36   67.83
## ManufacturingProcess31   67.82
## ManufacturingProcess09   64.12
## ManufacturingProcess06   60.80
## ManufacturingProcess29   53.56
## BiologicalMaterial02     53.50
## ManufacturingProcess11   50.24
## BiologicalMaterial11     48.66
## ManufacturingProcess33   46.88
## ManufacturingProcess30   45.42
## BiologicalMaterial09     38.34
## BiologicalMaterial04     37.89
## BiologicalMaterial08     36.89
## ManufacturingProcess12   36.65

# Compare top 10 predictors to linear models
set.seed(200)
linearModel <- train(
  Yield ~ .,
  data = trainData,
  method = "lm",
  preProc = c("center", "scale")
)

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

linearVarImp <- varImp(linearModel)
linearVarImp$importance %>%
  arrange(desc(Overall)) %>%
  head(10)

##                          Overall
## ManufacturingProcess32 100.00000
## ManufacturingProcess28  73.80025
## ManufacturingProcess33  68.88580
## ManufacturingProcess04  52.23883
## BiologicalMaterial07    43.31018
## ManufacturingProcess45  40.99224
## ManufacturingProcess37  39.86323
## BiologicalMaterial02    35.86519
## ManufacturingProcess12  35.68715
## BiologicalMaterial11    34.54960

# Evaluate performance on the test set for both models
svmPred <- predict(nonlinearModel, newdata = testData)
svmPerformance <- postResample(pred = svmPred, obs = testData$Yield)

linearPred <- predict(linearModel, newdata = testData)
linearPerformance <- postResample(pred = linearPred, obs = testData$Yield)

# Combine performance metrics into a data frame for plotting
performanceDF <- data.frame(
  Model = c("SVM", "Linear"),
  RMSE = c(svmPerformance["RMSE"], linearPerformance["RMSE"]),
  Rsquared = c(svmPerformance["Rsquared"], linearPerformance["Rsquared"])
)

# Plot RMSE and Rsquared for comparison
ggplot(performanceDF, aes(x = Model, y = RMSE, fill = Model)) +
  geom_bar(stat = "identity") +
  labs(title = "Model Performance Comparison", y = "RMSE") +
  theme_minimal()

ggplot(performanceDF, aes(x = Model, y = Rsquared, fill = Model)) +
  geom_bar(stat = "identity") +
  labs(title = "Model Performance Comparison (R-squared)", y = "R-squared") +
  theme_minimal()

# Extract and arrange top 10 predictors for the nonlinear model (SVM)
svmVarImp <- varImp(nonlinearModel)
svmVarImp <- cbind(rownames(svmVarImp$importance), data.frame(svmVarImp$importance, row.names = NULL))
colnames(svmVarImp) <- c("Predictor", "Overall")
svmVarImp <- svmVarImp %>% arrange(desc(Overall)) %>% head(10)

# Extract and arrange top 10 predictors for the linear model
linearVarImp <- varImp(linearModel)
linearVarImp <- cbind(rownames(linearVarImp$importance), data.frame(linearVarImp$importance, row.names = NULL))
colnames(linearVarImp) <- c("Predictor", "Overall")
linearVarImp <- linearVarImp %>% arrange(desc(Overall)) %>% head(10)

# Combine both models' variable importance for comparison
combinedVarImp <- rbind(
  data.frame(Model = "SVM", Predictor = svmVarImp$Predictor, Importance = svmVarImp$Overall),
  data.frame(Model = "Linear", Predictor = linearVarImp$Predictor, Importance = linearVarImp$Overall)
)

# Plot variable importance comparison
ggplot(combinedVarImp, aes(x = reorder(Predictor, Importance), y = Importance, fill = Model)) +
  geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  labs(title = "Top 10 Predictors - SVM vs Linear Model", x = "Predictor", y = "Importance") +
  theme_minimal()

library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")

# Preprocess the data with multiple methods including BoxCox transformation
preProcess <- preProcess(ChemicalManufacturingProcess,  
                         method = c("BoxCox", "knnImpute", "center", "scale"))
predPreProcess <- predict(preProcess, ChemicalManufacturingProcess)
predPreProcess$Yield = ChemicalManufacturingProcess$Yield

# Split the data into training and testing sets
set.seed(200)
ind <- sample(seq_len(nrow(predPreProcess)), size = floor(0.85 * nrow(predPreProcess)))
train <- predPreProcess[ind, ]
test <- predPreProcess[-ind, ]

# Train a nonlinear regression model (SVM as an example)
svmModel <- train(Yield ~ ., data = train,  
                  method = "svmRadial",  
                  preProcess = c("center", "scale"),
                  tuneLength = 15)

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

# Evaluate the model's performance on the test data
svmPred <- predict(svmModel, newdata = test)
postResample(pred = svmPred, obs = test$Yield)

##      RMSE  Rsquared       MAE 
## 1.4152575 0.6243845 1.2208405

# Extract and sort the top 10 predictors from the SVM model
topPredictors <- names(sort(varImp(svmModel)$importance[, 1], decreasing = TRUE)[1:10])

# Ensure that topPredictors are valid columns in the training data
validPredictors <- topPredictors[topPredictors %in% colnames(train)]

# Feature plot for visualizing relationships between the top predictors and Yield
varI <- varImp(svmModel)$importance
varI <- cbind(rownames(varI), data.frame(varI, row.names = NULL))
colnames(varI) <- c("Predictor", "Overall")
varI <- varI %>% arrange(Overall) %>% tail(10) %>% select(Predictor)
variables <- as.vector(varI$Predictor)

# Top predictors and Yield Visulaiztion
featurePlot(train[, variables], train$Yield)

DATA624-HW8

Biyag Dukuray

2024-11-09

DATA624-HW8