Homework 8

Do problems 7.2 and 7.5 in Kuhn and Johnson. There are only two but they have many parts. Please submit both a link to your Rpubs and the .rmd file.

library(mlbench)
library(caret)
library(earth)
library(ggplot2)
library(magrittr)
library(caret)
library(tidyverse)

Question 7.2

set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)

## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

Tune several models on these data.

Train Several Models

KNN Model

knnModel <- train(x = trainingData$x,
 y = trainingData$y,
 method = "knn",
 preProc = c("center", "scale"),
 tuneLength = 10)
knnModel
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.466085  0.5121775  2.816838
##    7  3.349428  0.5452823  2.727410
##    9  3.264276  0.5785990  2.660026
##   11  3.214216  0.6024244  2.603767
##   13  3.196510  0.6176570  2.591935
##   15  3.184173  0.6305506  2.577482
##   17  3.183130  0.6425367  2.567787
##   19  3.198752  0.6483184  2.592683
##   21  3.188993  0.6611428  2.588787
##   23  3.200458  0.6638353  2.604529
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
knnResults <- postResample(pred = knnPred, obs = testData$y)
knnResults
##      RMSE  Rsquared       MAE 
## 3.2040595 0.6819919 2.5683461
# Create a data frame with actual vs predicted values

results <- data.frame(Actual = testData$y, Predicted = knnPred)

# Create the ggplot

ggplot(results, aes(x = Actual, y = Predicted)) +

  geom_point(alpha = 0.4) +  # scatter plot

  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +  # ideal prediction line

  labs(title = "KNN Predictions vs Actual Values",

       x = "Actual Values",

       y = "Predicted Values") +

  theme_minimal() +

  xlim(c(min(results$Actual), max(results$Actual))) +  # set x-axis limits

  ylim(c(min(results$Actual), max(results$Actual))) +  # set y-axis limits

  theme(plot.title = element_text(hjust = 0.5))  # center title

Random Forest Model

rfModel <- train(x = trainingData$x,

                 y = trainingData$y,

                 method = "rf",

                 preProc = c("center", "scale"),

                 tuneLength = 10)
## note: only 9 unique complexity parameters in default grid. Truncating the grid to 9 .
# Make predictions
rfPred <- predict(rfModel, newdata = testData$x)
# Evaluate
rfResults <- postResample(pred = rfPred, obs = testData$y)
rfResults
##      RMSE  Rsquared       MAE 
## 2.4485896 0.7971454 1.9384524
# Prepare data frame for ggplot

results_df <- data.frame(Actual = testData$y, Predicted = rfPred)

# Create the plot

ggplot(results_df, aes(x = Actual, y = Predicted)) +

  geom_point(alpha = 0.5, color = 'blue') +               # Scatter plot of actual vs predicted

  geom_abline(slope = 1, intercept = 0, color = 'red') + # Reference line for perfect predictions

  labs(title = "Random Forest Predictions vs Actual Values",

       x = "Actual Values",

       y = "Predicted Values") +

  theme_minimal() +                                       # Clean theme

  xlim(c(min(results_df$Actual), max(results_df$Actual))) +

  ylim(c(min(results_df$Predicted), max(results_df$Predicted)))

MARS Model

marsModel <- train(x = trainingData$x,

                   y = trainingData$y,

                   method = "earth",

                   preProc = c("center", "scale"),

                   tuneGrid = expand.grid(degree = 1:2, 

                                        nprune = seq(2, 20, by = 2)))
# Make predictions
marsPred <- predict(marsModel, newdata = testData$x)
# Evaluate performance
marsResults <- postResample(pred = marsPred, obs = testData$y)
marsResults
##      RMSE  Rsquared       MAE 
## 1.2779993 0.9338365 1.0147070

MARS Feature Selection

# Check MARS variable importance

marsImp <- varImp(marsModel)
print(marsImp)
## earth variable importance
## 
##    Overall
## X1  100.00
## X4   75.40
## X2   49.00
## X5   15.72
## X3    0.00

MARS model correctly identified informative predictors X1-X5.

Best performing models based on test RMSE and R²

MARS: RMSE = 1.2793868 , R² = 0.9343367
Random Forest: RMSE = 2.4070460, R² = 0.79314252 
KNN: RMSE = 3.2040595, R² = 0.6819919 

Question 7.5

Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
dim(ChemicalManufacturingProcess)
## [1] 176  58

Impute missing values

# Impute missing values using k-Nearest Neighbors imputation

set.seed(123) # for reproducibility

chem_impute <- preProcess(ChemicalManufacturingProcess, method = c("knnImpute"))

df <- predict(chem_impute, ChemicalManufacturingProcess) # Apply imputation

Prepare data

# Prepare data for modeling
dfx <- df %>% select(-Yield) # Predictor variables
dfy <- df %>% select(Yield) # Target variable

Split Data

# Split data into training and testing sets

set.seed(123) # Ensure consistent data partitioning
chem_train <- createDataPartition(dfy$Yield, p = 0.80, list = FALSE)

x_train <- dfx[chem_train, ]
x_test <- dfx[-chem_train, ]
y_train <- dfy[chem_train, ]
y_test <- dfy[-chem_train, ]
set.seed(123)
ridgeGrid <- data.frame(.lambda = seq(0,0.1,length=15))

ridgeFit <- train(x=x_train , y=y_train,
                       method='ridge',
                       tuneGrid=ridgeGrid,
                       trControl=trainControl(method = "cv", number = 10),
                       preProc = c('center','scale'))
              

ridgeFit
## Ridge Regression 
## 
## 144 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 128, 129, 129, 130, 128, 131, ... 
## Resampling results across tuning parameters:
## 
##   lambda       RMSE      Rsquared   MAE      
##   0.000000000  6.559444  0.3705740  2.1257894
##   0.007142857  2.905917  0.4076010  1.1706542
##   0.014285714  2.356944  0.4321264  1.0217333
##   0.021428571  2.061485  0.4500623  0.9420376
##   0.028571429  1.868746  0.4640859  0.8889647
##   0.035714286  1.730620  0.4755309  0.8500597
##   0.042857143  1.625723  0.4851112  0.8199353
##   0.050000000  1.542823  0.4932645  0.7957412
##   0.057142857  1.475365  0.5002888  0.7761056
##   0.064285714  1.419231  0.5064010  0.7598050
##   0.071428571  1.371687  0.5117652  0.7460655
##   0.078571429  1.330839  0.5165093  0.7341978
##   0.085714286  1.295329  0.5207341  0.7238684
##   0.092857143  1.264153  0.5245201  0.7148742
##   0.100000000  1.236554  0.5279324  0.7068879
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.1.

The ridge method indicates that the optimal value is 0.1.

predict_ridge <- predict(ridgeFit, x_test)

defaultSummary(data.frame(pred=predict_ridge,obs=y_test))
##      RMSE  Rsquared       MAE 
## 0.7428885 0.4953262 0.6433366
# Visualize variable importance

plot(varImp(ridgeFit, scale = FALSE),
top = 20,
scales = list(y = list(cex = 0.8)),
main = "Top 20 Predictors of Yield"
)

The most important Predictors are Manifacturing Process 32, 06, and the Biological Material 03.

library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.95 loaded
correlation <- cor(select(
df, "ManufacturingProcess32", "ManufacturingProcess06",
 "BiologicalMaterial03", "Yield"
))

# Plot correlation matrix

corrplot(correlation,
method = "square", type = "upper",
title = "Correlation Matrix of Key Variables",
mar = c(0, 0, 1, 0)

)

There is a correlations between the primary predictors and the response variable. These predictors can enhance the model’s fit and are likely to hold greater statistical significance. Additionally, this illustrates the direct relationship between the processes that will yield either negative or positive outcomes.