Homework 8
Do problems 7.2 and 7.5 in Kuhn and Johnson. There are only two but they have many parts. Please submit both a link to your Rpubs and the .rmd file.
library(mlbench)
library(caret)
library(earth)
library(ggplot2)
library(magrittr)
library(caret)
library(tidyverse)
Question 7.2
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
Tune several models on these data.
Train Several Models
knnModel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
knnResults <- postResample(pred = knnPred, obs = testData$y)
knnResults
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
# Create a data frame with actual vs predicted values
results <- data.frame(Actual = testData$y, Predicted = knnPred)
# Create the ggplot
ggplot(results, aes(x = Actual, y = Predicted)) +
geom_point(alpha = 0.4) + # scatter plot
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") + # ideal prediction line
labs(title = "KNN Predictions vs Actual Values",
x = "Actual Values",
y = "Predicted Values") +
theme_minimal() +
xlim(c(min(results$Actual), max(results$Actual))) + # set x-axis limits
ylim(c(min(results$Actual), max(results$Actual))) + # set y-axis limits
theme(plot.title = element_text(hjust = 0.5)) # center title
rfModel <- train(x = trainingData$x,
y = trainingData$y,
method = "rf",
preProc = c("center", "scale"),
tuneLength = 10)
## note: only 9 unique complexity parameters in default grid. Truncating the grid to 9 .
# Make predictions
rfPred <- predict(rfModel, newdata = testData$x)
# Evaluate
rfResults <- postResample(pred = rfPred, obs = testData$y)
rfResults
## RMSE Rsquared MAE
## 2.4485896 0.7971454 1.9384524
# Prepare data frame for ggplot
results_df <- data.frame(Actual = testData$y, Predicted = rfPred)
# Create the plot
ggplot(results_df, aes(x = Actual, y = Predicted)) +
geom_point(alpha = 0.5, color = 'blue') + # Scatter plot of actual vs predicted
geom_abline(slope = 1, intercept = 0, color = 'red') + # Reference line for perfect predictions
labs(title = "Random Forest Predictions vs Actual Values",
x = "Actual Values",
y = "Predicted Values") +
theme_minimal() + # Clean theme
xlim(c(min(results_df$Actual), max(results_df$Actual))) +
ylim(c(min(results_df$Predicted), max(results_df$Predicted)))
marsModel <- train(x = trainingData$x,
y = trainingData$y,
method = "earth",
preProc = c("center", "scale"),
tuneGrid = expand.grid(degree = 1:2,
nprune = seq(2, 20, by = 2)))
# Make predictions
marsPred <- predict(marsModel, newdata = testData$x)
# Evaluate performance
marsResults <- postResample(pred = marsPred, obs = testData$y)
marsResults
## RMSE Rsquared MAE
## 1.2779993 0.9338365 1.0147070
MARS Feature Selection
# Check MARS variable importance
marsImp <- varImp(marsModel)
print(marsImp)
## earth variable importance
##
## Overall
## X1 100.00
## X4 75.40
## X2 49.00
## X5 15.72
## X3 0.00
MARS model correctly identified informative predictors X1-X5.
Best performing models based on test RMSE and R²
MARS: RMSE = 1.2793868 , R² = 0.9343367
Random Forest: RMSE = 2.4070460, R² = 0.79314252
KNN: RMSE = 3.2040595, R² = 0.6819919
Question 7.5
Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
dim(ChemicalManufacturingProcess)
## [1] 176 58
Impute missing values
# Impute missing values using k-Nearest Neighbors imputation
set.seed(123) # for reproducibility
chem_impute <- preProcess(ChemicalManufacturingProcess, method = c("knnImpute"))
df <- predict(chem_impute, ChemicalManufacturingProcess) # Apply imputation
Prepare data
# Prepare data for modeling
dfx <- df %>% select(-Yield) # Predictor variables
dfy <- df %>% select(Yield) # Target variable
Split Data
# Split data into training and testing sets
set.seed(123) # Ensure consistent data partitioning
chem_train <- createDataPartition(dfy$Yield, p = 0.80, list = FALSE)
x_train <- dfx[chem_train, ]
x_test <- dfx[-chem_train, ]
y_train <- dfy[chem_train, ]
y_test <- dfy[-chem_train, ]
set.seed(123)
ridgeGrid <- data.frame(.lambda = seq(0,0.1,length=15))
ridgeFit <- train(x=x_train , y=y_train,
method='ridge',
tuneGrid=ridgeGrid,
trControl=trainControl(method = "cv", number = 10),
preProc = c('center','scale'))
ridgeFit
## Ridge Regression
##
## 144 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 128, 129, 129, 130, 128, 131, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.000000000 6.559444 0.3705740 2.1257894
## 0.007142857 2.905917 0.4076010 1.1706542
## 0.014285714 2.356944 0.4321264 1.0217333
## 0.021428571 2.061485 0.4500623 0.9420376
## 0.028571429 1.868746 0.4640859 0.8889647
## 0.035714286 1.730620 0.4755309 0.8500597
## 0.042857143 1.625723 0.4851112 0.8199353
## 0.050000000 1.542823 0.4932645 0.7957412
## 0.057142857 1.475365 0.5002888 0.7761056
## 0.064285714 1.419231 0.5064010 0.7598050
## 0.071428571 1.371687 0.5117652 0.7460655
## 0.078571429 1.330839 0.5165093 0.7341978
## 0.085714286 1.295329 0.5207341 0.7238684
## 0.092857143 1.264153 0.5245201 0.7148742
## 0.100000000 1.236554 0.5279324 0.7068879
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.1.
The ridge method indicates that the optimal value is 0.1.
predict_ridge <- predict(ridgeFit, x_test)
defaultSummary(data.frame(pred=predict_ridge,obs=y_test))
## RMSE Rsquared MAE
## 0.7428885 0.4953262 0.6433366
# Visualize variable importance
plot(varImp(ridgeFit, scale = FALSE),
top = 20,
scales = list(y = list(cex = 0.8)),
main = "Top 20 Predictors of Yield"
)
The most important Predictors are Manifacturing Process 32, 06, and the Biological Material 03.
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.95 loaded
correlation <- cor(select(
df, "ManufacturingProcess32", "ManufacturingProcess06",
"BiologicalMaterial03", "Yield"
))
# Plot correlation matrix
corrplot(correlation,
method = "square", type = "upper",
title = "Correlation Matrix of Key Variables",
mar = c(0, 0, 1, 0)
)
There is a correlations between the primary predictors and the response variable. These predictors can enhance the model’s fit and are likely to hold greater statistical significance. Additionally, this illustrates the direct relationship between the processes that will yield either negative or positive outcomes.