Exercises: 7.2 and 7.5 from Applied Predictive Modeling
## Load required libraries
# Set seed for reproducibility
set.seed(200)
# Generate training data with Friedman simulation
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData <- data.frame(trainingData$x, y = trainingData$y)
# Generate test data to estimate true error rate
testData <- mlbench.friedman1(5000, sd = 1)
testData <- data.frame(testData$x, y = testData$y)
# Visualize the data
featurePlot(trainingData[ , 1:10], trainingData$y)
# Train a k-Nearest Neighbors (k-NN) model using caret::train
knnModel <- caret::train(
x = trainingData[, 1:10], # Select the first 10 columns as predictors
y = trainingData$y, # Use 'y' as the outcome variable
method = "knn",
preProcess = c("center", "scale"),
tuneLength = 10
)
print(knnModel)
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
# Predict on the test data and evaluate performance
knnPred <- predict(knnModel, newdata = testData[, 1:10])
knnPerf <- postResample(pred = knnPred, obs = testData$y)
print(knnPerf)
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
# Predict on the test data and evaluate performance
knnPred <- predict(knnModel, newdata = testData[ , 1:10])
knnPerf <- postResample(pred = knnPred, obs = testData$y)
print(knnPerf)
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
# Set seed for reproducibility
set.seed(200)
# Generate training data using the Friedman simulation
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData <- data.frame(trainingData$x, y = trainingData$y)
# Train a Multivariate Adaptive Regression Splines (MARS) model
marsModel <- caret::train(
x = trainingData[, 1:10], # Select the first 10 columns as predictors
y = trainingData$y, # Use 'y' as the outcome variable
method = "earth", # MARS model specified with "earth" method
tuneGrid = expand.grid(degree = 1:2, nprune = 2:10) # Grid of tuning parameters
)
print(marsModel)
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.527853 0.1962737 3.739316
## 1 3 3.775045 0.4322080 3.073585
## 1 4 2.848117 0.6796346 2.255598
## 1 5 2.597902 0.7322758 2.059575
## 1 6 2.437309 0.7645889 1.920825
## 1 7 2.035238 0.8324435 1.612510
## 1 8 1.912989 0.8530069 1.506779
## 1 9 1.833834 0.8640569 1.439198
## 1 10 1.764012 0.8747017 1.384686
## 2 2 4.531752 0.1949534 3.744716
## 2 3 3.738643 0.4452131 3.047947
## 2 4 2.904854 0.6633109 2.306559
## 2 5 2.611278 0.7278648 2.046642
## 2 6 2.491133 0.7518706 1.964614
## 2 7 2.177271 0.8077048 1.721813
## 2 8 1.986455 0.8388571 1.543917
## 2 9 1.819198 0.8646747 1.425725
## 2 10 1.662222 0.8882739 1.305445
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 10 and degree = 2.
# Predict on the test data and evaluate performance for MARS
marsPred <- predict(marsModel, newdata = testData[ , 1:10])
marsPerf <- postResample(pred = marsPred, obs = testData$y)
print(marsPerf)
## RMSE Rsquared MAE
## 1.4064480 0.9197345 1.1230234
# Check informative predictors selected by MARS
marsVarImp <- varImp(marsModel)
print(marsVarImp)
## earth variable importance
##
## Overall
## X1 100.00
## X4 75.57
## X2 49.29
## X5 15.94
## X3 0.00
# Set seed for reproducibility
set.seed(200)
# Generate training data using the Friedman simulation
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData <- data.frame(trainingData$x, y = trainingData$y)
# Generate test data for evaluating the model
testData <- mlbench.friedman1(5000, sd = 1)
testData <- data.frame(testData$x, y = testData$y)
# Train a k-Nearest Neighbors (k-NN) model using caret::train
knnModel <- caret::train(
x = trainingData[, 1:10], # Select the first 10 columns as predictors
y = trainingData$y, # Use 'y' as the outcome variable
method = "knn",
preProcess = c("center", "scale"),
tuneLength = 10
)
print(knnModel)
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
# Predict on the test data and evaluate performance
knnPred <- predict(knnModel, newdata = testData[, 1:10])
knnPerf <- postResample(pred = knnPred, obs = testData$y)
print(knnPerf)
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
# Load the chemicalManufacturing dataset
data("ChemicalManufacturingProcess")
# Extract predictors and response
ChemicalManufacturingProcess <- as.data.frame(ChemicalManufacturingProcess) # Ensure it's a data frame
predictors <- ChemicalManufacturingProcess[, -1] # Exclude 'Yield'
response <- ChemicalManufacturingProcess$Yield # Extract 'Yield' as the response
# Check if there are any remaining missing values
sum(is.na(predictors_imputed))
## [1] 0
# Data Splitting
set.seed(13)
train_index <- createDataPartition(response, p = 0.8, list = FALSE)
train_predictors <- predictors_imputed[train_index, ]
test_predictors <- predictors_imputed[-train_index, ]
train_response <- response[train_index]
test_response <- response[-train_index]
# Identify skewed predictors (for demonstration, assuming all as numeric)
skewed_predictors <- sapply(train_predictors, function(x) abs(mean(x)) > 1) # Replace with specific skewness check
train_predictors[, skewed_predictors] <- log1p(train_predictors[, skewed_predictors])
test_predictors[, skewed_predictors] <- log1p(test_predictors[, skewed_predictors])
preprocess_model <- preProcess(train_predictors, method = c("center", "scale"))
train_predictors <- predict(preprocess_model, newdata = train_predictors)
test_predictors <- predict(preprocess_model, newdata = test_predictors)
We can now train the following models and compare their performance: Neural Networks (NN), Support Vector Machines (SVM), K-Nearest Neighbors (KNN), and Multivariate Adaptive Regression Splines (MARS).
train_predictors_nnet <- train_predictors
train_response <- as.numeric(train_response)
train_predictors_nnet <- as.data.frame(train_predictors_nnet)
# Ensure the response is numeric for regression tasks
train_response <- as.numeric(train_response)
# Train the neural network using the `nnet` function directly
nn_model <- nnet(
x = train_predictors_nnet, # Predictor variables
y = train_response, # Response variable
size = 5, # Number of hidden units (neurons)
decay = 0.01, # Weight decay (L2 regularization)
linout = TRUE, # Linear output for regression
trace = FALSE, # Suppress output during training
maxit = 500, # Maximum number of iterations
MaxNWts = 10 * (ncol(train_predictors_nnet) + 1) + 5 + 1 # Max number of weights
)
# Print model summary to check details
(head(summary(nn_model)))
## $n
## [1] 57 5 1
##
## $nunits
## [1] 64
##
## $nconn
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [20] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [39] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [58] 0 0 58 116 174 232 290 296
##
## $conn
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
## [26] 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
## [51] 50 51 52 53 54 55 56 57 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## [76] 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
## [101] 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 0 1 2 3 4 5 6 7 8
## [126] 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
## [151] 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 0
## [176] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [201] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## [226] 51 52 53 54 55 56 57 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## [251] 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
## [276] 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 0 58 59 60 61 62
##
## $nsunits
## [1] 63
##
## $decay
## [1] 0.01
# Predict using the trained model on the test dataset
test_predictors_nnet <- test_predictors
nn_predictions <- predict(nn_model, newdata = test_predictors_nnet)
head(nn_predictions)
## [,1]
## 6 48.08048
## 9 42.17955
## 17 43.47570
## 20 39.75457
## 32 45.76906
## 34 40.39286
# Evaluate predictions:
# RMSE (Root Mean Squared Error)
rmse <- sqrt(mean((nn_predictions - test_response)^2))
print(paste("RMSE:", rmse))
## [1] "RMSE: 2.08042618156675"
# MAE (Mean Absolute Error)
mae <- mean(abs(nn_predictions - test_response))
print(paste("MAE:", mae))
## [1] "MAE: 1.69701579545323"
# Train Support Vector Machine model
svm_model <- svm(Yield ~ ., data = cbind(train_predictors, Yield = train_response))
svm_model
##
## Call:
## svm(formula = Yield ~ ., data = cbind(train_predictors, Yield = train_response))
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 1
## gamma: 0.01754386
## epsilon: 0.1
##
##
## Number of Support Vectors: 128
svm_predictions <- predict(svm_model, newdata = test_predictors)
svm_predictions
## 6 9 17 20 32 34 41 46
## 41.88124 42.22955 40.75449 40.04563 42.56969 40.49870 42.49397 42.32216
## 48 54 63 70 81 82 83 85
## 42.48441 40.71442 38.91372 39.06585 40.55556 40.52789 40.79082 40.73112
## 90 105 117 123 124 138 139 142
## 38.50990 39.07127 40.47198 39.92339 41.54043 38.65861 39.86662 40.12252
## 143 145 146 153 163 166 171 176
## 40.46115 39.07678 38.69582 38.55096 39.59045 38.90806 40.68020 39.91157
# Evaluate predictions:
# Evaluate predictions (RMSE)
rmse <- sqrt(mean((svm_predictions - test_response)^2))
print(paste("RMSE:", rmse))
## [1] "RMSE: 1.17824090056816"
# Evaluate predictions (MAE)
mae <- mean(abs(svm_predictions - test_response))
print(paste("MAE:", mae))
## [1] "MAE: 0.880340803879416"
# Train the K-Nearest Neighbors model using kknn
knn_model <- kknn(Yield ~ ., train = cbind(train_predictors, Yield = train_response),
test = test_predictors, k = 5)
# Print the model summary
summary(knn_model)
##
## Call:
## kknn(formula = Yield ~ ., train = cbind(train_predictors, Yield = train_response), test = test_predictors, k = 5)
##
## Response: "continuous"
# Evaluate predictions:
# Calculate MAE (Mean Absolute Error)
# Generate predictions using the KNN model
knn_predictions <- predict(knn_model, newdata = test_predictors)
mae <- mean(abs(knn_predictions - test_response))
print(paste("MAE:", mae))
## [1] "MAE: 0.973210632409784"
# Calculate RMSE (Root Mean Squared Error)
rmse <- sqrt(mean((knn_predictions - test_response)^2))
print(paste("RMSE:", rmse))
## [1] "RMSE: 1.37249147634138"
# Make predictions using the trained model
knn_predictions <- predict(knn_model, newdata = test_predictors)
knn_predictions
## [1] 42.59476 42.29578 41.18866 39.83274 39.86502 39.94568 41.84468 41.30619
## [9] 43.61979 40.02760 38.99271 39.71970 39.60673 40.17630 39.74956 40.03373
## [17] 39.96940 39.79556 41.72038 38.46873 39.57060 38.60848 39.40518 40.29814
## [25] 41.14075 38.77247 39.67823 38.86178 39.15881 38.95846 39.48231 39.76740
# Train MARS model
mars_model <- earth(Yield ~ ., data = cbind(train_predictors, Yield = train_response))
mars_model
## Selected 12 of 21 terms, and 8 of 57 predictors
## Termination condition: RSq changed by less than 0.001 at 21 terms
## Importance: ManufacturingProcess32, ManufacturingProcess09, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 1.168236 RSS 118.7788 GRSq 0.6619742 RSq 0.7579815
# Calculate RMSE and MAE for evaluation
rmse_value <- rmse(test_response, mars_predictions)
mae_value <- mae(test_response, mars_predictions)
# Print RMSE and MAE values
cat("RMSE: ", rmse_value, "\n")
## RMSE: 1.068764
cat("MAE: ", mae_value, "\n")
## MAE: 0.6772846
# Plot variable importance
var_imp <- evimp(mars_model) # Extract variable importance
print(var_imp) # Display variable importance
## nsubsets gcv rss
## ManufacturingProcess32 11 100.0 100.0
## ManufacturingProcess09 10 66.4 69.9
## ManufacturingProcess13 9 42.3 50.0
## ManufacturingProcess39 7 22.9 34.5
## ManufacturingProcess01 6 21.6 31.9
## BiologicalMaterial03 5 20.4 29.3
## BiologicalMaterial02 3 11.5 20.4
## ManufacturingProcess33 1 4.8 11.0
# Tuning MARS model using external resampling manually (example grid)
# Define candidate models to test
marsGrid <- expand.grid(.degree = 1:2, .nprune = seq(2, 38, by = 2))
# Tune model by iterating through the grid and calculating performance
best_rmse <- Inf
best_model <- NULL
best_params <- NULL
for (i in 1:nrow(marsGrid)) {
params <- marsGrid[i, ]
model <- earth(Yield ~ ., data = cbind(train_predictors, Yield = train_response),
degree = params$.degree, nprune = params$.nprune)
# Predict on the test set
predictions <- predict(model, newdata = test_predictors)
# Calculate RMSE for this model
rmse_value <- rmse(test_response, predictions)
# If this model has the best RMSE, store it
if (rmse_value < best_rmse) {
best_rmse <- rmse_value
best_model <- model
best_params <- params
}
}
# Print the best model and parameters
cat("Best RMSE: ", best_rmse, "\n")
## Best RMSE: 1.060588
# Use the best model for predictions
final_predictions <- predict(best_model, newdata = test_predictors)
# Evaluate the final model
final_rmse <- rmse(test_response, final_predictions)
final_mae <- mae(test_response, final_predictions)
# Print final evaluation results
cat("Final RMSE: ", final_rmse, "\n")
## Final RMSE: 1.060588
cat("Final MAE: ", final_mae, "\n")
## Final MAE: 0.7917836
# Plot the final model's results using plotmo for interpretation
plotmo(best_model, type = "response", main = "MARS Model: Response vs. Predictors")
## plotmo grid: BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
## -0.07401572 -0.08247092 -0.05311066
## BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
## -0.04061519 -0.0283942 -0.1029063
## BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
## -0.1182654 0.009356733 0.02376966
## BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
## -0.1932263 -0.1595675 -0.007984947
## ManufacturingProcess01 ManufacturingProcess02 ManufacturingProcess03
## 0.1652337 0.5146324 0.06520146
## ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess06
## 0.3465247 -0.07395531 -0.229795
## ManufacturingProcess07 ManufacturingProcess08 ManufacturingProcess09
## -0.9426152 0.9426152 0.07777282
## ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess12
## -0.06026455 -0.04588157 -0.4895948
## ManufacturingProcess13 ManufacturingProcess14 ManufacturingProcess15
## 0.07560224 0.04007555 -0.09750878
## ManufacturingProcess16 ManufacturingProcess17 ManufacturingProcess18
## 0.08249729 0.0128037 0.08298485
## ManufacturingProcess19 ManufacturingProcess20 ManufacturingProcess21
## -0.1131955 0.08300123 -0.2073696
## ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
## 0.1853467 0.1798176 0.1159048
## ManufacturingProcess25 ManufacturingProcess26 ManufacturingProcess27
## 0.08326936 0.08220305 0.08239999
## ManufacturingProcess28 ManufacturingProcess29 ManufacturingProcess30
## 0.7823448 0.05816823 0.04771302
## ManufacturingProcess31 ManufacturingProcess32 ManufacturingProcess33
## 0.09070835 -0.04532923 0.2654151
## ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
## 0.1103482 0.03569133 0.4424826
## ManufacturingProcess37 ManufacturingProcess38 ManufacturingProcess39
## 0.05853275 0.5812483 0.2142516
## ManufacturingProcess40 ManufacturingProcess41 ManufacturingProcess42
## -0.4677701 -0.4469901 0.1810788
## ManufacturingProcess43 ManufacturingProcess44 ManufacturingProcess45
## -0.1221465 0.2475303 0.1630509
Once we have the optimal model, we can extract feature importance. For example, using the varImp function for KNN, SVM, and MARS.
KNN and SVM:
# Train the KNN model using kknn
knn_model <- kknn(Yield ~ ., train = cbind(train_predictors, Yield = train_response),
test = test_predictors, k = 5)
# Function to compute RMSE or MAE
compute_performance <- function(model, test_data, test_response) {
predictions <- predict(model, newdata = test_data)
rmse <- sqrt(mean((predictions - test_response)^2))
return(rmse)}
# Baseline performance
baseline_rmse <- compute_performance(knn_model, test_predictors, test_response)
cat("Baseline RMSE: ", baseline_rmse, "\n")
## Baseline RMSE: 1.372491
MARS:
# Variable importance for MARS model
mars_importance <- varImp(mars_model)
print(mars_importance)
## Overall
## ManufacturingProcess32 100.000000
## ManufacturingProcess09 66.390853
## ManufacturingProcess13 42.288682
## ManufacturingProcess39 22.876918
## ManufacturingProcess01 21.608293
## BiologicalMaterial03 20.433283
## BiologicalMaterial02 11.505212
## ManufacturingProcess33 4.819014
# Create the data frame with the importance values
importance_df <- data.frame(
Predictor = c("ManufacturingProcess32", "ManufacturingProcess09", "ManufacturingProcess13",
"ManufacturingProcess39", "ManufacturingProcess01", "BiologicalMaterial03",
"BiologicalMaterial02", "ManufacturingProcess33"),
Importance = c(100.000000, 66.390853, 42.288682, 22.876918, 21.608293,
20.433283, 11.505212, 4.819014)
)
# Create a barplot to visualize the importance of predictors
barplot(importance_df$Importance,
names.arg = importance_df$Predictor,
main = "Variable Importance for MARS Model",
col = "skyblue",
las = 2, # Rotate the x-axis labels for better readability
cex.names = 0.8, # Adjust text size of labels
border = "white",
horiz = TRUE, # Horizontal bars for better label readability
xlab = "Importance"
)
top_predictors <- head(importance_df, 5)$predictor
# Create a new data frame containing only the top predictors and the response variable
top_predictors_df <- cbind(test_predictors[, top_predictors], Yield = test_response)
top_predictors_df <- ChemicalManufacturingProcess[1:32, c("ManufacturingProcess32", "ManufacturingProcess09", "BiologicalMaterial03")] # Adjust columns as needed
top_predictors_df$Yield <- ChemicalManufacturingProcess$Yield[1:32]
# Fit the linear regression model
linear_model <- lm(Yield ~ ., data = top_predictors_df)
# Print the model summary
summary(linear_model)
##
## Call:
## lm(formula = Yield ~ ., data = top_predictors_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.98492 -0.39036 0.06245 0.37593 0.85332
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -19.87456 3.25861 -6.099 1.40e-06 ***
## ManufacturingProcess32 0.18342 0.01839 9.972 1.02e-10 ***
## ManufacturingProcess09 0.70436 0.04647 15.156 5.04e-15 ***
## BiologicalMaterial03 -0.01619 0.02497 -0.649 0.522
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5285 on 28 degrees of freedom
## Multiple R-squared: 0.9456, Adjusted R-squared: 0.9398
## F-statistic: 162.2 on 3 and 28 DF, p-value: < 2.2e-16
# Plot relationships between each top predictor and the response (Yield)
ggplot(top_predictors_df, aes(x = top_predictors_df[, 1], y = Yield)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") + # Adding regression line
labs(title = paste("Relationship between top predictors", top_predictors[1], "and Yield"),
x = top_predictors[1], y = "Yield") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Use ggpairs to create a matrix of scatter plots for top predictors and the response variable
top_predictors_df$Yield <- as.numeric(top_predictors_df$Yield) # Make sure Yield is numeric
ggpairs(top_predictors_df,
aes(color = Yield),
upper = list(continuous = "points"),
lower = list(continuous = "smooth"),
progress = FALSE) # Suppress progress bar
cor_matrix <- cor(top_predictors_df)
# Melt the correlation matrix for ggplot2 compatibility
melted_cor_matrix <- melt(cor_matrix)
# Plot the heat map
ggplot(data = melted_cor_matrix, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1, 1), space = "Lab",
name = "Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
labs(title = "Heat Map of Correlations between Top Predictors and Yield")
# Residuals vs Fitted plot for model diagnostics
plot(linear_model, which = 1)
# Make predictions
predictions <- predict(linear_model, newdata = top_predictors_df)
# Calculate RMSE
rmse <- sqrt(mean((predictions - top_predictors_df$Yield)^2))
print(paste("RMSE:", rmse))
## [1] "RMSE: 0.494341167163"
# Calculate MAE
mae <- mean(abs(predictions - top_predictors_df$Yield))
print(paste("MAE:", mae))
## [1] "MAE: 0.419288337899643"
# R-squared
rsq <- summary(linear_model)$r.squared
print(paste("R-squared:", rsq))
## [1] "R-squared: 0.945596181994789"
# Ensure that rf_importance contains the correct importance values
train_data <- cbind(train_predictors, Yield = train_response)
rf_importance <- randomForest::randomForest(Yield ~ ., data = train_data) # For example, train a model if you haven't
# Extract the top 5 most important predictors
top_predictors <- names(sort(rf_importance$importance[, 1], decreasing = TRUE))[1:4]
top_predictors
## [1] "ManufacturingProcess32" "ManufacturingProcess13" "BiologicalMaterial03"
## [4] "BiologicalMaterial12"
# Create the pairs plot for the top 4 predictors vs the response
pairs(cbind(response = train_response, train_predictors[, top_predictors]),
main = "Top Predictors vs Yield")
We will use Mean Absolute Error (MAE) to evaluate model performance.
# Evaluate model performance (using MAE)
mae_nn <- mean(abs(nn_predictions - test_response))
mae_svm <- mean(abs(svm_predictions - test_response))
mae_knn <- mean(abs(knn_predictions - test_response))
mae_mars <- mean(abs(mars_predictions - test_response))
cat("MAE (NN):", mae_nn, "\n")
## MAE (NN): 1.697016
cat("MAE (SVM):", mae_svm, "\n")
## MAE (SVM): 0.8803408
cat("MAE (KNN):", mae_knn, "\n")
## MAE (KNN): 0.9732106
cat("MAE (MARS):", mae_mars, "\n")
## MAE (MARS): 0.6772846
Based on the MAE values, SVM emerges as the best-performing model, with the lowest error. MARS also performs well, while KNN and NN have higher errors, with the neural network model being the least effective in this case. But, based on RMSE, MARS (1.07 RMSE) has the best performance, followed closely by SVM (1.18 RMSE). KNN(1.37 RMSE) comes next, and NN (3.08 RMSE)performs the worst, with the highest RMSE, which indicates the largest error. My personal preference goes to SVM for its simple implementation steps and its good generalization performance.
# Convert the predictors to a matrix (if required)
train_predictors_matrix <- as.matrix(train_predictors)
rf_model <- caret::train(
x = train_predictors_matrix, # Ensure it's a data.frame or matrix with column names
y = train_response, # Vector of responses
method = "rf", # Random Forest method
trControl = fitControl, # Cross-validation settings
tuneLength = 10 # Hyperparameter tuning
)
# Print the results of the model training
print(rf_model)
## Random Forest
##
## 144 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 129, 129, 128, 130, 129, 129, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 1.229303 0.6472954 0.9975732
## 8 1.108002 0.6967114 0.8886329
## 14 1.084762 0.7050603 0.8650151
## 20 1.076142 0.7082278 0.8510208
## 26 1.076357 0.7033922 0.8513888
## 32 1.070691 0.7044220 0.8410492
## 38 1.066271 0.7054733 0.8375578
## 44 1.074495 0.6995948 0.8406339
## 50 1.079506 0.6943438 0.8428872
## 57 1.079663 0.6938599 0.8418698
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 38.
# Variable importance for the Random Forest model
rf_importance <- varImp(rf_model, scale = FALSE)
print(rf_importance)
## rf variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 129.718
## ManufacturingProcess13 51.501
## BiologicalMaterial12 31.397
## BiologicalMaterial03 29.546
## ManufacturingProcess17 21.497
## BiologicalMaterial11 16.513
## ManufacturingProcess31 12.791
## BiologicalMaterial06 12.707
## ManufacturingProcess09 11.381
## BiologicalMaterial05 10.325
## ManufacturingProcess06 9.787
## BiologicalMaterial02 7.989
## BiologicalMaterial08 7.523
## ManufacturingProcess18 6.965
## ManufacturingProcess30 6.662
## BiologicalMaterial09 6.501
## ManufacturingProcess28 6.327
## ManufacturingProcess34 6.210
## BiologicalMaterial04 5.342
## ManufacturingProcess39 4.366
# Plot top predictors
plot(rf_importance, top = 10)