Homework 8, Data 624

Exercises: 7.2 and 7.5 from Applied Predictive Modeling

## Load required libraries

7.2

# Set seed for reproducibility
set.seed(200)

# Generate training data with Friedman simulation
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData <- data.frame(trainingData$x, y = trainingData$y)

# Generate test data to estimate true error rate
testData <- mlbench.friedman1(5000, sd = 1)
testData <- data.frame(testData$x, y = testData$y)

# Visualize the data
featurePlot(trainingData[ , 1:10], trainingData$y)

# Train a k-Nearest Neighbors (k-NN) model using caret::train
knnModel <- caret::train(
  x = trainingData[, 1:10],   # Select the first 10 columns as predictors
  y = trainingData$y,         # Use 'y' as the outcome variable
  method = "knn",
  preProcess = c("center", "scale"),
  tuneLength = 10
)
print(knnModel)

## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.466085  0.5121775  2.816838
##    7  3.349428  0.5452823  2.727410
##    9  3.264276  0.5785990  2.660026
##   11  3.214216  0.6024244  2.603767
##   13  3.196510  0.6176570  2.591935
##   15  3.184173  0.6305506  2.577482
##   17  3.183130  0.6425367  2.567787
##   19  3.198752  0.6483184  2.592683
##   21  3.188993  0.6611428  2.588787
##   23  3.200458  0.6638353  2.604529
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.

# Predict on the test data and evaluate performance
knnPred <- predict(knnModel, newdata = testData[, 1:10])
knnPerf <- postResample(pred = knnPred, obs = testData$y)
print(knnPerf)

##      RMSE  Rsquared       MAE 
## 3.2040595 0.6819919 2.5683461

# Predict on the test data and evaluate performance
knnPred <- predict(knnModel, newdata = testData[ , 1:10])
knnPerf <- postResample(pred = knnPred, obs = testData$y)
print(knnPerf)

##      RMSE  Rsquared       MAE 
## 3.2040595 0.6819919 2.5683461

# Set seed for reproducibility
set.seed(200)

# Generate training data using the Friedman simulation
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData <- data.frame(trainingData$x, y = trainingData$y)

# Train a Multivariate Adaptive Regression Splines (MARS) model
marsModel <- caret::train(
  x = trainingData[, 1:10],           # Select the first 10 columns as predictors
  y = trainingData$y,                 # Use 'y' as the outcome variable
  method = "earth",                   # MARS model specified with "earth" method
  tuneGrid = expand.grid(degree = 1:2, nprune = 2:10)  # Grid of tuning parameters
)
print(marsModel)

## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE     
##   1        2      4.527853  0.1962737  3.739316
##   1        3      3.775045  0.4322080  3.073585
##   1        4      2.848117  0.6796346  2.255598
##   1        5      2.597902  0.7322758  2.059575
##   1        6      2.437309  0.7645889  1.920825
##   1        7      2.035238  0.8324435  1.612510
##   1        8      1.912989  0.8530069  1.506779
##   1        9      1.833834  0.8640569  1.439198
##   1       10      1.764012  0.8747017  1.384686
##   2        2      4.531752  0.1949534  3.744716
##   2        3      3.738643  0.4452131  3.047947
##   2        4      2.904854  0.6633109  2.306559
##   2        5      2.611278  0.7278648  2.046642
##   2        6      2.491133  0.7518706  1.964614
##   2        7      2.177271  0.8077048  1.721813
##   2        8      1.986455  0.8388571  1.543917
##   2        9      1.819198  0.8646747  1.425725
##   2       10      1.662222  0.8882739  1.305445
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 10 and degree = 2.

# Predict on the test data and evaluate performance for MARS
marsPred <- predict(marsModel, newdata = testData[ , 1:10])
marsPerf <- postResample(pred = marsPred, obs = testData$y)
print(marsPerf)

##      RMSE  Rsquared       MAE 
## 1.4064480 0.9197345 1.1230234

# Check informative predictors selected by MARS
marsVarImp <- varImp(marsModel)
print(marsVarImp)

## earth variable importance
## 
##    Overall
## X1  100.00
## X4   75.57
## X2   49.29
## X5   15.94
## X3    0.00

# Set seed for reproducibility
set.seed(200)

# Generate training data using the Friedman simulation
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData <- data.frame(trainingData$x, y = trainingData$y)

# Generate test data for evaluating the model
testData <- mlbench.friedman1(5000, sd = 1)
testData <- data.frame(testData$x, y = testData$y)

# Train a k-Nearest Neighbors (k-NN) model using caret::train
knnModel <- caret::train(
  x = trainingData[, 1:10],   # Select the first 10 columns as predictors
  y = trainingData$y,         # Use 'y' as the outcome variable
  method = "knn",
  preProcess = c("center", "scale"),
  tuneLength = 10
)
print(knnModel)

## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.466085  0.5121775  2.816838
##    7  3.349428  0.5452823  2.727410
##    9  3.264276  0.5785990  2.660026
##   11  3.214216  0.6024244  2.603767
##   13  3.196510  0.6176570  2.591935
##   15  3.184173  0.6305506  2.577482
##   17  3.183130  0.6425367  2.567787
##   19  3.198752  0.6483184  2.592683
##   21  3.188993  0.6611428  2.588787
##   23  3.200458  0.6638353  2.604529
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.

# Predict on the test data and evaluate performance
knnPred <- predict(knnModel, newdata = testData[, 1:10])
knnPerf <- postResample(pred = knnPred, obs = testData$y)
print(knnPerf)

##      RMSE  Rsquared       MAE 
## 3.2040595 0.6819919 2.5683461

7.5

Load the datat set

# Load the chemicalManufacturing dataset
data("ChemicalManufacturingProcess")

# Extract predictors and response
ChemicalManufacturingProcess <- as.data.frame(ChemicalManufacturingProcess)  # Ensure it's a data frame
predictors <- ChemicalManufacturingProcess[, -1]  # Exclude 'Yield'
response <- ChemicalManufacturingProcess$Yield    # Extract 'Yield' as the response

Impute Missing Values

# Check if there are any remaining missing values
sum(is.na(predictors_imputed))

## [1] 0

Data Splitting

#  Data Splitting
set.seed(13)
train_index <- createDataPartition(response, p = 0.8, list = FALSE)
train_predictors <- predictors_imputed[train_index, ]
test_predictors <- predictors_imputed[-train_index, ]
train_response <- response[train_index]
test_response <- response[-train_index]

Data preprocessing

# Identify skewed predictors (for demonstration, assuming all as numeric)
skewed_predictors <- sapply(train_predictors, function(x) abs(mean(x)) > 1)  # Replace with specific skewness check
train_predictors[, skewed_predictors] <- log1p(train_predictors[, skewed_predictors])
test_predictors[, skewed_predictors] <- log1p(test_predictors[, skewed_predictors])

preprocess_model <- preProcess(train_predictors, method = c("center", "scale"))
train_predictors <- predict(preprocess_model, newdata = train_predictors)
test_predictors <- predict(preprocess_model, newdata = test_predictors)

Training Nonlinear Regression Models

We can now train the following models and compare their performance: Neural Networks (NN), Support Vector Machines (SVM), K-Nearest Neighbors (KNN), and Multivariate Adaptive Regression Splines (MARS).

Neural Networks or NN (using nnet package)

train_predictors_nnet <- train_predictors

train_response <- as.numeric(train_response)


train_predictors_nnet <- as.data.frame(train_predictors_nnet)

# Ensure the response is numeric for regression tasks
train_response <- as.numeric(train_response)

# Train the neural network using the `nnet` function directly
nn_model <- nnet(
  x = train_predictors_nnet,               # Predictor variables
  y = train_response,                      # Response variable
  size = 5,                                # Number of hidden units (neurons)
  decay = 0.01,                            # Weight decay (L2 regularization)
  linout = TRUE,                           # Linear output for regression
  trace = FALSE,                           # Suppress output during training
  maxit = 500,                             # Maximum number of iterations
  MaxNWts = 10 * (ncol(train_predictors_nnet) + 1) + 5 + 1  # Max number of weights
)

# Print model summary to check details
(head(summary(nn_model)))

## $n
## [1] 57  5  1
## 
## $nunits
## [1] 64
## 
## $nconn
##  [1]   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## [20]   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## [39]   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## [58]   0   0  58 116 174 232 290 296
## 
## $conn
##   [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
##  [26] 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
##  [51] 50 51 52 53 54 55 56 57  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16
##  [76] 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
## [101] 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57  0  1  2  3  4  5  6  7  8
## [126]  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
## [151] 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57  0
## [176]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [201] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## [226] 51 52 53 54 55 56 57  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17
## [251] 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
## [276] 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57  0 58 59 60 61 62
## 
## $nsunits
## [1] 63
## 
## $decay
## [1] 0.01

# Predict using the trained model on the test dataset

test_predictors_nnet <- test_predictors

nn_predictions <- predict(nn_model, newdata = test_predictors_nnet)
head(nn_predictions)

##        [,1]
## 6  48.08048
## 9  42.17955
## 17 43.47570
## 20 39.75457
## 32 45.76906
## 34 40.39286

# Evaluate predictions: 
# RMSE (Root Mean Squared Error)
rmse <- sqrt(mean((nn_predictions - test_response)^2))
print(paste("RMSE:", rmse))

## [1] "RMSE: 2.08042618156675"

# MAE (Mean Absolute Error)
mae <- mean(abs(nn_predictions - test_response))
print(paste("MAE:", mae))

## [1] "MAE: 1.69701579545323"

Support Vector Machines (SVM)

# Train Support Vector Machine model
svm_model <- svm(Yield ~ ., data = cbind(train_predictors, Yield = train_response))
svm_model

## 
## Call:
## svm(formula = Yield ~ ., data = cbind(train_predictors, Yield = train_response))
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.01754386 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  128

svm_predictions <- predict(svm_model, newdata = test_predictors)
svm_predictions

##        6        9       17       20       32       34       41       46 
## 41.88124 42.22955 40.75449 40.04563 42.56969 40.49870 42.49397 42.32216 
##       48       54       63       70       81       82       83       85 
## 42.48441 40.71442 38.91372 39.06585 40.55556 40.52789 40.79082 40.73112 
##       90      105      117      123      124      138      139      142 
## 38.50990 39.07127 40.47198 39.92339 41.54043 38.65861 39.86662 40.12252 
##      143      145      146      153      163      166      171      176 
## 40.46115 39.07678 38.69582 38.55096 39.59045 38.90806 40.68020 39.91157

# Evaluate predictions: 
# Evaluate predictions (RMSE)
rmse <- sqrt(mean((svm_predictions - test_response)^2))
print(paste("RMSE:", rmse))

## [1] "RMSE: 1.17824090056816"

# Evaluate predictions (MAE)
mae <- mean(abs(svm_predictions - test_response))
print(paste("MAE:", mae))

## [1] "MAE: 0.880340803879416"

K-Nearest Neighbors (KNN)

# Train the K-Nearest Neighbors model using kknn
knn_model <- kknn(Yield ~ ., train = cbind(train_predictors, Yield = train_response), 
                  test = test_predictors, k = 5)

# Print the model summary
summary(knn_model)

## 
## Call:
## kknn(formula = Yield ~ ., train = cbind(train_predictors, Yield = train_response),     test = test_predictors, k = 5)
## 
## Response: "continuous"

# Evaluate predictions: 
# Calculate MAE (Mean Absolute Error)
# Generate predictions using the KNN model
knn_predictions <- predict(knn_model, newdata = test_predictors)


mae <- mean(abs(knn_predictions - test_response))
print(paste("MAE:", mae))

## [1] "MAE: 0.973210632409784"

# Calculate RMSE (Root Mean Squared Error)
rmse <- sqrt(mean((knn_predictions - test_response)^2))
print(paste("RMSE:", rmse))

## [1] "RMSE: 1.37249147634138"

# Make predictions using the trained model
knn_predictions <- predict(knn_model, newdata = test_predictors)
knn_predictions

##  [1] 42.59476 42.29578 41.18866 39.83274 39.86502 39.94568 41.84468 41.30619
##  [9] 43.61979 40.02760 38.99271 39.71970 39.60673 40.17630 39.74956 40.03373
## [17] 39.96940 39.79556 41.72038 38.46873 39.57060 38.60848 39.40518 40.29814
## [25] 41.14075 38.77247 39.67823 38.86178 39.15881 38.95846 39.48231 39.76740

Multivariate Adaptive Regression Splines (MARS)

# Train MARS model
mars_model <- earth(Yield ~ ., data = cbind(train_predictors, Yield = train_response))
mars_model

## Selected 12 of 21 terms, and 8 of 57 predictors
## Termination condition: RSq changed by less than 0.001 at 21 terms
## Importance: ManufacturingProcess32, ManufacturingProcess09, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 1.168236    RSS 118.7788    GRSq 0.6619742    RSq 0.7579815

# Calculate RMSE and MAE for evaluation
rmse_value <- rmse(test_response, mars_predictions)
mae_value <- mae(test_response, mars_predictions)

# Print RMSE and MAE values
cat("RMSE: ", rmse_value, "\n")

## RMSE:  1.068764

cat("MAE: ", mae_value, "\n")

## MAE:  0.6772846

# Plot variable importance
var_imp <- evimp(mars_model)  # Extract variable importance
print(var_imp)  # Display variable importance

##                        nsubsets   gcv    rss
## ManufacturingProcess32       11 100.0  100.0
## ManufacturingProcess09       10  66.4   69.9
## ManufacturingProcess13        9  42.3   50.0
## ManufacturingProcess39        7  22.9   34.5
## ManufacturingProcess01        6  21.6   31.9
## BiologicalMaterial03          5  20.4   29.3
## BiologicalMaterial02          3  11.5   20.4
## ManufacturingProcess33        1   4.8   11.0

# Tuning MARS model using external resampling manually (example grid)
# Define candidate models to test
marsGrid <- expand.grid(.degree = 1:2, .nprune = seq(2, 38, by = 2))

# Tune model by iterating through the grid and calculating performance
best_rmse <- Inf
best_model <- NULL
best_params <- NULL

for (i in 1:nrow(marsGrid)) {
  params <- marsGrid[i, ]
  model <- earth(Yield ~ ., data = cbind(train_predictors, Yield = train_response), 
                 degree = params$.degree, nprune = params$.nprune)
  
  # Predict on the test set
  predictions <- predict(model, newdata = test_predictors)
  
  # Calculate RMSE for this model
  rmse_value <- rmse(test_response, predictions)
  
  # If this model has the best RMSE, store it
  if (rmse_value < best_rmse) {
    best_rmse <- rmse_value
    best_model <- model
    best_params <- params
  }
}

# Print the best model and parameters
cat("Best RMSE: ", best_rmse, "\n")

## Best RMSE:  1.060588

# Use the best model for predictions
final_predictions <- predict(best_model, newdata = test_predictors)

# Evaluate the final model
final_rmse <- rmse(test_response, final_predictions)
final_mae <- mae(test_response, final_predictions)

# Print final evaluation results
cat("Final RMSE: ", final_rmse, "\n")

## Final RMSE:  1.060588

cat("Final MAE: ", final_mae, "\n")

## Final MAE:  0.7917836

# Plot the final model's results using plotmo for interpretation

plotmo(best_model, type = "response", main = "MARS Model: Response vs. Predictors")

##  plotmo grid:    BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
##                           -0.07401572          -0.08247092          -0.05311066
##  BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
##           -0.04061519           -0.0283942           -0.1029063
##  BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
##            -0.1182654          0.009356733           0.02376966
##  BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
##            -0.1932263           -0.1595675         -0.007984947
##  ManufacturingProcess01 ManufacturingProcess02 ManufacturingProcess03
##               0.1652337              0.5146324             0.06520146
##  ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess06
##               0.3465247            -0.07395531              -0.229795
##  ManufacturingProcess07 ManufacturingProcess08 ManufacturingProcess09
##              -0.9426152              0.9426152             0.07777282
##  ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess12
##             -0.06026455            -0.04588157             -0.4895948
##  ManufacturingProcess13 ManufacturingProcess14 ManufacturingProcess15
##              0.07560224             0.04007555            -0.09750878
##  ManufacturingProcess16 ManufacturingProcess17 ManufacturingProcess18
##              0.08249729              0.0128037             0.08298485
##  ManufacturingProcess19 ManufacturingProcess20 ManufacturingProcess21
##              -0.1131955             0.08300123             -0.2073696
##  ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
##               0.1853467              0.1798176              0.1159048
##  ManufacturingProcess25 ManufacturingProcess26 ManufacturingProcess27
##              0.08326936             0.08220305             0.08239999
##  ManufacturingProcess28 ManufacturingProcess29 ManufacturingProcess30
##               0.7823448             0.05816823             0.04771302
##  ManufacturingProcess31 ManufacturingProcess32 ManufacturingProcess33
##              0.09070835            -0.04532923              0.2654151
##  ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
##               0.1103482             0.03569133              0.4424826
##  ManufacturingProcess37 ManufacturingProcess38 ManufacturingProcess39
##              0.05853275              0.5812483              0.2142516
##  ManufacturingProcess40 ManufacturingProcess41 ManufacturingProcess42
##              -0.4677701             -0.4469901              0.1810788
##  ManufacturingProcess43 ManufacturingProcess44 ManufacturingProcess45
##              -0.1221465              0.2475303              0.1630509

Predictor Importance

Once we have the optimal model, we can extract feature importance. For example, using the varImp function for KNN, SVM, and MARS.

KNN and SVM:

# Train the KNN model using kknn
knn_model <- kknn(Yield ~ ., train = cbind(train_predictors, Yield = train_response), 
                  test = test_predictors, k = 5)

# Function to compute RMSE or MAE
compute_performance <- function(model, test_data, test_response) {
  predictions <- predict(model, newdata = test_data)
  rmse <- sqrt(mean((predictions - test_response)^2))
  return(rmse)}
# Baseline performance
baseline_rmse <- compute_performance(knn_model, test_predictors, test_response)
cat("Baseline RMSE: ", baseline_rmse, "\n")

## Baseline RMSE:  1.372491

MARS:

# Variable importance for MARS model
mars_importance <- varImp(mars_model)
print(mars_importance)

##                           Overall
## ManufacturingProcess32 100.000000
## ManufacturingProcess09  66.390853
## ManufacturingProcess13  42.288682
## ManufacturingProcess39  22.876918
## ManufacturingProcess01  21.608293
## BiologicalMaterial03    20.433283
## BiologicalMaterial02    11.505212
## ManufacturingProcess33   4.819014

# Create the data frame with the importance values
importance_df <- data.frame(
  Predictor = c("ManufacturingProcess32", "ManufacturingProcess09", "ManufacturingProcess13", 
                "ManufacturingProcess39", "ManufacturingProcess01", "BiologicalMaterial03", 
                "BiologicalMaterial02", "ManufacturingProcess33"),
  Importance = c(100.000000, 66.390853, 42.288682, 22.876918, 21.608293, 
                 20.433283, 11.505212, 4.819014)
)

# Create a barplot to visualize the importance of predictors
barplot(importance_df$Importance,
        names.arg = importance_df$Predictor,
        main = "Variable Importance for MARS Model",
        col = "skyblue",
        las = 2, # Rotate the x-axis labels for better readability
        cex.names = 0.8, # Adjust text size of labels
        border = "white",
        horiz = TRUE, # Horizontal bars for better label readability
        xlab = "Importance"
)

Relationships Between Top Predictors and Response

top_predictors <- head(importance_df, 5)$predictor

# Create a new data frame containing only the top predictors and the response variable
top_predictors_df <- cbind(test_predictors[, top_predictors], Yield = test_response)

top_predictors_df <- ChemicalManufacturingProcess[1:32, c("ManufacturingProcess32", "ManufacturingProcess09", "BiologicalMaterial03")]  # Adjust columns as needed
top_predictors_df$Yield <- ChemicalManufacturingProcess$Yield[1:32]

# Fit the linear regression model
linear_model <- lm(Yield ~ ., data = top_predictors_df)

# Print the model summary
summary(linear_model)

## 
## Call:
## lm(formula = Yield ~ ., data = top_predictors_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.98492 -0.39036  0.06245  0.37593  0.85332 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -19.87456    3.25861  -6.099 1.40e-06 ***
## ManufacturingProcess32   0.18342    0.01839   9.972 1.02e-10 ***
## ManufacturingProcess09   0.70436    0.04647  15.156 5.04e-15 ***
## BiologicalMaterial03    -0.01619    0.02497  -0.649    0.522    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5285 on 28 degrees of freedom
## Multiple R-squared:  0.9456, Adjusted R-squared:  0.9398 
## F-statistic: 162.2 on 3 and 28 DF,  p-value: < 2.2e-16

# Plot relationships between each top predictor and the response (Yield)

ggplot(top_predictors_df, aes(x = top_predictors_df[, 1], y = Yield)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Adding regression line
  labs(title = paste("Relationship between top predictors", top_predictors[1], "and Yield"), 
       x = top_predictors[1], y = "Yield") + 
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

# Use ggpairs to create a matrix of scatter plots for top predictors and the response variable
top_predictors_df$Yield <- as.numeric(top_predictors_df$Yield)  # Make sure Yield is numeric
ggpairs(top_predictors_df, 
        aes(color = Yield), 
        upper = list(continuous = "points"), 
        lower = list(continuous = "smooth"),
        progress = FALSE)  # Suppress progress bar

cor_matrix <- cor(top_predictors_df)

# Melt the correlation matrix for ggplot2 compatibility
melted_cor_matrix <- melt(cor_matrix)

# Plot the heat map
ggplot(data = melted_cor_matrix, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1, 1), space = "Lab", 
                       name = "Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  labs(title = "Heat Map of Correlations between Top Predictors and Yield")

# Residuals vs Fitted plot for model diagnostics
plot(linear_model, which = 1)

# Make predictions
predictions <- predict(linear_model, newdata = top_predictors_df)

# Calculate RMSE
rmse <- sqrt(mean((predictions - top_predictors_df$Yield)^2))
print(paste("RMSE:", rmse))

## [1] "RMSE: 0.494341167163"

# Calculate MAE
mae <- mean(abs(predictions - top_predictors_df$Yield))
print(paste("MAE:", mae))

## [1] "MAE: 0.419288337899643"

# R-squared
rsq <- summary(linear_model)$r.squared
print(paste("R-squared:", rsq))

## [1] "R-squared: 0.945596181994789"

Analyze Relationships Between Top Predictors and Response

# Ensure that rf_importance contains the correct importance values

train_data <- cbind(train_predictors, Yield = train_response)

rf_importance <- randomForest::randomForest(Yield ~ ., data = train_data)  # For example, train a model if you haven't

# Extract the top 5 most important predictors
top_predictors <- names(sort(rf_importance$importance[, 1], decreasing = TRUE))[1:4]
top_predictors

## [1] "ManufacturingProcess32" "ManufacturingProcess13" "BiologicalMaterial03"  
## [4] "BiologicalMaterial12"

# Create the pairs plot for the top 4 predictors vs the response
pairs(cbind(response = train_response, train_predictors[, top_predictors]),
      main = "Top Predictors vs Yield")

Evaluate Model Performance

We will use Mean Absolute Error (MAE) to evaluate model performance.

# Evaluate model performance (using MAE)
mae_nn <- mean(abs(nn_predictions - test_response))
mae_svm <- mean(abs(svm_predictions - test_response))
mae_knn <- mean(abs(knn_predictions - test_response))
mae_mars <- mean(abs(mars_predictions - test_response))

cat("MAE (NN):", mae_nn, "\n")

## MAE (NN): 1.697016

cat("MAE (SVM):", mae_svm, "\n")

## MAE (SVM): 0.8803408

cat("MAE (KNN):", mae_knn, "\n")

## MAE (KNN): 0.9732106

cat("MAE (MARS):", mae_mars, "\n")

## MAE (MARS): 0.6772846

Results and findings

Based on the MAE values, SVM emerges as the best-performing model, with the lowest error. MARS also performs well, while KNN and NN have higher errors, with the neural network model being the least effective in this case. But, based on RMSE, MARS (1.07 RMSE) has the best performance, followed closely by SVM (1.18 RMSE). KNN(1.37 RMSE) comes next, and NN (3.08 RMSE)performs the worst, with the highest RMSE, which indicates the largest error. My personal preference goes to SVM for its simple implementation steps and its good generalization performance.

ADDENDUM: Random Forest model

# Convert the predictors to a matrix (if required)
train_predictors_matrix <- as.matrix(train_predictors)

rf_model <- caret::train(
  x = train_predictors_matrix,    # Ensure it's a data.frame or matrix with column names
  y = train_response,             # Vector of responses
  method = "rf",                  # Random Forest method
  trControl = fitControl,         # Cross-validation settings
  tuneLength = 10                 # Hyperparameter tuning
)

# Print the results of the model training
print(rf_model)

## Random Forest 
## 
## 144 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 129, 129, 128, 130, 129, 129, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE      
##    2    1.229303  0.6472954  0.9975732
##    8    1.108002  0.6967114  0.8886329
##   14    1.084762  0.7050603  0.8650151
##   20    1.076142  0.7082278  0.8510208
##   26    1.076357  0.7033922  0.8513888
##   32    1.070691  0.7044220  0.8410492
##   38    1.066271  0.7054733  0.8375578
##   44    1.074495  0.6995948  0.8406339
##   50    1.079506  0.6943438  0.8428872
##   57    1.079663  0.6938599  0.8418698
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 38.

Predictor Importance and Analysis

# Variable importance for the Random Forest model
rf_importance <- varImp(rf_model, scale = FALSE)
print(rf_importance)

## rf variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32 129.718
## ManufacturingProcess13  51.501
## BiologicalMaterial12    31.397
## BiologicalMaterial03    29.546
## ManufacturingProcess17  21.497
## BiologicalMaterial11    16.513
## ManufacturingProcess31  12.791
## BiologicalMaterial06    12.707
## ManufacturingProcess09  11.381
## BiologicalMaterial05    10.325
## ManufacturingProcess06   9.787
## BiologicalMaterial02     7.989
## BiologicalMaterial08     7.523
## ManufacturingProcess18   6.965
## ManufacturingProcess30   6.662
## BiologicalMaterial09     6.501
## ManufacturingProcess28   6.327
## ManufacturingProcess34   6.210
## BiologicalMaterial04     5.342
## ManufacturingProcess39   4.366

# Plot top predictors
plot(rf_importance, top = 10)