Exercise 7.2

Data preparation

First, I examined the training data and simulated the test dataset following the exercise instructions.

# I used the same randomization seed provided in the exercise instructions
set.seed(200) 

# Load data
trainingData <- mlbench.friedman1(200, sd = 1)

# Convert 'x' from matrix to dataframe
trainingData$x <- data.frame(trainingData$x)
# Examine data
featurePlot(trainingData$x, trainingData$y)

set.seed(200) 

# Simulate a test set to estimate the true error rate
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

Model development

I trained and compared the performance of four nonlinear models (neural network, multivariate adaptive regression splines (MARS), support vector machine (SVM), and k-nearest neighbors (kNN)).

# Define cross-validation method
ctrl <- trainControl(method = "cv", number = 10)

Neural network

set.seed(200) 

# Register parallel backend
registerDoParallel(cores = n_cores)

# Define candidate models to test
nnetGrid <- expand.grid(.decay = c(0, 0.01, .1), .size = c(1:5), .bag = FALSE)

# Fit model using training data
# The 'avNNet' method is used to perform model averaging
# as described in the textbook
nnetModel <- train(x = trainingData$x, y = trainingData$y, 
                  method = 'avNNet',
                  preProc = c('center', 'scale'),
                  tuneGrid = nnetGrid,
                  trControl = ctrl,
                  # number of models to average
                  repeats = 5,
                  # maximum iterations
                  maxit = 500,
                  # maximum number of weights
                  MaxNWts = 5 * (ncol(trainingData$x) + 1) + 5 + 1,
                  # linear output units
                  linout = TRUE,
                  # suppress tracing optimization
                  trace = FALSE)

nnetModel
## Model Averaged Neural Network 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   decay  size  RMSE      Rsquared   MAE     
##   0.00   1     2.388511  0.7708442  1.880680
##   0.00   2     2.390423  0.7649245  1.929286
##   0.00   3     1.974630  0.8307087  1.564585
##   0.00   4     2.020615  0.8317097  1.636117
##   0.00   5     3.451602  0.6980520  2.246110
##   0.01   1     2.380845  0.7641822  1.871241
##   0.01   2     2.428199  0.7573996  1.885830
##   0.01   3     2.102588  0.8033818  1.659007
##   0.01   4     1.999703  0.8284804  1.563979
##   0.01   5     2.042573  0.8210607  1.628343
##   0.10   1     2.392324  0.7614517  1.873878
##   0.10   2     2.496895  0.7450697  2.000428
##   0.10   3     2.061674  0.8143677  1.640823
##   0.10   4     2.110922  0.8169582  1.719886
##   0.10   5     2.064228  0.8281129  1.684613
## 
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 3, decay = 0 and bag = FALSE.
set.seed(200) 

# Make predictions using test data
nnetPred <- predict(nnetModel, newdata = testData$x)
nnet_performance <- postResample(pred = nnetPred, obs = testData$y)

MARS

set.seed(200) 

# Define candidate models to test
# Parameters from example in Chapter 7
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)

# Fit model using training data
# Note that in MARS, the selection of cut points does not depend on the 
# scale of the predictor variables; however, to ensure that all models
# are compared relative to the same training data, I included pre-processing
marsModel <- train(x = trainingData$x, y = trainingData$y, 
                  method = 'earth',
                  preProc = c('center', 'scale'),
                  tuneGrid = marsGrid,
                  trControl = ctrl)

marsModel
## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE      
##   1        2      4.188280  0.3042527  3.4606894
##   1        3      3.551182  0.4999832  2.8371162
##   1        4      2.653143  0.7167280  2.1282215
##   1        5      2.405769  0.7562160  1.9481607
##   1        6      2.295006  0.7754603  1.8531995
##   1        7      1.771950  0.8611767  1.3913569
##   1        8      1.647182  0.8774867  1.2995643
##   1        9      1.609816  0.8837307  1.2997049
##   1       10      1.635035  0.8798236  1.3094359
##   1       11      1.571915  0.8896147  1.2607106
##   1       12      1.571561  0.8898750  1.2530769
##   1       13      1.567577  0.8906927  1.2507948
##   1       14      1.571673  0.8909652  1.2455080
##   1       15      1.571673  0.8909652  1.2455080
##   1       16      1.571673  0.8909652  1.2455080
##   1       17      1.571673  0.8909652  1.2455080
##   1       18      1.571673  0.8909652  1.2455080
##   1       19      1.571673  0.8909652  1.2455080
##   1       20      1.571673  0.8909652  1.2455080
##   1       21      1.571673  0.8909652  1.2455080
##   1       22      1.571673  0.8909652  1.2455080
##   1       23      1.571673  0.8909652  1.2455080
##   1       24      1.571673  0.8909652  1.2455080
##   1       25      1.571673  0.8909652  1.2455080
##   1       26      1.571673  0.8909652  1.2455080
##   1       27      1.571673  0.8909652  1.2455080
##   1       28      1.571673  0.8909652  1.2455080
##   1       29      1.571673  0.8909652  1.2455080
##   1       30      1.571673  0.8909652  1.2455080
##   1       31      1.571673  0.8909652  1.2455080
##   1       32      1.571673  0.8909652  1.2455080
##   1       33      1.571673  0.8909652  1.2455080
##   1       34      1.571673  0.8909652  1.2455080
##   1       35      1.571673  0.8909652  1.2455080
##   1       36      1.571673  0.8909652  1.2455080
##   1       37      1.571673  0.8909652  1.2455080
##   1       38      1.571673  0.8909652  1.2455080
##   2        2      4.308195  0.2578397  3.6207010
##   2        3      3.706763  0.4604448  2.9757957
##   2        4      2.735073  0.6839796  2.2293125
##   2        5      2.492961  0.7285910  1.9701005
##   2        6      2.376226  0.7515868  1.8509493
##   2        7      2.056766  0.7880683  1.6268337
##   2        8      1.777202  0.8531962  1.3935838
##   2        9      1.689355  0.8660853  1.3332428
##   2       10      1.644239  0.8756652  1.3024389
##   2       11      1.515215  0.8908390  1.1976900
##   2       12      1.384404  0.9129573  1.1044863
##   2       13      1.315403  0.9237719  1.0512295
##   2       14      1.293663  0.9269840  1.0376277
##   2       15      1.244900  0.9320943  0.9875052
##   2       16      1.238511  0.9327014  0.9875981
##   2       17      1.244898  0.9321291  0.9943780
##   2       18      1.244898  0.9321291  0.9943780
##   2       19      1.244898  0.9321291  0.9943780
##   2       20      1.244898  0.9321291  0.9943780
##   2       21      1.244898  0.9321291  0.9943780
##   2       22      1.244898  0.9321291  0.9943780
##   2       23      1.244898  0.9321291  0.9943780
##   2       24      1.244898  0.9321291  0.9943780
##   2       25      1.244898  0.9321291  0.9943780
##   2       26      1.244898  0.9321291  0.9943780
##   2       27      1.244898  0.9321291  0.9943780
##   2       28      1.244898  0.9321291  0.9943780
##   2       29      1.244898  0.9321291  0.9943780
##   2       30      1.244898  0.9321291  0.9943780
##   2       31      1.244898  0.9321291  0.9943780
##   2       32      1.244898  0.9321291  0.9943780
##   2       33      1.244898  0.9321291  0.9943780
##   2       34      1.244898  0.9321291  0.9943780
##   2       35      1.244898  0.9321291  0.9943780
##   2       36      1.244898  0.9321291  0.9943780
##   2       37      1.244898  0.9321291  0.9943780
##   2       38      1.244898  0.9321291  0.9943780
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 16 and degree = 2.
set.seed(200) 

# Make predictions using test data
marsPred <- predict(marsModel, newdata = testData$x)
mars_performance <- postResample(pred = marsPred, obs = testData$y)

SVM

set.seed(200) 

# Fit model using training data
svmModel <- train(x = trainingData$x, y = trainingData$y, 
                  method = 'svmRadial',
                  preProc = c('center', 'scale'),
                  tuneLength = 10,
                  trControl = ctrl)

svmModel
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE      Rsquared   MAE     
##     0.25  2.525164  0.7810576  2.010680
##     0.50  2.270567  0.7944850  1.794902
##     1.00  2.099319  0.8155594  1.659342
##     2.00  2.005858  0.8302852  1.578799
##     4.00  1.934650  0.8435677  1.528373
##     8.00  1.915653  0.8475592  1.528614
##    16.00  1.923884  0.8463090  1.535976
##    32.00  1.923884  0.8463090  1.535976
##    64.00  1.923884  0.8463090  1.535976
##   128.00  1.923884  0.8463090  1.535976
## 
## Tuning parameter 'sigma' was held constant at a value of 0.06299324
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06299324 and C = 8.
set.seed(200) 

# Make predictions using test data
svmPred <- predict(svmModel, newdata = testData$x)
svm_performance <- postResample(pred = svmPred, obs = testData$y)

kNN

set.seed(200) 

# Fit model using training data
# I used the code provided in exercise but added cross-validation
knnModel <- train(x = trainingData$x, y = trainingData$y, 
                  method = 'knn',
                  preProc = c('center', 'scale'),
                  tuneLength = 10,
                  trControl = ctrl)

knnModel
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.238598  0.5836232  2.705822
##    7  3.117335  0.6295372  2.561052
##    9  3.100423  0.6590940  2.524483
##   11  3.086639  0.6822198  2.506584
##   13  3.094904  0.6902613  2.504433
##   15  3.116059  0.7045172  2.516131
##   17  3.129874  0.7133067  2.529370
##   19  3.151840  0.7183283  2.546422
##   21  3.175787  0.7209301  2.574113
##   23  3.208213  0.7146199  2.611285
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 11.
set.seed(200) 

# Make predictions using test data
# Code provided in exercise
knnPred <- predict(knnModel, newdata = testData$x)
knn_performance <- postResample(pred = knnPred, obs = testData$y)

Performance comparison

Which models appear to give the best performance?

Answer: Of the four models, MARS had the lowest RMSE and MAE and the highest \(R^2\), which indicates that it is the best fit for the data.

# Combine performance metrics from all models into dataframe
model_performance_df <- as.data.frame(rbind(nnet_performance,
                                            mars_performance,
                                            svm_performance,
                                            knn_performance))

model_performance_df$model <- c('NN', 'MARS', 'SVM', 'kNN')
rownames(model_performance_df) <- NULL

model_performance_df %>%
  select(model, RMSE, Rsquared, MAE) %>%
  arrange(RMSE)
##   model     RMSE  Rsquared      MAE
## 1  MARS 1.281766 0.9345701 1.016075
## 2    NN 1.884349 0.8607047 1.420308
## 3   SVM 2.087047 0.8253586 1.588687
## 4   kNN 3.147014 0.6670418 2.510034

Does MARS select the informative predictors (those named X1–X5)?

Answer: Yes. The most important predictor in the MARS model was X1, followed by X4, X2, X5 and X3.

# Scale parameter set to FALSE to show raw importance scores.
# With scaling, the importance score of the least important predictor becomes zero, 
# which is potentially misleading
varImp(marsModel, scale=FALSE)
## earth variable importance
## 
##    Overall
## X1  100.00
## X4   85.12
## X2   69.20
## X5   49.23
## X3   39.89

Exercise 7.5

The data for this exercise were pre-processed and modeled using partial least squares in Exercise 6.3 (Homework 7). Of note, missing data were imputed using multivariate imputation by chained equations (MICE) with predictive mean matching. This method generates multiple imputed datasets, each of which are used for modeling, followed by integration of performance metrics using Rubin’s rules.1

(a) Optimal nonlinear model

Which nonlinear regression model gives the optimal resampling and test set performance?

Answer: The best model was SVM, which had the lowest \(RMSE = 1.025\) and the highest \(R^2 = 0.71\).

Details:

I trained and compared the performance of four nonlinear models (neural network, MARS, SVM, and kNN).

# Neural network
# --------------
# Note: This block may take a few minutes to run

# Create matrix to store predictions for all models
predictions_mat <- matrix(NA, nrow = nrow(test_df), ncol = n_impute_datasets)

# Define candidate models to test
nnetGrid <- expand.grid(.decay = c(0, 1e-6, 1e-5, 1e-4, 1e-3, 0.01, 0.1),
                        .size = c(3, 5, 10),
                        .bag = FALSE)

# Loop through the imputed datasets, fit model and make predictions for each
for (i in 1 : n_impute_datasets) {
  # Extract the imputed dataset
  full_imputed <- mice::complete(CMP_imputations, i)
  
  # Subset training and test rows
  train_imputed <- full_imputed[train_idx, ]
  test_imputed <- full_imputed[-train_idx, ]  
  
  # Fit model using training data
  # Note: The formula for the MaxNWts parameter provided in the textbook and 
  # used in Exercise 7.2 sets MaxNWts = 291. However, this resulted in warnings 
  # about there being too many weights in the model (581). As a result, 
  # I reverted to the default value (1000; not explicitly defined below).  
  model <- train(
    Yield ~ ., data = train_imputed,
    method = 'avNNet',
    preProcess = c('center', 'scale'),
    tuneGrid = nnetGrid,
    trControl = ctrl,
    repeats = 5,
    maxit = 500,
    linout = TRUE,
    trace = FALSE
  )  
  
  # Make predictions using test data
  predictions_mat[, i] <- predict(model, newdata = test_imputed)
}

# Average the predictions and then calculate performance metrics
mean_predictions <- rowMeans(predictions_mat)
nn_performance <- caret::postResample(pred = mean_predictions, obs = test_imputed$Yield)
# MARS
# ----
# Create matrix to store predictions for all models
predictions_mat <- matrix(NA, nrow = nrow(test_df), ncol = n_impute_datasets)

# Define candidate models to test
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)

# Loop through the imputed datasets, fit model and make predictions for each
for (i in 1 : n_impute_datasets) {
  # Extract the imputed dataset
  full_imputed <- mice::complete(CMP_imputations, i)
  
  # Subset training and test rows
  train_imputed <- full_imputed[train_idx, ]
  test_imputed <- full_imputed[-train_idx, ]  
  
  # Fit model using training data
  model <- train(
    Yield ~ ., data = train_imputed,
    method = 'earth',
    preProcess = c('center', 'scale'),
    tuneGrid = marsGrid,
    trControl = ctrl
  )
  
  # Make predictions using test data
  predictions_mat[, i] <- predict(model, newdata = test_imputed)
}

# Average the predictions and then calculate performance metrics
mean_predictions <- rowMeans(predictions_mat)
mars_performance <- caret::postResample(pred = mean_predictions, obs = test_imputed$Yield)
# SVM
# ----

# Create list to store models
svm_models <- list()

# Create matrix to store predictions for all models
predictions_mat <- matrix(NA, nrow = nrow(test_df), ncol = n_impute_datasets)

# Loop through the imputed datasets, fit model and make predictions for each
for (i in 1 : n_impute_datasets) {
  # Extract the imputed dataset
  full_imputed <- mice::complete(CMP_imputations, i)
  
  # Subset training and test rows
  train_imputed <- full_imputed[train_idx, ]
  test_imputed <- full_imputed[-train_idx, ]  
  
  # Fit model using training data
  model <- train(
    Yield ~ ., data = train_imputed,
    method = 'svmRadial',
    preProcess = c('center', 'scale'),
    tuneLength = 10,
    trControl = ctrl
  )
  
  # Make predictions using test data
  predictions_mat[, i] <- predict(model, newdata = test_imputed)
  
  # Store the model
  svm_models[[i]] <- model
}

# Average the predictions and then calculate performance metrics
mean_predictions <- rowMeans(predictions_mat)
svm_performance <- caret::postResample(pred = mean_predictions, obs = test_imputed$Yield)
# kNN
# ----

# Create matrix to store predictions for all models
predictions_mat <- matrix(NA, nrow = nrow(test_df), ncol = n_impute_datasets)

# Loop through the imputed datasets, fit model and make predictions for each
for (i in 1 : n_impute_datasets) {
  # Extract the imputed dataset
  full_imputed <- mice::complete(CMP_imputations, i)
  
  # Subset training and test rows
  train_imputed <- full_imputed[train_idx, ]
  test_imputed <- full_imputed[-train_idx, ]  
  
  # Fit model using training data
  model <- train(
    Yield ~ ., data = train_imputed,
    method = 'knn',
    preProcess = c('center', 'scale'),
    tuneLength = 10,
    trControl = ctrl
  )
  
  # Make predictions using test data
  predictions_mat[, i] <- predict(model, newdata = test_imputed)
}

# Average the predictions and then calculate performance metrics
mean_predictions <- rowMeans(predictions_mat)
knn_performance <- caret::postResample(pred = mean_predictions, obs = test_imputed$Yield)

Performance comparison

Of the three models, SVM had the lowest RMSE and MAE and the highest \(R^2\), which indicates that it is the best fit for the data.

# Combine performance metrics from all models into dataframe
cmp_model_performance_df <- as.data.frame(rbind(nn_performance,
   mars_performance, svm_performance, knn_performance))

cmp_model_performance_df$model <- c('NN', 'MARS', 'SVM', 'kNN')
rownames(model_performance_df) <- NULL

cmp_model_performance_df %>%
  select(model, RMSE, Rsquared, MAE) %>%
  arrange(RMSE)
##                  model     RMSE  Rsquared       MAE
## svm_performance    SVM 1.031397 0.7116228 0.8447902
## mars_performance  MARS 1.229586 0.5786710 0.9725749
## knn_performance    kNN 1.421966 0.4492763 1.1222138
## nn_performance      NN 1.457817 0.4700351 1.1585141

(b) Predictor importance in optimal nonlinear model

Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

Answer:

Because the MICE imputation process generates multiple imputed datasets, each of which has an associated model, I compared variable importance for each model.

The top 10 important variables in all models included an equal number of biological material and manufacturing process predictors. Notably, the top 4 important variables were Process32, Material06, Material02, and Material03 in all models, suggesting that these are the most important predictors of product yield. In addition, Process32 was the most important predictor in both the SVM model and the previous PLS model.

In general, most of the top 10 important predictors in the SVM models were also among the top 10 in the PLS models. In the PLS model, the top 10 process variables in all models (ie, union) included 09, 13, 17, 33, and 36, and the top 10 material variables included 01, 02, 03, 04, 06, and 08. In the SVM model, the top 10 process variables in all models included 09, 13, 17, 29, 31, and 36, and the top 10 material variables included 02, 03, 04, 06, and 12. Collectively, this suggests that the overlapping predictors are truly important determinants of product yield (ie, because the models are predicting the response variable in different ways [linear and nonlinear]).

Variable importance in SVM model using imputed dataset 1

varImp(svm_models[[1]])
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##            Overall
## Process32   100.00
## Material06   74.71
## Material02   59.86
## Material03   57.89
## Process36    57.30
## Process13    55.23
## Material04   54.66
## Material12   48.65
## Process31    43.29
## Process17    43.29
## Process33    41.47
## Process09    41.11
## Process29    40.89
## Material01   35.33
## Material11   33.52
## Material08   32.76
## Process06    32.22
## Process02    29.34
## Process27    26.83
## Material09   25.95

Variable importance in SVM model using imputed dataset 2

varImp(svm_models[[2]])
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##            Overall
## Process32   100.00
## Material06   74.71
## Material02   59.85
## Material03   57.89
## Process36    56.66
## Process13    55.22
## Material04   54.65
## Material12   48.64
## Process31    44.45
## Process17    43.28
## Process09    41.10
## Material01   35.32
## Process33    35.18
## Process29    34.06
## Material11   33.50
## Material08   32.75
## Process06    32.51
## Process04    29.20
## Process02    28.45
## Material09   25.94

Variable importance in SVM model using imputed dataset 3

varImp(svm_models[[3]])
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##            Overall
## Process32   100.00
## Material06   74.71
## Material02   59.86
## Material03   57.89
## Process13    55.23
## Material04   54.66
## Process36    53.57
## Material12   48.65
## Process17    43.29
## Process29    41.29
## Process09    41.11
## Process33    40.17
## Process31    35.93
## Material01   35.33
## Material11   33.52
## Process06    33.15
## Material08   32.76
## Process02    31.03
## Process27    26.97
## Process04    26.24

Variable importance in SVM model using imputed dataset 4

varImp(svm_models[[4]])
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##            Overall
## Process32   100.00
## Material06   74.71
## Material02   59.85
## Material03   57.89
## Process36    56.03
## Process13    55.22
## Material04   54.65
## Material12   48.64
## Process31    44.96
## Process17    43.28
## Process29    41.20
## Process09    41.10
## Material01   35.32
## Process33    35.18
## Material11   33.50
## Process06    33.06
## Material08   32.75
## Process02    30.71
## Process27    26.85
## Process04    26.23

Variable importance in SVM model using imputed dataset 5

varImp(svm_models[[5]])
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##            Overall
## Process32   100.00
## Material06   74.71
## Material02   59.85
## Material03   57.89
## Process13    55.22
## Material04   54.65
## Process36    52.28
## Material12   48.64
## Process17    43.28
## Process09    41.10
## Process31    38.97
## Process33    36.51
## Material01   35.32
## Process29    34.87
## Material11   33.50
## Process06    33.18
## Material08   32.75
## Process02    31.02
## Process27    26.76
## Process04    26.52

(c) Relationships between top predictors and response variable

Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

Answer:

Compared with the PLS models developed previously, few variables in the top 10 most important predictors were unique to the SVM models. Among manufacturing process variables, Process31 was an important predictor in 3 of 5 SVM models but no PLS models, and Process29 was an important predictor in 1 of 5 SVM models but no PLS models. Among biological material variables, Material12 was an important predictor in 1 of 5 SVM models but no PLS models.

Scatterplots, Pearson correlation coefficients, and linear regression all support the hypothesis that there is no linear relationship between these three variables and Yield. This is in agreement with the better fit and predictive performance of the nonlinear SVM model (\(RMSE=1.025\), \(R^2=0.71\)) than the linear PLS model (\(RMSE=1.71\), \(R^2=0.56\)).

The predictor importance analysis in part (b) suggests that optimization (maximization) of Process32 is critical to increase product yield and, in turn, revenue. Although biological predictors cannot be changed (per exercise instructions), Material06, Material02, and Material03 may be useful for quality control of the raw material.

Relationship between Process31 and Yield

As shown in the scatterplot below, Process31 and Yield are not linearly correlated (Pearson \(r=-0.05\)). Attempting to regress the data using a univariate linear model results in \(R^2 = 0\), which indicates that Process31 does not explain any of the variance in Yield. This poor predictive performance appears to be the result of an extreme outlier, which acts as a bad leverage point.

pearson_r <- round(cor(train_df$Process31, train_df$Yield, 
                       method = c("pearson"), use = 'complete.obs'), 2)

ggplot(drop_na(train_df, Process31), aes(x = Process31, y = Yield)) +
  geom_point() +
  labs(
    x = 'Process31',
    y = 'Yield',
    title = 'Relationship between Manufacturing Process31 and Yield'
  ) +
  geom_smooth(method = 'lm', formula = y ~ x, se = FALSE) +
  geom_text(x = 40, y = 42,
            label = paste0('r = ', pearson_r),
            color = 'blue') +
  theme_classic() +
  theme(
    plot.title = element_text(face = 'bold'),
    axis.title = element_text(face = 'bold')
  )

process31_yield_lm <- lm(Yield ~ Process31, data = train_df)
summary(process31_yield_lm)
## 
## Call:
## lm(formula = Yield ~ Process31, data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.9244 -1.4493 -0.1622  1.2483  6.1343 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 41.19757    1.80364  22.841   <2e-16 ***
## Process31   -0.01423    0.02566  -0.555     0.58    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.848 on 118 degrees of freedom
##   (4 observations deleted due to missingness)
## Multiple R-squared:  0.002601,   Adjusted R-squared:  -0.005852 
## F-statistic: 0.3077 on 1 and 118 DF,  p-value: 0.5802

Relationship between Process29 and Yield

Similarly, Process31 and Yield are weakly linearly correlated (\(r=0.15\)). Attempting to regress the data using a univariate linear model results in \(R^2 \approx 0\), which indicates that Process29 does not explain any of the variance in Yield. This poor predictive performance appears to be the result of an extreme outlier, which acts as a bad leverage point.

pearson_r <- round(cor(train_df$Process29, train_df$Yield, 
                       method = c("pearson"), use = 'complete.obs'), 2)

ggplot(drop_na(train_df, Process31), aes(x = Process29, y = Yield)) +
  geom_point() +
  labs(
    x = 'Process29',
    y = 'Yield',
    title = 'Relationship between Manufacturing Process29 and Yield'
  ) +
  geom_smooth(method = 'lm', formula = y ~ x, se = FALSE) +
  geom_text(x = 10, y = 40,
            label = paste0('r = ', pearson_r),
            color = 'blue') +
  theme_classic() +
  theme(
    plot.title = element_text(face = 'bold'),
    axis.title = element_text(face = 'bold')
  )

process29_yield_lm <- lm(Yield ~ Process29, data = train_df)
summary(process29_yield_lm)
## 
## Call:
## lm(formula = Yield ~ Process29, data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0395 -1.4151 -0.1838  1.2686  6.0366 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 37.42090    1.72605  21.680   <2e-16 ***
## Process29    0.13925    0.08603   1.619    0.108    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.83 on 118 degrees of freedom
##   (4 observations deleted due to missingness)
## Multiple R-squared:  0.02172,    Adjusted R-squared:  0.01343 
## F-statistic:  2.62 on 1 and 118 DF,  p-value: 0.1082

Relationship between Material12 and Yield

Material12 and Yield are weakly/moderately correlated (\(r=0.36\)); however, the scatterplot does not show a clear linear relationship. Regressing the data using a univariate linear model results in \(R^2 \approx 0.12\), which indicates that Material12 explains very little (~12%) of the variance in Yield.

pearson_r <- round(cor(train_df$Material12, train_df$Yield, 
                       method = c("pearson"), use = 'complete.obs'), 2)

ggplot(drop_na(train_df, Material12), aes(x = Material12, y = Yield)) +
  geom_point() +
  labs(
    x = 'Material12',
    y = 'Yield',
    title = 'Relationship between Biological Material12 and Yield'
  ) +
  geom_smooth(method = 'lm', formula = y ~ x, se = FALSE) +
  geom_text(x = 22, y = 42.5,
            label = paste0('r = ', pearson_r),
            color = 'blue') +
  theme_classic() +
  theme(
    plot.title = element_text(face = 'bold'),
    axis.title = element_text(face = 'bold')
  )

material12_yield_lm <- lm(Yield ~ Material12, data = train_df)
summary(material12_yield_lm)
## 
## Call:
## lm(formula = Yield ~ Material12, data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5886 -1.3237 -0.0237  1.1208  5.1771 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  22.8686     4.0605   5.632 1.16e-07 ***
## Material12    0.8577     0.2009   4.270 3.89e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.714 on 122 degrees of freedom
## Multiple R-squared:   0.13,  Adjusted R-squared:  0.1229 
## F-statistic: 18.23 on 1 and 122 DF,  p-value: 3.894e-05

Session Details

pander::pander(sessionInfo())

R version 4.5.2 (2025-10-31)

Platform: aarch64-apple-darwin20

locale: en_US.UTF-8||en_US.UTF-8||en_US.UTF-8||C||en_US.UTF-8||en_US.UTF-8

attached base packages: parallel, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: pander(v.0.6.6), doParallel(v.1.0.17), iterators(v.1.0.14), foreach(v.1.5.2), earth(v.5.3.5), plotmo(v.3.7.0), plotrix(v.3.8-14), Formula(v.1.2-5), caret(v.7.0-1), lattice(v.0.22-7), mlbench(v.2.1-7), mice(v.3.19.0), lubridate(v.1.9.5), forcats(v.1.0.1), stringr(v.1.6.0), dplyr(v.1.2.0), purrr(v.1.2.1), readr(v.2.1.6), tidyr(v.1.3.2), tibble(v.3.3.1), ggplot2(v.4.0.2), tidyverse(v.2.0.0) and AppliedPredictiveModeling(v.1.1-7)

loaded via a namespace (and not attached): Rdpack(v.2.6.5), pROC(v.1.19.0.1), rlang(v.1.1.7), magrittr(v.2.0.4), rpart.plot(v.3.1.4), otel(v.0.2.0), compiler(v.4.5.2), mgcv(v.1.9-4), vctrs(v.0.7.2), reshape2(v.1.4.5), pkgconfig(v.2.0.3), shape(v.1.4.6.1), fastmap(v.1.2.0), backports(v.1.5.0), labeling(v.0.4.3), rmarkdown(v.2.30), prodlim(v.2025.04.28), tzdb(v.0.5.0), nloptr(v.2.2.1), xfun(v.0.56), glmnet(v.4.1-10), jomo(v.2.7-6), cachem(v.1.1.0), jsonlite(v.2.0.0), recipes(v.1.3.1), pan(v.1.9), broom(v.1.0.12), cluster(v.2.1.8.1), R6(v.2.6.1), CORElearn(v.1.57.3.1), bslib(v.0.10.0), stringi(v.1.8.7), RColorBrewer(v.1.1-3), parallelly(v.1.46.1), boot(v.1.3-32), rpart(v.4.1.24), jquerylib(v.0.1.4), Rcpp(v.1.1.1), knitr(v.1.51), future.apply(v.1.20.1), Matrix(v.1.7-4), splines(v.4.5.2), nnet(v.7.3-20), timechange(v.0.4.0), tidyselect(v.1.2.1), rstudioapi(v.0.18.0), yaml(v.2.3.12), timeDate(v.4052.112), codetools(v.0.2-20), listenv(v.0.10.0), plyr(v.1.8.9), withr(v.3.0.2), S7(v.0.2.1), evaluate(v.1.0.5), future(v.1.69.0), survival(v.3.8-6), kernlab(v.0.9-33), pillar(v.1.11.1), stats4(v.4.5.2), reformulas(v.0.4.3.1), ellipse(v.0.5.0), generics(v.0.1.4), hms(v.1.1.4), scales(v.1.4.0), minqa(v.1.2.8), globals(v.0.18.0), class(v.7.3-23), glue(v.1.8.0), tools(v.4.5.2), data.table(v.1.18.2.1), lme4(v.1.1-38), ModelMetrics(v.1.2.2.2), gower(v.1.0.2), grid(v.4.5.2), rbibutils(v.2.4.1), ipred(v.0.9-15), nlme(v.3.1-168), cli(v.3.6.5), lava(v.1.8.2), gtable(v.0.3.6), sass(v.0.4.10), digest(v.0.6.39), farver(v.2.1.2), htmltools(v.0.5.9), lifecycle(v.1.0.5), hardhat(v.1.4.2), mitml(v.0.4-5) and MASS(v.7.3-65)