First, I examined the training data and simulated the test dataset following the exercise instructions.
# I used the same randomization seed provided in the exercise instructions
set.seed(200)
# Load data
trainingData <- mlbench.friedman1(200, sd = 1)
# Convert 'x' from matrix to dataframe
trainingData$x <- data.frame(trainingData$x)
# Examine data
featurePlot(trainingData$x, trainingData$y)
set.seed(200)
# Simulate a test set to estimate the true error rate
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
I trained and compared the performance of four nonlinear models (neural network, multivariate adaptive regression splines (MARS), support vector machine (SVM), and k-nearest neighbors (kNN)).
# Define cross-validation method
ctrl <- trainControl(method = "cv", number = 10)
set.seed(200)
# Register parallel backend
registerDoParallel(cores = n_cores)
# Define candidate models to test
nnetGrid <- expand.grid(.decay = c(0, 0.01, .1), .size = c(1:5), .bag = FALSE)
# Fit model using training data
# The 'avNNet' method is used to perform model averaging
# as described in the textbook
nnetModel <- train(x = trainingData$x, y = trainingData$y,
method = 'avNNet',
preProc = c('center', 'scale'),
tuneGrid = nnetGrid,
trControl = ctrl,
# number of models to average
repeats = 5,
# maximum iterations
maxit = 500,
# maximum number of weights
MaxNWts = 5 * (ncol(trainingData$x) + 1) + 5 + 1,
# linear output units
linout = TRUE,
# suppress tracing optimization
trace = FALSE)
nnetModel
## Model Averaged Neural Network
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 2.388511 0.7708442 1.880680
## 0.00 2 2.390423 0.7649245 1.929286
## 0.00 3 1.974630 0.8307087 1.564585
## 0.00 4 2.020615 0.8317097 1.636117
## 0.00 5 3.451602 0.6980520 2.246110
## 0.01 1 2.380845 0.7641822 1.871241
## 0.01 2 2.428199 0.7573996 1.885830
## 0.01 3 2.102588 0.8033818 1.659007
## 0.01 4 1.999703 0.8284804 1.563979
## 0.01 5 2.042573 0.8210607 1.628343
## 0.10 1 2.392324 0.7614517 1.873878
## 0.10 2 2.496895 0.7450697 2.000428
## 0.10 3 2.061674 0.8143677 1.640823
## 0.10 4 2.110922 0.8169582 1.719886
## 0.10 5 2.064228 0.8281129 1.684613
##
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 3, decay = 0 and bag = FALSE.
set.seed(200)
# Make predictions using test data
nnetPred <- predict(nnetModel, newdata = testData$x)
nnet_performance <- postResample(pred = nnetPred, obs = testData$y)
set.seed(200)
# Define candidate models to test
# Parameters from example in Chapter 7
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
# Fit model using training data
# Note that in MARS, the selection of cut points does not depend on the
# scale of the predictor variables; however, to ensure that all models
# are compared relative to the same training data, I included pre-processing
marsModel <- train(x = trainingData$x, y = trainingData$y,
method = 'earth',
preProc = c('center', 'scale'),
tuneGrid = marsGrid,
trControl = ctrl)
marsModel
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.188280 0.3042527 3.4606894
## 1 3 3.551182 0.4999832 2.8371162
## 1 4 2.653143 0.7167280 2.1282215
## 1 5 2.405769 0.7562160 1.9481607
## 1 6 2.295006 0.7754603 1.8531995
## 1 7 1.771950 0.8611767 1.3913569
## 1 8 1.647182 0.8774867 1.2995643
## 1 9 1.609816 0.8837307 1.2997049
## 1 10 1.635035 0.8798236 1.3094359
## 1 11 1.571915 0.8896147 1.2607106
## 1 12 1.571561 0.8898750 1.2530769
## 1 13 1.567577 0.8906927 1.2507948
## 1 14 1.571673 0.8909652 1.2455080
## 1 15 1.571673 0.8909652 1.2455080
## 1 16 1.571673 0.8909652 1.2455080
## 1 17 1.571673 0.8909652 1.2455080
## 1 18 1.571673 0.8909652 1.2455080
## 1 19 1.571673 0.8909652 1.2455080
## 1 20 1.571673 0.8909652 1.2455080
## 1 21 1.571673 0.8909652 1.2455080
## 1 22 1.571673 0.8909652 1.2455080
## 1 23 1.571673 0.8909652 1.2455080
## 1 24 1.571673 0.8909652 1.2455080
## 1 25 1.571673 0.8909652 1.2455080
## 1 26 1.571673 0.8909652 1.2455080
## 1 27 1.571673 0.8909652 1.2455080
## 1 28 1.571673 0.8909652 1.2455080
## 1 29 1.571673 0.8909652 1.2455080
## 1 30 1.571673 0.8909652 1.2455080
## 1 31 1.571673 0.8909652 1.2455080
## 1 32 1.571673 0.8909652 1.2455080
## 1 33 1.571673 0.8909652 1.2455080
## 1 34 1.571673 0.8909652 1.2455080
## 1 35 1.571673 0.8909652 1.2455080
## 1 36 1.571673 0.8909652 1.2455080
## 1 37 1.571673 0.8909652 1.2455080
## 1 38 1.571673 0.8909652 1.2455080
## 2 2 4.308195 0.2578397 3.6207010
## 2 3 3.706763 0.4604448 2.9757957
## 2 4 2.735073 0.6839796 2.2293125
## 2 5 2.492961 0.7285910 1.9701005
## 2 6 2.376226 0.7515868 1.8509493
## 2 7 2.056766 0.7880683 1.6268337
## 2 8 1.777202 0.8531962 1.3935838
## 2 9 1.689355 0.8660853 1.3332428
## 2 10 1.644239 0.8756652 1.3024389
## 2 11 1.515215 0.8908390 1.1976900
## 2 12 1.384404 0.9129573 1.1044863
## 2 13 1.315403 0.9237719 1.0512295
## 2 14 1.293663 0.9269840 1.0376277
## 2 15 1.244900 0.9320943 0.9875052
## 2 16 1.238511 0.9327014 0.9875981
## 2 17 1.244898 0.9321291 0.9943780
## 2 18 1.244898 0.9321291 0.9943780
## 2 19 1.244898 0.9321291 0.9943780
## 2 20 1.244898 0.9321291 0.9943780
## 2 21 1.244898 0.9321291 0.9943780
## 2 22 1.244898 0.9321291 0.9943780
## 2 23 1.244898 0.9321291 0.9943780
## 2 24 1.244898 0.9321291 0.9943780
## 2 25 1.244898 0.9321291 0.9943780
## 2 26 1.244898 0.9321291 0.9943780
## 2 27 1.244898 0.9321291 0.9943780
## 2 28 1.244898 0.9321291 0.9943780
## 2 29 1.244898 0.9321291 0.9943780
## 2 30 1.244898 0.9321291 0.9943780
## 2 31 1.244898 0.9321291 0.9943780
## 2 32 1.244898 0.9321291 0.9943780
## 2 33 1.244898 0.9321291 0.9943780
## 2 34 1.244898 0.9321291 0.9943780
## 2 35 1.244898 0.9321291 0.9943780
## 2 36 1.244898 0.9321291 0.9943780
## 2 37 1.244898 0.9321291 0.9943780
## 2 38 1.244898 0.9321291 0.9943780
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 16 and degree = 2.
set.seed(200)
# Make predictions using test data
marsPred <- predict(marsModel, newdata = testData$x)
mars_performance <- postResample(pred = marsPred, obs = testData$y)
set.seed(200)
# Fit model using training data
svmModel <- train(x = trainingData$x, y = trainingData$y,
method = 'svmRadial',
preProc = c('center', 'scale'),
tuneLength = 10,
trControl = ctrl)
svmModel
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.525164 0.7810576 2.010680
## 0.50 2.270567 0.7944850 1.794902
## 1.00 2.099319 0.8155594 1.659342
## 2.00 2.005858 0.8302852 1.578799
## 4.00 1.934650 0.8435677 1.528373
## 8.00 1.915653 0.8475592 1.528614
## 16.00 1.923884 0.8463090 1.535976
## 32.00 1.923884 0.8463090 1.535976
## 64.00 1.923884 0.8463090 1.535976
## 128.00 1.923884 0.8463090 1.535976
##
## Tuning parameter 'sigma' was held constant at a value of 0.06299324
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06299324 and C = 8.
set.seed(200)
# Make predictions using test data
svmPred <- predict(svmModel, newdata = testData$x)
svm_performance <- postResample(pred = svmPred, obs = testData$y)
set.seed(200)
# Fit model using training data
# I used the code provided in exercise but added cross-validation
knnModel <- train(x = trainingData$x, y = trainingData$y,
method = 'knn',
preProc = c('center', 'scale'),
tuneLength = 10,
trControl = ctrl)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.238598 0.5836232 2.705822
## 7 3.117335 0.6295372 2.561052
## 9 3.100423 0.6590940 2.524483
## 11 3.086639 0.6822198 2.506584
## 13 3.094904 0.6902613 2.504433
## 15 3.116059 0.7045172 2.516131
## 17 3.129874 0.7133067 2.529370
## 19 3.151840 0.7183283 2.546422
## 21 3.175787 0.7209301 2.574113
## 23 3.208213 0.7146199 2.611285
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 11.
set.seed(200)
# Make predictions using test data
# Code provided in exercise
knnPred <- predict(knnModel, newdata = testData$x)
knn_performance <- postResample(pred = knnPred, obs = testData$y)
Which models appear to give the best performance?
Answer: Of the four models, MARS had the lowest RMSE and MAE and the highest \(R^2\), which indicates that it is the best fit for the data.
# Combine performance metrics from all models into dataframe
model_performance_df <- as.data.frame(rbind(nnet_performance,
mars_performance,
svm_performance,
knn_performance))
model_performance_df$model <- c('NN', 'MARS', 'SVM', 'kNN')
rownames(model_performance_df) <- NULL
model_performance_df %>%
select(model, RMSE, Rsquared, MAE) %>%
arrange(RMSE)
## model RMSE Rsquared MAE
## 1 MARS 1.281766 0.9345701 1.016075
## 2 NN 1.884349 0.8607047 1.420308
## 3 SVM 2.087047 0.8253586 1.588687
## 4 kNN 3.147014 0.6670418 2.510034
Does MARS select the informative predictors (those named X1–X5)?
Answer: Yes. The most important predictor in the MARS model was X1, followed by X4, X2, X5 and X3.
# Scale parameter set to FALSE to show raw importance scores.
# With scaling, the importance score of the least important predictor becomes zero,
# which is potentially misleading
varImp(marsModel, scale=FALSE)
## earth variable importance
##
## Overall
## X1 100.00
## X4 85.12
## X2 69.20
## X5 49.23
## X3 39.89
The data for this exercise were pre-processed and modeled using partial least squares in Exercise 6.3 (Homework 7). Of note, missing data were imputed using multivariate imputation by chained equations (MICE) with predictive mean matching. This method generates multiple imputed datasets, each of which are used for modeling, followed by integration of performance metrics using Rubin’s rules.1
Which nonlinear regression model gives the optimal resampling and test set performance?
Answer: The best model was SVM, which had the lowest \(RMSE = 1.025\) and the highest \(R^2 = 0.71\).
Details:
I trained and compared the performance of four nonlinear models (neural network, MARS, SVM, and kNN).
# Neural network
# --------------
# Note: This block may take a few minutes to run
# Create matrix to store predictions for all models
predictions_mat <- matrix(NA, nrow = nrow(test_df), ncol = n_impute_datasets)
# Define candidate models to test
nnetGrid <- expand.grid(.decay = c(0, 1e-6, 1e-5, 1e-4, 1e-3, 0.01, 0.1),
.size = c(3, 5, 10),
.bag = FALSE)
# Loop through the imputed datasets, fit model and make predictions for each
for (i in 1 : n_impute_datasets) {
# Extract the imputed dataset
full_imputed <- mice::complete(CMP_imputations, i)
# Subset training and test rows
train_imputed <- full_imputed[train_idx, ]
test_imputed <- full_imputed[-train_idx, ]
# Fit model using training data
# Note: The formula for the MaxNWts parameter provided in the textbook and
# used in Exercise 7.2 sets MaxNWts = 291. However, this resulted in warnings
# about there being too many weights in the model (581). As a result,
# I reverted to the default value (1000; not explicitly defined below).
model <- train(
Yield ~ ., data = train_imputed,
method = 'avNNet',
preProcess = c('center', 'scale'),
tuneGrid = nnetGrid,
trControl = ctrl,
repeats = 5,
maxit = 500,
linout = TRUE,
trace = FALSE
)
# Make predictions using test data
predictions_mat[, i] <- predict(model, newdata = test_imputed)
}
# Average the predictions and then calculate performance metrics
mean_predictions <- rowMeans(predictions_mat)
nn_performance <- caret::postResample(pred = mean_predictions, obs = test_imputed$Yield)
# MARS
# ----
# Create matrix to store predictions for all models
predictions_mat <- matrix(NA, nrow = nrow(test_df), ncol = n_impute_datasets)
# Define candidate models to test
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
# Loop through the imputed datasets, fit model and make predictions for each
for (i in 1 : n_impute_datasets) {
# Extract the imputed dataset
full_imputed <- mice::complete(CMP_imputations, i)
# Subset training and test rows
train_imputed <- full_imputed[train_idx, ]
test_imputed <- full_imputed[-train_idx, ]
# Fit model using training data
model <- train(
Yield ~ ., data = train_imputed,
method = 'earth',
preProcess = c('center', 'scale'),
tuneGrid = marsGrid,
trControl = ctrl
)
# Make predictions using test data
predictions_mat[, i] <- predict(model, newdata = test_imputed)
}
# Average the predictions and then calculate performance metrics
mean_predictions <- rowMeans(predictions_mat)
mars_performance <- caret::postResample(pred = mean_predictions, obs = test_imputed$Yield)
# SVM
# ----
# Create list to store models
svm_models <- list()
# Create matrix to store predictions for all models
predictions_mat <- matrix(NA, nrow = nrow(test_df), ncol = n_impute_datasets)
# Loop through the imputed datasets, fit model and make predictions for each
for (i in 1 : n_impute_datasets) {
# Extract the imputed dataset
full_imputed <- mice::complete(CMP_imputations, i)
# Subset training and test rows
train_imputed <- full_imputed[train_idx, ]
test_imputed <- full_imputed[-train_idx, ]
# Fit model using training data
model <- train(
Yield ~ ., data = train_imputed,
method = 'svmRadial',
preProcess = c('center', 'scale'),
tuneLength = 10,
trControl = ctrl
)
# Make predictions using test data
predictions_mat[, i] <- predict(model, newdata = test_imputed)
# Store the model
svm_models[[i]] <- model
}
# Average the predictions and then calculate performance metrics
mean_predictions <- rowMeans(predictions_mat)
svm_performance <- caret::postResample(pred = mean_predictions, obs = test_imputed$Yield)
# kNN
# ----
# Create matrix to store predictions for all models
predictions_mat <- matrix(NA, nrow = nrow(test_df), ncol = n_impute_datasets)
# Loop through the imputed datasets, fit model and make predictions for each
for (i in 1 : n_impute_datasets) {
# Extract the imputed dataset
full_imputed <- mice::complete(CMP_imputations, i)
# Subset training and test rows
train_imputed <- full_imputed[train_idx, ]
test_imputed <- full_imputed[-train_idx, ]
# Fit model using training data
model <- train(
Yield ~ ., data = train_imputed,
method = 'knn',
preProcess = c('center', 'scale'),
tuneLength = 10,
trControl = ctrl
)
# Make predictions using test data
predictions_mat[, i] <- predict(model, newdata = test_imputed)
}
# Average the predictions and then calculate performance metrics
mean_predictions <- rowMeans(predictions_mat)
knn_performance <- caret::postResample(pred = mean_predictions, obs = test_imputed$Yield)
Of the three models, SVM had the lowest RMSE and MAE and the highest \(R^2\), which indicates that it is the best fit for the data.
# Combine performance metrics from all models into dataframe
cmp_model_performance_df <- as.data.frame(rbind(nn_performance,
mars_performance, svm_performance, knn_performance))
cmp_model_performance_df$model <- c('NN', 'MARS', 'SVM', 'kNN')
rownames(model_performance_df) <- NULL
cmp_model_performance_df %>%
select(model, RMSE, Rsquared, MAE) %>%
arrange(RMSE)
## model RMSE Rsquared MAE
## svm_performance SVM 1.031397 0.7116228 0.8447902
## mars_performance MARS 1.229586 0.5786710 0.9725749
## knn_performance kNN 1.421966 0.4492763 1.1222138
## nn_performance NN 1.457817 0.4700351 1.1585141
Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?
Answer:
Because the MICE imputation process generates multiple imputed datasets, each of which has an associated model, I compared variable importance for each model.
The top 10 important variables in all models included an equal number
of biological material and manufacturing process predictors. Notably,
the top 4 important variables were Process32,
Material06, Material02, and
Material03 in all models, suggesting that these are the
most important predictors of product yield. In addition,
Process32 was the most important predictor in both the SVM
model and the previous PLS model.
In general, most of the top 10 important predictors in the SVM models were also among the top 10 in the PLS models. In the PLS model, the top 10 process variables in all models (ie, union) included 09, 13, 17, 33, and 36, and the top 10 material variables included 01, 02, 03, 04, 06, and 08. In the SVM model, the top 10 process variables in all models included 09, 13, 17, 29, 31, and 36, and the top 10 material variables included 02, 03, 04, 06, and 12. Collectively, this suggests that the overlapping predictors are truly important determinants of product yield (ie, because the models are predicting the response variable in different ways [linear and nonlinear]).
varImp(svm_models[[1]])
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## Process32 100.00
## Material06 74.71
## Material02 59.86
## Material03 57.89
## Process36 57.30
## Process13 55.23
## Material04 54.66
## Material12 48.65
## Process31 43.29
## Process17 43.29
## Process33 41.47
## Process09 41.11
## Process29 40.89
## Material01 35.33
## Material11 33.52
## Material08 32.76
## Process06 32.22
## Process02 29.34
## Process27 26.83
## Material09 25.95
varImp(svm_models[[2]])
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## Process32 100.00
## Material06 74.71
## Material02 59.85
## Material03 57.89
## Process36 56.66
## Process13 55.22
## Material04 54.65
## Material12 48.64
## Process31 44.45
## Process17 43.28
## Process09 41.10
## Material01 35.32
## Process33 35.18
## Process29 34.06
## Material11 33.50
## Material08 32.75
## Process06 32.51
## Process04 29.20
## Process02 28.45
## Material09 25.94
varImp(svm_models[[3]])
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## Process32 100.00
## Material06 74.71
## Material02 59.86
## Material03 57.89
## Process13 55.23
## Material04 54.66
## Process36 53.57
## Material12 48.65
## Process17 43.29
## Process29 41.29
## Process09 41.11
## Process33 40.17
## Process31 35.93
## Material01 35.33
## Material11 33.52
## Process06 33.15
## Material08 32.76
## Process02 31.03
## Process27 26.97
## Process04 26.24
varImp(svm_models[[4]])
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## Process32 100.00
## Material06 74.71
## Material02 59.85
## Material03 57.89
## Process36 56.03
## Process13 55.22
## Material04 54.65
## Material12 48.64
## Process31 44.96
## Process17 43.28
## Process29 41.20
## Process09 41.10
## Material01 35.32
## Process33 35.18
## Material11 33.50
## Process06 33.06
## Material08 32.75
## Process02 30.71
## Process27 26.85
## Process04 26.23
varImp(svm_models[[5]])
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## Process32 100.00
## Material06 74.71
## Material02 59.85
## Material03 57.89
## Process13 55.22
## Material04 54.65
## Process36 52.28
## Material12 48.64
## Process17 43.28
## Process09 41.10
## Process31 38.97
## Process33 36.51
## Material01 35.32
## Process29 34.87
## Material11 33.50
## Process06 33.18
## Material08 32.75
## Process02 31.02
## Process27 26.76
## Process04 26.52
Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?
Answer:
Compared with the PLS models developed previously, few variables in
the top 10 most important predictors were unique to the SVM models.
Among manufacturing process variables, Process31 was an
important predictor in 3 of 5 SVM models but no PLS models, and
Process29 was an important predictor in 1 of 5 SVM models
but no PLS models. Among biological material variables,
Material12 was an important predictor in 1 of 5 SVM models
but no PLS models.
Scatterplots, Pearson correlation coefficients, and linear regression
all support the hypothesis that there is no linear relationship between
these three variables and Yield. This is in agreement with
the better fit and predictive performance of the nonlinear SVM model
(\(RMSE=1.025\), \(R^2=0.71\)) than the linear PLS model
(\(RMSE=1.71\), \(R^2=0.56\)).
The predictor importance analysis in part (b) suggests that
optimization (maximization) of Process32 is critical to
increase product yield and, in turn, revenue. Although biological
predictors cannot be changed (per exercise instructions),
Material06, Material02, and
Material03 may be useful for quality control of the raw
material.
Process31 and
YieldAs shown in the scatterplot below, Process31 and
Yield are not linearly correlated (Pearson \(r=-0.05\)). Attempting to regress the data
using a univariate linear model results in \(R^2 = 0\), which indicates that
Process31 does not explain any of the variance in
Yield. This poor predictive performance appears to be the
result of an extreme outlier, which acts as a bad leverage point.
pearson_r <- round(cor(train_df$Process31, train_df$Yield,
method = c("pearson"), use = 'complete.obs'), 2)
ggplot(drop_na(train_df, Process31), aes(x = Process31, y = Yield)) +
geom_point() +
labs(
x = 'Process31',
y = 'Yield',
title = 'Relationship between Manufacturing Process31 and Yield'
) +
geom_smooth(method = 'lm', formula = y ~ x, se = FALSE) +
geom_text(x = 40, y = 42,
label = paste0('r = ', pearson_r),
color = 'blue') +
theme_classic() +
theme(
plot.title = element_text(face = 'bold'),
axis.title = element_text(face = 'bold')
)
process31_yield_lm <- lm(Yield ~ Process31, data = train_df)
summary(process31_yield_lm)
##
## Call:
## lm(formula = Yield ~ Process31, data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9244 -1.4493 -0.1622 1.2483 6.1343
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.19757 1.80364 22.841 <2e-16 ***
## Process31 -0.01423 0.02566 -0.555 0.58
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.848 on 118 degrees of freedom
## (4 observations deleted due to missingness)
## Multiple R-squared: 0.002601, Adjusted R-squared: -0.005852
## F-statistic: 0.3077 on 1 and 118 DF, p-value: 0.5802
Process29 and
YieldSimilarly, Process31 and Yield are weakly
linearly correlated (\(r=0.15\)).
Attempting to regress the data using a univariate linear model results
in \(R^2 \approx 0\), which indicates
that Process29 does not explain any of the variance in
Yield. This poor predictive performance appears to be the
result of an extreme outlier, which acts as a bad leverage point.
pearson_r <- round(cor(train_df$Process29, train_df$Yield,
method = c("pearson"), use = 'complete.obs'), 2)
ggplot(drop_na(train_df, Process31), aes(x = Process29, y = Yield)) +
geom_point() +
labs(
x = 'Process29',
y = 'Yield',
title = 'Relationship between Manufacturing Process29 and Yield'
) +
geom_smooth(method = 'lm', formula = y ~ x, se = FALSE) +
geom_text(x = 10, y = 40,
label = paste0('r = ', pearson_r),
color = 'blue') +
theme_classic() +
theme(
plot.title = element_text(face = 'bold'),
axis.title = element_text(face = 'bold')
)
process29_yield_lm <- lm(Yield ~ Process29, data = train_df)
summary(process29_yield_lm)
##
## Call:
## lm(formula = Yield ~ Process29, data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.0395 -1.4151 -0.1838 1.2686 6.0366
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.42090 1.72605 21.680 <2e-16 ***
## Process29 0.13925 0.08603 1.619 0.108
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.83 on 118 degrees of freedom
## (4 observations deleted due to missingness)
## Multiple R-squared: 0.02172, Adjusted R-squared: 0.01343
## F-statistic: 2.62 on 1 and 118 DF, p-value: 0.1082
Material12 and
YieldMaterial12 and Yield are weakly/moderately
correlated (\(r=0.36\)); however, the
scatterplot does not show a clear linear relationship. Regressing the
data using a univariate linear model results in \(R^2 \approx 0.12\), which indicates that
Material12 explains very little (~12%) of the variance in
Yield.
pearson_r <- round(cor(train_df$Material12, train_df$Yield,
method = c("pearson"), use = 'complete.obs'), 2)
ggplot(drop_na(train_df, Material12), aes(x = Material12, y = Yield)) +
geom_point() +
labs(
x = 'Material12',
y = 'Yield',
title = 'Relationship between Biological Material12 and Yield'
) +
geom_smooth(method = 'lm', formula = y ~ x, se = FALSE) +
geom_text(x = 22, y = 42.5,
label = paste0('r = ', pearson_r),
color = 'blue') +
theme_classic() +
theme(
plot.title = element_text(face = 'bold'),
axis.title = element_text(face = 'bold')
)
material12_yield_lm <- lm(Yield ~ Material12, data = train_df)
summary(material12_yield_lm)
##
## Call:
## lm(formula = Yield ~ Material12, data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5886 -1.3237 -0.0237 1.1208 5.1771
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.8686 4.0605 5.632 1.16e-07 ***
## Material12 0.8577 0.2009 4.270 3.89e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.714 on 122 degrees of freedom
## Multiple R-squared: 0.13, Adjusted R-squared: 0.1229
## F-statistic: 18.23 on 1 and 122 DF, p-value: 3.894e-05
pander::pander(sessionInfo())
R version 4.5.2 (2025-10-31)
Platform: aarch64-apple-darwin20
locale: en_US.UTF-8||en_US.UTF-8||en_US.UTF-8||C||en_US.UTF-8||en_US.UTF-8
attached base packages: parallel, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: pander(v.0.6.6), doParallel(v.1.0.17), iterators(v.1.0.14), foreach(v.1.5.2), earth(v.5.3.5), plotmo(v.3.7.0), plotrix(v.3.8-14), Formula(v.1.2-5), caret(v.7.0-1), lattice(v.0.22-7), mlbench(v.2.1-7), mice(v.3.19.0), lubridate(v.1.9.5), forcats(v.1.0.1), stringr(v.1.6.0), dplyr(v.1.2.0), purrr(v.1.2.1), readr(v.2.1.6), tidyr(v.1.3.2), tibble(v.3.3.1), ggplot2(v.4.0.2), tidyverse(v.2.0.0) and AppliedPredictiveModeling(v.1.1-7)
loaded via a namespace (and not attached): Rdpack(v.2.6.5), pROC(v.1.19.0.1), rlang(v.1.1.7), magrittr(v.2.0.4), rpart.plot(v.3.1.4), otel(v.0.2.0), compiler(v.4.5.2), mgcv(v.1.9-4), vctrs(v.0.7.2), reshape2(v.1.4.5), pkgconfig(v.2.0.3), shape(v.1.4.6.1), fastmap(v.1.2.0), backports(v.1.5.0), labeling(v.0.4.3), rmarkdown(v.2.30), prodlim(v.2025.04.28), tzdb(v.0.5.0), nloptr(v.2.2.1), xfun(v.0.56), glmnet(v.4.1-10), jomo(v.2.7-6), cachem(v.1.1.0), jsonlite(v.2.0.0), recipes(v.1.3.1), pan(v.1.9), broom(v.1.0.12), cluster(v.2.1.8.1), R6(v.2.6.1), CORElearn(v.1.57.3.1), bslib(v.0.10.0), stringi(v.1.8.7), RColorBrewer(v.1.1-3), parallelly(v.1.46.1), boot(v.1.3-32), rpart(v.4.1.24), jquerylib(v.0.1.4), Rcpp(v.1.1.1), knitr(v.1.51), future.apply(v.1.20.1), Matrix(v.1.7-4), splines(v.4.5.2), nnet(v.7.3-20), timechange(v.0.4.0), tidyselect(v.1.2.1), rstudioapi(v.0.18.0), yaml(v.2.3.12), timeDate(v.4052.112), codetools(v.0.2-20), listenv(v.0.10.0), plyr(v.1.8.9), withr(v.3.0.2), S7(v.0.2.1), evaluate(v.1.0.5), future(v.1.69.0), survival(v.3.8-6), kernlab(v.0.9-33), pillar(v.1.11.1), stats4(v.4.5.2), reformulas(v.0.4.3.1), ellipse(v.0.5.0), generics(v.0.1.4), hms(v.1.1.4), scales(v.1.4.0), minqa(v.1.2.8), globals(v.0.18.0), class(v.7.3-23), glue(v.1.8.0), tools(v.4.5.2), data.table(v.1.18.2.1), lme4(v.1.1-38), ModelMetrics(v.1.2.2.2), gower(v.1.0.2), grid(v.4.5.2), rbibutils(v.2.4.1), ipred(v.0.9-15), nlme(v.3.1-168), cli(v.3.6.5), lava(v.1.8.2), gtable(v.0.3.6), sass(v.0.4.10), digest(v.0.6.39), farver(v.2.1.2), htmltools(v.0.5.9), lifecycle(v.1.0.5), hardhat(v.1.4.2), mitml(v.0.4-5) and MASS(v.7.3-65)