Do problems 7.2 and 7.5 in Kuhn and Johnson. There are only two but they have many parts. Please submit both a link to your Rpubs and the .rmd file.
y = 10sin(πx1x2)+20(x3−0.5)2 +10x4 +5x5 +N(0,σ2)
where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simula tion). The package mlbench contains a function called mlbench.
postResample(pred = knnPred, obs = testData$y) RMSE Rsquared 3.2286834 0.6871735 Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?
# Set seed for reproducibility
set.seed(200)
# Generate training data
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)
# Generate test data
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
# Define train control (use repeated CV for stability)
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
# 1. kNN (already shown, but rerun for consistency)
knnModel <- train(x = trainingData$x, y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = ctrl)
# 2. MARS
marsModel <- train(x = trainingData$x, y = trainingData$y,
method = "earth",
tuneLength = 10,
trControl = ctrl)
# 3. Random Forest
rfModel <- train(x = trainingData$x, y = trainingData$y,
method = "rf",
tuneLength = 5,
trControl = ctrl)
# 4. SVM Radial
svmModel <- train(x = trainingData$x, y = trainingData$y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = ctrl)
# 5. Neural Network
nnetModel <- train(x = trainingData$x, y = trainingData$y,
method = "nnet",
preProc = c("center", "scale"),
tuneLength = 5,
trace = FALSE,
trControl = ctrl)
# Collect models
models <- list(kNN = knnModel, MARS = marsModel, RF = rfModel,
SVM = svmModel, NNET = nnetModel)
# Compare resampling results
resamp_results <- resamples(models)
summary(resamp_results)
##
## Call:
## summary.resamples(object = resamp_results)
##
## Models: kNN, MARS, RF, SVM, NNET
## Number of resamples: 30
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## kNN 1.6948227 2.126767 2.479266 2.463204 2.791239 3.332986 0
## MARS 0.9080905 1.135586 1.223749 1.286705 1.373974 1.955600 0
## RF 1.4331692 1.809617 1.935672 1.972550 2.091619 2.540529 0
## SVM 1.1413868 1.355576 1.460862 1.493441 1.632186 2.104775 0
## NNET 12.8914968 13.131096 13.301336 13.416183 13.607133 14.248141 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## kNN 2.157744 2.765565 2.976898 3.041161 3.371740 3.885103 0
## MARS 1.119523 1.465947 1.560528 1.636287 1.767332 2.418587 0
## RF 1.770782 2.160481 2.355934 2.386435 2.579787 3.157105 0
## SVM 1.354465 1.712269 1.818063 1.876507 2.069615 2.474454 0
## NNET 13.668889 14.007535 14.168806 14.289053 14.529932 15.191264 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## kNN 0.4963614 0.6241825 0.7459069 0.7079347 0.7978241 0.8684934 0
## MARS 0.7190395 0.8807695 0.9015405 0.8918032 0.9234754 0.9672971 0
## RF 0.6567706 0.7521090 0.8213958 0.8049434 0.8573331 0.9060294 0
## SVM 0.7204388 0.8460680 0.8769719 0.8668740 0.9013229 0.9318848 0
## NNET NA NA NA NaN NA NA 30
# Test set evaluation
test_results <- data.frame(Model = character(),
RMSE = numeric(),
Rsquared = numeric())
for (name in names(models)) {
pred <- predict(models[[name]], newdata = testData$x)
perf <- postResample(pred = pred, obs = testData$y)
test_results <- rbind(test_results, data.frame(Model = name,
RMSE = perf[1],
Rsquared = perf[2]))
}
print(test_results)
## Model RMSE Rsquared
## RMSE kNN 3.148156 0.6747755
## RMSE1 MARS 1.813647 0.8677298
## RMSE2 RF 2.408286 0.7917156
## RMSE3 SVM 2.077211 0.8251031
## RMSE4 NNET 14.276927 NA
Best model: MARS
RMSE = 1.81 (much lower than others)
R² = 0.87 (explains 87% of variance)
Why MARS Wins?
MARS has the lowest RMSE in both resampling and test.
It also has the highest R² (0.892 average in resampling, 0.868 on test).
MARS handles interactions naturally
Check variable importance in the MARS model:
varImp(marsModel)
## earth variable importance
##
## Overall
## X1 100.00
## X4 82.08
## X2 62.79
## X5 38.07
## X3 25.80
## X6 0.00
# or
summary(marsModel$finalModel)
## Call: earth(x=data.frame[200,10], y=c(18.46,16.1,17...), keepxy=TRUE, degree=1,
## nprune=12)
##
## coefficients
## (Intercept) 18.451984
## h(0.621722-X1) -11.074396
## h(0.601063-X2) -10.744225
## h(X3-0.281766) 20.607853
## h(0.447442-X3) 17.880232
## h(X3-0.447442) -23.282007
## h(X3-0.636458) 15.150350
## h(0.734892-X4) -10.027487
## h(X4-0.734892) 9.092045
## h(0.850094-X5) -4.723407
## h(X5-0.850094) 10.832932
## h(X6-0.361791) -1.956821
##
## Selected 12 of 18 terms, and 6 of 10 predictors (nprune=12)
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556 RSS 397.9654 GRSq 0.8968524 RSq 0.9183982
1. Does MARS select the informative predictors (X1–X5)?
Yes — strongly.
From the Importance section:
Importance: X1, X4, X2, X5, X3, all have nonzero importance (X1 = 100, X4 = 82, X2 = 63, X5 = 38, X3 = 25.8) X6, X7-unused, X8-unused, X9-unused, X10-unused
MARS successfully identified all 5 informative predictors and correctly ignored (or nearly ignored) the 5 non-informative ones.
# ============================================================
# Exercise 7.5 - Nonlinear Regression Models
# Chemical Manufacturing Process Dataset
# Based on imputation, splitting, and pre-processing from 6.3
# ============================================================
# ============================================================
# Step 1: Load Data (as in 6.3)
# ============================================================
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
# The data: processPredictors (176x57), yield (176x1)
# 57 predictors: 12 biological + 45 manufacturing process
# Combine into one data frame for easier handling
chem_data <- ChemicalManufacturingProcess
head(chem_data, 1)
## Yield BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
## 1 38 6.25 49.58 56.97
## BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
## 1 12.74 19.51 43.73
## BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
## 1 100 16.66 11.44
## BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
## 1 3.46 138.09 18.83
## ManufacturingProcess01 ManufacturingProcess02 ManufacturingProcess03
## 1 NA NA NA
## ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess06
## 1 NA NA NA
## ManufacturingProcess07 ManufacturingProcess08 ManufacturingProcess09
## 1 NA NA 43
## ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess12
## 1 NA NA NA
## ManufacturingProcess13 ManufacturingProcess14 ManufacturingProcess15
## 1 35.5 4898 6108
## ManufacturingProcess16 ManufacturingProcess17 ManufacturingProcess18
## 1 4682 35.5 4865
## ManufacturingProcess19 ManufacturingProcess20 ManufacturingProcess21
## 1 6049 4665 0
## ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
## 1 NA NA NA
## ManufacturingProcess25 ManufacturingProcess26 ManufacturingProcess27
## 1 4873 6074 4685
## ManufacturingProcess28 ManufacturingProcess29 ManufacturingProcess30
## 1 10.7 21 9.9
## ManufacturingProcess31 ManufacturingProcess32 ManufacturingProcess33
## 1 69.1 156 66
## ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
## 1 2.4 486 0.019
## ManufacturingProcess37 ManufacturingProcess38 ManufacturingProcess39
## 1 0.5 3 7.2
## ManufacturingProcess40 ManufacturingProcess41 ManufacturingProcess42
## 1 NA NA 11.6
## ManufacturingProcess43 ManufacturingProcess44 ManufacturingProcess45
## 1 3 1.8 2.4
cat("Dataset dimensions:", dim(chem_data), "\n")
## Dataset dimensions: 176 58
cat("Missing values:", sum(is.na(chem_data)), "\n")
## Missing values: 106
# ============================================================
# Step 2: Impute Missing Values (as in 6.3)
# ============================================================
# Using k-nearest neighbors imputation (common approach from 6.3)
# KNN Imputation (most common in solutions)
set.seed(123)
impute_preproc <- preProcess(chem_data,
method = c("knnImpute"),
k = 5)
chem_imputed <- predict(impute_preproc, chem_data)
# Verify no missing values remain
cat("Missing values after imputation:", sum(is.na(chem_imputed)), "\n")
## Missing values after imputation: 0
# ============================================================
# Step 3: Split Data into Training and Test Sets
# ============================================================
set.seed(517) # Using seed from exercise 6.3 solutions
train_index <- createDataPartition(chem_imputed$Yield,
p = 0.8,
list = FALSE)
train_data <- chem_imputed[train_index, ]
test_data <- chem_imputed[-train_index, ]
cat("Training set size:", nrow(train_data), "\n")
## Training set size: 144
cat("Test set size:", nrow(test_data), "\n")
## Test set size: 32
# Separate predictors and response
train_x <- train_data %>% select(-Yield)
train_y <- train_data$Yield
test_x <- test_data %>% select(-Yield)
test_y <- test_data$Yield
# ============================================================
# Step 4: Define Training Control
# ============================================================
# Using repeated cross-validation for more stable estimates
ctrl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3,
verboseIter = FALSE)
# Pre-processing options (center + scale for all models)
preProc_options <- c("center", "scale")
# ============================================================
# Step 5: Train Multiple Nonlinear Regression Models
# ============================================================
# ---------- 5.1 k-Nearest Neighbors (KNN) ----------
set.seed(456)
knn_model <- train(x = train_x,
y = train_y,
method = "knn",
preProc = preProc_options,
tuneLength = 10,
trControl = ctrl)
cat("\n=== KNN Best Tuning Parameters ===\n")
##
## === KNN Best Tuning Parameters ===
print(knn_model$bestTune)
## k
## 1 5
# ---------- 5.2 Neural Network (NNET) ----------
set.seed(456)
nnet_grid <- expand.grid(decay = c(0, 0.01, 0.1),
size = c(1, 3, 5, 7))
nnet_model <- train(x = train_x,
y = train_y,
method = "nnet",
preProc = preProc_options,
tuneGrid = nnet_grid,
trControl = ctrl,
linout = TRUE,
trace = FALSE,
maxit = 500)
cat("\n=== Neural Network Best Tuning Parameters ===\n")
##
## === Neural Network Best Tuning Parameters ===
print(nnet_model$bestTune)
## size decay
## 9 1 0.1
# ---------- 5.3 Random Forest (RF) ----------
set.seed(456)
rf_model <- train(x = train_x,
y = train_y,
method = "rf",
preProc = preProc_options,
tuneLength = 10,
trControl = ctrl,
importance = TRUE)
cat("\n=== Random Forest Best Tuning Parameters ===\n")
##
## === Random Forest Best Tuning Parameters ===
print(rf_model$bestTune)
## mtry
## 4 20
# ---------- 5.4 MARS (Earth) ----------
set.seed(456)
mars_grid <- expand.grid(degree = 1:2,
nprune = 2:30)
mars_model <- train(x = train_x,
y = train_y,
method = "earth",
preProc = preProc_options,
tuneGrid = mars_grid,
trControl = ctrl)
cat("\n=== MARS Best Tuning Parameters ===\n")
##
## === MARS Best Tuning Parameters ===
print(mars_model$bestTune)
## nprune degree
## 7 8 1
# ---------- 5.5 Support Vector Machine (SVM) ----------
set.seed(456)
svm_model <- train(x = train_x,
y = train_y,
method = "svmRadial",
preProc = preProc_options,
tuneLength = 15,
trControl = ctrl)
cat("\n=== SVM Best Tuning Parameters ===\n")
##
## === SVM Best Tuning Parameters ===
print(svm_model$bestTune)
## sigma C
## 6 0.01612431 8
# ============================================================
# Step 6: Evaluate Models on Training (Resampling) and Test Sets
# ============================================================
# Function to get training performance from train object
get_training_performance <- function(model) {
results <- model$results
best_row <- which.min(results$RMSE)
data.frame(RMSE = results$RMSE[best_row],
Rsquared = results$Rsquared[best_row])
}
# Function to evaluate on test set
evaluate_test <- function(model, test_x, test_y) {
predictions <- predict(model, newdata = test_x)
metrics <- postResample(pred = predictions, obs = test_y)
data.frame(RMSE = metrics[1], Rsquared = metrics[2])
}
# Collect results
models_list <- list(KNN = knn_model,
NeuralNet = nnet_model,
RandomForest = rf_model,
MARS = mars_model,
SVM = svm_model)
# Store results
training_results <- data.frame()
test_results <- data.frame()
for (name in names(models_list)) {
train_perf <- get_training_performance(models_list[[name]])
train_perf$Model <- name
training_results <- rbind(training_results, train_perf)
test_perf <- evaluate_test(models_list[[name]], test_x, test_y)
test_perf$Model <- name
test_results <- rbind(test_results, test_perf)
}
# Reorder columns
training_results <- training_results[, c("Model", "RMSE", "Rsquared")]
test_results <- test_results[, c("Model", "RMSE", "Rsquared")]
# Sort by test RMSE (lower is better)
test_results <- test_results[order(test_results$RMSE), ]
cat("\n==================================================\n")
##
## ==================================================
cat("=== TRAINING (Resampling) Performance ===\n")
## === TRAINING (Resampling) Performance ===
cat("==================================================\n")
## ==================================================
print(training_results[order(training_results$RMSE), ])
## Model RMSE Rsquared
## 3 RandomForest 0.6274546 0.6453596
## 5 SVM 0.6316745 0.6144536
## 4 MARS 0.6639217 0.5618524
## 1 KNN 0.6913311 0.5305066
## 2 NeuralNet 0.7213128 0.5599081
cat("\n==================================================\n")
##
## ==================================================
cat("=== TEST SET Performance ===\n")
## === TEST SET Performance ===
cat("==================================================\n")
## ==================================================
print(test_results)
## Model RMSE Rsquared
## RMSE4 SVM 0.5149061 0.7497862
## RMSE2 RandomForest 0.5422061 0.7527479
## RMSE3 MARS 0.6150528 0.6481346
## RMSE KNN 0.6575686 0.6460840
## RMSE1 NeuralNet 0.7419546 0.5946228
# ============================================================
# Step 7: Answer to Question (a)
# ============================================================
cat("\n==================================================\n")
##
## ==================================================
cat("=== ANSWER TO QUESTION 7.5(a) ===\n")
## === ANSWER TO QUESTION 7.5(a) ===
cat("==================================================\n")
## ==================================================
best_train_model <- training_results[which.min(training_results$RMSE), "Model"]
best_test_model <- test_results[which.min(test_results$RMSE), "Model"]
cat("Optimal resampling (training) performance:", best_train_model, "\n")
## Optimal resampling (training) performance: RandomForest
cat("Optimal test set performance:", best_test_model, "\n")
## Optimal test set performance: SVM
Optimal Performance:
1. SVM performed best on test data
2. Random Forest was most consistent
3. MARS, KNN, and Neural Network underperformed
MARS: Test RMSE 0.615, R² 0.648 KNN: Test RMSE 0.658, R² 0.646 NeuralNet: Test RMSE 0.742, R² 0.595 (worst by far)
# ============================================================
# Visual Comparison
# ============================================================
# Create comparison plot
comparison_df <- rbind(
data.frame(Model = test_results$Model, Metric = "RMSE", Value = test_results$RMSE, Type = "Test"),
data.frame(Model = training_results$Model, Metric = "RMSE", Value = training_results$RMSE, Type = "Training")
)
library(ggplot2)
ggplot(comparison_df, aes(x = reorder(Model, -Value), y = Value, fill = Type)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Model Performance Comparison - Chemical Manufacturing Process",
x = "Model", y = "RMSE (lower is better)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# ============================================================
# Top 5 Predictors for All Nonlinear Models
# ============================================================
library(tidyverse)
library(caret)
# Function to classify predictors
classify_predictor <- function(predictor) {
ifelse(grepl("BiologicalMaterial", predictor), "Biological", "Process")
}
# Extract top 5 predictors for each model
get_top5 <- function(model, model_name) {
importance <- varImp(model, scale = TRUE)$importance
top5 <- importance %>%
rownames_to_column("Predictor") %>%
arrange(desc(Overall)) %>%
slice_head(n = 5) %>%
mutate(Model = model_name,
Type = classify_predictor(Predictor))
return(top5)
}
# Get top 5 for all nonlinear models
all_top5 <- bind_rows(
get_top5(knn_model, "KNN"),
get_top5(nnet_model, "NeuralNet"),
get_top5(rf_model, "RandomForest"),
get_top5(mars_model, "MARS"),
get_top5(svm_model, "SVM")
)
# Display results
print("=== Top 5 Predictors for All Nonlinear Models ===")
## [1] "=== Top 5 Predictors for All Nonlinear Models ==="
print(all_top5)
## Predictor Overall Model Type
## 1 ManufacturingProcess32 100.00000 KNN Process
## 2 ManufacturingProcess13 83.81915 KNN Process
## 3 BiologicalMaterial06 78.47463 KNN Biological
## 4 ManufacturingProcess36 76.47953 KNN Process
## 5 ManufacturingProcess17 69.03277 KNN Process
## 6 ManufacturingProcess32 100.00000 NeuralNet Process
## 7 BiologicalMaterial12 84.99691 NeuralNet Biological
## 8 BiologicalMaterial04 76.78742 NeuralNet Biological
## 9 BiologicalMaterial06 69.76208 NeuralNet Biological
## 10 ManufacturingProcess29 68.21831 NeuralNet Process
## 11 ManufacturingProcess32 100.00000 RandomForest Process
## 12 BiologicalMaterial12 45.92849 RandomForest Biological
## 13 BiologicalMaterial03 43.10222 RandomForest Biological
## 14 ManufacturingProcess13 42.78401 RandomForest Process
## 15 BiologicalMaterial06 40.76003 RandomForest Biological
## 16 ManufacturingProcess32 100.00000 MARS Process
## 17 ManufacturingProcess09 58.01505 MARS Process
## 18 ManufacturingProcess13 29.82021 MARS Process
## 19 ManufacturingProcess01 17.40827 MARS Process
## 20 ManufacturingProcess22 17.40827 MARS Process
## 21 ManufacturingProcess32 100.00000 SVM Process
## 22 ManufacturingProcess13 83.81915 SVM Process
## 23 BiologicalMaterial06 78.47463 SVM Biological
## 24 ManufacturingProcess36 76.47953 SVM Process
## 25 ManufacturingProcess17 69.03277 SVM Process
# Summary: Which variables appear most frequently?
frequent_predictors <- all_top5 %>%
group_by(Predictor, Type) %>%
summarise(Appearances = n(), .groups = "drop") %>%
arrange(desc(Appearances))
print("=== Most Frequently Important Predictors Across Models ===")
## [1] "=== Most Frequently Important Predictors Across Models ==="
print(frequent_predictors)
## # A tibble: 12 × 3
## Predictor Type Appearances
## <chr> <chr> <int>
## 1 ManufacturingProcess32 Process 5
## 2 BiologicalMaterial06 Biological 4
## 3 ManufacturingProcess13 Process 4
## 4 BiologicalMaterial12 Biological 2
## 5 ManufacturingProcess17 Process 2
## 6 ManufacturingProcess36 Process 2
## 7 BiologicalMaterial03 Biological 1
## 8 BiologicalMaterial04 Biological 1
## 9 ManufacturingProcess01 Process 1
## 10 ManufacturingProcess09 Process 1
## 11 ManufacturingProcess22 Process 1
## 12 ManufacturingProcess29 Process 1
# Dominance summary by model
dominance <- all_top5 %>%
group_by(Model, Type) %>%
summarise(Count = n(), .groups = "drop") %>%
pivot_wider(names_from = Type, values_from = Count, values_fill = 0) %>%
mutate(Process_Pct = round(100 * Process / (Process + Biological), 1),
Biological_Pct = round(100 * Biological / (Process + Biological), 1))
print("=== Biological vs Process Dominance by Model ===")
## [1] "=== Biological vs Process Dominance by Model ==="
print(dominance)
## # A tibble: 5 × 5
## Model Biological Process Process_Pct Biological_Pct
## <chr> <int> <int> <dbl> <dbl>
## 1 KNN 1 4 80 20
## 2 MARS 0 5 100 0
## 3 NeuralNet 3 2 40 60
## 4 RandomForest 3 2 40 60
## 5 SVM 1 4 80 20
# Overall dominance across all models
total_dominance <- all_top5 %>%
group_by(Type) %>%
summarise(Total_Appearances = n()) %>%
mutate(Percentage = round(100 * Total_Appearances / sum(Total_Appearances), 1))
print("=== Overall Dominance (All Nonlinear Models Combined) ===")
## [1] "=== Overall Dominance (All Nonlinear Models Combined) ==="
print(total_dominance)
## # A tibble: 2 × 3
## Type Total_Appearances Percentage
## <chr> <int> <dbl>
## 1 Biological 8 32
## 2 Process 17 68
ManufacturingProcess32 is universally important — robust finding across all modeling approaches
BiologicalMaterial06 is the most important biological variable (appears in 4 of 5 models)
MARS stands out as the only model that completely excludes biological variables from its top 5
Model choice matters — NeuralNet and RandomForest assign more weight to biological variables than KNN, SVM, or MARS
# ============================================================
# 7.5(c) - Explore Relationships: Top 5 MARS Predictors vs Yield
# ============================================================
library(ggplot2)
library(gridExtra)
library(dplyr)
# Extract top 5 predictors from MARS
mars_importance <- varImp(mars_model, scale = TRUE)
top_predictors <- rownames(mars_importance$importance)[
order(mars_importance$importance$Overall, decreasing = TRUE)[1:5]
]
# Create plot data
plot_data <- train_data %>%
select(Yield, all_of(top_predictors))
names(plot_data) <- c("Yield", "Top1", "Top2", "Top3", "Top4", "Top5")
# Plot 1: Top predictor vs Yield
p1 <- ggplot(plot_data, aes(x = Top1, y = Yield)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "loess", se = TRUE, color = "red") +
labs(title = paste(top_predictors[1], "vs Yield"),
subtitle = paste("MARS Importance =", round(mars_importance$importance[top_predictors[1], "Overall"], 1)),
x = top_predictors[1], y = "Yield (%)") +
theme_minimal()
# Plot 2: Second predictor vs Yield
p2 <- ggplot(plot_data, aes(x = Top2, y = Yield)) +
geom_point(alpha = 0.6, color = "darkgreen") +
geom_smooth(method = "loess", se = TRUE, color = "red") +
labs(title = paste(top_predictors[2], "vs Yield"),
subtitle = paste("MARS Importance =", round(mars_importance$importance[top_predictors[2], "Overall"], 1)),
x = top_predictors[2], y = "Yield (%)") +
theme_minimal()
# Plot 3: Third predictor vs Yield
p3 <- ggplot(plot_data, aes(x = Top3, y = Yield)) +
geom_point(alpha = 0.6, color = "purple") +
geom_smooth(method = "loess", se = TRUE, color = "red") +
labs(title = paste(top_predictors[3], "vs Yield"),
subtitle = paste("MARS Importance =", round(mars_importance$importance[top_predictors[3], "Overall"], 1)),
x = top_predictors[3], y = "Yield (%)") +
theme_minimal()
# Plot 4: Third predictor vs Yield
p4 <- ggplot(plot_data, aes(x = Top4, y = Yield)) +
geom_point(alpha = 0.6, color = "purple") +
geom_smooth(method = "loess", se = TRUE, color = "red") +
labs(title = paste(top_predictors[4], "vs Yield"),
subtitle = paste("MARS Importance =", round(mars_importance$importance[top_predictors[4], "Overall"], 1)),
x = top_predictors[4], y = "Yield (%)") +
theme_minimal()
# Plot 5: Third predictor vs Yield
p5 <- ggplot(plot_data, aes(x = Top5, y = Yield)) +
geom_point(alpha = 0.6, color = "purple") +
geom_smooth(method = "loess", se = TRUE, color = "red") +
labs(title = paste(top_predictors[5], "vs Yield"),
subtitle = paste("MARS Importance =", round(mars_importance$importance[top_predictors[5], "Overall"], 1)),
x = top_predictors[5], y = "Yield (%)") +
theme_minimal()
grid.arrange(p1, p2, p3, ncol = 3, widths = c(1, 1, 1))
# For 2 plots side by side
grid.arrange(p4, p5, ncol = 2, widths = c(1, 1))
# Correlation summary
cat("\nCorrelation with Yield:\n")
##
## Correlation with Yield:
cat(paste(top_predictors[1], ":", round(cor(plot_data$Top1, plot_data$Yield), 3), "\n"))
## ManufacturingProcess32 : 0.605
cat(paste(top_predictors[2], ":", round(cor(plot_data$Top2, plot_data$Yield), 3), "\n"))
## ManufacturingProcess09 : 0.482
cat(paste(top_predictors[3], ":", round(cor(plot_data$Top3, plot_data$Yield), 3), "\n"))
## ManufacturingProcess13 : -0.478
cat(paste(top_predictors[4], ":", round(cor(plot_data$Top2, plot_data$Yield), 4), "\n"))
## ManufacturingProcess01 : 0.4825
cat(paste(top_predictors[5], ":", round(cor(plot_data$Top3, plot_data$Yield), 5), "\n"))
## ManufacturingProcess22 : -0.47826
Relationship Findings - Top 5 MARS Predictors vs Yield
Intuition Revealed
All five top predictors are process variables (not biological). To maximize yield:
Increase ManufacturingProcess32, ManufacturingProcess09 and ManufacturingProcess01
Decrease ManufacturingProcess13 and ManufacturingProcess22
These are actionable levers that can be controlled in real-time during manufacturing, unlike biological variables which are fixed raw material properties