## Loading required package: ggplot2
## Loading required package: lattice
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x" data from a matrix to a dataframe
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)## or other methods.
## This creates a list with a vector 'y' and a maxtrix
## of predictors 'x'. Also simulates a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
## Example of Tuning a Model is
knnmodel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnmodel## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnpred <- predict(knnmodel, newdata = testData$x)
## the function 'postResample' can be used to get the test set
## performance values
postResample(pred = knnpred, obs = testData$y)## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
# Single-Layer Neural Network with 5 Hidden Neurons
# Train a single-layer neural network model
single_layer_nn <- nnet(
x = trainingData$x, # Input features from training dataset
y = trainingData$y, # Target output values from training dataset
size = 5, # Number of neurons in the hidden layer
decay = 0.01, # Regularization parameter to reduce overfitting
linout = TRUE, # Use linear output for regression
trace = FALSE, # Suppress training output details
maxit = 500, # Maximum number of iterations for convergence
MaxNWts = 5 * (ncol(trainingData$x) + 1) + 5 + 1 # Max weights based on inputs and hidden layer size
)
# Predict using the single-layer model on test dataset
single_layer_predictions <- predict(single_layer_nn, newdata = testData$x)
# Evaluate model performance: accuracy, RMSE, and R-squared
single_layer_performance <- postResample(pred = single_layer_predictions, obs = testData$y)
# Averaged Neural Network Model with Multiple Repeats
# Train an ensemble neural network model with averaged predictions
averaged_nn_model <- avNNet(
x = trainingData$x, # Input features from training dataset
y = trainingData$y, # Target output values from training dataset
size = 5, # Number of neurons in each hidden layer
decay = 0.01, # Regularization parameter to reduce overfitting
repeats = 5, # Number of model repeats for ensemble averaging
linout = TRUE, # Use linear output for regression
trace = FALSE, # Suppress training output details
maxit = 500, # Maximum number of iterations for convergence
MaxNWts = 5 * (ncol(trainingData$x) + 1) + 5 + 1 # Max weights based on inputs and hidden layer size
)## Warning: executing %dopar% sequentially: no parallel backend registered
# Predict using the averaged model on test dataset
averaged_nn_predictions <- predict(averaged_nn_model, newdata = testData$x)
# Evaluate performance of the averaged neural network model
averaged_nn_performance <- postResample(pred = averaged_nn_predictions, obs = testData$y)
# Output the performance metrics for both models
list(Single_Layer_NN_Performance = single_layer_performance,
Averaged_NN_Performance = averaged_nn_performance)## $Single_Layer_NN_Performance
## RMSE Rsquared MAE
## 2.5244877 0.7621849 1.8722728
##
## $Averaged_NN_Performance
## RMSE Rsquared MAE
## 2.0404770 0.8331947 1.5329094
# Regular MARS (Multivariate Adaptive Regression Splines) Model
# Train a basic MARS model on the training data
basic_mars_model <- earth(
x = trainingData$x, # Input features from training dataset
y = trainingData$y # Target output values from training dataset
)
# Display model summary for interpretation
print(basic_mars_model)## Selected 12 of 18 terms, and 6 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556 RSS 397.9654 GRSq 0.8968524 RSq 0.9183982
## Call: earth(x=trainingData$x, y=trainingData$y)
##
## coefficients
## (Intercept) 18.451984
## h(0.621722-X1) -11.074396
## h(0.601063-X2) -10.744225
## h(X3-0.281766) 20.607853
## h(0.447442-X3) 17.880232
## h(X3-0.447442) -23.282007
## h(X3-0.636458) 15.150350
## h(0.734892-X4) -10.027487
## h(X4-0.734892) 9.092045
## h(0.850094-X5) -4.723407
## h(X5-0.850094) 10.832932
## h(X6-0.361791) -1.956821
##
## Selected 12 of 18 terms, and 6 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556 RSS 397.9654 GRSq 0.8968524 RSq 0.9183982
# Make predictions with the basic MARS model on the test dataset
basic_mars_predictions <- predict(basic_mars_model, newdata = testData$x)
# Evaluate performance of the basic MARS model: accuracy, RMSE, and R-squared
basic_mars_performance <- postResample(pred = basic_mars_predictions, obs = testData$y)
# Setting up a grid for hyperparameter tuning
mars_tuning_grid <- expand.grid(.degree = 1:2, .nprune = 2:16) # Degrees of interaction and number of terms
# Set seed for reproducibility in cross-validation
set.seed(1234)
# Tuning the MARS model with cross-validation
# Train a MARS model with cross-validation to find optimal hyperparameters
tuned_mars_model <- train(
x = trainingData$x, # Input features from training dataset
y = trainingData$y, # Target output values from training dataset
method = "earth", # Specifies MARS model
tuneGrid = mars_tuning_grid, # Grid of tuning parameters
trControl = trainControl(method = "cv") # Cross-validation control
)
# Display the tuned model details
print(tuned_mars_model)## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.312373 0.2464574 3.547735
## 1 3 3.707278 0.4683798 3.005934
## 1 4 2.663179 0.7126715 2.137120
## 1 5 2.261502 0.7904454 1.824339
## 1 6 2.145367 0.8119436 1.680939
## 1 7 1.797464 0.8635322 1.419835
## 1 8 1.720995 0.8768729 1.389307
## 1 9 1.653852 0.8890984 1.326900
## 1 10 1.674588 0.8827378 1.320200
## 1 11 1.650636 0.8864692 1.284660
## 1 12 1.646699 0.8855752 1.288449
## 1 13 1.643453 0.8866051 1.288624
## 1 14 1.646405 0.8856334 1.291766
## 1 15 1.637579 0.8876260 1.283147
## 1 16 1.637579 0.8876260 1.283147
## 2 2 4.312373 0.2464574 3.547735
## 2 3 3.721908 0.4678488 3.016069
## 2 4 2.674839 0.7105853 2.140857
## 2 5 2.267824 0.7888276 1.832740
## 2 6 2.216987 0.8005498 1.778172
## 2 7 1.844249 0.8605997 1.431686
## 2 8 1.749633 0.8774372 1.360255
## 2 9 1.560708 0.9024603 1.231461
## 2 10 1.529405 0.9066803 1.222445
## 2 11 1.467736 0.9105211 1.153231
## 2 12 1.360016 0.9236123 1.084557
## 2 13 1.304167 0.9332150 1.048825
## 2 14 1.378462 0.9261006 1.101009
## 2 15 1.395984 0.9244996 1.114657
## 2 16 1.424798 0.9209343 1.131557
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 13 and degree = 2.
# Make predictions with the tuned MARS model on the test dataset
tuned_mars_predictions <- predict(tuned_mars_model, newdata = testData$x)
# Evaluate performance of the tuned MARS model
tuned_mars_performance <- postResample(pred = tuned_mars_predictions, obs = testData$y)
# Output performance metrics for both the basic and tuned MARS models
list(Basic_MARS_Performance = basic_mars_performance,
Tuned_MARS_Performance = tuned_mars_performance)## $Basic_MARS_Performance
## RMSE Rsquared MAE
## 1.8136467 0.8677298 1.3911836
##
## $Tuned_MARS_Performance
## RMSE Rsquared MAE
## 1.2803060 0.9335241 1.0168673
# Training SVM with Radial Basis Function (RBF) Kernel
# Train an SVM model with an RBF kernel on the training data
svm_rbf_model <- ksvm(
x = as.matrix(trainingData$x), # Input features matrix from training dataset
y = trainingData$y, # Target output values from training dataset
kernel = "rbfdot", # Use radial basis function kernel
kpar = "automatic", # Automatically select kernel parameters
C = 1, # Regularization parameter
epsilon = 0.1 # Epsilon parameter for regression tolerance
)
# Make predictions with the RBF SVM model on the test dataset
svm_rbf_predictions <- predict(svm_rbf_model, newdata = testData$x)
# Evaluate performance of the RBF SVM model: accuracy, RMSE, and R-squared
svm_rbf_performance <- postResample(pred = svm_rbf_predictions, obs = testData$y)
# Tuning SVM with Radial Kernel, Centering, and Scaling
# Set up an SVM model with cross-validation, centering, and scaling for tuning
tuned_svm_model <- train(
x = trainingData$x, # Input features from training dataset
y = trainingData$y, # Target output values from training dataset
method = "svmRadial", # SVM with radial basis kernel
preProc = c("center", "scale"), # Preprocess data by centering and scaling
tuneLength = 14, # Number of hyperparameter combinations to try
trControl = trainControl(method = "cv") # Cross-validation control
)
# Display the details of the final tuned model
print(tuned_svm_model$finalModel)## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 8
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.0602551759408374
##
## Number of Support Vectors : 153
##
## Objective Function Value : -77.0787
## Training error : 0.009431
# Make predictions with the tuned SVM model on the test dataset
tuned_svm_predictions <- predict(tuned_svm_model, newdata = testData$x)
# Evaluate performance of the tuned SVM model
tuned_svm_performance <- postResample(pred = tuned_svm_predictions, obs = testData$y)
# Output performance metrics for both the basic and tuned SVM models
list(RBF_SVM_Performance = svm_rbf_performance,
Tuned_SVM_Performance = tuned_svm_performance)## $RBF_SVM_Performance
## RMSE Rsquared MAE
## 2.2550806 0.7996157 1.7224336
##
## $Tuned_SVM_Performance
## RMSE Rsquared MAE
## 2.0437049 0.8307058 1.5500663
In comparing the base models of four different non-linear methods—Multivariate Adaptive Regression Splines (MARS), Support Vector Machine (SVM), Neural Network (NN), and K-Nearest Neighbor (KNN)—we observe that MARS achieves the strongest baseline performance, with an RMSE of 1.8 and an R-squared of 0.86. This indicates that MARS captures the data structure better than the other models at this base level.
Looking at the tuned models, MARS again outperforms the others with an RMSE of 1.28 and an R-squared of 0.93, confirming its adaptability in capturing non-linear relationships effectively even after hyperparameter tuning. Among the remaining three models, the ranking changes slightly: for base models, SVM shows better performance than NN, followed by KNN. However, after tuning, the Neural Network slightly surpasses the SVM, suggesting that NN benefits more from tuning adjustments than SVM does in this case.
In terms of feature selection, MARS naturally identifies and prioritizes informative predictors (x1-x5) while leaving less relevant features (x7-x10) unused. This inherent ability to select the most predictive variables is advantageous, as it not only improves model accuracy but also provides interpretability. The five top-ranked predictors—x1 through x5—stand out with high importance scores, while x6 is slightly less significant, and the remaining predictors contribute minimally to model performance.
# Load dataset
data("ChemicalManufacturingProcess")
# Step 1: Handle Missing Values Using KNN Imputation
# The 'preProcess' function is set up to fill in missing data using KNN imputation.
imputation_model <- preProcess(ChemicalManufacturingProcess, method = "knnImpute")
data_imputed <- predict(imputation_model, ChemicalManufacturingProcess)
# Separate predictors (X) and response variable (y)
predictors <- dplyr::select(data_imputed, -Yield) # All variables except 'Yield'
response <- data_imputed$Yield # 'Yield' is the target variable
# Part (a): Train-Test Split for Model Evaluation
# Set random seed for reproducibility and create an 80-20 train-test split
set.seed(1234)
train_indices <- createDataPartition(response, p = 0.8, list = FALSE)
train_X <- predictors[train_indices, ] %>% as.matrix() # Training predictors matrix
test_X <- predictors[-train_indices, ] %>% as.matrix() # Testing predictors matrix
train_y <- response[train_indices] # Training target values
test_y <- response[-train_indices] # Testing target values
# 1. K-Nearest Neighbors (KNN) Model
# This model scales and centers the data, tuning for the best K value within a range of 10.
knn_model <- train(
x = train_X, y = train_y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10
)## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
knn_predictions <- predict(knn_model, newdata = test_X)
postResample(pred = knn_predictions, obs = test_y)## RMSE Rsquared MAE
## 0.7059851 0.3434676 0.5682675
# Output: Performance metrics (e.g., RMSE, R-squared) to assess KNN model's accuracy.
# 2. Multivariate Adaptive Regression Splines (MARS) Model with Cross-Validation Tuning
# Expanding grid for tuning parameters degree and nprune for the MARS model.
mars_params <- expand.grid(.degree = 1:2, .nprune = 2:58)
mars_tuned_model <- train(
x = train_X, y = train_y,
method = "earth",
tuneGrid = mars_params,
trControl = trainControl(method = "cv")
)
mars_predictions <- predict(mars_tuned_model, newdata = test_X)
postResample(pred = mars_predictions, obs = test_y)## RMSE Rsquared MAE
## 0.5891524 0.5427526 0.4669305
# Output: Performance metrics for MARS model on test data.
# 3. Support Vector Machine (SVM) with Radial Basis Function Kernel
# Tuning SVM model using cross-validation with radial kernel, centering, and scaling.
svm_model <- train(
x = train_X, y = train_y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = trainControl(method = "cv")
)
svm_predictions <- predict(svm_model, newdata = test_X)
postResample(pred = svm_predictions, obs = test_y)## RMSE Rsquared MAE
## 0.6095296 0.5236468 0.4804063
# Output: Accuracy metrics for tuned SVM model.
# 4. Neural Network Model
# Training an averaged neural network with 5 hidden nodes and weight decay.
nn_model <- avNNet(
x = train_X, y = train_y,
size = 5,
decay = 0.01,
repeats = 5,
linout = TRUE,
trace = FALSE,
maxit = 500,
MaxNWts = 5 * (ncol(train_X) + 1) + 5 + 1
)
nn_predictions <- predict(nn_model, newdata = test_X)
postResample(pred = nn_predictions, obs = test_y)## RMSE Rsquared MAE
## 0.7016608 0.4516523 0.5697754
# Part (b): Variable Importance from MARS Model
# Variable importance to determine which features are most informative.
importance_mars <- varImp(mars_tuned_model)
print(importance_mars)## earth variable importance
##
## Overall
## ManufacturingProcess32 100.000
## ManufacturingProcess09 59.823
## ManufacturingProcess13 17.294
## ManufacturingProcess01 2.199
## ManufacturingProcess42 0.000
# Fitting MARS model directly for comparison of feature importance
mars_direct <- earth(x = train_X, y = train_y)
importance_direct <- varImp(mars_direct)
print(importance_direct)## Overall
## ManufacturingProcess32 100.00000
## ManufacturingProcess09 67.41221
## ManufacturingProcess13 35.46817
## ManufacturingProcess01 26.58793
## ManufacturingProcess42 25.56440
## ManufacturingProcess33 21.70760
## BiologicalMaterial03 15.11625
## ManufacturingProcess22 13.16129
## ManufacturingProcess28 10.32610
## ManufacturingProcess04 0.00000
# Part (c): Plotting Relationships Between Top Predictors and Yield
# List of most important predictors for analysis
top_features <- c("ManufacturingProcess32", "ManufacturingProcess09", "ManufacturingProcess13",
"ManufacturingProcess39", "ManufacturingProcess22", "ManufacturingProcess28",
"BiologicalMaterial12", "BiologicalMaterial03", "ManufacturingProcess01",
"ManufacturingProcess33")
# Generate scatter plots with linear fit for each selected predictor
for (feature in top_features) {
plot_data <- data.frame(X = predictors[[feature]], Y = response)
plot <- ggplot(plot_data, aes_string(x = "X", y = "Y")) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "blue", se = FALSE) +
labs(
title = paste("Relationship between", feature, "and Yield"),
x = feature, y = "Yield"
) +
theme_minimal()
print(plot)
}## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'