7.2
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)
#Look at the data using feature
featurePlot(trainingData$x, trainingData$y)
#Estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
ctrl <- trainControl(
method = "boot",
number = 25
)
#Linear regression
set.seed(200)
lmModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "lm",
trControl = ctrl
)
#KNN
knnModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = ctrl
)
#MARS
marsModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "earth",
tuneLength = 10,
trControl = ctrl
)
## Loading required package: earth
## Warning: package 'earth' was built under R version 4.5.2
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.5.2
## Loading required package: plotrix
## Warning: package 'plotrix' was built under R version 4.5.2
#Random forest
rfModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "rf",
tuneLength = 5,
trControl = ctrl
)
#Support vector machine
svmModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = ctrl
)
#Neural network
nnetModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "nnet",
preProc = c("center", "scale"),
tuneLength = 10,
trace = FALSE,
linout = TRUE,
trControl = ctrl
)
model_list <- list(
Linear_Regression = lmModel,
KNN = knnModel,
MARS = marsModel,
Random_Forest = rfModel,
SVM_Radial = svmModel,
Neural_Network = nnetModel
)
test_results <- lapply(model_list, function(model) {
pred <- predict(model, newdata = testData$x)
postResample(pred = pred, obs = testData$y)
})
test_results <- do.call(rbind, test_results)
test_results <- data.frame(
Model = rownames(test_results),
test_results,
row.names = NULL
)
test_results
## Model RMSE Rsquared MAE
## 1 Linear_Regression 2.697068 0.7084666 2.060054
## 2 KNN 3.228683 0.6871735 2.593973
## 3 MARS 1.776575 0.8727000 1.358367
## 4 Random_Forest 2.413570 0.7930923 1.907306
## 5 SVM_Radial 2.076083 0.8252816 1.577022
## 6 Neural_Network 2.645210 0.7188479 2.025391
#Mars results on importance of informative predictors
marsModel$finalModel
## Selected 10 of 18 terms, and 6 of 10 predictors (nprune=10)
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 9 (additive model)
## GCV 2.731203 RSS 447.3848 GRSq 0.889112 RSq 0.9082649
marsImp <- varImp(marsModel)
marsImp
## earth variable importance
##
## Overall
## X1 100.00
## X4 82.78
## X2 64.18
## X5 40.21
## X3 28.14
## X6 0.00
plot(marsImp, top = 10)
The results show that the MARS model achieved the best predictive
performance, with the lowest RMSE (1.78) and highest R^2 (0.873).The SVM
and Random Forest models also did well but not as strongly as MARS.
Linear regression and neural networks produced moderately, while KNN
performed the worst, probably due to the presence of irrelevant
predictors and the high dimensionality of the data as shown in the
textbook example
The MARS informative predictors importance results indicate that X1–X5 were identified as the most important predictors, while X6 and lower had little or no visible importance. This agrees with the true underlying function used to generate this data which shown only depends on X1–X5.
7.5
#A
#Testing different models from chapter and using same seed as hw for repetition
set.seed(62462)
data(ChemicalManufacturingProcess)
chem <- ChemicalManufacturingProcess
chem <- ChemicalManufacturingProcess
ProcessPredictors <- chem[, names(chem) != "Yield"]
yield <- chem$Yield
Training_split <- createDataPartition(yield, p = 0.80, list = FALSE)
trainsplit_x <- ProcessPredictors[Training_split, ]
testsplit_x <- ProcessPredictors[-Training_split, ]
trainsplit_y <- yield[Training_split]
testsplit_y <- yield[-Training_split]
medprocess <- preProcess(
trainsplit_x,
method = c("medianImpute", "center", "scale")
)
training_x_f <- predict(medprocess, trainsplit_x)
test_x_f <- predict(medprocess, testsplit_x)
ctrl <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 5
)
mars_fit <- train(
x = training_x_f,
y = trainsplit_y,
method = "earth",
trControl = ctrl,
tuneLength = 10,
metric = "RMSE"
)
svm_fit <- train(
x = training_x_f,
y = trainsplit_y,
method = "svmRadial",
trControl = ctrl,
tuneLength = 10,
metric = "RMSE"
)
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
rf_fit <- train(
x = training_x_f,
y = trainsplit_y,
method = "rf",
trControl = ctrl,
tuneLength = 10,
metric = "RMSE"
)
gbm_fit <- train(
x = training_x_f,
y = trainsplit_y,
method = "gbm",
trControl = ctrl,
tuneLength = 10,
metric = "RMSE",
verbose = FALSE
)
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
nonlinear_resamples <- resamples(list(
MARS = mars_fit,
SVM = svm_fit,
RandomForest = rf_fit,
GBM = gbm_fit
))
summary(nonlinear_resamples)
##
## Call:
## summary.resamples(object = nonlinear_resamples)
##
## Models: MARS, SVM, RandomForest, GBM
## Number of resamples: 50
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## MARS 0.4841509 0.8062760 0.9294589 0.9465545 1.0441384 1.803642 0
## SVM 0.5215231 0.7027138 0.7939892 0.8113800 0.9065167 1.178549 0
## RandomForest 0.5057704 0.7250368 0.8399093 0.8386686 0.9361618 1.240473 0
## GBM 0.5267122 0.7160739 0.8153555 0.8374412 0.9377986 1.179194 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## MARS 0.6084239 1.0047145 1.1503193 1.193842 1.273466 3.088726 0
## SVM 0.6475951 0.8547185 0.9979235 1.017687 1.125255 1.564169 0
## RandomForest 0.6620880 0.9175951 1.0492676 1.085483 1.260341 1.638098 0
## GBM 0.6332132 0.9260535 1.0152287 1.062458 1.233116 1.579060 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## MARS 0.1065626 0.5511950 0.6400761 0.6252242 0.7429767 0.8401023 0
## SVM 0.3740499 0.6540322 0.7264424 0.7128082 0.7952394 0.8963321 0
## RandomForest 0.3799379 0.6118480 0.6794435 0.6791131 0.7639873 0.8841133 0
## GBM 0.3294154 0.5712696 0.7224334 0.6786335 0.8003342 0.9192390 0
dotplot(nonlinear_resamples, metric = "RMSE")
nonlinear_test_results <- data.frame(
Model = c("MARS", "SVM", "Random Forest", "GBM"),
RMSE = c(
caret::RMSE(as.vector(predict(mars_fit, test_x_f)), testsplit_y),
caret::RMSE(as.vector(predict(svm_fit, test_x_f)), testsplit_y),
caret::RMSE(as.vector(predict(rf_fit, test_x_f)), testsplit_y),
caret::RMSE(as.vector(predict(gbm_fit, test_x_f)), testsplit_y)
),
Rsquared = c(
caret::R2(as.vector(predict(mars_fit, test_x_f)), testsplit_y),
caret::R2(as.vector(predict(svm_fit, test_x_f)), testsplit_y),
caret::R2(as.vector(predict(rf_fit, test_x_f)), testsplit_y),
caret::R2(as.vector(predict(gbm_fit, test_x_f)), testsplit_y)
)
)
nonlinear_test_results
## Model RMSE Rsquared
## 1 MARS 1.164975 0.5940395
## 2 SVM 1.283718 0.5269012
## 3 Random Forest 1.266489 0.5143470
## 4 GBM 1.158034 0.5929873
#B
#Best model
best_nonlinear <- svm_fit
nonlinear_imp <- varImp(best_nonlinear)
plot(nonlinear_imp, top = 20)
nonlinear_imp_df <- nonlinear_imp$importance |>
as.data.frame() |>
tibble::rownames_to_column("Predictor") |>
arrange(desc(Overall))
head(nonlinear_imp_df, 10)
## Predictor Overall
## 1 ManufacturingProcess32 100.00000
## 2 ManufacturingProcess13 90.39975
## 3 BiologicalMaterial06 89.74252
## 4 ManufacturingProcess36 81.53643
## 5 BiologicalMaterial12 78.41002
## 6 BiologicalMaterial02 78.16844
## 7 ManufacturingProcess09 76.38185
## 8 ManufacturingProcess17 74.88325
## 9 BiologicalMaterial03 73.17861
## 10 ManufacturingProcess31 63.57876
Among top ten predictors 6 are manufacturing process variables and 4 are biological material variables suggesting that process variables slightly hold more influence. Biological variables still play a significant role suggesting that both process conditions and raw material characteristics hold a critical importance for predicting yield. Comparing these results to the optimal linear model there is a strong overlap. 8 of the top 10 predictors appear in both models. This indicates that many predictors have strong main effects that are consistently identified in every model.
The nonlinear model identifies two additional predictors ManufacturingProcess31 and BiologicalMaterial12 that were not among the top predictors in the linear model. This suggests that these variables may have some additional influence or collinearity being applied. It also may suggest there is a nonlinear relationship that isn’t captured by a linear model.
#C
#extract nonlinear top 10
nonlinear_top10 <- head(nonlinear_imp_df, 10)$Predictor
#Top10 from 6.3 results hw
linear_top10 <- c(
"ManufacturingProcess32",
"ManufacturingProcess36",
"ManufacturingProcess13",
"ManufacturingProcess09",
"ManufacturingProcess17",
"ManufacturingProcess33",
"BiologicalMaterial02",
"BiologicalMaterial03",
"BiologicalMaterial08",
"BiologicalMaterial06"
)
#find unique nonlinear predictors
unique_nonlinear <- setdiff(nonlinear_top10, linear_top10)
unique_nonlinear
## [1] "BiologicalMaterial12" "ManufacturingProcess31"
#Make Plotting data
plot_data <- training_x_f |>
as.data.frame() |>
mutate(Yield = trainsplit_y)
#Plot unique nonlinear predictors
for (var in unique_nonlinear) {
p <- ggplot(plot_data, aes(x = .data[[var]], y = Yield)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "loess", se = FALSE) +
labs(
title = paste("Yield vs", var),
x = var,
y = "Yield"
)
print(p)
}
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
The predictors that were unique to the nonlinear model,
ManufacturingProcess31 and BiologicalMaterial12, were plotted against
yield to better understand their relationships. The plot for
ManufacturingProcess31 shows a nonlinear downward trend, where yield
decreases as the predictor increases, but the rate of decrease is not
constant. In particular, the relationship becomes steeper in certain
regions and then levels off, indicating that an optimal range exists and
around -0.5 we see a decrease.
The plot for BiologicalMaterial12 shows yield initially increases as the predictor increases then reaches a peak and then decreases at higher values. This also suggests the presence of an optimal range for this biological variable where yield is maximized with a penalty for too little or too much. The nonlinear patterns help explain why these predictors were not identified as highly important in the linear model that assumes a constant linear relationship. The SVM model is able to capture these more complex relationships, indicating that both process and biological variables can influence yield in a nonlinear way.