suppressPackageStartupMessages({
library(caret)
library(mlbench)
library(ggplot2)
library(earth)
library(kernlab)
library(randomForest)
library(gbm)
library(nnet)
library(AppliedPredictiveModeling)
library(tibble)
library(dplyr)
})
theme_set(theme_minimal())
library(doParallel)
registerDoParallel(cores = parallel::detectCores()-1)
ctrl <- trainControl(method = "cv", number = 3, summaryFunction = defaultSummary, savePredictions = "final")
metric <- "RMSE"
set.seed(200)
trainingData <- mlbench.friedman1(100, sd = 1)
trainingData$x <- as.data.frame(trainingData$x); names(trainingData$x) <- paste0("X", 1:10)
testData <- mlbench.friedman1(1000, sd = 1)
testData$x <- as.data.frame(testData$x); names(testData$x) <- paste0("X", 1:10)
pairs(trainingData$x[,1:5])
Interpretation:
Nonlinear and interaction patterns appear in X1–X5; X6–X10 are
noise.
set.seed(200)
pre <- c("center","scale")
knnModel <- train(x=trainingData$x, y=trainingData$y, method="knn", preProcess=pre, tuneLength=5, trControl=ctrl, metric=metric)
marsModel <- train(x=trainingData$x, y=trainingData$y, method="earth", tuneLength=5, trControl=ctrl, metric=metric)
svmModel <- train(x=trainingData$x, y=trainingData$y, method="svmRadial", preProcess=pre, tuneLength=5, trControl=ctrl, metric=metric)
rfModel <- train(x=trainingData$x, y=trainingData$y, method="rf", tuneLength=3, trControl=ctrl, metric=metric, importance=TRUE)
gbmModel <- train(x=trainingData$x, y=trainingData$y, method="gbm", verbose=FALSE, tuneLength=3, trControl=ctrl, metric=metric)
resamp <- resamples(list(KNN=knnModel, MARS=marsModel, SVM=svmModel, RF=rfModel, GBM=gbmModel))
dotplot(resamp, metric="RMSE")
summary(resamp)
##
## Call:
## summary.resamples(object = resamp)
##
## Models: KNN, MARS, SVM, RF, GBM
## Number of resamples: 3
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## KNN 2.645805 2.962941 3.280077 3.102446 3.330766 3.381455 0
## MARS 1.550732 1.629542 1.708351 1.664458 1.721320 1.734290 0
## SVM 1.875930 1.904770 1.933609 1.932866 1.961335 1.989060 0
## RF 2.010920 2.100923 2.190926 2.187631 2.275987 2.361048 0
## GBM 1.571274 1.614983 1.658693 1.708539 1.777172 1.895651 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## KNN 3.215916 3.674246 4.132576 3.828073 4.134151 4.135727 0
## MARS 1.933126 2.048968 2.164809 2.098230 2.180782 2.196755 0
## SVM 2.278147 2.319452 2.360756 2.419144 2.489643 2.618530 0
## RF 2.535351 2.618155 2.700959 2.675254 2.745206 2.789454 0
## GBM 1.886296 1.928171 1.970046 2.089988 2.191834 2.413622 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## KNN 0.4520668 0.5253695 0.5986721 0.5538527 0.6047457 0.6108192 0
## MARS 0.8472413 0.8479259 0.8486105 0.8545421 0.8581926 0.8677746 0
## SVM 0.7649261 0.7861198 0.8073136 0.7943175 0.8090131 0.8107127 0
## RF 0.6575727 0.7329948 0.8084169 0.7861653 0.8504617 0.8925064 0
## GBM 0.7940180 0.8261251 0.8582323 0.8463024 0.8724447 0.8866570 0
Interpretation:
SVM-RBF and MARS usually perform best on this nonlinear function.
preds <- lapply(list(KNN=knnModel,MARS=marsModel,SVM=svmModel,RF=rfModel,GBM=gbmModel), predict, newdata=testData$x)
perf <- sapply(preds, function(p) postResample(p, testData$y))
t(perf)
## RMSE Rsquared MAE
## KNN 3.439214 0.5910585 2.773497
## MARS 1.982857 0.8464011 1.521008
## SVM 2.517090 0.7517280 1.930327
## RF 2.869054 0.6955246 2.258341
## GBM 2.216743 0.8079666 1.721698
varImp(marsModel)$importance %>%
tibble::rownames_to_column("Feature") %>% arrange(desc(Overall)) %>% head(10) %>%
ggplot(aes(x=reorder(Feature, Overall), y=Overall)) +
geom_col() + coord_flip() +
labs(title="MARS Variable Importance — Friedman 1 (Fast Version)", x=NULL, y="Importance")
data("ChemicalManufacturingProcess")
cmp <- ChemicalManufacturingProcess
set.seed(200)
inTrain <- createDataPartition(cmp$Yield, p=0.8, list=FALSE)
trainX <- cmp[inTrain, -1]; trainY <- cmp$Yield[inTrain]
testX <- cmp[-inTrain, -1]; testY <- cmp$Yield[-inTrain]
pp <- preProcess(trainX, method=c("medianImpute","zv","center","scale"))
trainX <- predict(pp, trainX); testX <- predict(pp, testX)
set.seed(200)
svm_cmp <- train(trainX, trainY, method="svmRadial", tuneLength=5, trControl=ctrl, metric=metric)
mars_cmp <- train(trainX, trainY, method="earth", tuneLength=5, trControl=ctrl, metric=metric)
rf_cmp <- train(trainX, trainY, method="rf", tuneLength=3, trControl=ctrl, metric=metric, importance=TRUE)
gbm_cmp <- train(trainX, trainY, method="gbm", tuneLength=3, trControl=ctrl, metric=metric, verbose=FALSE)
nnet_cmp <- train(trainX, trainY, method="nnet", linout=TRUE, trace=FALSE, tuneLength=3, trControl=ctrl, metric=metric)
res_cmp <- resamples(list(SVM=svm_cmp,MARS=mars_cmp,RF=rf_cmp,GBM=gbm_cmp,NNET=nnet_cmp))
dotplot(res_cmp, metric="RMSE")
summary(res_cmp)
##
## Call:
## summary.resamples(object = res_cmp)
##
## Models: SVM, MARS, RF, GBM, NNET
## Number of resamples: 3
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SVM 0.8523109 0.8554566 0.8586023 0.8823431 0.8973592 0.9361160 0
## MARS 0.7965739 0.9728732 1.1491725 1.0642395 1.1980724 1.2469722 0
## RF 0.8356929 0.8552098 0.8747267 0.8955150 0.9254260 0.9761254 0
## GBM 0.8181439 0.8409093 0.8636748 0.8833286 0.9159210 0.9681672 0
## NNET 1.1651261 1.1866438 1.2081615 1.1984109 1.2150533 1.2219450 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SVM 1.0485325 1.090085 1.131638 1.121799 1.158432 1.185225 0
## MARS 0.9498128 1.191531 1.433249 1.328771 1.518250 1.603251 0
## RF 1.0493208 1.077820 1.106320 1.129179 1.169108 1.231896 0
## GBM 1.0264474 1.082407 1.138366 1.110495 1.152519 1.166672 0
## NNET 1.4504581 1.465980 1.481502 1.476003 1.488775 1.496048 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SVM 0.5691201 0.5912098 0.6132995 0.6216060 0.6478490 0.6823985 0
## MARS 0.3431908 0.3691018 0.3950128 0.4554276 0.5115460 0.6280792 0
## RF 0.6143960 0.6209804 0.6275649 0.6446071 0.6597127 0.6918606 0
## GBM 0.6027174 0.6106278 0.6185382 0.6207064 0.6297008 0.6408634 0
## NNET 0.3384389 0.3516045 0.3647701 0.3898919 0.4156185 0.4664668 0
preds_cmp <- lapply(list(SVM=svm_cmp,MARS=mars_cmp,RF=rf_cmp,GBM=gbm_cmp,NNET=nnet_cmp), predict, newdata=testX)
perf_cmp <- sapply(preds_cmp, function(p) postResample(p, testY))
t(perf_cmp)
## RMSE Rsquared MAE
## SVM 1.330960 0.61945385 1.0017224
## MARS 1.654897 0.32853139 1.2847871
## RF 1.320086 0.60641257 0.9244915
## GBM 1.240818 0.64554151 0.9251764
## NNET 6.546625 0.04155309 4.5325295
Interpretation:
SVM-RBF or GBM still perform best even with faster CV settings.
set.seed(200)
glmnet_cmp <- train(trainX, trainY, method="glmnet", tuneLength=10, trControl=ctrl, metric=metric)
postResample(predict(glmnet_cmp, testX), testY)
## RMSE Rsquared MAE
## 3.1904518 0.1328109 1.4627602
bestModel <- svm_cmp
vi_nonlin <- varImp(bestModel)$importance %>% tibble::rownames_to_column("Feature") %>% arrange(desc(Overall)) %>% head(10)
ggplot(vi_nonlin, aes(x=reorder(Feature, Overall), y=Overall)) + geom_col() + coord_flip() +
labs(title="Top 10 Variables — Optimal Nonlinear Model", x=NULL, y="Importance")
vi_linear <- varImp(glmnet_cmp)$importance %>% tibble::rownames_to_column("Feature") %>% arrange(desc(Overall)) %>% head(10)
ggplot(vi_linear, aes(x=reorder(Feature, Overall), y=Overall)) + geom_col() + coord_flip() +
labs(title="Top 10 Variables — Linear Model (GLMNET)", x=NULL, y="Importance")
unique_vars <- setdiff(vi_nonlin$Feature, vi_linear$Feature)
if(length(unique_vars)==0){
cat("All top predictors overlap between models.")
} else {
df_plot <- data.frame(Yield=trainY, trainX[,unique_vars,drop=FALSE])
for(v in unique_vars){
print(
ggplot(df_plot, aes_string(x=v, y="Yield")) +
geom_point(alpha=0.6) +
geom_smooth(method="loess", se=TRUE, col="steelblue") +
labs(title=paste("Yield vs", v), subtitle="Unique nonlinear predictor")
)
}
}
Interpretation:
Curved or threshold-like relationships reveal nonlinear influence
zones.