# detect, use multicores
numCores <- parallel::detectCores()
# create a simple cluster on the local machine using all available threads
cl <- parallel::makeCluster(detectCores(), methods = FALSE)
# register our cluster
doParallel::registerDoParallel(cl)
MARS
# model
set.seed(1234)
marsModel <- caret::train(x = bev_model_train %>% dplyr::select(-pH),
y = bev_model_train %>% dplyr::select(pH) %>% .$pH,
method = "earth",
preProcess = c("center", "scale"),
tuneGrid = expand.grid(degree = 1:3, nprune = 1:30),
trControl = trainControl(method = "cv"))
# ggplot
ggplot(marsModel) + labs(title = "MARS Cross-Validated RMSE Profile") + theme_classic()

# importance (top 5)
marsPerformance <- caret::varImp(marsModel)
marsPerformance$importance %>%
as.data.frame() %>%
tibble::rownames_to_column() %>%
dplyr::mutate(name = forcats::fct_inorder(rowname)) %>%
arrange(desc(Overall)) %>%
head(5) %>%
ggplot(., aes(x = reorder(name, Overall), y = Overall)) +
geom_point() +
geom_segment(aes(x = name, xend = name, y = 0, yend = Overall)) +
ggtitle("MARS: Top 5 Variables") +
labs(x = "Variable", y = "Importance") +
coord_flip() +
theme_minimal()

# Validation on the hold-out set
marsPred <- predict(marsModel, newdata = bev_model_test)
marsKPI <- postResample(pred = marsPred, obs = Y_test)
marsMAPE <- MLmetrics::MAPE(predict(marsModel, X_test), Y_test)
SVM
# model
set.seed(1234)
svmModel <- caret::train(pH ~., bev_model_train,
method = "svmRadial",
preProcess = c("center", "scale"),
tuneLength = 14,
trControl = trainControl(method = "cv"))
# ggplot
ggplot(svmModel) + labs(title = "SVM Cross-Validated RMSE Profile") + theme_gray()

# importance (top 5)
svmPerformance <- caret::varImp(svmModel)
svmPerformance$importance %>%
as.data.frame() %>%
tibble::rownames_to_column() %>%
dplyr::mutate(name = forcats::fct_inorder(rowname)) %>%
arrange(desc(Overall)) %>%
head(5) %>%
ggplot(., aes(x = reorder(name, Overall), y = Overall)) +
geom_point() +
geom_segment(aes(x = name, xend = name, y = 0, yend = Overall)) +
ggtitle("SVM: Top 5 Variables") +
labs(x = "Variable", y = "Importance") +
coord_flip() +
theme_minimal()

# Validation on the hold-out set
svmPred <- predict(svmModel, newdata = bev_model_test)
svmKPI <- postResample(pred = svmPred, obs = Y_test)
svmMAPE <- MLmetrics::MAPE(predict(svmModel, X_test), Y_test)
KNN
# model
set.seed(1234)
knnModel <- caret::train(pH ~ ., bev_model_train,
method = "knn",
preProcess = c('center', 'scale'),
tuneLength = 10,
trControl = trainControl(method = "cv"))
# ggplot
ggplot(knnModel) + labs(title = "KNN Cross-Validated RMSE Profile") + theme_bw()

# importance (top 5)
knnPerformance <- caret::varImp(knnModel)
knnPerformance$importance %>%
as.data.frame() %>%
tibble::rownames_to_column() %>%
dplyr::mutate(name = forcats::fct_inorder(rowname)) %>%
arrange(desc(Overall)) %>%
head(5) %>%
ggplot(., aes(x = reorder(name, Overall), y = Overall)) +
geom_point() +
geom_segment(aes(x = name, xend = name, y = 0, yend = Overall)) +
ggtitle("KNN: Top 5 Variables") +
labs(x = "Variable", y = "Importance") +
coord_flip() +
theme_minimal()

# Validation on the hold-out set
knnPred <- predict(knnModel, newdata = bev_model_test)
knnKPI <- postResample(pred = knnPred, obs = Y_test)
knnMAPE <- MLmetrics::MAPE(predict(knnModel, X_test), Y_test)
MARS model summary
|
KPI
|
MARS.Model
|
MAPE
|
|
RMSE
|
0.1177403
|
0.0102855
|
|
Rsquared
|
0.5554237
|
0.0102855
|
|
MAE
|
0.0875297
|
0.0102855
|
SVM model summary
|
KPI
|
SVM.Model
|
MAPE
|
|
RMSE
|
0.1200991
|
0.0100186
|
|
Rsquared
|
0.5402065
|
0.0100186
|
|
MAE
|
0.0853700
|
0.0100186
|
KNN model summary
|
KPI
|
KNN.Model
|
MAPE
|
|
RMSE
|
0.1255216
|
0.0105526
|
|
Rsquared
|
0.5003098
|
0.0105526
|
|
MAE
|
0.0898410
|
0.0105526
|
The performance of these models are very similar, but SVM seems to do slightly better than MARS and KNN. We can apply the model to the evaluation set.

# stop the cluster
parallel::stopCluster(cl)