# detect, use multicores
numCores <- parallel::detectCores()

# create a simple cluster on the local machine using all available threads
cl <- parallel::makeCluster(detectCores(), methods = FALSE)

# register our cluster
doParallel::registerDoParallel(cl)

MARS

# model
set.seed(1234)
marsModel <- caret::train(x = bev_model_train %>% dplyr::select(-pH), 
                          y = bev_model_train %>% dplyr::select(pH) %>% .$pH,
                          method = "earth",
                          preProcess = c("center", "scale"),
                          tuneGrid = expand.grid(degree = 1:3, nprune = 1:30),
                          trControl = trainControl(method = "cv"))

# ggplot
ggplot(marsModel) + labs(title = "MARS Cross-Validated RMSE Profile") + theme_classic()


# importance (top 5)
marsPerformance <- caret::varImp(marsModel)
marsPerformance$importance %>% 
        as.data.frame() %>%
        tibble::rownames_to_column() %>%
        dplyr::mutate(name = forcats::fct_inorder(rowname)) %>%
        arrange(desc(Overall)) %>%
        head(5) %>%
        ggplot(., aes(x = reorder(name, Overall), y = Overall)) + 
        geom_point() + 
        geom_segment(aes(x = name, xend = name, y = 0, yend = Overall)) + 
        ggtitle("MARS: Top 5 Variables") +
        labs(x = "Variable", y = "Importance") +
        coord_flip() +
        theme_minimal()


# Validation on the hold-out set 
marsPred <- predict(marsModel, newdata = bev_model_test) 
marsKPI <- postResample(pred = marsPred, obs = Y_test)
marsMAPE <- MLmetrics::MAPE(predict(marsModel, X_test), Y_test)

SVM

# model
set.seed(1234)
svmModel <- caret::train(pH ~., bev_model_train,
                         method = "svmRadial",
                         preProcess = c("center", "scale"),
                         tuneLength = 14,
                         trControl = trainControl(method = "cv"))

# ggplot
ggplot(svmModel) + labs(title = "SVM Cross-Validated RMSE Profile") + theme_gray()


# importance (top 5)
svmPerformance <- caret::varImp(svmModel)
svmPerformance$importance %>% 
        as.data.frame() %>%
        tibble::rownames_to_column() %>%
        dplyr::mutate(name = forcats::fct_inorder(rowname)) %>%
        arrange(desc(Overall)) %>%
        head(5) %>%
        ggplot(., aes(x = reorder(name, Overall), y = Overall)) + 
        geom_point() + 
        geom_segment(aes(x = name, xend = name, y = 0, yend = Overall)) + 
        ggtitle("SVM: Top 5 Variables") +
        labs(x = "Variable", y = "Importance") +
        coord_flip() +
        theme_minimal()


# Validation on the hold-out set 
svmPred <- predict(svmModel, newdata = bev_model_test) 
svmKPI <- postResample(pred = svmPred, obs = Y_test)
svmMAPE <- MLmetrics::MAPE(predict(svmModel, X_test), Y_test)

KNN

# model
set.seed(1234)
knnModel <- caret::train(pH ~ ., bev_model_train,
                         method = "knn", 
                         preProcess = c('center', 'scale'), 
                         tuneLength = 10, 
                         trControl = trainControl(method = "cv"))

# ggplot
ggplot(knnModel) + labs(title = "KNN Cross-Validated RMSE Profile") + theme_bw()


# importance (top 5)
knnPerformance <- caret::varImp(knnModel)
knnPerformance$importance %>% 
        as.data.frame() %>%
        tibble::rownames_to_column() %>%
        dplyr::mutate(name = forcats::fct_inorder(rowname)) %>%
        arrange(desc(Overall)) %>%
        head(5) %>%
        ggplot(., aes(x = reorder(name, Overall), y = Overall)) + 
        geom_point() + 
        geom_segment(aes(x = name, xend = name, y = 0, yend = Overall)) + 
        ggtitle("KNN: Top 5 Variables") +
        labs(x = "Variable", y = "Importance") +
        coord_flip() +
        theme_minimal()


# Validation on the hold-out set 
knnPred <- predict(knnModel, newdata = bev_model_test) 
knnKPI <- postResample(pred = knnPred, obs = Y_test)
knnMAPE <- MLmetrics::MAPE(predict(knnModel, X_test), Y_test)

MARS model summary

KPI MARS.Model MAPE
RMSE 0.1177403 0.0102855
Rsquared 0.5554237 0.0102855
MAE 0.0875297 0.0102855

SVM model summary

KPI SVM.Model MAPE
RMSE 0.1200991 0.0100186
Rsquared 0.5402065 0.0100186
MAE 0.0853700 0.0100186

KNN model summary

KPI KNN.Model MAPE
RMSE 0.1255216 0.0105526
Rsquared 0.5003098 0.0105526
MAE 0.0898410 0.0105526

The performance of these models are very similar, but SVM seems to do slightly better than MARS and KNN. We can apply the model to the evaluation set.

# stop the cluster
parallel::stopCluster(cl)