Install & Load package

train_raw <- read_excel("StudentData.xlsx")
test_raw  <- read_excel("StudentEvaluation.xlsx")
 
cat("Training set dimensions:", dim(train_raw), "\n")
## Training set dimensions: 2571 33
cat("Test set dimensions:    ", dim(test_raw),  "\n")
## Test set dimensions:     267 33
cat("Training columns:", names(train_raw), "\n")
## Training columns: Brand Code Carb Volume Fill Ounces PC Volume Carb Pressure Carb Temp PSC PSC Fill PSC CO2 Mnf Flow Carb Pressure1 Fill Pressure Hyd Pressure1 Hyd Pressure2 Hyd Pressure3 Hyd Pressure4 Filler Level Filler Speed Temperature Usage cont Carb Flow Density MFR Balling Pressure Vacuum PH Oxygen Filler Bowl Setpoint Pressure Setpoint Air Pressurer Alch Rel Carb Rel Balling Lvl

Exploratory Analysis

ggplot(train_raw, aes(x = PH)) +
  geom_histogram(binwidth = 0.05, fill = "#2E75B6", color = "white", alpha = 0.85) +
  geom_vline(aes(xintercept = mean(PH, na.rm = TRUE)),
             color = "red", linetype = "dashed", linewidth = 1) +
  labs(title = "Distribution of pH — Training Data",
       subtitle = paste0("Mean = ", round(mean(train_raw$PH, na.rm=TRUE), 3),
                         " | SD = ", round(sd(train_raw$PH, na.rm=TRUE), 3)),
       x = "pH", y = "Count") +
  theme_minimal(base_size = 13)
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(train_raw %>% filter(!is.na(`Brand Code`)),
       aes(x = `Brand Code`, y = PH, fill = `Brand Code`)) +
  geom_boxplot(alpha = 0.7, outlier.colour = "red", outlier.size = 1.5) +
  labs(title = "pH Distribution by Brand Code",
       x = "Brand Code", y = "pH") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

plot_missing(train_raw)

aggr(train_raw, col = c("#2E75B6", "#C9A84C"),
     numbers = TRUE, sortVars = TRUE,
     labels = names(train_raw),
     cex.axis = 0.6, gap = 3,
     ylab = c("Missing Data Pattern", "Proportion Missing"))
## Warning in plot.aggr(res, ...): not enough vertical space to display
## frequencies (too many combinations)

## 
##  Variables sorted by number of missings: 
##           Variable        Count
##                MFR 0.0824581875
##         Brand Code 0.0466744457
##       Filler Speed 0.0221703617
##          PC Volume 0.0151691949
##            PSC CO2 0.0151691949
##        Fill Ounces 0.0147802412
##                PSC 0.0128354726
##     Carb Pressure1 0.0124465189
##      Hyd Pressure4 0.0116686114
##      Carb Pressure 0.0105017503
##          Carb Temp 0.0101127966
##           PSC Fill 0.0089459354
##      Fill Pressure 0.0085569817
##       Filler Level 0.0077790743
##      Hyd Pressure2 0.0058343057
##      Hyd Pressure3 0.0058343057
##        Temperature 0.0054453520
##      Oxygen Filler 0.0046674446
##  Pressure Setpoint 0.0046674446
##      Hyd Pressure1 0.0042784909
##        Carb Volume 0.0038895371
##           Carb Rel 0.0038895371
##           Alch Rel 0.0035005834
##         Usage cont 0.0019447686
##                 PH 0.0015558149
##           Mnf Flow 0.0007779074
##          Carb Flow 0.0007779074
##      Bowl Setpoint 0.0007779074
##            Density 0.0003889537
##            Balling 0.0003889537
##        Balling Lvl 0.0003889537
##    Pressure Vacuum 0.0000000000
##      Air Pressurer 0.0000000000
numeric_vars <- train_raw %>% select(where(is.numeric))
cor_matrix   <- cor(numeric_vars, use = "pairwise.complete.obs")
 
corrplot(cor_matrix,
         method  = "color",
         type    = "upper",
         tl.cex  = 0.50,
         tl.col  = "black",
         tl.srt       = 45,
         cl.cex       = 0.50,
         addCoef.col = NULL,
         col     = colorRampPalette(c("#D73027","white","#2E75B6"))(200),
         title   = "Correlation Matrix — All Numeric Variables",
         mar     = c(0,0,2,0))

ph_cors <- cor_matrix["PH", ] %>%
  sort(decreasing = TRUE) %>%
  as.data.frame() %>%
  tibble::rownames_to_column("Variable") %>%
  rename(Correlation = ".") %>%
  filter(Variable != "PH")
 
ggplot(ph_cors %>% slice_max(abs(Correlation), n = 15),
       aes(x = reorder(Variable, Correlation), y = Correlation, fill = Correlation > 0)) +
  geom_col(alpha = 0.85) +
  scale_fill_manual(values = c("#C9A84C", "#2E75B6"), labels = c("Negative","Positive")) +
  coord_flip() +
  labs(title = "Top 15 Variable Correlations with pH",
       x = NULL, y = "Pearson Correlation", fill = "Direction") +
  theme_minimal(base_size = 13)

top_preds <- ph_cors %>% slice_max(abs(Correlation), n = 6) %>% pull(Variable)

scatter_plots <- lapply(top_preds, function(var) {
  ggplot(train_raw, aes(x = .data[[var]], y = PH)) +
    geom_point(alpha = 0.3, color = "#2E75B6", size = 0.8) +
    geom_smooth(method = "loess", formula = y ~ x, color = "red", se = FALSE) +
    labs(title = var, x = var, y = "pH") +
    theme_minimal(base_size = 10)
})

do.call(grid.arrange, c(scatter_plots, ncol = 3))

Data Preprocessing

train_raw$`Brand Code` <- as.factor(
  ifelse(is.na(train_raw$`Brand Code`), "Unknown", train_raw$`Brand Code`))
test_raw$`Brand Code`  <- as.factor(
  ifelse(is.na(test_raw$`Brand Code`),  "Unknown", test_raw$`Brand Code`))
# Replace spaces with _ 
train_raw <- train_raw %>% rename_with(~ gsub(" ", "_", .x))
test_raw  <- test_raw  %>% rename_with(~ gsub(" ", "_", .x))

# Remove rows missing PH from training
train_clean <- train_raw %>% filter(!is.na(PH))
cat("Training rows after removing missing PH:", nrow(train_clean), "\n")
## Training rows after removing missing PH: 2567
# Cross validation
cv_folds <- vfold_cv(train_clean, v = 5)

Model instruction

# impute missing values & dummy encode
recipe_base <- recipe(PH ~ ., data = train_clean) %>%
  step_impute_median(all_numeric_predictors()) %>%  # median impute missing values
  step_dummy(Brand_Code, one_hot = TRUE) %>%         # one-hot encode Brand Code
  step_zv(all_predictors())                           # remove zero-variance columns

# adds normalization for linear, SVM, KNN & Correlation Filtering
recipe_scaled <- recipe_base %>%
  step_corr(all_numeric_predictors(), threshold = 0.95) %>%  # remove near-perfect correlations
  step_normalize(all_numeric_predictors())

Model Training

lm_spec <- linear_reg() %>%
  set_engine("lm") %>%
  set_mode("regression")
ridge_spec <- linear_reg(penalty = tune(), mixture = 0) %>%
  set_engine("glmnet") %>%
  set_mode("regression")
lasso_spec <- linear_reg(penalty = tune(), mixture = 1) %>%
  set_engine("glmnet") %>%
  set_mode("regression")
mars_spec <- mars(num_terms = tune(), prod_degree = tune()) %>%
  set_engine("earth") %>%
  set_mode("regression")
svm_spec <- svm_rbf(cost = tune(), rbf_sigma = tune()) %>%
  set_engine("kernlab") %>%
  set_mode("regression")
knn_spec <- nearest_neighbor(neighbors = tune()) %>%
  set_engine("kknn") %>%
  set_mode("regression")
rf_spec <- rand_forest(mtry = tune(), trees = 200, min_n = tune()) %>%
  set_engine("randomForest", importance = TRUE) %>%
  set_mode("regression")
gbm_spec <- boost_tree(
  trees      = tune(),
  tree_depth = tune(),
  learn_rate = tune(),
  min_n      = tune()
) %>%
  set_engine("xgboost") %>%
  set_mode("regression")
xgb_spec <- boost_tree(
  trees            = tune(),
  tree_depth       = tune(),
  learn_rate       = tune(),
  mtry             = tune(),
  sample_size      = tune(),
  loss_reduction   = tune()
) %>%
  set_engine("xgboost") %>%
  set_mode("regression")

TUNING GRIDS

ridge_grid <- grid_regular(penalty(range = c(-4, 1)), levels = 30)
lasso_grid <- grid_regular(penalty(range = c(-4, 1)), levels = 30)
 
mars_grid <- grid_regular(
  num_terms(range = c(5, 30)),
  prod_degree(range = c(1, 2)),
  levels = c(5, 2)
)
 
svm_grid <- grid_regular(
  cost(range = c(-2, 2)),
  rbf_sigma(range = c(-3, 0)),
  levels = 4
)
 
knn_grid <- tibble(neighbors = c(3, 5, 7, 9, 11, 15))
 
rf_grid <- grid_regular(
  mtry(range = c(5, 20)),
  min_n(range = c(2, 10)),
  levels = 4
)
 
gbm_grid <- grid_space_filling(
  trees(range = c(100, 200)),
  tree_depth(range = c(3, 6)),
  learn_rate(range = c(-2, -1)),
  min_n(range = c(5, 20)),
  size = 10
)
 
xgb_grid <- grid_space_filling(
  trees(range = c(100, 300)),
  tree_depth(range = c(3, 8)),
  learn_rate(range = c(-2, -1)),
  mtry(range = c(5, 20)),
  sample_size = sample_prop(range = c(0.6, 1.0)),
  loss_reduction(),
  size = 15
)

Recipe and Model tuning

metrics_set <- metric_set(rmse, rsq, mae)

# Helper function to tune a workflow and return results cleanly
tune_wf <- function(spec, recipe, grid, label) {
  cat("\nTraining:", label, "...\n")
  wf <- workflow() %>% add_recipe(recipe) %>% add_model(spec)
  tune_grid(wf, resamples = cv_folds, grid = grid, metrics = metrics_set)
}

# Linear regresion doesn't need tuning
lm_wf <- workflow() %>% add_recipe(recipe_scaled) %>% add_model(lm_spec)
lm_res <- fit_resamples(lm_wf, resamples = cv_folds, metrics = metrics_set)
## → A | warning: prediction from rank-deficient fit; consider predict(., rankdeficient="NA")
## There were issues with some computations   A: x1There were issues with some computations   A: x2There were issues with some computations   A: x3There were issues with some computations   A: x4There were issues with some computations   A: x5There were issues with some computations   A: x5
cat("Linear Regression done.\n")
## Linear Regression done.
collect_metrics(lm_res)
## # A tibble: 3 × 6
##   .metric .estimator  mean     n std_err .config        
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>          
## 1 mae     standard   0.104     5 0.00181 pre0_mod0_post0
## 2 rmse    standard   0.134     5 0.00226 pre0_mod0_post0
## 3 rsq     standard   0.397     5 0.0109  pre0_mod0_post0
# Every other model
ridge_res <- tune_wf(ridge_spec, recipe_scaled, ridge_grid, "Ridge")
## 
## Training: Ridge ...
lasso_res <- tune_wf(lasso_spec, recipe_scaled, lasso_grid, "Lasso")
## 
## Training: Lasso ...
## → A | warning: A correlation computation is required, but `estimate` is constant and has 0
##                standard deviation, resulting in a divide by 0 error. `NA` will be returned.
## There were issues with some computations   A: x13There were issues with some computations   A: x26There were issues with some computations   A: x39There were issues with some computations   A: x52There were issues with some computations   A: x65There were issues with some computations   A: x65
mars_res  <- tune_wf(mars_spec,  recipe_base,   mars_grid,  "MARS")
## 
## Training: MARS ...
svm_res   <- tune_wf(svm_spec,   recipe_scaled, svm_grid,   "SVM")
## 
## Training: SVM ...
knn_res   <- tune_wf(knn_spec,   recipe_scaled, knn_grid,   "KNN")
## 
## Training: KNN ...
rf_res    <- tune_wf(rf_spec,    recipe_base,   rf_grid,    "Random Forest")
## 
## Training: Random Forest ...
gbm_res   <- tune_wf(gbm_spec,   recipe_base,   gbm_grid,   "Gradient Boosting")
## 
## Training: Gradient Boosting ...
xgb_res   <- tune_wf(xgb_spec,   recipe_base,   xgb_grid,   "XGBoost")
## 
## Training: XGBoost ...
## → A | warning: A correlation computation is required, but `estimate` is constant and has 0
##                standard deviation, resulting in a divide by 0 error. `NA` will be returned.
## There were issues with some computations   A: x1There were issues with some computations   A: x2There were issues with some computations   A: x3There were issues with some computations   A: x4There were issues with some computations   A: x5There were issues with some computations   A: x5

Graphs

autoplot(ridge_res) + labs(title = "Ridge: RMSE vs Penalty")

autoplot(lasso_res) + labs(title = "Lasso: RMSE vs Penalty")

autoplot(mars_res)  + labs(title = "MARS: RMSE vs Terms / Degree")

autoplot(svm_res)   + labs(title = "SVM: RMSE vs Cost / Sigma")

autoplot(knn_res)   + labs(title = "KNN: RMSE vs Neighbors")

autoplot(rf_res)    + labs(title = "Random Forest: RMSE vs mtry / min_n")

autoplot(gbm_res)   + labs(title = "GBM: RMSE vs Tuning Parameters")

autoplot(xgb_res)   + labs(title = "XGBoost: RMSE vs Tuning Parameters")

Model Comparison

# Pull best RMSE and R2 for each model
extract_best_metrics <- function(res, label, tuned = TRUE) {
  all_metrics <- collect_metrics(res)
  
  if (tuned) {
    rmse_val <- show_best(res, metric = "rmse", n = 1) %>% pull(mean)
    r2_val   <- show_best(res, metric = "rsq",  n = 1) %>% pull(mean)
  } else {
    rmse_val <- all_metrics %>% filter(.metric == "rmse") %>% pull(mean) %>% min()
    r2_val   <- all_metrics %>% filter(.metric == "rsq")  %>% pull(mean) %>% max()
  }
  
  tibble(Model = label, CV_RMSE = round(rmse_val, 4), CV_R2 = round(r2_val, 4))
}
 
comparison_df <- bind_rows(
  extract_best_metrics(lm_res,    "Linear Regression", tuned = FALSE),
  extract_best_metrics(ridge_res, "Ridge"),
  extract_best_metrics(lasso_res, "Lasso"),
  extract_best_metrics(mars_res,  "MARS"),
  extract_best_metrics(svm_res,   "SVM"),
  extract_best_metrics(knn_res,   "KNN"),
  extract_best_metrics(rf_res,    "Random Forest"),
  extract_best_metrics(gbm_res,   "Gradient Boosting"),
  extract_best_metrics(xgb_res,   "XGBoost")
) %>% arrange(CV_RMSE)
 
print(comparison_df)
## # A tibble: 9 × 3
##   Model             CV_RMSE CV_R2
##   <chr>               <dbl> <dbl>
## 1 XGBoost            0.0946 0.702
## 2 Random Forest      0.0957 0.704
## 3 Gradient Boosting  0.0985 0.676
## 4 SVM                0.118  0.540
## 5 KNN                0.119  0.532
## 6 MARS               0.124  0.485
## 7 Lasso              0.134  0.398
## 8 Linear Regression  0.134  0.397
## 9 Ridge              0.134  0.394
cat("\n>>> Selected Model:", comparison_df$Model[1],
    "| CV RMSE:", comparison_df$CV_RMSE[1],
    "| CV R2:", comparison_df$CV_R2[1], "\n")
## 
## >>> Selected Model: XGBoost | CV RMSE: 0.0946 | CV R2: 0.702

Visualization of model results

ggplot(comparison_df,
       aes(x = reorder(Model, CV_RMSE), y = CV_RMSE, fill = CV_RMSE)) +
  geom_col(alpha = 0.85) +
  scale_fill_gradient(low = "#2E75B6", high = "#C9A84C") +
  coord_flip() +
  labs(title    = "Model Comparison — 5-Fold CV RMSE (Lower = Better)",
       x = NULL, y = "CV RMSE") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

ggplot(comparison_df,
       aes(x = reorder(Model, CV_R2), y = CV_R2, fill = CV_R2)) +
  geom_col(alpha = 0.85) +
  scale_fill_gradient(low = "#C9A84C", high = "#2E75B6") +
  coord_flip() +
  labs(title    = "Model Comparison — 5-Fold CV R-squared (Higher = Better)",
       x = NULL, y = "CV R^2") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

Best Model

best_res    <- xgb_res     # XGBoost won on RMSE
best_spec   <- xgb_spec
best_recipe <- recipe_base  # stays the same since XGBoost is tree-based

best_params <- select_best(best_res, metric = "rmse")
cat("\nBest hyperparameters:\n")
## 
## Best hyperparameters:
print(best_params)
## # A tibble: 1 × 7
##    mtry trees tree_depth learn_rate loss_reduction sample_size .config         
##   <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>           
## 1    15   257          8     0.0720     0.00000128       0.829 pre0_mod11_post0
final_wf <- workflow() %>%
  add_recipe(best_recipe) %>%
  add_model(best_spec) %>%
  finalize_workflow(best_params)

final_fit <- fit(final_wf, data = train_clean)
test_pred_check <- predict(final_fit, new_data = head(train_clean, 5))

print(test_pred_check)
## # A tibble: 5 × 1
##   .pred
##   <dbl>
## 1  8.36
## 2  8.27
## 3  8.93
## 4  8.24
## 5  8.26

Final Model Diagonistic

train_preds_in <- predict(final_fit, new_data = train_clean) %>% pull(.pred)
residuals_in   <- train_clean$PH - train_preds_in

par(mfrow = c(2, 2))
 
plot(train_preds_in, residuals_in,
     xlab = "Fitted Values", ylab = "Residuals",
     main = "Residuals vs Fitted",
     pch = 16, col = adjustcolor("#2E75B6", 0.4), cex = 0.6)
abline(h = 0, col = "red", lty = 2)
 
hist(residuals_in, breaks = 40, col = "#2E75B6",
     main = "Histogram of Residuals", xlab = "Residual")

qqnorm(residuals_in, main = "Normal Q-Q Plot",
       pch = 16, col = adjustcolor("#2E75B6", 0.4), cex = 0.6)
qqline(residuals_in, col = "red")
 
plot(train_clean$PH, train_preds_in,
     xlab = "Actual pH", ylab = "Predicted pH",
     main = "Actual vs Predicted pH",
     pch = 16, col = adjustcolor("#2E75B6", 0.4), cex = 0.6)
abline(a = 0, b = 1, col = "red", lwd = 2)

par(mfrow = c(1, 1))
train_results <- tibble(
  truth    = train_clean$PH,
  estimate = train_preds_in
)
 
train_metrics <- bind_rows(
  rmse(train_results, truth, estimate),
  rsq(train_results,  truth, estimate),
  mae(train_results,  truth, estimate)
)
print(train_metrics)
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard     0.00585
## 2 rsq     standard     0.999  
## 3 mae     standard     0.00410
final_fit %>%
  extract_fit_parsnip() %>%
  vip(num_features = 20,
      aesthetics = list(fill = "#2E75B6", alpha = 0.85)) +
  labs(title    = "Top 20 Feature Importances — Final Model",
       subtitle = "Higher bars = stronger predictor of pH",
       x = NULL) +
  theme_minimal(base_size = 13)

Model Prediction

ph_predictions <- predict(final_fit, new_data = test_raw) %>% pull(.pred)
 
results_out <- test_raw %>%
  select(Brand_Code) %>%
  mutate(
    Row          = row_number(),
    Brand_Code   = as.character(Brand_Code),
    Predicted_PH = round(ph_predictions, 4)
  ) %>%
  select(Row, Brand_Code, Predicted_PH)
 
cat("\nPrediction Summary:\n")
## 
## Prediction Summary:
cat("  N predictions:", nrow(results_out), "\n")
##   N predictions: 267
cat("  Mean pH:      ", round(mean(ph_predictions), 4), "\n")
##   Mean pH:       8.5465
cat("  Std Dev:      ", round(sd(ph_predictions),   4), "\n")
##   Std Dev:       0.1363
cat("  Min pH:       ", round(min(ph_predictions),  4), "\n")
##   Min pH:        8.0925
cat("  Max pH:       ", round(max(ph_predictions),  4), "\n")
##   Max pH:        8.8352
# Distribution of predicted pH
ggplot(results_out, aes(x = Predicted_PH)) +
  geom_histogram(binwidth = 0.05, fill = "#2E75B6", color = "white", alpha = 0.85) +
  geom_vline(aes(xintercept = mean(Predicted_PH)),
             color = "red", linetype = "dashed", linewidth = 1) +
  labs(title    = "Distribution of Predicted pH — Test Set",
       subtitle = paste0("N = ", nrow(results_out),
                         " | Mean = ", round(mean(ph_predictions), 4)),
       x = "Predicted pH", y = "Count") +
  theme_minimal(base_size = 13)

# Predicted pH by Brand Code
ggplot(results_out, aes(x = Brand_Code, y = Predicted_PH, fill = Brand_Code)) +
  geom_boxplot(alpha = 0.7) +
  labs(title = "Predicted pH by Brand Code — Test Set",
       x = "Brand Code", y = "Predicted pH") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

Exporting results #```{r} write.csv(results_out, “ABC_Beverage_pH_Predictions.csv”, row.names = FALSE) cat(“saved to: ABC_Beverage_pH_Predictions.csv”)

write.csv(comparison_df, “ABC_Beverage_Model_Comparison.csv”, row.names = FALSE) cat(“Model comparison saved to: ABC_Beverage_Model_Comparison.csv”)

saveRDS(final_fit, “ABC_Beverage_Final_Model.rds”) cat(“Final model saved to: ABC_Beverage_Final_Model.rds”) ```