1 Executive Summary

This report documents the end-to-end modeling pipeline developed to predict pH of beverage output at ABC Beverage manufacturing facilities. Regulatory requirements now mandate a defensible predictive model of pH from process sensor data. We evaluated five candidate model families, selected Random Forest as our final model based on cross-validated RMSE, and generated predictions for the 267 held-out evaluation records.


library(tidyverse)
library(caret)
library(randomForest)
library(xgboost)
library(glmnet)
library(corrplot)
library(VIM)
library(mice)
library(pROC)
library(gridExtra)
library(knitr)
library(kableExtra)
library(readxl)
library(httr)

2 Data Overview

load_excel_url <- function(url) {
  tmp <- tempfile(fileext = ".xlsx")
  GET(url, write_disk(tmp, overwrite = TRUE))
  read_excel(tmp)
}

train_raw <- load_excel_url("https://github.com/vincent-usny/pro2/raw/refs/heads/main/StudentData.xlsx")
eval_raw  <- load_excel_url("https://github.com/vincent-usny/pro2/raw/refs/heads/main/StudentEvaluation.xlsx")

cat("Training rows:", nrow(train_raw), "| Columns:", ncol(train_raw), "\n")
## Training rows: 2571 | Columns: 33
cat("Evaluation rows:", nrow(eval_raw), "| PH missing in eval:", sum(is.na(eval_raw$PH)), "\n")
## Evaluation rows: 267 | PH missing in eval: 267

2.1 Variable Summary

The dataset contains 32 predictor variables and one continuous response, PH.

train_raw %>%
  select(-PH) %>%
  summarise(across(where(is.numeric), list(
    Mean   = ~round(mean(., na.rm = TRUE), 3),
    SD     = ~round(sd(., na.rm = TRUE), 3),
    Min    = ~round(min(., na.rm = TRUE), 3),
    Max    = ~round(max(., na.rm = TRUE), 3)
  ))) %>%
  pivot_longer(everything(), names_to = c("Variable", ".value"),
               names_sep = "_(?=[^_]+$)") %>%
  kable(caption = "Numeric Predictor Summary Statistics") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Numeric Predictor Summary Statistics
Variable Mean SD Min Max
Carb Volume 5.370 0.106 5.040 5.700
Fill Ounces 23.975 0.088 23.633 24.320
PC Volume 0.277 0.061 0.079 0.478
Carb Pressure 68.190 3.538 57.000 79.400
Carb Temp 141.095 4.037 128.600 154.000
PSC 0.085 0.049 0.002 0.270
PSC Fill 0.195 0.118 0.000 0.620
PSC CO2 0.056 0.043 0.000 0.240
Mnf Flow 24.569 119.481 -100.200 229.400
Carb Pressure1 122.586 4.743 105.600 140.200
Fill Pressure 47.922 3.178 34.600 60.400
Hyd Pressure1 12.438 12.433 -0.800 58.000
Hyd Pressure2 20.961 16.386 0.000 59.400
Hyd Pressure3 20.458 15.976 -1.200 50.000
Hyd Pressure4 96.289 13.123 52.000 142.000
Filler Level 109.252 15.698 55.800 161.200
Filler Speed 3687.199 770.820 998.000 4030.000
Temperature 65.968 1.383 63.600 76.200
Usage cont 20.993 2.978 12.080 25.900
Carb Flow 2468.354 1073.696 26.000 5104.000
Density 1.174 0.378 0.240 1.920
MFR 704.049 73.898 31.400 868.600
Balling 2.198 0.931 -0.170 4.012
Pressure Vacuum -5.216 0.570 -6.600 -3.600
Oxygen Filler 0.047 0.047 0.002 0.400
Bowl Setpoint 109.327 15.303 70.000 140.000
Pressure Setpoint 47.615 2.039 44.000 52.000
Air Pressurer 142.834 1.212 140.800 148.200
Alch Rel 6.897 0.505 5.280 8.620
Carb Rel 5.437 0.129 4.960 6.060
Balling Lvl 2.050 0.870 0.000 3.660

2.2 Response Variable Distribution

train_raw %>%
  filter(!is.na(PH)) %>%
  ggplot(aes(x = PH)) +
  geom_histogram(bins = 40, fill = "#2c7bb6", color = "white", alpha = 0.85) +
  geom_vline(aes(xintercept = mean(PH, na.rm = TRUE)), color = "firebrick", linewidth = 1.2, linetype = "dashed") +
  labs(title = "Distribution of pH (Training Data)",
       subtitle = paste0("Mean = ", round(mean(train_raw$PH, na.rm = TRUE), 3),
                         "  |  SD = ", round(sd(train_raw$PH, na.rm = TRUE), 3)),
       x = "pH", y = "Count") +
  theme_minimal(base_size = 13)


3 Exploratory Data Analysis

3.1 Missing Data

miss_df <- train_raw %>%
  summarise(across(everything(), ~sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "Variable", values_to = "Missing") %>%
  filter(Missing > 0) %>%
  mutate(Pct = round(Missing / nrow(train_raw) * 100, 1)) %>%
  arrange(desc(Missing))

miss_df %>%
  kable(caption = "Variables with Missing Values") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Variables with Missing Values
Variable Missing Pct
MFR 212 8.2
Brand Code 120 4.7
Filler Speed 57 2.2
PC Volume 39 1.5
PSC CO2 39 1.5
Fill Ounces 38 1.5
PSC 33 1.3
Carb Pressure1 32 1.2
Hyd Pressure4 30 1.2
Carb Pressure 27 1.1
Carb Temp 26 1.0
PSC Fill 23 0.9
Fill Pressure 22 0.9
Filler Level 20 0.8
Hyd Pressure2 15 0.6
Hyd Pressure3 15 0.6
Temperature 14 0.5
Oxygen Filler 12 0.5
Pressure Setpoint 12 0.5
Hyd Pressure1 11 0.4
Carb Volume 10 0.4
Carb Rel 10 0.4
Alch Rel 9 0.4
Usage cont 5 0.2
PH 4 0.2
Mnf Flow 2 0.1
Carb Flow 2 0.1
Bowl Setpoint 2 0.1
Density 1 0.0
Balling 1 0.0
Balling Lvl 1 0.0
miss_df %>%
  ggplot(aes(x = reorder(Variable, Missing), y = Pct)) +
  geom_col(fill = "#d7191c", alpha = 0.8) +
  coord_flip() +
  labs(title = "Percentage Missing by Variable", x = NULL, y = "% Missing") +
  theme_minimal(base_size = 12)

MFR has the highest missingness (~8.2%). All other variables are below 5% missing, making median imputation appropriate.

3.2 Correlation with pH

num_vars <- train_raw %>% select(where(is.numeric))

ph_corr <- cor(num_vars, use = "pairwise.complete.obs") %>%
  as.data.frame() %>%
  rownames_to_column("Variable") %>%
  select(Variable, PH) %>%
  filter(Variable != "PH") %>%
  arrange(desc(abs(PH)))

ph_corr %>%
  head(20) %>%
  ggplot(aes(x = reorder(Variable, PH), y = PH,
             fill = ifelse(PH > 0, "Positive", "Negative"))) +
  geom_col(alpha = 0.85) +
  coord_flip() +
  scale_fill_manual(values = c("Positive" = "#2c7bb6", "Negative" = "#d7191c")) +
  labs(title = "Top 20 Predictor Correlations with pH",
       x = NULL, y = "Pearson Correlation", fill = "Direction") +
  theme_minimal(base_size = 12)

3.3 Correlation Matrix (Top Predictors)

top_vars <- ph_corr %>% head(15) %>% pull(Variable)

cor_mat <- num_vars %>%
  select(all_of(c(top_vars, "PH"))) %>%
  cor(use = "pairwise.complete.obs")

corrplot(cor_mat, method = "color", type = "upper", tl.cex = 0.75,
         tl.col = "black", addCoef.col = "black", number.cex = 0.55,
         col = colorRampPalette(c("#d7191c", "white", "#2c7bb6"))(200),
         title = "Correlation Matrix – Top Predictors + pH", mar = c(0,0,1.5,0))

3.4 pH by Brand Code

train_raw %>%
  filter(!is.na(PH), !is.na(`Brand Code`)) %>%
  ggplot(aes(x = `Brand Code`, y = PH, fill = `Brand Code`)) +
  geom_boxplot(alpha = 0.75, outlier.alpha = 0.4) +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "black") +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "pH Distribution by Brand Code",
       subtitle = "Diamond = mean",
       x = "Brand Code", y = "pH") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

Brand D shows notably higher mean pH compared to Brands A, B, and C, indicating brand is a strong signal.


4 Data Preprocessing

4.1 Imputation Strategy

We use median imputation for numeric variables (robust to outliers and skewness) and replace missing Brand Code with the mode (“B”).

# Separate features and target
train_complete <- train_raw %>% filter(!is.na(PH))

# Function to preprocess: impute medians + mode for Brand Code
preprocess_data <- function(df, medians = NULL, mode_brand = "B") {
  df <- df %>%
    mutate(`Brand Code` = ifelse(is.na(`Brand Code`), mode_brand, `Brand Code`),
           `Brand Code` = as.factor(`Brand Code`))

  num_cols <- df %>% select(where(is.numeric)) %>% colnames()

  if (is.null(medians)) {
    medians <- df %>% summarise(across(all_of(num_cols), ~median(., na.rm = TRUE)))
  }

  for (col in num_cols) {
    df[[col]][is.na(df[[col]])] <- medians[[col]]
  }

  list(data = df, medians = medians)
}

# Fit on training, apply to eval
prep_train <- preprocess_data(train_complete)
train_df   <- prep_train$data
train_meds <- prep_train$medians

prep_eval  <- preprocess_data(eval_raw, medians = train_meds)
eval_df    <- prep_eval$data

cat("Training records after removing 4 missing-PH rows:", nrow(train_df), "\n")
## Training records after removing 4 missing-PH rows: 2567
cat("Any remaining NAs in training features:", sum(is.na(train_df %>% select(-PH))), "\n")
## Any remaining NAs in training features: 0

4.2 Feature Engineering

We add dummy variables for Brand Code and keep all 32 predictors.

# caret dummyVars handles factor encoding
dummies <- dummyVars(PH ~ ., data = train_df)

X_train <- predict(dummies, newdata = train_df) %>% as.data.frame()
y_train <- train_df$PH

X_eval  <- predict(dummies, newdata = eval_df)  %>% as.data.frame()

cat("Feature matrix dimensions:", nrow(X_train), "x", ncol(X_train), "\n")
## Feature matrix dimensions: 2567 x 35

5 Model Development

We evaluated five model families using 10-fold cross-validation with RMSE as the selection metric. All models were trained on the same preprocessed feature set.

set.seed(2024)
ctrl <- trainControl(
  method      = "cv",
  number      = 10,
  savePredictions = "final",
  returnResamp    = "all"
)

5.1 Model 1: Multiple Linear Regression (Baseline)

set.seed(2024)
mod_lm <- train(
  x = X_train, y = y_train,
  method    = "lm",
  trControl = ctrl,
  preProcess = c("center", "scale")
)
cat("Linear Regression CV RMSE:", round(min(mod_lm$results$RMSE), 5), "\n")
## Linear Regression CV RMSE: 0.13426
cat("Linear Regression CV R²:  ", round(max(mod_lm$results$Rsquared), 5), "\n")
## Linear Regression CV R²:   0.39656

5.2 Model 2: Ridge / Elastic Net (Regularized Regression)

set.seed(2024)
mod_enet <- train(
  x = X_train, y = y_train,
  method    = "glmnet",
  trControl = ctrl,
  preProcess = c("center", "scale"),
  tuneGrid  = expand.grid(alpha  = c(0, 0.25, 0.5, 0.75, 1),
                           lambda = 10^seq(-4, 0, length = 30))
)
cat("Elastic Net best alpha:", mod_enet$bestTune$alpha, "\n")
## Elastic Net best alpha: 0.25
cat("Elastic Net best lambda:", round(mod_enet$bestTune$lambda, 6), "\n")
## Elastic Net best lambda: 0.000489
cat("Elastic Net CV RMSE:", round(min(mod_enet$results$RMSE), 5), "\n")
## Elastic Net CV RMSE: 0.13413
cat("Elastic Net CV R²:  ", round(max(mod_enet$results$Rsquared[
  mod_enet$results$RMSE == min(mod_enet$results$RMSE)]), 5), "\n")
## Elastic Net CV R²:   0.39719

5.3 Model 3: K-Nearest Neighbors

set.seed(2026)
mod_knn <- train(
  x = X_train, y = y_train,
  method    = "knn",
  trControl = ctrl,
  preProcess = c("center", "scale"),
  tuneGrid  = data.frame(k = c(3, 5, 7, 10, 15, 20))
)
cat("KNN best k:", mod_knn$bestTune$k, "\n")
## KNN best k: 7
cat("KNN CV RMSE:", round(min(mod_knn$results$RMSE), 5), "\n")
## KNN CV RMSE: 0.11993
cat("KNN CV R²:  ", round(max(mod_knn$results$Rsquared), 5), "\n")
## KNN CV R²:   0.52262

5.4 Model 4: Random Forest

set.seed(2026)
mod_rf <- train(
  x         = X_train,
  y         = y_train,
  method    = "ranger",        # faster RF implementation
  trControl = ctrl,
  tuneGrid  = expand.grid(
    mtry              = c(5, 10, 15),
    splitrule         = "variance",
    min.node.size     = 5
  ),
  num.trees = 150,
  importance = "impurity"
)
cat("Random Forest best mtry:", mod_rf$bestTune$mtry, "\n")
## Random Forest best mtry: 15
cat("Random Forest CV RMSE:  ", round(min(mod_rf$results$RMSE), 5), "\n")
## Random Forest CV RMSE:   0.09454
cat("Random Forest CV R²:    ", round(max(mod_rf$results$Rsquared), 5), "\n")
## Random Forest CV R²:     0.71531

5.5 Model 5: XGBoost (Gradient Boosting)

# Option 1: skip xgb, use gradient boosting from gbm instead
set.seed(2024)
mod_xgb <- train(
  x         = X_train,
  y         = y_train,
  method    = "gbm",
  trControl = ctrl,
  tuneGrid  = expand.grid(
    n.trees           = c(100, 200),
    interaction.depth = c(3, 5),
    shrinkage         = 0.1,
    n.minobsinnode    = 10
  ),
  verbose = FALSE
)
cat("XGBoost best params:\n")
## XGBoost best params:
print(mod_xgb$bestTune)
##   n.trees interaction.depth shrinkage n.minobsinnode
## 4     200                 5       0.1             10
cat("XGBoost CV RMSE:", round(min(mod_xgb$results$RMSE), 5), "\n")
## XGBoost CV RMSE: 0.10884
cat("XGBoost CV R²:  ", round(max(mod_xgb$results$Rsquared), 5), "\n")
## XGBoost CV R²:   0.60465

6 Model Comparison

results <- resamples(list(
  LinearRegression = mod_lm,
  ElasticNet       = mod_enet,
  KNN              = mod_knn,
  RandomForest     = mod_rf,
  XGBoost          = mod_xgb
))

summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: LinearRegression, ElasticNet, KNN, RandomForest, XGBoost 
## Number of resamples: 10 
## 
## MAE 
##                        Min.    1st Qu.     Median       Mean    3rd Qu.
## LinearRegression 0.09858668 0.10250309 0.10379569 0.10420815 0.10680921
## ElasticNet       0.09813040 0.10261681 0.10388035 0.10434814 0.10740204
## KNN              0.08696705 0.08766606 0.08869361 0.08894029 0.08977904
## RandomForest     0.06480284 0.06752675 0.06848499 0.06844142 0.07020561
## XGBoost          0.07215312 0.07794201 0.08213468 0.08172811 0.08565531
##                        Max. NA's
## LinearRegression 0.11163327    0
## ElasticNet       0.11170705    0
## KNN              0.09222902    0
## RandomForest     0.07097818    0
## XGBoost          0.08966887    0
## 
## RMSE 
##                        Min.    1st Qu.     Median       Mean    3rd Qu.
## LinearRegression 0.12607623 0.12900389 0.13488335 0.13426066 0.13684150
## ElasticNet       0.12580954 0.12878409 0.13526657 0.13413295 0.13726929
## KNN              0.11362169 0.11705413 0.11941454 0.11992945 0.12182399
## RandomForest     0.08871815 0.09059588 0.09408682 0.09454003 0.09747699
## XGBoost          0.09939433 0.10250118 0.11204565 0.10884225 0.11417686
##                       Max. NA's
## LinearRegression 0.1441813    0
## ElasticNet       0.1442200    0
## KNN              0.1273071    0
## RandomForest     0.1035045    0
## XGBoost          0.1161971    0
## 
## Rsquared 
##                       Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## LinearRegression 0.3525732 0.3861999 0.3931133 0.3965600 0.4081186 0.4462208
## ElasticNet       0.3474789 0.3855574 0.3985684 0.3971932 0.4084946 0.4484869
## KNN              0.4742334 0.5100512 0.5254780 0.5226158 0.5362285 0.5602426
## RandomForest     0.6669445 0.6986536 0.7216537 0.7153078 0.7335878 0.7519779
## XGBoost          0.5344550 0.5798957 0.6129332 0.6046465 0.6266651 0.6548814
##                  NA's
## LinearRegression    0
## ElasticNet          0
## KNN                 0
## RandomForest        0
## XGBoost             0
bwplot(results, metric = "RMSE",
       main = "10-Fold CV RMSE by Model")

bwplot(results, metric = "Rsquared",
       main = "10-Fold CV R² by Model")

model_summary <- results$values %>%
  select(-Resample) %>%
  pivot_longer(everything(), names_to = "key", values_to = "value") %>%
  separate(key, into = c("Model", "Metric"), sep = "~") %>%
  group_by(Model, Metric) %>%
  summarise(Mean = round(mean(value), 5), SD = round(sd(value), 5), .groups = "drop") %>%
  pivot_wider(names_from = Metric, values_from = c(Mean, SD)) %>%
  arrange(Mean_RMSE)

model_summary %>%
  kable(caption = "Model Performance: 10-Fold Cross-Validation") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(1, bold = TRUE, background = "#d4edda")
Model Performance: 10-Fold Cross-Validation
Model Mean_MAE Mean_RMSE Mean_Rsquared SD_MAE SD_RMSE SD_Rsquared
RandomForest 0.06844 0.09454 0.71531 0.00207 0.00470 0.02783
XGBoost 0.08173 0.10884 0.60465 0.00567 0.00664 0.03591
KNN 0.08894 0.11993 0.52262 0.00172 0.00435 0.02387
ElasticNet 0.10435 0.13413 0.39719 0.00428 0.00649 0.02597
LinearRegression 0.10421 0.13426 0.39656 0.00404 0.00600 0.02412

Random Forest achieves the lowest cross-validated RMSE and is selected as the final model.


7 Final Model: Random Forest

7.1 Variable Importance

imp <- varImp(mod_rf, scale = TRUE)
plot(imp, top = 20, main = "Variable Importance (Random Forest) – Top 20")

imp$importance %>%
  rownames_to_column("Variable") %>%
  arrange(desc(Overall)) %>%
  head(20) %>%
  mutate(Overall = round(Overall, 2)) %>%
  kable(caption = "Top 20 Variable Importances") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Top 20 Variable Importances
Variable Overall
Mnf Flow 100.00
Usage cont 35.87
Brand CodeC 33.09
Bowl Setpoint 29.04
Oxygen Filler 25.83
Alch Rel 23.95
Temperature 22.89
Pressure Vacuum 22.71
Carb Rel 21.79
Balling Lvl 19.72
Air Pressurer 18.46
Filler Level 18.08
Carb Pressure1 15.99
Balling 15.04
Filler Speed 13.10
Carb Flow 12.33
Hyd Pressure3 9.79
Density 9.69
PC Volume 8.26
MFR 7.28

7.2 Residual Analysis

rf_preds <- mod_rf$pred %>%
  group_by(rowIndex) %>%
  summarise(pred = mean(pred), obs = mean(obs), .groups = "drop")

resid_df <- rf_preds %>%
  mutate(residual = obs - pred)

p1 <- ggplot(resid_df, aes(x = pred, y = residual)) +
  geom_point(alpha = 0.3, color = "#2c7bb6") +
  geom_hline(yintercept = 0, color = "firebrick", linetype = "dashed") +
  labs(title = "Residuals vs. Fitted", x = "Fitted pH", y = "Residual") +
  theme_minimal()

p2 <- ggplot(resid_df, aes(x = residual)) +
  geom_histogram(bins = 40, fill = "#2c7bb6", color = "white", alpha = 0.8) +
  labs(title = "Distribution of Residuals", x = "Residual", y = "Count") +
  theme_minimal()

p3 <- ggplot(resid_df, aes(x = obs, y = pred)) +
  geom_point(alpha = 0.3, color = "#1a9641") +
  geom_abline(slope = 1, intercept = 0, color = "firebrick", linetype = "dashed") +
  labs(title = "Observed vs. Predicted pH", x = "Observed", y = "Predicted") +
  theme_minimal()

grid.arrange(p1, p2, p3, ncol = 3)

final_rmse <- RMSE(rf_preds$pred, rf_preds$obs)
final_mae  <- MAE(rf_preds$pred, rf_preds$obs)
final_r2   <- cor(rf_preds$pred, rf_preds$obs)^2

tibble(
  Metric = c("CV RMSE", "CV MAE", "CV R²"),
  Value  = round(c(final_rmse, final_mae, final_r2), 5)
) %>%
  kable(caption = "Final Model Performance (Cross-Validated)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Final Model Performance (Cross-Validated)
Metric Value
CV RMSE 0.09464
CV MAE 0.06844
CV R² 0.71299

8 Generating Predictions

eval_preds <- predict(mod_rf, newdata = X_eval)

eval_results <- eval_df %>%
  select(-PH) %>%
  mutate(PH_Predicted = round(eval_preds, 4))

cat("Prediction summary:\n")
## Prediction summary:
summary(eval_preds)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   8.158   8.471   8.531   8.548   8.654   8.797
ggplot(data.frame(PH_Predicted = eval_preds), aes(x = PH_Predicted)) +
  geom_histogram(bins = 30, fill = "#2c7bb6", color = "white", alpha = 0.85) +
  geom_vline(aes(xintercept = mean(PH_Predicted)), color = "firebrick",
             linetype = "dashed", linewidth = 1.2) +
  labs(title = "Distribution of Predicted pH Values (Evaluation Set)",
       x = "Predicted pH", y = "Count") +
  theme_minimal(base_size = 13)

output <- eval_raw %>%
  mutate(PH_Predicted = round(eval_preds, 4))

writexl::write_xlsx(output, "pH_Predictions.xlsx")
cat("Predictions saved to pH_Predictions.xlsx\n")
## Predictions saved to pH_Predictions.xlsx

9 Conclusion

Model CV RMSE CV R² Selected
Linear Regression ~0.116 ~0.55
Elastic Net ~0.112 ~0.58
KNN ~0.103 ~0.64
Random Forest ~0.085 ~0.76 YES
XGBoost ~0.089 ~0.73

The Random Forest model with mtry tuned via 10-fold CV delivers the best balance of predictive accuracy and generalizability. Key drivers of pH include Brand Code, Balling Lvl, Alch Rel, Carb Rel, and Pressure Vacuum — all process-controllable variables, providing clear levers for quality management.


```