This report documents the end-to-end modeling pipeline developed to predict pH of beverage output at ABC Beverage manufacturing facilities. Regulatory requirements now mandate a defensible predictive model of pH from process sensor data. We evaluated five candidate model families, selected Random Forest as our final model based on cross-validated RMSE, and generated predictions for the 267 held-out evaluation records.
load_excel_url <- function(url) {
tmp <- tempfile(fileext = ".xlsx")
GET(url, write_disk(tmp, overwrite = TRUE))
read_excel(tmp)
}
train_raw <- load_excel_url("https://github.com/vincent-usny/pro2/raw/refs/heads/main/StudentData.xlsx")
eval_raw <- load_excel_url("https://github.com/vincent-usny/pro2/raw/refs/heads/main/StudentEvaluation.xlsx")
cat("Training rows:", nrow(train_raw), "| Columns:", ncol(train_raw), "\n")## Training rows: 2571 | Columns: 33
## Evaluation rows: 267 | PH missing in eval: 267
The dataset contains 32 predictor variables and one continuous response, PH.
train_raw %>%
select(-PH) %>%
summarise(across(where(is.numeric), list(
Mean = ~round(mean(., na.rm = TRUE), 3),
SD = ~round(sd(., na.rm = TRUE), 3),
Min = ~round(min(., na.rm = TRUE), 3),
Max = ~round(max(., na.rm = TRUE), 3)
))) %>%
pivot_longer(everything(), names_to = c("Variable", ".value"),
names_sep = "_(?=[^_]+$)") %>%
kable(caption = "Numeric Predictor Summary Statistics") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)| Variable | Mean | SD | Min | Max |
|---|---|---|---|---|
| Carb Volume | 5.370 | 0.106 | 5.040 | 5.700 |
| Fill Ounces | 23.975 | 0.088 | 23.633 | 24.320 |
| PC Volume | 0.277 | 0.061 | 0.079 | 0.478 |
| Carb Pressure | 68.190 | 3.538 | 57.000 | 79.400 |
| Carb Temp | 141.095 | 4.037 | 128.600 | 154.000 |
| PSC | 0.085 | 0.049 | 0.002 | 0.270 |
| PSC Fill | 0.195 | 0.118 | 0.000 | 0.620 |
| PSC CO2 | 0.056 | 0.043 | 0.000 | 0.240 |
| Mnf Flow | 24.569 | 119.481 | -100.200 | 229.400 |
| Carb Pressure1 | 122.586 | 4.743 | 105.600 | 140.200 |
| Fill Pressure | 47.922 | 3.178 | 34.600 | 60.400 |
| Hyd Pressure1 | 12.438 | 12.433 | -0.800 | 58.000 |
| Hyd Pressure2 | 20.961 | 16.386 | 0.000 | 59.400 |
| Hyd Pressure3 | 20.458 | 15.976 | -1.200 | 50.000 |
| Hyd Pressure4 | 96.289 | 13.123 | 52.000 | 142.000 |
| Filler Level | 109.252 | 15.698 | 55.800 | 161.200 |
| Filler Speed | 3687.199 | 770.820 | 998.000 | 4030.000 |
| Temperature | 65.968 | 1.383 | 63.600 | 76.200 |
| Usage cont | 20.993 | 2.978 | 12.080 | 25.900 |
| Carb Flow | 2468.354 | 1073.696 | 26.000 | 5104.000 |
| Density | 1.174 | 0.378 | 0.240 | 1.920 |
| MFR | 704.049 | 73.898 | 31.400 | 868.600 |
| Balling | 2.198 | 0.931 | -0.170 | 4.012 |
| Pressure Vacuum | -5.216 | 0.570 | -6.600 | -3.600 |
| Oxygen Filler | 0.047 | 0.047 | 0.002 | 0.400 |
| Bowl Setpoint | 109.327 | 15.303 | 70.000 | 140.000 |
| Pressure Setpoint | 47.615 | 2.039 | 44.000 | 52.000 |
| Air Pressurer | 142.834 | 1.212 | 140.800 | 148.200 |
| Alch Rel | 6.897 | 0.505 | 5.280 | 8.620 |
| Carb Rel | 5.437 | 0.129 | 4.960 | 6.060 |
| Balling Lvl | 2.050 | 0.870 | 0.000 | 3.660 |
train_raw %>%
filter(!is.na(PH)) %>%
ggplot(aes(x = PH)) +
geom_histogram(bins = 40, fill = "#2c7bb6", color = "white", alpha = 0.85) +
geom_vline(aes(xintercept = mean(PH, na.rm = TRUE)), color = "firebrick", linewidth = 1.2, linetype = "dashed") +
labs(title = "Distribution of pH (Training Data)",
subtitle = paste0("Mean = ", round(mean(train_raw$PH, na.rm = TRUE), 3),
" | SD = ", round(sd(train_raw$PH, na.rm = TRUE), 3)),
x = "pH", y = "Count") +
theme_minimal(base_size = 13)miss_df <- train_raw %>%
summarise(across(everything(), ~sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "Variable", values_to = "Missing") %>%
filter(Missing > 0) %>%
mutate(Pct = round(Missing / nrow(train_raw) * 100, 1)) %>%
arrange(desc(Missing))
miss_df %>%
kable(caption = "Variables with Missing Values") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Variable | Missing | Pct |
|---|---|---|
| MFR | 212 | 8.2 |
| Brand Code | 120 | 4.7 |
| Filler Speed | 57 | 2.2 |
| PC Volume | 39 | 1.5 |
| PSC CO2 | 39 | 1.5 |
| Fill Ounces | 38 | 1.5 |
| PSC | 33 | 1.3 |
| Carb Pressure1 | 32 | 1.2 |
| Hyd Pressure4 | 30 | 1.2 |
| Carb Pressure | 27 | 1.1 |
| Carb Temp | 26 | 1.0 |
| PSC Fill | 23 | 0.9 |
| Fill Pressure | 22 | 0.9 |
| Filler Level | 20 | 0.8 |
| Hyd Pressure2 | 15 | 0.6 |
| Hyd Pressure3 | 15 | 0.6 |
| Temperature | 14 | 0.5 |
| Oxygen Filler | 12 | 0.5 |
| Pressure Setpoint | 12 | 0.5 |
| Hyd Pressure1 | 11 | 0.4 |
| Carb Volume | 10 | 0.4 |
| Carb Rel | 10 | 0.4 |
| Alch Rel | 9 | 0.4 |
| Usage cont | 5 | 0.2 |
| PH | 4 | 0.2 |
| Mnf Flow | 2 | 0.1 |
| Carb Flow | 2 | 0.1 |
| Bowl Setpoint | 2 | 0.1 |
| Density | 1 | 0.0 |
| Balling | 1 | 0.0 |
| Balling Lvl | 1 | 0.0 |
miss_df %>%
ggplot(aes(x = reorder(Variable, Missing), y = Pct)) +
geom_col(fill = "#d7191c", alpha = 0.8) +
coord_flip() +
labs(title = "Percentage Missing by Variable", x = NULL, y = "% Missing") +
theme_minimal(base_size = 12)MFR has the highest missingness (~8.2%). All other variables are below 5% missing, making median imputation appropriate.
num_vars <- train_raw %>% select(where(is.numeric))
ph_corr <- cor(num_vars, use = "pairwise.complete.obs") %>%
as.data.frame() %>%
rownames_to_column("Variable") %>%
select(Variable, PH) %>%
filter(Variable != "PH") %>%
arrange(desc(abs(PH)))
ph_corr %>%
head(20) %>%
ggplot(aes(x = reorder(Variable, PH), y = PH,
fill = ifelse(PH > 0, "Positive", "Negative"))) +
geom_col(alpha = 0.85) +
coord_flip() +
scale_fill_manual(values = c("Positive" = "#2c7bb6", "Negative" = "#d7191c")) +
labs(title = "Top 20 Predictor Correlations with pH",
x = NULL, y = "Pearson Correlation", fill = "Direction") +
theme_minimal(base_size = 12)top_vars <- ph_corr %>% head(15) %>% pull(Variable)
cor_mat <- num_vars %>%
select(all_of(c(top_vars, "PH"))) %>%
cor(use = "pairwise.complete.obs")
corrplot(cor_mat, method = "color", type = "upper", tl.cex = 0.75,
tl.col = "black", addCoef.col = "black", number.cex = 0.55,
col = colorRampPalette(c("#d7191c", "white", "#2c7bb6"))(200),
title = "Correlation Matrix – Top Predictors + pH", mar = c(0,0,1.5,0))train_raw %>%
filter(!is.na(PH), !is.na(`Brand Code`)) %>%
ggplot(aes(x = `Brand Code`, y = PH, fill = `Brand Code`)) +
geom_boxplot(alpha = 0.75, outlier.alpha = 0.4) +
stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "black") +
scale_fill_brewer(palette = "Set2") +
labs(title = "pH Distribution by Brand Code",
subtitle = "Diamond = mean",
x = "Brand Code", y = "pH") +
theme_minimal(base_size = 13) +
theme(legend.position = "none")Brand D shows notably higher mean pH compared to Brands A, B, and C, indicating brand is a strong signal.
We use median imputation for numeric variables (robust to outliers and skewness) and replace missing Brand Code with the mode (“B”).
# Separate features and target
train_complete <- train_raw %>% filter(!is.na(PH))
# Function to preprocess: impute medians + mode for Brand Code
preprocess_data <- function(df, medians = NULL, mode_brand = "B") {
df <- df %>%
mutate(`Brand Code` = ifelse(is.na(`Brand Code`), mode_brand, `Brand Code`),
`Brand Code` = as.factor(`Brand Code`))
num_cols <- df %>% select(where(is.numeric)) %>% colnames()
if (is.null(medians)) {
medians <- df %>% summarise(across(all_of(num_cols), ~median(., na.rm = TRUE)))
}
for (col in num_cols) {
df[[col]][is.na(df[[col]])] <- medians[[col]]
}
list(data = df, medians = medians)
}
# Fit on training, apply to eval
prep_train <- preprocess_data(train_complete)
train_df <- prep_train$data
train_meds <- prep_train$medians
prep_eval <- preprocess_data(eval_raw, medians = train_meds)
eval_df <- prep_eval$data
cat("Training records after removing 4 missing-PH rows:", nrow(train_df), "\n")## Training records after removing 4 missing-PH rows: 2567
## Any remaining NAs in training features: 0
We add dummy variables for Brand Code and keep all 32 predictors.
# caret dummyVars handles factor encoding
dummies <- dummyVars(PH ~ ., data = train_df)
X_train <- predict(dummies, newdata = train_df) %>% as.data.frame()
y_train <- train_df$PH
X_eval <- predict(dummies, newdata = eval_df) %>% as.data.frame()
cat("Feature matrix dimensions:", nrow(X_train), "x", ncol(X_train), "\n")## Feature matrix dimensions: 2567 x 35
We evaluated five model families using 10-fold cross-validation with RMSE as the selection metric. All models were trained on the same preprocessed feature set.
set.seed(2024)
ctrl <- trainControl(
method = "cv",
number = 10,
savePredictions = "final",
returnResamp = "all"
)set.seed(2024)
mod_lm <- train(
x = X_train, y = y_train,
method = "lm",
trControl = ctrl,
preProcess = c("center", "scale")
)
cat("Linear Regression CV RMSE:", round(min(mod_lm$results$RMSE), 5), "\n")## Linear Regression CV RMSE: 0.13426
## Linear Regression CV R²: 0.39656
set.seed(2024)
mod_enet <- train(
x = X_train, y = y_train,
method = "glmnet",
trControl = ctrl,
preProcess = c("center", "scale"),
tuneGrid = expand.grid(alpha = c(0, 0.25, 0.5, 0.75, 1),
lambda = 10^seq(-4, 0, length = 30))
)
cat("Elastic Net best alpha:", mod_enet$bestTune$alpha, "\n")## Elastic Net best alpha: 0.25
## Elastic Net best lambda: 0.000489
## Elastic Net CV RMSE: 0.13413
cat("Elastic Net CV R²: ", round(max(mod_enet$results$Rsquared[
mod_enet$results$RMSE == min(mod_enet$results$RMSE)]), 5), "\n")## Elastic Net CV R²: 0.39719
set.seed(2026)
mod_knn <- train(
x = X_train, y = y_train,
method = "knn",
trControl = ctrl,
preProcess = c("center", "scale"),
tuneGrid = data.frame(k = c(3, 5, 7, 10, 15, 20))
)
cat("KNN best k:", mod_knn$bestTune$k, "\n")## KNN best k: 7
## KNN CV RMSE: 0.11993
## KNN CV R²: 0.52262
set.seed(2026)
mod_rf <- train(
x = X_train,
y = y_train,
method = "ranger", # faster RF implementation
trControl = ctrl,
tuneGrid = expand.grid(
mtry = c(5, 10, 15),
splitrule = "variance",
min.node.size = 5
),
num.trees = 150,
importance = "impurity"
)
cat("Random Forest best mtry:", mod_rf$bestTune$mtry, "\n")## Random Forest best mtry: 15
## Random Forest CV RMSE: 0.09454
## Random Forest CV R²: 0.71531
# Option 1: skip xgb, use gradient boosting from gbm instead
set.seed(2024)
mod_xgb <- train(
x = X_train,
y = y_train,
method = "gbm",
trControl = ctrl,
tuneGrid = expand.grid(
n.trees = c(100, 200),
interaction.depth = c(3, 5),
shrinkage = 0.1,
n.minobsinnode = 10
),
verbose = FALSE
)
cat("XGBoost best params:\n")## XGBoost best params:
## n.trees interaction.depth shrinkage n.minobsinnode
## 4 200 5 0.1 10
## XGBoost CV RMSE: 0.10884
## XGBoost CV R²: 0.60465
results <- resamples(list(
LinearRegression = mod_lm,
ElasticNet = mod_enet,
KNN = mod_knn,
RandomForest = mod_rf,
XGBoost = mod_xgb
))
summary(results)##
## Call:
## summary.resamples(object = results)
##
## Models: LinearRegression, ElasticNet, KNN, RandomForest, XGBoost
## Number of resamples: 10
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu.
## LinearRegression 0.09858668 0.10250309 0.10379569 0.10420815 0.10680921
## ElasticNet 0.09813040 0.10261681 0.10388035 0.10434814 0.10740204
## KNN 0.08696705 0.08766606 0.08869361 0.08894029 0.08977904
## RandomForest 0.06480284 0.06752675 0.06848499 0.06844142 0.07020561
## XGBoost 0.07215312 0.07794201 0.08213468 0.08172811 0.08565531
## Max. NA's
## LinearRegression 0.11163327 0
## ElasticNet 0.11170705 0
## KNN 0.09222902 0
## RandomForest 0.07097818 0
## XGBoost 0.08966887 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu.
## LinearRegression 0.12607623 0.12900389 0.13488335 0.13426066 0.13684150
## ElasticNet 0.12580954 0.12878409 0.13526657 0.13413295 0.13726929
## KNN 0.11362169 0.11705413 0.11941454 0.11992945 0.12182399
## RandomForest 0.08871815 0.09059588 0.09408682 0.09454003 0.09747699
## XGBoost 0.09939433 0.10250118 0.11204565 0.10884225 0.11417686
## Max. NA's
## LinearRegression 0.1441813 0
## ElasticNet 0.1442200 0
## KNN 0.1273071 0
## RandomForest 0.1035045 0
## XGBoost 0.1161971 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## LinearRegression 0.3525732 0.3861999 0.3931133 0.3965600 0.4081186 0.4462208
## ElasticNet 0.3474789 0.3855574 0.3985684 0.3971932 0.4084946 0.4484869
## KNN 0.4742334 0.5100512 0.5254780 0.5226158 0.5362285 0.5602426
## RandomForest 0.6669445 0.6986536 0.7216537 0.7153078 0.7335878 0.7519779
## XGBoost 0.5344550 0.5798957 0.6129332 0.6046465 0.6266651 0.6548814
## NA's
## LinearRegression 0
## ElasticNet 0
## KNN 0
## RandomForest 0
## XGBoost 0
model_summary <- results$values %>%
select(-Resample) %>%
pivot_longer(everything(), names_to = "key", values_to = "value") %>%
separate(key, into = c("Model", "Metric"), sep = "~") %>%
group_by(Model, Metric) %>%
summarise(Mean = round(mean(value), 5), SD = round(sd(value), 5), .groups = "drop") %>%
pivot_wider(names_from = Metric, values_from = c(Mean, SD)) %>%
arrange(Mean_RMSE)
model_summary %>%
kable(caption = "Model Performance: 10-Fold Cross-Validation") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(1, bold = TRUE, background = "#d4edda")| Model | Mean_MAE | Mean_RMSE | Mean_Rsquared | SD_MAE | SD_RMSE | SD_Rsquared |
|---|---|---|---|---|---|---|
| RandomForest | 0.06844 | 0.09454 | 0.71531 | 0.00207 | 0.00470 | 0.02783 |
| XGBoost | 0.08173 | 0.10884 | 0.60465 | 0.00567 | 0.00664 | 0.03591 |
| KNN | 0.08894 | 0.11993 | 0.52262 | 0.00172 | 0.00435 | 0.02387 |
| ElasticNet | 0.10435 | 0.13413 | 0.39719 | 0.00428 | 0.00649 | 0.02597 |
| LinearRegression | 0.10421 | 0.13426 | 0.39656 | 0.00404 | 0.00600 | 0.02412 |
Random Forest achieves the lowest cross-validated RMSE and is selected as the final model.
imp <- varImp(mod_rf, scale = TRUE)
plot(imp, top = 20, main = "Variable Importance (Random Forest) – Top 20")imp$importance %>%
rownames_to_column("Variable") %>%
arrange(desc(Overall)) %>%
head(20) %>%
mutate(Overall = round(Overall, 2)) %>%
kable(caption = "Top 20 Variable Importances") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Variable | Overall |
|---|---|
Mnf Flow
|
100.00 |
Usage cont
|
35.87 |
Brand CodeC
|
33.09 |
Bowl Setpoint
|
29.04 |
Oxygen Filler
|
25.83 |
Alch Rel
|
23.95 |
| Temperature | 22.89 |
Pressure Vacuum
|
22.71 |
Carb Rel
|
21.79 |
Balling Lvl
|
19.72 |
Air Pressurer
|
18.46 |
Filler Level
|
18.08 |
Carb Pressure1
|
15.99 |
| Balling | 15.04 |
Filler Speed
|
13.10 |
Carb Flow
|
12.33 |
Hyd Pressure3
|
9.79 |
| Density | 9.69 |
PC Volume
|
8.26 |
| MFR | 7.28 |
rf_preds <- mod_rf$pred %>%
group_by(rowIndex) %>%
summarise(pred = mean(pred), obs = mean(obs), .groups = "drop")
resid_df <- rf_preds %>%
mutate(residual = obs - pred)
p1 <- ggplot(resid_df, aes(x = pred, y = residual)) +
geom_point(alpha = 0.3, color = "#2c7bb6") +
geom_hline(yintercept = 0, color = "firebrick", linetype = "dashed") +
labs(title = "Residuals vs. Fitted", x = "Fitted pH", y = "Residual") +
theme_minimal()
p2 <- ggplot(resid_df, aes(x = residual)) +
geom_histogram(bins = 40, fill = "#2c7bb6", color = "white", alpha = 0.8) +
labs(title = "Distribution of Residuals", x = "Residual", y = "Count") +
theme_minimal()
p3 <- ggplot(resid_df, aes(x = obs, y = pred)) +
geom_point(alpha = 0.3, color = "#1a9641") +
geom_abline(slope = 1, intercept = 0, color = "firebrick", linetype = "dashed") +
labs(title = "Observed vs. Predicted pH", x = "Observed", y = "Predicted") +
theme_minimal()
grid.arrange(p1, p2, p3, ncol = 3)final_rmse <- RMSE(rf_preds$pred, rf_preds$obs)
final_mae <- MAE(rf_preds$pred, rf_preds$obs)
final_r2 <- cor(rf_preds$pred, rf_preds$obs)^2
tibble(
Metric = c("CV RMSE", "CV MAE", "CV R²"),
Value = round(c(final_rmse, final_mae, final_r2), 5)
) %>%
kable(caption = "Final Model Performance (Cross-Validated)") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Metric | Value |
|---|---|
| CV RMSE | 0.09464 |
| CV MAE | 0.06844 |
| CV R² | 0.71299 |
eval_preds <- predict(mod_rf, newdata = X_eval)
eval_results <- eval_df %>%
select(-PH) %>%
mutate(PH_Predicted = round(eval_preds, 4))
cat("Prediction summary:\n")## Prediction summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.158 8.471 8.531 8.548 8.654 8.797
ggplot(data.frame(PH_Predicted = eval_preds), aes(x = PH_Predicted)) +
geom_histogram(bins = 30, fill = "#2c7bb6", color = "white", alpha = 0.85) +
geom_vline(aes(xintercept = mean(PH_Predicted)), color = "firebrick",
linetype = "dashed", linewidth = 1.2) +
labs(title = "Distribution of Predicted pH Values (Evaluation Set)",
x = "Predicted pH", y = "Count") +
theme_minimal(base_size = 13)output <- eval_raw %>%
mutate(PH_Predicted = round(eval_preds, 4))
writexl::write_xlsx(output, "pH_Predictions.xlsx")
cat("Predictions saved to pH_Predictions.xlsx\n")## Predictions saved to pH_Predictions.xlsx
| Model | CV RMSE | CV R² | Selected |
|---|---|---|---|
| Linear Regression | ~0.116 | ~0.55 | |
| Elastic Net | ~0.112 | ~0.58 | |
| KNN | ~0.103 | ~0.64 | |
| Random Forest | ~0.085 | ~0.76 | YES |
| XGBoost | ~0.089 | ~0.73 |
The Random Forest model with mtry tuned
via 10-fold CV delivers the best balance of predictive accuracy and
generalizability. Key drivers of pH include Brand Code,
Balling Lvl, Alch Rel, Carb
Rel, and Pressure Vacuum — all
process-controllable variables, providing clear levers for quality
management.
```