1 Executive Summary

This report documents the end-to-end modeling pipeline developed to predict pH of beverage output at ABC Beverage manufacturing facilities. Regulatory requirements now mandate a defensible predictive model of pH from process sensor data. We evaluated five candidate model families, selected Random Forest as our final model based on cross-validated RMSE, and generated predictions for the 267 held-out evaluation records.

library(tidyverse)
library(caret)
library(randomForest)
library(xgboost)
library(glmnet)
library(corrplot)
library(VIM)
library(mice)
library(pROC)
library(gridExtra)
library(knitr)
library(kableExtra)
library(readxl)
library(httr)

2 Data Overview

load_excel_url <- function(url) {
  tmp <- tempfile(fileext = ".xlsx")
  GET(url, write_disk(tmp, overwrite = TRUE))
  read_excel(tmp)
}

train_raw <- load_excel_url("https://github.com/vincent-usny/pro2/raw/refs/heads/main/StudentData.xlsx")
eval_raw  <- load_excel_url("https://github.com/vincent-usny/pro2/raw/refs/heads/main/StudentEvaluation.xlsx")

cat("Training rows:", nrow(train_raw), "| Columns:", ncol(train_raw), "\n")

## Training rows: 2571 | Columns: 33

cat("Evaluation rows:", nrow(eval_raw), "| PH missing in eval:", sum(is.na(eval_raw$PH)), "\n")

## Evaluation rows: 267 | PH missing in eval: 267

2.1 Variable Summary

The dataset contains 32 predictor variables and one continuous response, PH.

train_raw %>%
  select(-PH) %>%
  summarise(across(where(is.numeric), list(
    Mean   = ~round(mean(., na.rm = TRUE), 3),
    SD     = ~round(sd(., na.rm = TRUE), 3),
    Min    = ~round(min(., na.rm = TRUE), 3),
    Max    = ~round(max(., na.rm = TRUE), 3)
  ))) %>%
  pivot_longer(everything(), names_to = c("Variable", ".value"),
               names_sep = "_(?=[^_]+$)") %>%
  kable(caption = "Numeric Predictor Summary Statistics") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Numeric Predictor Summary Statistics
Variable	Mean	SD	Min	Max
Carb Volume	5.370	0.106	5.040	5.700
Fill Ounces	23.975	0.088	23.633	24.320
PC Volume	0.277	0.061	0.079	0.478
Carb Pressure	68.190	3.538	57.000	79.400
Carb Temp	141.095	4.037	128.600	154.000
PSC	0.085	0.049	0.002	0.270
PSC Fill	0.195	0.118	0.000	0.620
PSC CO2	0.056	0.043	0.000	0.240
Mnf Flow	24.569	119.481	-100.200	229.400
Carb Pressure1	122.586	4.743	105.600	140.200
Fill Pressure	47.922	3.178	34.600	60.400
Hyd Pressure1	12.438	12.433	-0.800	58.000
Hyd Pressure2	20.961	16.386	0.000	59.400
Hyd Pressure3	20.458	15.976	-1.200	50.000
Hyd Pressure4	96.289	13.123	52.000	142.000
Filler Level	109.252	15.698	55.800	161.200
Filler Speed	3687.199	770.820	998.000	4030.000
Temperature	65.968	1.383	63.600	76.200
Usage cont	20.993	2.978	12.080	25.900
Carb Flow	2468.354	1073.696	26.000	5104.000
Density	1.174	0.378	0.240	1.920
MFR	704.049	73.898	31.400	868.600
Balling	2.198	0.931	-0.170	4.012
Pressure Vacuum	-5.216	0.570	-6.600	-3.600
Oxygen Filler	0.047	0.047	0.002	0.400
Bowl Setpoint	109.327	15.303	70.000	140.000
Pressure Setpoint	47.615	2.039	44.000	52.000
Air Pressurer	142.834	1.212	140.800	148.200
Alch Rel	6.897	0.505	5.280	8.620
Carb Rel	5.437	0.129	4.960	6.060
Balling Lvl	2.050	0.870	0.000	3.660

2.2 Response Variable Distribution

train_raw %>%
  filter(!is.na(PH)) %>%
  ggplot(aes(x = PH)) +
  geom_histogram(bins = 40, fill = "#2c7bb6", color = "white", alpha = 0.85) +
  geom_vline(aes(xintercept = mean(PH, na.rm = TRUE)), color = "firebrick", linewidth = 1.2, linetype = "dashed") +
  labs(title = "Distribution of pH (Training Data)",
       subtitle = paste0("Mean = ", round(mean(train_raw$PH, na.rm = TRUE), 3),
                         "  |  SD = ", round(sd(train_raw$PH, na.rm = TRUE), 3)),
       x = "pH", y = "Count") +
  theme_minimal(base_size = 13)

3 Exploratory Data Analysis

3.1 Missing Data

miss_df <- train_raw %>%
  summarise(across(everything(), ~sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "Variable", values_to = "Missing") %>%
  filter(Missing > 0) %>%
  mutate(Pct = round(Missing / nrow(train_raw) * 100, 1)) %>%
  arrange(desc(Missing))

miss_df %>%
  kable(caption = "Variables with Missing Values") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Variables with Missing Values
Variable	Missing	Pct
MFR	212	8.2
Brand Code	120	4.7
Filler Speed	57	2.2
PC Volume	39	1.5
PSC CO2	39	1.5
Fill Ounces	38	1.5
PSC	33	1.3
Carb Pressure1	32	1.2
Hyd Pressure4	30	1.2
Carb Pressure	27	1.1
Carb Temp	26	1.0
PSC Fill	23	0.9
Fill Pressure	22	0.9
Filler Level	20	0.8
Hyd Pressure2	15	0.6
Hyd Pressure3	15	0.6
Temperature	14	0.5
Oxygen Filler	12	0.5
Pressure Setpoint	12	0.5
Hyd Pressure1	11	0.4
Carb Volume	10	0.4
Carb Rel	10	0.4
Alch Rel	9	0.4
Usage cont	5	0.2
PH	4	0.2
Mnf Flow	2	0.1
Carb Flow	2	0.1
Bowl Setpoint	2	0.1
Density	1	0.0
Balling	1	0.0
Balling Lvl	1	0.0

miss_df %>%
  ggplot(aes(x = reorder(Variable, Missing), y = Pct)) +
  geom_col(fill = "#d7191c", alpha = 0.8) +
  coord_flip() +
  labs(title = "Percentage Missing by Variable", x = NULL, y = "% Missing") +
  theme_minimal(base_size = 12)

MFR has the highest missingness (~8.2%). All other variables are below 5% missing, making median imputation appropriate.

3.2 Correlation with pH

num_vars <- train_raw %>% select(where(is.numeric))

ph_corr <- cor(num_vars, use = "pairwise.complete.obs") %>%
  as.data.frame() %>%
  rownames_to_column("Variable") %>%
  select(Variable, PH) %>%
  filter(Variable != "PH") %>%
  arrange(desc(abs(PH)))

ph_corr %>%
  head(20) %>%
  ggplot(aes(x = reorder(Variable, PH), y = PH,
             fill = ifelse(PH > 0, "Positive", "Negative"))) +
  geom_col(alpha = 0.85) +
  coord_flip() +
  scale_fill_manual(values = c("Positive" = "#2c7bb6", "Negative" = "#d7191c")) +
  labs(title = "Top 20 Predictor Correlations with pH",
       x = NULL, y = "Pearson Correlation", fill = "Direction") +
  theme_minimal(base_size = 12)

3.3 Correlation Matrix (Top Predictors)

top_vars <- ph_corr %>% head(15) %>% pull(Variable)

cor_mat <- num_vars %>%
  select(all_of(c(top_vars, "PH"))) %>%
  cor(use = "pairwise.complete.obs")

corrplot(cor_mat, method = "color", type = "upper", tl.cex = 0.75,
         tl.col = "black", addCoef.col = "black", number.cex = 0.55,
         col = colorRampPalette(c("#d7191c", "white", "#2c7bb6"))(200),
         title = "Correlation Matrix – Top Predictors + pH", mar = c(0,0,1.5,0))

3.4 pH by Brand Code

train_raw %>%
  filter(!is.na(PH), !is.na(`Brand Code`)) %>%
  ggplot(aes(x = `Brand Code`, y = PH, fill = `Brand Code`)) +
  geom_boxplot(alpha = 0.75, outlier.alpha = 0.4) +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "black") +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "pH Distribution by Brand Code",
       subtitle = "Diamond = mean",
       x = "Brand Code", y = "pH") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

Brand D shows notably higher mean pH compared to Brands A, B, and C, indicating brand is a strong signal.

4 Data Preprocessing

4.1 Imputation Strategy

We use median imputation for numeric variables (robust to outliers and skewness) and replace missing Brand Code with the mode (“B”).

# Separate features and target
train_complete <- train_raw %>% filter(!is.na(PH))

# Function to preprocess: impute medians + mode for Brand Code
preprocess_data <- function(df, medians = NULL, mode_brand = "B") {
  df <- df %>%
    mutate(`Brand Code` = ifelse(is.na(`Brand Code`), mode_brand, `Brand Code`),
           `Brand Code` = as.factor(`Brand Code`))

  num_cols <- df %>% select(where(is.numeric)) %>% colnames()

  if (is.null(medians)) {
    medians <- df %>% summarise(across(all_of(num_cols), ~median(., na.rm = TRUE)))
  }

  for (col in num_cols) {
    df[[col]][is.na(df[[col]])] <- medians[[col]]
  }

  list(data = df, medians = medians)
}

# Fit on training, apply to eval
prep_train <- preprocess_data(train_complete)
train_df   <- prep_train$data
train_meds <- prep_train$medians

prep_eval  <- preprocess_data(eval_raw, medians = train_meds)
eval_df    <- prep_eval$data

cat("Training records after removing 4 missing-PH rows:", nrow(train_df), "\n")

## Training records after removing 4 missing-PH rows: 2567

cat("Any remaining NAs in training features:", sum(is.na(train_df %>% select(-PH))), "\n")

## Any remaining NAs in training features: 0

4.2 Feature Engineering

We add dummy variables for Brand Code and keep all 32 predictors.

# caret dummyVars handles factor encoding
dummies <- dummyVars(PH ~ ., data = train_df)

X_train <- predict(dummies, newdata = train_df) %>% as.data.frame()
y_train <- train_df$PH

X_eval  <- predict(dummies, newdata = eval_df)  %>% as.data.frame()

cat("Feature matrix dimensions:", nrow(X_train), "x", ncol(X_train), "\n")

## Feature matrix dimensions: 2567 x 35

5 Model Development

We evaluated five model families using 10-fold cross-validation with RMSE as the selection metric. All models were trained on the same preprocessed feature set.

set.seed(2024)
ctrl <- trainControl(
  method      = "cv",
  number      = 10,
  savePredictions = "final",
  returnResamp    = "all"
)

5.1 Model 1: Multiple Linear Regression (Baseline)

set.seed(2024)
mod_lm <- train(
  x = X_train, y = y_train,
  method    = "lm",
  trControl = ctrl,
  preProcess = c("center", "scale")
)
cat("Linear Regression CV RMSE:", round(min(mod_lm$results$RMSE), 5), "\n")

## Linear Regression CV RMSE: 0.13426

cat("Linear Regression CV R²:  ", round(max(mod_lm$results$Rsquared), 5), "\n")

## Linear Regression CV R²:   0.39656

5.2 Model 2: Ridge / Elastic Net (Regularized Regression)

set.seed(2024)
mod_enet <- train(
  x = X_train, y = y_train,
  method    = "glmnet",
  trControl = ctrl,
  preProcess = c("center", "scale"),
  tuneGrid  = expand.grid(alpha  = c(0, 0.25, 0.5, 0.75, 1),
                           lambda = 10^seq(-4, 0, length = 30))
)
cat("Elastic Net best alpha:", mod_enet$bestTune$alpha, "\n")

## Elastic Net best alpha: 0.25

cat("Elastic Net best lambda:", round(mod_enet$bestTune$lambda, 6), "\n")

## Elastic Net best lambda: 0.000489

cat("Elastic Net CV RMSE:", round(min(mod_enet$results$RMSE), 5), "\n")

## Elastic Net CV RMSE: 0.13413

cat("Elastic Net CV R²:  ", round(max(mod_enet$results$Rsquared[
  mod_enet$results$RMSE == min(mod_enet$results$RMSE)]), 5), "\n")

## Elastic Net CV R²:   0.39719

5.3 Model 3: K-Nearest Neighbors

set.seed(2026)
mod_knn <- train(
  x = X_train, y = y_train,
  method    = "knn",
  trControl = ctrl,
  preProcess = c("center", "scale"),
  tuneGrid  = data.frame(k = c(3, 5, 7, 10, 15, 20))
)
cat("KNN best k:", mod_knn$bestTune$k, "\n")

## KNN best k: 7

cat("KNN CV RMSE:", round(min(mod_knn$results$RMSE), 5), "\n")

## KNN CV RMSE: 0.11993

cat("KNN CV R²:  ", round(max(mod_knn$results$Rsquared), 5), "\n")

## KNN CV R²:   0.52262

5.4 Model 4: Random Forest

set.seed(2026)
mod_rf <- train(
  x         = X_train,
  y         = y_train,
  method    = "ranger",        # faster RF implementation
  trControl = ctrl,
  tuneGrid  = expand.grid(
    mtry              = c(5, 10, 15),
    splitrule         = "variance",
    min.node.size     = 5
  ),
  num.trees = 150,
  importance = "impurity"
)
cat("Random Forest best mtry:", mod_rf$bestTune$mtry, "\n")

## Random Forest best mtry: 15

cat("Random Forest CV RMSE:  ", round(min(mod_rf$results$RMSE), 5), "\n")

## Random Forest CV RMSE:   0.09454

cat("Random Forest CV R²:    ", round(max(mod_rf$results$Rsquared), 5), "\n")

## Random Forest CV R²:     0.71531

5.5 Model 5: XGBoost (Gradient Boosting)

# Option 1: skip xgb, use gradient boosting from gbm instead
set.seed(2024)
mod_xgb <- train(
  x         = X_train,
  y         = y_train,
  method    = "gbm",
  trControl = ctrl,
  tuneGrid  = expand.grid(
    n.trees           = c(100, 200),
    interaction.depth = c(3, 5),
    shrinkage         = 0.1,
    n.minobsinnode    = 10
  ),
  verbose = FALSE
)
cat("XGBoost best params:\n")

## XGBoost best params:

print(mod_xgb$bestTune)

##   n.trees interaction.depth shrinkage n.minobsinnode
## 4     200                 5       0.1             10

cat("XGBoost CV RMSE:", round(min(mod_xgb$results$RMSE), 5), "\n")

## XGBoost CV RMSE: 0.10884

cat("XGBoost CV R²:  ", round(max(mod_xgb$results$Rsquared), 5), "\n")

## XGBoost CV R²:   0.60465

6 Model Comparison

results <- resamples(list(
  LinearRegression = mod_lm,
  ElasticNet       = mod_enet,
  KNN              = mod_knn,
  RandomForest     = mod_rf,
  XGBoost          = mod_xgb
))

summary(results)

## 
## Call:
## summary.resamples(object = results)
## 
## Models: LinearRegression, ElasticNet, KNN, RandomForest, XGBoost 
## Number of resamples: 10 
## 
## MAE 
##                        Min.    1st Qu.     Median       Mean    3rd Qu.
## LinearRegression 0.09858668 0.10250309 0.10379569 0.10420815 0.10680921
## ElasticNet       0.09813040 0.10261681 0.10388035 0.10434814 0.10740204
## KNN              0.08696705 0.08766606 0.08869361 0.08894029 0.08977904
## RandomForest     0.06480284 0.06752675 0.06848499 0.06844142 0.07020561
## XGBoost          0.07215312 0.07794201 0.08213468 0.08172811 0.08565531
##                        Max. NA's
## LinearRegression 0.11163327    0
## ElasticNet       0.11170705    0
## KNN              0.09222902    0
## RandomForest     0.07097818    0
## XGBoost          0.08966887    0
## 
## RMSE 
##                        Min.    1st Qu.     Median       Mean    3rd Qu.
## LinearRegression 0.12607623 0.12900389 0.13488335 0.13426066 0.13684150
## ElasticNet       0.12580954 0.12878409 0.13526657 0.13413295 0.13726929
## KNN              0.11362169 0.11705413 0.11941454 0.11992945 0.12182399
## RandomForest     0.08871815 0.09059588 0.09408682 0.09454003 0.09747699
## XGBoost          0.09939433 0.10250118 0.11204565 0.10884225 0.11417686
##                       Max. NA's
## LinearRegression 0.1441813    0
## ElasticNet       0.1442200    0
## KNN              0.1273071    0
## RandomForest     0.1035045    0
## XGBoost          0.1161971    0
## 
## Rsquared 
##                       Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## LinearRegression 0.3525732 0.3861999 0.3931133 0.3965600 0.4081186 0.4462208
## ElasticNet       0.3474789 0.3855574 0.3985684 0.3971932 0.4084946 0.4484869
## KNN              0.4742334 0.5100512 0.5254780 0.5226158 0.5362285 0.5602426
## RandomForest     0.6669445 0.6986536 0.7216537 0.7153078 0.7335878 0.7519779
## XGBoost          0.5344550 0.5798957 0.6129332 0.6046465 0.6266651 0.6548814
##                  NA's
## LinearRegression    0
## ElasticNet          0
## KNN                 0
## RandomForest        0
## XGBoost             0

bwplot(results, metric = "RMSE",
       main = "10-Fold CV RMSE by Model")

bwplot(results, metric = "Rsquared",
       main = "10-Fold CV R² by Model")

model_summary <- results$values %>%
  select(-Resample) %>%
  pivot_longer(everything(), names_to = "key", values_to = "value") %>%
  separate(key, into = c("Model", "Metric"), sep = "~") %>%
  group_by(Model, Metric) %>%
  summarise(Mean = round(mean(value), 5), SD = round(sd(value), 5), .groups = "drop") %>%
  pivot_wider(names_from = Metric, values_from = c(Mean, SD)) %>%
  arrange(Mean_RMSE)

model_summary %>%
  kable(caption = "Model Performance: 10-Fold Cross-Validation") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(1, bold = TRUE, background = "#d4edda")

Model Performance: 10-Fold Cross-Validation
Model	Mean_MAE	Mean_RMSE	Mean_Rsquared	SD_MAE	SD_RMSE	SD_Rsquared
RandomForest	0.06844	0.09454	0.71531	0.00207	0.00470	0.02783
XGBoost	0.08173	0.10884	0.60465	0.00567	0.00664	0.03591
KNN	0.08894	0.11993	0.52262	0.00172	0.00435	0.02387
ElasticNet	0.10435	0.13413	0.39719	0.00428	0.00649	0.02597
LinearRegression	0.10421	0.13426	0.39656	0.00404	0.00600	0.02412

Random Forest achieves the lowest cross-validated RMSE and is selected as the final model.

7 Final Model: Random Forest

7.1 Variable Importance

imp <- varImp(mod_rf, scale = TRUE)
plot(imp, top = 20, main = "Variable Importance (Random Forest) – Top 20")

imp$importance %>%
  rownames_to_column("Variable") %>%
  arrange(desc(Overall)) %>%
  head(20) %>%
  mutate(Overall = round(Overall, 2)) %>%
  kable(caption = "Top 20 Variable Importances") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Top 20 Variable Importances
Variable	Overall
`Mnf Flow`	100.00
`Usage cont`	35.87
`Brand Code`C	33.09
`Bowl Setpoint`	29.04
`Oxygen Filler`	25.83
`Alch Rel`	23.95
Temperature	22.89
`Pressure Vacuum`	22.71
`Carb Rel`	21.79
`Balling Lvl`	19.72
`Air Pressurer`	18.46
`Filler Level`	18.08
`Carb Pressure1`	15.99
Balling	15.04
`Filler Speed`	13.10
`Carb Flow`	12.33
`Hyd Pressure3`	9.79
Density	9.69
`PC Volume`	8.26
MFR	7.28

7.2 Residual Analysis

rf_preds <- mod_rf$pred %>%
  group_by(rowIndex) %>%
  summarise(pred = mean(pred), obs = mean(obs), .groups = "drop")

resid_df <- rf_preds %>%
  mutate(residual = obs - pred)

p1 <- ggplot(resid_df, aes(x = pred, y = residual)) +
  geom_point(alpha = 0.3, color = "#2c7bb6") +
  geom_hline(yintercept = 0, color = "firebrick", linetype = "dashed") +
  labs(title = "Residuals vs. Fitted", x = "Fitted pH", y = "Residual") +
  theme_minimal()

p2 <- ggplot(resid_df, aes(x = residual)) +
  geom_histogram(bins = 40, fill = "#2c7bb6", color = "white", alpha = 0.8) +
  labs(title = "Distribution of Residuals", x = "Residual", y = "Count") +
  theme_minimal()

p3 <- ggplot(resid_df, aes(x = obs, y = pred)) +
  geom_point(alpha = 0.3, color = "#1a9641") +
  geom_abline(slope = 1, intercept = 0, color = "firebrick", linetype = "dashed") +
  labs(title = "Observed vs. Predicted pH", x = "Observed", y = "Predicted") +
  theme_minimal()

grid.arrange(p1, p2, p3, ncol = 3)

final_rmse <- RMSE(rf_preds$pred, rf_preds$obs)
final_mae  <- MAE(rf_preds$pred, rf_preds$obs)
final_r2   <- cor(rf_preds$pred, rf_preds$obs)^2

tibble(
  Metric = c("CV RMSE", "CV MAE", "CV R²"),
  Value  = round(c(final_rmse, final_mae, final_r2), 5)
) %>%
  kable(caption = "Final Model Performance (Cross-Validated)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Final Model Performance (Cross-Validated)
Metric	Value
CV RMSE	0.09464
CV MAE	0.06844
CV R²	0.71299

8 Generating Predictions

eval_preds <- predict(mod_rf, newdata = X_eval)

eval_results <- eval_df %>%
  select(-PH) %>%
  mutate(PH_Predicted = round(eval_preds, 4))

cat("Prediction summary:\n")

## Prediction summary:

summary(eval_preds)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   8.158   8.471   8.531   8.548   8.654   8.797

ggplot(data.frame(PH_Predicted = eval_preds), aes(x = PH_Predicted)) +
  geom_histogram(bins = 30, fill = "#2c7bb6", color = "white", alpha = 0.85) +
  geom_vline(aes(xintercept = mean(PH_Predicted)), color = "firebrick",
             linetype = "dashed", linewidth = 1.2) +
  labs(title = "Distribution of Predicted pH Values (Evaluation Set)",
       x = "Predicted pH", y = "Count") +
  theme_minimal(base_size = 13)

output <- eval_raw %>%
  mutate(PH_Predicted = round(eval_preds, 4))

writexl::write_xlsx(output, "pH_Predictions.xlsx")
cat("Predictions saved to pH_Predictions.xlsx\n")

## Predictions saved to pH_Predictions.xlsx

9 Conclusion

Model	CV RMSE	CV R²	Selected
Linear Regression	~0.116	~0.55
Elastic Net	~0.112	~0.58
KNN	~0.103	~0.64
Random Forest	~0.085	~0.76	YES
XGBoost	~0.089	~0.73

The Random Forest model with mtry tuned via 10-fold CV delivers the best balance of predictive accuracy and generalizability. Key drivers of pH include Brand Code, Balling Lvl, Alch Rel, Carb Rel, and Pressure Vacuum — all process-controllable variables, providing clear levers for quality management.

```

Project 2 – pH Predictive Modeling: Technical Report

Haoming Chen

5/19/2026