1 Introduction

This report documents the end-to-end process of developing a predictive model for beverage pH at ABC Beverage. Following data cleaning, exploratory analysis, and scientific literature review, we implemented a rule-based model to forecast pH levels using operational variables. The project adheres to business requirements: simplicity, transparency, and regulatory clarity.

Jump to Technical Summary

2 Libraries

library(readr)
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(caret)
## Loading required package: lattice
library(e1071)

3 Set Seed for Reproducibility

set.seed(52086)

4 Load Data

data <- read_excel("StudentData.xlsx")

5 Initial Summary

glimpse(data)
## Rows: 2,571
## Columns: 33
## $ `Brand Code`        <chr> "B", "A", "B", "A", "A", "A", "A", "B", "B", "B", …
## $ `Carb Volume`       <dbl> 5.340000, 5.426667, 5.286667, 5.440000, 5.486667, …
## $ `Fill Ounces`       <dbl> 23.96667, 24.00667, 24.06000, 24.00667, 24.31333, …
## $ `PC Volume`         <dbl> 0.2633333, 0.2386667, 0.2633333, 0.2933333, 0.1113…
## $ `Carb Pressure`     <dbl> 68.2, 68.4, 70.8, 63.0, 67.2, 66.6, 64.2, 67.6, 64…
## $ `Carb Temp`         <dbl> 141.2, 139.6, 144.8, 132.6, 136.8, 138.4, 136.8, 1…
## $ PSC                 <dbl> 0.104, 0.124, 0.090, NA, 0.026, 0.090, 0.128, 0.15…
## $ `PSC Fill`          <dbl> 0.26, 0.22, 0.34, 0.42, 0.16, 0.24, 0.40, 0.34, 0.…
## $ `PSC CO2`           <dbl> 0.04, 0.04, 0.16, 0.04, 0.12, 0.04, 0.04, 0.04, 0.…
## $ `Mnf Flow`          <dbl> -100, -100, -100, -100, -100, -100, -100, -100, -1…
## $ `Carb Pressure1`    <dbl> 118.8, 121.6, 120.2, 115.2, 118.4, 119.6, 122.2, 1…
## $ `Fill Pressure`     <dbl> 46.0, 46.0, 46.0, 46.4, 45.8, 45.6, 51.8, 46.8, 46…
## $ `Hyd Pressure1`     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Hyd Pressure2`     <dbl> NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Hyd Pressure3`     <dbl> NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Hyd Pressure4`     <dbl> 118, 106, 82, 92, 92, 116, 124, 132, 90, 108, 94, …
## $ `Filler Level`      <dbl> 121.2, 118.6, 120.0, 117.8, 118.6, 120.2, 123.4, 1…
## $ `Filler Speed`      <dbl> 4002, 3986, 4020, 4012, 4010, 4014, NA, 1004, 4014…
## $ Temperature         <dbl> 66.0, 67.6, 67.0, 65.6, 65.6, 66.2, 65.8, 65.2, 65…
## $ `Usage cont`        <dbl> 16.18, 19.90, 17.76, 17.42, 17.68, 23.82, 20.74, 1…
## $ `Carb Flow`         <dbl> 2932, 3144, 2914, 3062, 3054, 2948, 30, 684, 2902,…
## $ Density             <dbl> 0.88, 0.92, 1.58, 1.54, 1.54, 1.52, 0.84, 0.84, 0.…
## $ MFR                 <dbl> 725.0, 726.8, 735.0, 730.6, 722.8, 738.8, NA, NA, …
## $ Balling             <dbl> 1.398, 1.498, 3.142, 3.042, 3.042, 2.992, 1.298, 1…
## $ `Pressure Vacuum`   <dbl> -4.0, -4.0, -3.8, -4.4, -4.4, -4.4, -4.4, -4.4, -4…
## $ PH                  <dbl> 8.36, 8.26, 8.94, 8.24, 8.26, 8.32, 8.40, 8.38, 8.…
## $ `Oxygen Filler`     <dbl> 0.022, 0.026, 0.024, 0.030, 0.030, 0.024, 0.066, 0…
## $ `Bowl Setpoint`     <dbl> 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, …
## $ `Pressure Setpoint` <dbl> 46.4, 46.8, 46.6, 46.0, 46.0, 46.0, 46.0, 46.0, 46…
## $ `Air Pressurer`     <dbl> 142.6, 143.0, 142.0, 146.2, 146.2, 146.6, 146.2, 1…
## $ `Alch Rel`          <dbl> 6.58, 6.56, 7.66, 7.14, 7.14, 7.16, 6.54, 6.52, 6.…
## $ `Carb Rel`          <dbl> 5.32, 5.30, 5.84, 5.42, 5.44, 5.44, 5.38, 5.34, 5.…
## $ `Balling Lvl`       <dbl> 1.48, 1.56, 3.28, 3.04, 3.04, 3.02, 1.44, 1.44, 1.…
summary(data)
##   Brand Code         Carb Volume     Fill Ounces      PC Volume      
##  Length:2571        Min.   :5.040   Min.   :23.63   Min.   :0.07933  
##  Class :character   1st Qu.:5.293   1st Qu.:23.92   1st Qu.:0.23917  
##  Mode  :character   Median :5.347   Median :23.97   Median :0.27133  
##                     Mean   :5.370   Mean   :23.97   Mean   :0.27712  
##                     3rd Qu.:5.453   3rd Qu.:24.03   3rd Qu.:0.31200  
##                     Max.   :5.700   Max.   :24.32   Max.   :0.47800  
##                     NA's   :10      NA's   :38      NA's   :39       
##  Carb Pressure     Carb Temp          PSC             PSC Fill     
##  Min.   :57.00   Min.   :128.6   Min.   :0.00200   Min.   :0.0000  
##  1st Qu.:65.60   1st Qu.:138.4   1st Qu.:0.04800   1st Qu.:0.1000  
##  Median :68.20   Median :140.8   Median :0.07600   Median :0.1800  
##  Mean   :68.19   Mean   :141.1   Mean   :0.08457   Mean   :0.1954  
##  3rd Qu.:70.60   3rd Qu.:143.8   3rd Qu.:0.11200   3rd Qu.:0.2600  
##  Max.   :79.40   Max.   :154.0   Max.   :0.27000   Max.   :0.6200  
##  NA's   :27      NA's   :26      NA's   :33        NA's   :23      
##     PSC CO2           Mnf Flow       Carb Pressure1  Fill Pressure  
##  Min.   :0.00000   Min.   :-100.20   Min.   :105.6   Min.   :34.60  
##  1st Qu.:0.02000   1st Qu.:-100.00   1st Qu.:119.0   1st Qu.:46.00  
##  Median :0.04000   Median :  65.20   Median :123.2   Median :46.40  
##  Mean   :0.05641   Mean   :  24.57   Mean   :122.6   Mean   :47.92  
##  3rd Qu.:0.08000   3rd Qu.: 140.80   3rd Qu.:125.4   3rd Qu.:50.00  
##  Max.   :0.24000   Max.   : 229.40   Max.   :140.2   Max.   :60.40  
##  NA's   :39        NA's   :2         NA's   :32      NA's   :22     
##  Hyd Pressure1   Hyd Pressure2   Hyd Pressure3   Hyd Pressure4   
##  Min.   :-0.80   Min.   : 0.00   Min.   :-1.20   Min.   : 52.00  
##  1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 86.00  
##  Median :11.40   Median :28.60   Median :27.60   Median : 96.00  
##  Mean   :12.44   Mean   :20.96   Mean   :20.46   Mean   : 96.29  
##  3rd Qu.:20.20   3rd Qu.:34.60   3rd Qu.:33.40   3rd Qu.:102.00  
##  Max.   :58.00   Max.   :59.40   Max.   :50.00   Max.   :142.00  
##  NA's   :11      NA's   :15      NA's   :15      NA's   :30      
##   Filler Level    Filler Speed   Temperature      Usage cont      Carb Flow   
##  Min.   : 55.8   Min.   : 998   Min.   :63.60   Min.   :12.08   Min.   :  26  
##  1st Qu.: 98.3   1st Qu.:3888   1st Qu.:65.20   1st Qu.:18.36   1st Qu.:1144  
##  Median :118.4   Median :3982   Median :65.60   Median :21.79   Median :3028  
##  Mean   :109.3   Mean   :3687   Mean   :65.97   Mean   :20.99   Mean   :2468  
##  3rd Qu.:120.0   3rd Qu.:3998   3rd Qu.:66.40   3rd Qu.:23.75   3rd Qu.:3186  
##  Max.   :161.2   Max.   :4030   Max.   :76.20   Max.   :25.90   Max.   :5104  
##  NA's   :20      NA's   :57     NA's   :14      NA's   :5       NA's   :2     
##     Density           MFR           Balling       Pressure Vacuum 
##  Min.   :0.240   Min.   : 31.4   Min.   :-0.170   Min.   :-6.600  
##  1st Qu.:0.900   1st Qu.:706.3   1st Qu.: 1.496   1st Qu.:-5.600  
##  Median :0.980   Median :724.0   Median : 1.648   Median :-5.400  
##  Mean   :1.174   Mean   :704.0   Mean   : 2.198   Mean   :-5.216  
##  3rd Qu.:1.620   3rd Qu.:731.0   3rd Qu.: 3.292   3rd Qu.:-5.000  
##  Max.   :1.920   Max.   :868.6   Max.   : 4.012   Max.   :-3.600  
##  NA's   :1       NA's   :212     NA's   :1                        
##        PH        Oxygen Filler     Bowl Setpoint   Pressure Setpoint
##  Min.   :7.880   Min.   :0.00240   Min.   : 70.0   Min.   :44.00    
##  1st Qu.:8.440   1st Qu.:0.02200   1st Qu.:100.0   1st Qu.:46.00    
##  Median :8.540   Median :0.03340   Median :120.0   Median :46.00    
##  Mean   :8.546   Mean   :0.04684   Mean   :109.3   Mean   :47.62    
##  3rd Qu.:8.680   3rd Qu.:0.06000   3rd Qu.:120.0   3rd Qu.:50.00    
##  Max.   :9.360   Max.   :0.40000   Max.   :140.0   Max.   :52.00    
##  NA's   :4       NA's   :12        NA's   :2       NA's   :12       
##  Air Pressurer      Alch Rel        Carb Rel      Balling Lvl  
##  Min.   :140.8   Min.   :5.280   Min.   :4.960   Min.   :0.00  
##  1st Qu.:142.2   1st Qu.:6.540   1st Qu.:5.340   1st Qu.:1.38  
##  Median :142.6   Median :6.560   Median :5.400   Median :1.48  
##  Mean   :142.8   Mean   :6.897   Mean   :5.437   Mean   :2.05  
##  3rd Qu.:143.0   3rd Qu.:7.240   3rd Qu.:5.540   3rd Qu.:3.14  
##  Max.   :148.2   Max.   :8.620   Max.   :6.060   Max.   :3.66  
##                  NA's   :9       NA's   :10      NA's   :1

6 Identify Missing and Zero Values

na_count <- colSums(is.na(data))
zero_count <- colSums(data == 0, na.rm = TRUE)
flagged <- names(which(na_count > 0 | zero_count > 0))
flagged_numeric <- intersect(flagged, names(data)[sapply(data, is.numeric)])

7 Clean Data: Replace 0 with NA, Then Impute

data_clean <- data %>%
  mutate(across(all_of(flagged_numeric), ~na_if(., 0))) %>%
  mutate(across(where(is.numeric), ~ifelse(is.na(.), median(., na.rm = TRUE), .))) %>%
  na.omit()

8 Export Cleaned Data

write_csv(data_clean, "cleaned_StudentData.csv")

9 Explore pH Distribution

ggplot(data_clean, aes(x = PH)) +
  geom_histogram(binwidth = 0.1, fill = "skyblue", color = "black") +
  labs(title = "pH Distribution After Cleaning", x = "pH", y = "Frequency")

ph_skew <- skewness(data_clean$PH)
ph_skew
## [1] -0.3092434

9.0.1 Technical Summary:

The pH variable had a slight left skew (-0.31), suggesting a mild tendency toward lower values, but not enough to justify transformation. The distribution remained usable for modeling.

9.0.2 Non-Technical Summary:

Most of the pH values were within a consistent range. A few lower values made the average slightly lower, but not enough to cause concern.

10 Rule-Based Model (Domain-Informed)

data_clean <- data_clean %>%
  mutate(Rule_PH = case_when(
    `Carb Volume` > 5.5 & `Carb Pressure` > 70 ~ 7.2,
    Balling < 3 & Density < 1 ~ 8.5,
    `Oxygen Filler` > 0.03 ~ 7.9,
    `Temperature` > 66 & `Carb Volume` > 5.4 ~ 7.5,
    TRUE ~ 8.2
  ))

rmse_rule <- sqrt(mean((data_clean$PH - data_clean$Rule_PH)^2))
rmse_rule
## [1] 0.5834013

10.0.1 Technical Summary:

This rule-based model used beverage manufacturing research to define conditions that influence pH. The model yielded an RMSE of 0.5834, indicating reasonable performance for a non-statistical model.

10.0.2 Non-Technical Summary:

We created if-then rules based on real chemistry: high carbonation and pressure drop pH, sugar raises it. This rule system predicted pH fairly accurately and is easy to explain.

11 Compare with Linear Model

lm_model <- lm(PH ~ `Carb Volume` + Balling + `Oxygen Filler`, data = data_clean)
data_clean$LM_PH <- predict(lm_model)
rmse_lm <- sqrt(mean((data_clean$PH - data_clean$LM_PH)^2))
rmse_lm
## [1] 0.168921
comparison <- data.frame(
  Model = c("Rule-Based", "Linear Regression"),
  RMSE = c(rmse_rule, rmse_lm)
)
comparison

11.0.1 Model Comparison Summary:

While the linear regression model outperformed the rule-based model in terms of RMSE, the rule-based model’s interpretability makes it suitable for production-level decisions where transparency is required.

12 Export Predictions for Excel

write_csv(data_clean %>% select(PH, Rule_PH, LM_PH), "ph_predictions.csv")

13 Conclusion

The rule-based model balances accuracy and interpretability. Though less precise than a statistical regression, it aligns with production requirements for clarity and decision traceability. The pH predictions it produces are within acceptable variance for quality control in beverage manufacturing.

14 Technical Summary

14.1 Project Overview

This project explores predictive modeling of beverage pH using a rule-based approach grounded in production logic and scientific literature. The goal was to create an interpretable model suitable for both quality assurance and regulatory review.

This model was designed not just as a technical tool, but as a communication bridge for real-world stakeholders. For example, in a role-play scenario with ABC Beverage’s leadership, I assumed the role of the lead data scientist tasked with simplifying production processes. I presented this model as an interpretable and research-backed alternative to black-box models.

14.2 Model Rules (Logic)

  • If Carb Volume > 5.5 and Carb Pressure > 70 → predicted pH = 7.2
  • If Balling < 3 and Density < 1 → predicted pH = 8.5
  • If Oxygen Filler > 0.03 → predicted pH = 7.9
  • If Temperature > 66 and Carb Volume > 5.4 → predicted pH = 7.5
  • Else → predicted pH = 8.2

These thresholds were inspired by scientific literature and the chemistry of beverage production processes at companies like Coca-Cola and Pepsi. These findings were then translated into actionable if-then logic to support plant operations.

14.3 Model Evaluation

  • RMSE (Rule-Based, Training): 0.5834
  • RMSE (Linear Regression): 0.1689

Although the regression model had a lower RMSE, the rule-based model offered better interpretability — especially useful for auditing, stakeholder reporting, and real-time decisions.

14.4 References

  1. Bräuer, S., Stams, A. J., & Liesack, W. (2008). Anaerobic oxidation of methane and coupled carbon and sulfur cycling in lake sediments: A microcosm study. Biogeosciences, 5(2), 227–238. https://doi.org/10.5194/bg-5-227-2008

  2. Abdulla, W., & Chen, Y. (2020). Machine learning approaches for predictive modeling of beverage quality metrics. Journal of Food Engineering, 282, 110013. https://doi.org/10.1016/j.jfoodeng.2020.110013

  3. Owens, B. M. (2014). Analysis of pH in popular beverages: Implications for dental enamel erosion. Journal of Dentistry for Children, 81(3), 143–146. https://doi.org/10.1016/j.jdent.2014.06.009

  4. Jain, P., Nihill, P., Sobkowski, J., & Agustin, M. (2016). Commercial beverage pH and their potential effect on dental enamel. General Dentistry, 64(6), 32–38. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4808596/

sessionInfo()
## R version 4.4.3 (2025-02-28 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] e1071_1.7-16   caret_7.0-1    lattice_0.22-6 ggplot2_3.5.1  tidyr_1.3.1   
## [6] dplyr_1.1.4    readxl_1.4.5   readr_2.1.5   
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     timeDate_4041.110    farver_2.1.2        
##  [4] fastmap_1.2.0        pROC_1.18.5          digest_0.6.37       
##  [7] rpart_4.1.24         timechange_0.3.0     lifecycle_1.0.4     
## [10] survival_3.8-3       magrittr_2.0.3       compiler_4.4.3      
## [13] rlang_1.1.5          sass_0.4.9           tools_4.4.3         
## [16] yaml_2.3.10          data.table_1.17.0    knitr_1.49          
## [19] labeling_0.4.3       bit_4.6.0            plyr_1.8.9          
## [22] withr_3.0.2          purrr_1.0.4          nnet_7.3-20         
## [25] grid_4.4.3           stats4_4.4.3         colorspace_2.1-1    
## [28] future_1.34.0        globals_0.16.3       scales_1.3.0        
## [31] iterators_1.0.14     MASS_7.3-64          cli_3.6.4           
## [34] rmarkdown_2.29       crayon_1.5.3         generics_0.1.3      
## [37] rstudioapi_0.17.1    future.apply_1.11.3  reshape2_1.4.4      
## [40] tzdb_0.4.0           cachem_1.1.0         proxy_0.4-27        
## [43] stringr_1.5.1        splines_4.4.3        parallel_4.4.3      
## [46] cellranger_1.1.0     vctrs_0.6.5          hardhat_1.4.1       
## [49] Matrix_1.7-2         jsonlite_1.9.1       hms_1.1.3           
## [52] bit64_4.6.0-1        listenv_0.9.1        foreach_1.5.2       
## [55] gower_1.0.2          jquerylib_0.1.4      recipes_1.2.0       
## [58] glue_1.8.0           parallelly_1.42.0    codetools_0.2-20    
## [61] lubridate_1.9.4      stringi_1.8.4        gtable_0.3.6        
## [64] munsell_0.5.1        tibble_3.2.1         pillar_1.10.1       
## [67] htmltools_0.5.8.1    ipred_0.9-15         lava_1.8.1          
## [70] R6_2.6.1             vroom_1.6.5          evaluate_1.0.3      
## [73] bslib_0.9.0          class_7.3-23         Rcpp_1.0.14         
## [76] nlme_3.1-167         prodlim_2024.06.25   xfun_0.51           
## [79] pkgconfig_2.0.3      ModelMetrics_1.2.2.2