1 Executive Summary

Note: This is the technical report. For business-friendly recommendations, please refer to the separate “Non-Technical Report for Leadership.”

1.1 Project Objective

New regulations require ABC Beverage to understand manufacturing factors that control pH and provide a working predictive model. We have historical data from 2,567 production runs and must predict pH for 267 new runs.

1.2 Key Findings

  • Random Forest selected as final model (RMSE = 0.0921, MAE = 0.0685, R² = 0.726)
  • Top 3 predictors: Manufacturing Flow, Bowl Setpoint, Filler Level
  • Brand matters: Significant pH differences across brands (p < 0.001)
  • Predictions delivered: Excel file with 90% confidence intervals (±0.1514 pH)

1.3 Load Training and Testing data

# Set working directory (adjust to your path)
# setwd("your/folder/path")
# Read the training data
train_data <- read_excel("StudentData.xlsx")
test_data <- read_excel("StudentEvaluation.xlsx")

1.3.1 Training Data

# Check structure of training data
cat("=== TRAINING DATA ===\n")
## === TRAINING DATA ===
cat("Dimensions:", dim(train_data), "\n")
## Dimensions: 2571 33
cat("\nColumn names:\n")
## 
## Column names:
print(names(train_data))
##  [1] "Brand Code"        "Carb Volume"       "Fill Ounces"      
##  [4] "PC Volume"         "Carb Pressure"     "Carb Temp"        
##  [7] "PSC"               "PSC Fill"          "PSC CO2"          
## [10] "Mnf Flow"          "Carb Pressure1"    "Fill Pressure"    
## [13] "Hyd Pressure1"     "Hyd Pressure2"     "Hyd Pressure3"    
## [16] "Hyd Pressure4"     "Filler Level"      "Filler Speed"     
## [19] "Temperature"       "Usage cont"        "Carb Flow"        
## [22] "Density"           "MFR"               "Balling"          
## [25] "Pressure Vacuum"   "PH"                "Oxygen Filler"    
## [28] "Bowl Setpoint"     "Pressure Setpoint" "Air Pressurer"    
## [31] "Alch Rel"          "Carb Rel"          "Balling Lvl"
cat("\nFirst few rows:\n")
## 
## First few rows:
print(head(train_data,3))
## # A tibble: 3 × 33
##   `Brand Code` `Carb Volume` `Fill Ounces` `PC Volume` `Carb Pressure`
##   <chr>                <dbl>         <dbl>       <dbl>           <dbl>
## 1 B                     5.34          24.0       0.263            68.2
## 2 A                     5.43          24.0       0.239            68.4
## 3 B                     5.29          24.1       0.263            70.8
## # ℹ 28 more variables: `Carb Temp` <dbl>, PSC <dbl>, `PSC Fill` <dbl>,
## #   `PSC CO2` <dbl>, `Mnf Flow` <dbl>, `Carb Pressure1` <dbl>,
## #   `Fill Pressure` <dbl>, `Hyd Pressure1` <dbl>, `Hyd Pressure2` <dbl>,
## #   `Hyd Pressure3` <dbl>, `Hyd Pressure4` <dbl>, `Filler Level` <dbl>,
## #   `Filler Speed` <dbl>, Temperature <dbl>, `Usage cont` <dbl>,
## #   `Carb Flow` <dbl>, Density <dbl>, MFR <dbl>, Balling <dbl>,
## #   `Pressure Vacuum` <dbl>, PH <dbl>, `Oxygen Filler` <dbl>, …
cat("\nSummary of training data:\n")
## 
## Summary of training data:
print(summary(train_data))
##   Brand Code         Carb Volume     Fill Ounces      PC Volume      
##  Length:2571        Min.   :5.040   Min.   :23.63   Min.   :0.07933  
##  Class :character   1st Qu.:5.293   1st Qu.:23.92   1st Qu.:0.23917  
##  Mode  :character   Median :5.347   Median :23.97   Median :0.27133  
##                     Mean   :5.370   Mean   :23.97   Mean   :0.27712  
##                     3rd Qu.:5.453   3rd Qu.:24.03   3rd Qu.:0.31200  
##                     Max.   :5.700   Max.   :24.32   Max.   :0.47800  
##                     NA's   :10      NA's   :38      NA's   :39       
##  Carb Pressure     Carb Temp          PSC             PSC Fill     
##  Min.   :57.00   Min.   :128.6   Min.   :0.00200   Min.   :0.0000  
##  1st Qu.:65.60   1st Qu.:138.4   1st Qu.:0.04800   1st Qu.:0.1000  
##  Median :68.20   Median :140.8   Median :0.07600   Median :0.1800  
##  Mean   :68.19   Mean   :141.1   Mean   :0.08457   Mean   :0.1954  
##  3rd Qu.:70.60   3rd Qu.:143.8   3rd Qu.:0.11200   3rd Qu.:0.2600  
##  Max.   :79.40   Max.   :154.0   Max.   :0.27000   Max.   :0.6200  
##  NA's   :27      NA's   :26      NA's   :33        NA's   :23      
##     PSC CO2           Mnf Flow       Carb Pressure1  Fill Pressure  
##  Min.   :0.00000   Min.   :-100.20   Min.   :105.6   Min.   :34.60  
##  1st Qu.:0.02000   1st Qu.:-100.00   1st Qu.:119.0   1st Qu.:46.00  
##  Median :0.04000   Median :  65.20   Median :123.2   Median :46.40  
##  Mean   :0.05641   Mean   :  24.57   Mean   :122.6   Mean   :47.92  
##  3rd Qu.:0.08000   3rd Qu.: 140.80   3rd Qu.:125.4   3rd Qu.:50.00  
##  Max.   :0.24000   Max.   : 229.40   Max.   :140.2   Max.   :60.40  
##  NA's   :39        NA's   :2         NA's   :32      NA's   :22     
##  Hyd Pressure1   Hyd Pressure2   Hyd Pressure3   Hyd Pressure4   
##  Min.   :-0.80   Min.   : 0.00   Min.   :-1.20   Min.   : 52.00  
##  1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 86.00  
##  Median :11.40   Median :28.60   Median :27.60   Median : 96.00  
##  Mean   :12.44   Mean   :20.96   Mean   :20.46   Mean   : 96.29  
##  3rd Qu.:20.20   3rd Qu.:34.60   3rd Qu.:33.40   3rd Qu.:102.00  
##  Max.   :58.00   Max.   :59.40   Max.   :50.00   Max.   :142.00  
##  NA's   :11      NA's   :15      NA's   :15      NA's   :30      
##   Filler Level    Filler Speed   Temperature      Usage cont      Carb Flow   
##  Min.   : 55.8   Min.   : 998   Min.   :63.60   Min.   :12.08   Min.   :  26  
##  1st Qu.: 98.3   1st Qu.:3888   1st Qu.:65.20   1st Qu.:18.36   1st Qu.:1144  
##  Median :118.4   Median :3982   Median :65.60   Median :21.79   Median :3028  
##  Mean   :109.3   Mean   :3687   Mean   :65.97   Mean   :20.99   Mean   :2468  
##  3rd Qu.:120.0   3rd Qu.:3998   3rd Qu.:66.40   3rd Qu.:23.75   3rd Qu.:3186  
##  Max.   :161.2   Max.   :4030   Max.   :76.20   Max.   :25.90   Max.   :5104  
##  NA's   :20      NA's   :57     NA's   :14      NA's   :5       NA's   :2     
##     Density           MFR           Balling       Pressure Vacuum 
##  Min.   :0.240   Min.   : 31.4   Min.   :-0.170   Min.   :-6.600  
##  1st Qu.:0.900   1st Qu.:706.3   1st Qu.: 1.496   1st Qu.:-5.600  
##  Median :0.980   Median :724.0   Median : 1.648   Median :-5.400  
##  Mean   :1.174   Mean   :704.0   Mean   : 2.198   Mean   :-5.216  
##  3rd Qu.:1.620   3rd Qu.:731.0   3rd Qu.: 3.292   3rd Qu.:-5.000  
##  Max.   :1.920   Max.   :868.6   Max.   : 4.012   Max.   :-3.600  
##  NA's   :1       NA's   :212     NA's   :1                        
##        PH        Oxygen Filler     Bowl Setpoint   Pressure Setpoint
##  Min.   :7.880   Min.   :0.00240   Min.   : 70.0   Min.   :44.00    
##  1st Qu.:8.440   1st Qu.:0.02200   1st Qu.:100.0   1st Qu.:46.00    
##  Median :8.540   Median :0.03340   Median :120.0   Median :46.00    
##  Mean   :8.546   Mean   :0.04684   Mean   :109.3   Mean   :47.62    
##  3rd Qu.:8.680   3rd Qu.:0.06000   3rd Qu.:120.0   3rd Qu.:50.00    
##  Max.   :9.360   Max.   :0.40000   Max.   :140.0   Max.   :52.00    
##  NA's   :4       NA's   :12        NA's   :2       NA's   :12       
##  Air Pressurer      Alch Rel        Carb Rel      Balling Lvl  
##  Min.   :140.8   Min.   :5.280   Min.   :4.960   Min.   :0.00  
##  1st Qu.:142.2   1st Qu.:6.540   1st Qu.:5.340   1st Qu.:1.38  
##  Median :142.6   Median :6.560   Median :5.400   Median :1.48  
##  Mean   :142.8   Mean   :6.897   Mean   :5.437   Mean   :2.05  
##  3rd Qu.:143.0   3rd Qu.:7.240   3rd Qu.:5.540   3rd Qu.:3.14  
##  Max.   :148.2   Max.   :8.620   Max.   :6.060   Max.   :3.66  
##                  NA's   :9       NA's   :10      NA's   :1

Key Insights from Training Data:

  • pH is numeric in training data (range: 7.88 to 9.36, mean ~8.55) — this is our target variable

  • Dataset have missing values that need handling before modeling

  • 33 predictor variables available (excluding Brand Code which is categorical)

1.3.2 Test Data

cat("\n\n=== TEST DATA ===\n")
## 
## 
## === TEST DATA ===
cat("Dimensions:", dim(test_data), "\n")
## Dimensions: 267 33
cat("\nColumn names:\n")
## 
## Column names:
print(names(test_data))
##  [1] "Brand Code"        "Carb Volume"       "Fill Ounces"      
##  [4] "PC Volume"         "Carb Pressure"     "Carb Temp"        
##  [7] "PSC"               "PSC Fill"          "PSC CO2"          
## [10] "Mnf Flow"          "Carb Pressure1"    "Fill Pressure"    
## [13] "Hyd Pressure1"     "Hyd Pressure2"     "Hyd Pressure3"    
## [16] "Hyd Pressure4"     "Filler Level"      "Filler Speed"     
## [19] "Temperature"       "Usage cont"        "Carb Flow"        
## [22] "Density"           "MFR"               "Balling"          
## [25] "Pressure Vacuum"   "PH"                "Oxygen Filler"    
## [28] "Bowl Setpoint"     "Pressure Setpoint" "Air Pressurer"    
## [31] "Alch Rel"          "Carb Rel"          "Balling Lvl"
cat("\nFirst few rows:\n")
## 
## First few rows:
print(head(test_data,3))
## # A tibble: 3 × 33
##   `Brand Code` `Carb Volume` `Fill Ounces` `PC Volume` `Carb Pressure`
##   <chr>                <dbl>         <dbl>       <dbl>           <dbl>
## 1 D                     5.48          24.0       0.27             65.4
## 2 A                     5.39          24.0       0.227            63.2
## 3 B                     5.29          23.9       0.303            66.4
## # ℹ 28 more variables: `Carb Temp` <dbl>, PSC <dbl>, `PSC Fill` <dbl>,
## #   `PSC CO2` <dbl>, `Mnf Flow` <dbl>, `Carb Pressure1` <dbl>,
## #   `Fill Pressure` <dbl>, `Hyd Pressure1` <dbl>, `Hyd Pressure2` <dbl>,
## #   `Hyd Pressure3` <dbl>, `Hyd Pressure4` <dbl>, `Filler Level` <dbl>,
## #   `Filler Speed` <dbl>, Temperature <dbl>, `Usage cont` <dbl>,
## #   `Carb Flow` <dbl>, Density <dbl>, MFR <dbl>, Balling <dbl>,
## #   `Pressure Vacuum` <dbl>, PH <lgl>, `Oxygen Filler` <dbl>, …
cat("\nSummary of test data:\n")
## 
## Summary of test data:
print(summary(test_data))
##   Brand Code         Carb Volume     Fill Ounces      PC Volume      
##  Length:267         Min.   :5.147   Min.   :23.75   Min.   :0.09867  
##  Class :character   1st Qu.:5.287   1st Qu.:23.92   1st Qu.:0.23333  
##  Mode  :character   Median :5.340   Median :23.97   Median :0.27533  
##                     Mean   :5.369   Mean   :23.97   Mean   :0.27769  
##                     3rd Qu.:5.465   3rd Qu.:24.01   3rd Qu.:0.32200  
##                     Max.   :5.667   Max.   :24.20   Max.   :0.46400  
##                     NA's   :1       NA's   :6       NA's   :4        
##  Carb Pressure     Carb Temp          PSC             PSC Fill     
##  Min.   :60.20   Min.   :130.0   Min.   :0.00400   Min.   :0.0200  
##  1st Qu.:65.30   1st Qu.:138.4   1st Qu.:0.04450   1st Qu.:0.1000  
##  Median :68.00   Median :140.8   Median :0.07600   Median :0.1800  
##  Mean   :68.25   Mean   :141.2   Mean   :0.08545   Mean   :0.1903  
##  3rd Qu.:70.60   3rd Qu.:143.8   3rd Qu.:0.11200   3rd Qu.:0.2600  
##  Max.   :77.60   Max.   :154.0   Max.   :0.24600   Max.   :0.6200  
##                  NA's   :1       NA's   :5         NA's   :3       
##     PSC CO2           Mnf Flow       Carb Pressure1  Fill Pressure  
##  Min.   :0.00000   Min.   :-100.20   Min.   :113.0   Min.   :37.80  
##  1st Qu.:0.02000   1st Qu.:-100.00   1st Qu.:120.2   1st Qu.:46.00  
##  Median :0.04000   Median :   0.20   Median :123.4   Median :47.80  
##  Mean   :0.05107   Mean   :  21.03   Mean   :123.0   Mean   :48.14  
##  3rd Qu.:0.06000   3rd Qu.: 141.30   3rd Qu.:125.5   3rd Qu.:50.20  
##  Max.   :0.24000   Max.   : 220.40   Max.   :136.0   Max.   :60.20  
##  NA's   :5                           NA's   :4       NA's   :2      
##  Hyd Pressure1    Hyd Pressure2    Hyd Pressure3    Hyd Pressure4   
##  Min.   :-50.00   Min.   :-50.00   Min.   :-50.00   Min.   : 68.00  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.: 90.00  
##  Median : 10.40   Median : 26.80   Median : 27.70   Median : 98.00  
##  Mean   : 12.01   Mean   : 20.11   Mean   : 19.61   Mean   : 97.84  
##  3rd Qu.: 20.40   3rd Qu.: 34.80   3rd Qu.: 33.00   3rd Qu.:104.00  
##  Max.   : 50.00   Max.   : 61.40   Max.   : 49.20   Max.   :140.00  
##                   NA's   :1        NA's   :1        NA's   :4       
##   Filler Level    Filler Speed   Temperature      Usage cont      Carb Flow   
##  Min.   : 69.2   Min.   :1006   Min.   :63.80   Min.   :12.90   Min.   :   0  
##  1st Qu.:100.6   1st Qu.:3812   1st Qu.:65.40   1st Qu.:18.12   1st Qu.:1083  
##  Median :118.6   Median :3978   Median :65.80   Median :21.44   Median :3038  
##  Mean   :110.3   Mean   :3581   Mean   :66.23   Mean   :20.90   Mean   :2409  
##  3rd Qu.:120.2   3rd Qu.:3996   3rd Qu.:66.60   3rd Qu.:23.74   3rd Qu.:3215  
##  Max.   :153.2   Max.   :4020   Max.   :75.40   Max.   :24.60   Max.   :3858  
##  NA's   :2       NA's   :10     NA's   :2       NA's   :2                     
##     Density           MFR           Balling      Pressure Vacuum 
##  Min.   :0.060   Min.   : 15.6   Min.   :0.902   Min.   :-6.400  
##  1st Qu.:0.920   1st Qu.:707.0   1st Qu.:1.498   1st Qu.:-5.600  
##  Median :0.980   Median :724.6   Median :1.648   Median :-5.200  
##  Mean   :1.177   Mean   :697.8   Mean   :2.203   Mean   :-5.174  
##  3rd Qu.:1.600   3rd Qu.:731.5   3rd Qu.:3.242   3rd Qu.:-4.800  
##  Max.   :1.840   Max.   :784.8   Max.   :3.788   Max.   :-3.600  
##  NA's   :1       NA's   :31      NA's   :1       NA's   :1       
##     PH          Oxygen Filler     Bowl Setpoint   Pressure Setpoint
##  Mode:logical   Min.   :0.00240   Min.   : 70.0   Min.   :44.00    
##  NA's:267       1st Qu.:0.01960   1st Qu.:100.0   1st Qu.:46.00    
##                 Median :0.03370   Median :120.0   Median :46.00    
##                 Mean   :0.04666   Mean   :109.6   Mean   :47.73    
##                 3rd Qu.:0.05440   3rd Qu.:120.0   3rd Qu.:50.00    
##                 Max.   :0.39800   Max.   :130.0   Max.   :52.00    
##                 NA's   :3         NA's   :1       NA's   :2        
##  Air Pressurer      Alch Rel        Carb Rel     Balling Lvl   
##  Min.   :141.2   Min.   :6.400   Min.   :5.18   Min.   :0.000  
##  1st Qu.:142.2   1st Qu.:6.540   1st Qu.:5.34   1st Qu.:1.380  
##  Median :142.6   Median :6.580   Median :5.40   Median :1.480  
##  Mean   :142.8   Mean   :6.907   Mean   :5.44   Mean   :2.051  
##  3rd Qu.:142.8   3rd Qu.:7.180   3rd Qu.:5.56   3rd Qu.:3.080  
##  Max.   :147.2   Max.   :7.820   Max.   :5.74   Max.   :3.420  
##  NA's   :1       NA's   :3       NA's   :2

1.3.3 Check missing values

# Check for missing values in both datasets
cat("\n=== MISSING VALUES ===\n")
## 
## === MISSING VALUES ===
cat("Training data missing values per column:\n")
## Training data missing values per column:
print(colSums(is.na(train_data)))
##        Brand Code       Carb Volume       Fill Ounces         PC Volume 
##               120                10                38                39 
##     Carb Pressure         Carb Temp               PSC          PSC Fill 
##                27                26                33                23 
##           PSC CO2          Mnf Flow    Carb Pressure1     Fill Pressure 
##                39                 2                32                22 
##     Hyd Pressure1     Hyd Pressure2     Hyd Pressure3     Hyd Pressure4 
##                11                15                15                30 
##      Filler Level      Filler Speed       Temperature        Usage cont 
##                20                57                14                 5 
##         Carb Flow           Density               MFR           Balling 
##                 2                 1               212                 1 
##   Pressure Vacuum                PH     Oxygen Filler     Bowl Setpoint 
##                 0                 4                12                 2 
## Pressure Setpoint     Air Pressurer          Alch Rel          Carb Rel 
##                12                 0                 9                10 
##       Balling Lvl 
##                 1
cat("\nTest data missing values per column:\n")
## 
## Test data missing values per column:
print(colSums(is.na(test_data)))
##        Brand Code       Carb Volume       Fill Ounces         PC Volume 
##                 8                 1                 6                 4 
##     Carb Pressure         Carb Temp               PSC          PSC Fill 
##                 0                 1                 5                 3 
##           PSC CO2          Mnf Flow    Carb Pressure1     Fill Pressure 
##                 5                 0                 4                 2 
##     Hyd Pressure1     Hyd Pressure2     Hyd Pressure3     Hyd Pressure4 
##                 0                 1                 1                 4 
##      Filler Level      Filler Speed       Temperature        Usage cont 
##                 2                10                 2                 2 
##         Carb Flow           Density               MFR           Balling 
##                 0                 1                31                 1 
##   Pressure Vacuum                PH     Oxygen Filler     Bowl Setpoint 
##                 1               267                 3                 1 
## Pressure Setpoint     Air Pressurer          Alch Rel          Carb Rel 
##                 2                 1                 3                 2 
##       Balling Lvl 
##                 0

1.4 Problem Understanding

1.4.1 Business Goal:

ABC Beverage need to comply with new regulations requiring you to:

  • Understand your manufacturing process and which factors predict pH
  • Report a predictive model for pH to leadership

1.4.2 Data Situation:

  • Training data (StudentData.xlsx): Has historical manufacturing measurements INCLUDING pH values (it contains pH as numeric)
  • Test data (StudentEvaluation.xlsx): Has the SAME manufacturing measurements BUT pH is empty/logical (all NA values) — this is what we need to predict

Challenge: Missing values in BOTH datasets across many columns (Brand Code has 120 NAs in training, MFR has 212 NAs, etc.)

1.5 Data preprocessing

# Create a copy for preprocessing
train_clean <- train_data
test_clean <- test_data

# IDENTIFY COLUMN TYPES
cat("=== COLUMN TYPES IN TRAINING ===\n")
## === COLUMN TYPES IN TRAINING ===
col_types <- sapply(train_clean, class)
print(table(col_types))
## col_types
## character   numeric 
##         1        32
# Separate features and target in training
# Remove rows where PH is missing (only 4 NAs in training)
train_clean <- train_clean %>% filter(!is.na(PH))
cat("\nTraining rows after removing NA in PH:", nrow(train_clean), "\n")
## 
## Training rows after removing NA in PH: 2567
# For test data, PH is all NA - we'll keep it as placeholder
# But we need to ensure test has same structure as training features

# Handle Brand Code as factor (categorical)
train_clean$`Brand Code` <- as.factor(train_clean$`Brand Code`)
test_clean$`Brand Code` <- as.factor(test_clean$`Brand Code`)

# Check levels match
cat("\nBrand Code levels in training:", levels(train_clean$`Brand Code`), "\n")
## 
## Brand Code levels in training: A B C D
cat("Brand Code levels in test:", levels(test_clean$`Brand Code`), "\n")
## Brand Code levels in test: A B C D
# MISSING VALUE ANALYSIS
cat("\n=== MISSING VALUE SUMMARY ===\n")
## 
## === MISSING VALUE SUMMARY ===
missing_train <- sort(colSums(is.na(train_clean)), decreasing = TRUE)
missing_train <- missing_train[missing_train > 0]
cat("Columns with missing values in training:\n")
## Columns with missing values in training:
print(missing_train)
##               MFR        Brand Code      Filler Speed         PC Volume 
##               208               120                54                39 
##           PSC CO2       Fill Ounces               PSC    Carb Pressure1 
##                39                38                33                32 
##     Hyd Pressure4     Carb Pressure         Carb Temp          PSC Fill 
##                28                27                26                23 
##     Fill Pressure      Filler Level     Hyd Pressure2     Hyd Pressure3 
##                18                16                15                15 
##       Temperature Pressure Setpoint     Hyd Pressure1     Oxygen Filler 
##                12                12                11                11 
##       Carb Volume          Carb Rel          Alch Rel        Usage cont 
##                10                 8                 7                 5 
##         Carb Flow     Bowl Setpoint       Balling Lvl 
##                 2                 2                 1
# Identify columns with >30% missing in training
high_missing <- names(missing_train[missing_train > 0.3 * nrow(train_clean)])
cat("\nColumns with >30% missing:", high_missing, "\n")
## 
## Columns with >30% missing:
# For now, let's remove columns with >50% missing
threshold <- 0.5 * nrow(train_clean)
cols_to_remove <- names(missing_train[missing_train > threshold])
cat("Columns to remove (>50% missing):", cols_to_remove, "\n")
## Columns to remove (>50% missing):
# Remove these columns from both datasets
if(length(cols_to_remove) > 0) {
  train_clean <- train_clean %>% select(-one_of(cols_to_remove))
  test_clean <- test_clean %>% select(-one_of(cols_to_remove))
}

1.5.1 Handle Missing Data with Imputation

# For remaining missing values, use median/mode imputation
# Separate numeric and character columns
numeric_cols <- names(train_clean)[sapply(train_clean, is.numeric)]
factor_cols <- names(train_clean)[sapply(train_clean, is.factor)]

# Impute numeric columns with median
for(col in numeric_cols) {
  if(any(is.na(train_clean[[col]]))) {
    med_val <- median(train_clean[[col]], na.rm = TRUE)
    train_clean[[col]][is.na(train_clean[[col]])] <- med_val
  }
  if(any(is.na(test_clean[[col]]))) {
    med_val <- median(test_clean[[col]], na.rm = TRUE)
    test_clean[[col]][is.na(test_clean[[col]])] <- med_val
  }
}

# Impute factor columns with mode
for(col in factor_cols) {
  if(any(is.na(train_clean[[col]]))) {
    mode_val <- names(sort(table(train_clean[[col]]), decreasing = TRUE))[1]
    train_clean[[col]][is.na(train_clean[[col]])] <- mode_val
  }
  if(any(is.na(test_clean[[col]]))) {
    mode_val <- names(sort(table(test_clean[[col]]), decreasing = TRUE))[1]
    test_clean[[col]][is.na(test_clean[[col]])] <- mode_val
  }
}

# Verify no missing values remain
cat("\n=== MISSING VALUES AFTER CLEANING ===\n")
## 
## === MISSING VALUES AFTER CLEANING ===
cat("Training:", sum(is.na(train_clean)), "\n")
## Training: 0
cat("Test:", sum(is.na(test_clean)), "\n")
## Test: 267

1.5.1.1 Summary of handling missing values

  • Removed target variable NAs: Dropped 4 rows from training where pH was missing, leaving 2,567 complete training records

  • Handled missing values in predictors: Imputed numeric columns with median values and categorical (Brand Code) with mode; no columns exceeded 50% missing, so all 33 original features were retained

  • Aligned datasets: Ensured both training and test data have identical structure with same 33 columns and matching factor levels (A, B, C, D for Brand Code)

1.5.2 Outliers and Anomalies Detection

# Ensure factor
train_clean$Brand_Code <- as.factor(train_clean$'Brand Code')
test_clean$Brand_Code <- as.factor(test_clean$'Brand Code')

# 1. OUTLIER DETECTION using IQR method
cat("=== OUTLIER ANALYSIS ===\n")
## === OUTLIER ANALYSIS ===
# Get numeric columns (exclude Brand Code and PH)
numeric_cols <- names(train_clean)[sapply(train_clean, is.numeric)]
numeric_cols <- numeric_cols[!numeric_cols %in% c("PH")]

outlier_summary <- data.frame(
  Variable = character(),
  Outliers_Count = integer(),
  Outliers_Percent = numeric(),
  stringsAsFactors = FALSE
)

for(col in numeric_cols) {
  Q1 <- quantile(train_clean[[col]], 0.25, na.rm = TRUE)
  Q3 <- quantile(train_clean[[col]], 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  
  outliers <- sum(train_clean[[col]] < lower_bound | train_clean[[col]] > upper_bound, na.rm = TRUE)
  outlier_pct <- (outliers / nrow(train_clean)) * 100
  
  outlier_summary <- rbind(outlier_summary, 
                           data.frame(Variable = col, 
                                      Outliers_Count = outliers, 
                                      Outliers_Percent = round(outlier_pct, 2)))
}

# Show top 10 variables with most outliers
outlier_summary <- outlier_summary[order(-outlier_summary$Outliers_Percent), ]
cat("\nTop 10 variables with highest outlier percentage:\n")
## 
## Top 10 variables with highest outlier percentage:
print(head(outlier_summary, 10))
##           Variable Outliers_Count Outliers_Percent
## 17    Filler Speed            440            17.14
## 22             MFR            279            10.87
## 28   Air Pressurer            220             8.57
## 25   Oxygen Filler            193             7.52
## 24 Pressure Vacuum            125             4.87
## 18     Temperature            122             4.75
## 3        PC Volume             95             3.70
## 15   Hyd Pressure4             82             3.19
## 11   Fill Pressure             79             3.08
## 8          PSC CO2             77             3.00

1.5.2.1 Data Quality Findings

1.5.2.2 Top 3 variables with most outliers:

  • Filler Speed - 17.1% outliers (440 out of 2,567 rows)
  • MFR - 10.9% outliers (279 rows)
  • Air Pressurer - 8.6% outliers (220 rows)

Key insight: These are not data entry errors — they represent legitimate manufacturing variation. Filler speed extremes likely occur during production ramp-up/down or different product runs.

1.5.2.3 Recommendation

**Lets Keep outliers but we will document them. Why?

  • Manufacturing processes naturally have extreme values (startup, shutdown, different brands)

  • Removing 17% of data would bias our model

  • These extremes help predict pH across full operating range

1.5.3 Check Negative and zero values

# 2. CHECK FOR IMPOSSIBLE/NONSENSICAL VALUES
cat("\n=== LOGICAL CONSISTENCY CHECKS ===\n")
## 
## === LOGICAL CONSISTENCY CHECKS ===
# Check for negative pressures (should be impossible)
pressure_cols <- grep("Pressure", names(train_clean), value = TRUE)
for(col in pressure_cols) {
  negatives <- sum(train_clean[[col]] < 0, na.rm = TRUE)
  if(negatives > 0) {
    cat(col, ": ", negatives, "negative values (range:", min(train_clean[[col]]), "to", max(train_clean[[col]]), ")\n")
  }
}
## Hyd Pressure1 :  38 negative values (range: -0.8 to 58 )
## Hyd Pressure3 :  70 negative values (range: -1.2 to 50 )
## Pressure Vacuum :  2567 negative values (range: -6.6 to -3.6 )
# Check for zero or negative in volume/flow measurements
volume_cols <- c("Carb.Volume", "Fill.Ounces", "PC.Volume", "Carb.Flow")
for(col in volume_cols) {
  if(col %in% names(train_clean)) {
    zeros_or_neg <- sum(train_clean[[col]] <= 0, na.rm = TRUE)
    if(zeros_or_neg > 0) {
      cat(col, ": ", zeros_or_neg, "values <= 0\n")
    }
  }
}

# 3. CHECK DATA TYPES AND FORMATS
cat("\n=== DATA TYPE VERIFICATION ===\n")
## 
## === DATA TYPE VERIFICATION ===
cat("pH range in training:", range(train_clean$PH), "\n")
## pH range in training: 7.88 9.36
cat("pH should be between 0-14. All values valid:", 
    all(train_clean$PH >= 0 & train_clean$PH <= 14), "\n")
## pH should be between 0-14. All values valid: TRUE
# Check for duplicate rows
duplicates <- sum(duplicated(train_clean[, !names(train_clean) %in% "PH"]))
cat("Duplicate rows (excluding pH):", duplicates, "\n")
## Duplicate rows (excluding pH): 0

1.5.3.1 Key Findings from Quality Checks

Negative values detected :

  • Hyd Pressure1: 38 negative values (down to -0.8)

  • Hyd Pressure3: 70 negative values (down to -1.2)

_ Pressure Vacuum: ALL values negative (-6.6 to -3.6) — this is correct (vacuum = negative pressure)

No duplicate rows — data is clean

pH range: 7.88 to 9.36 (normal for beverages)

1.5.4 Save clean Datasets

# Save cleaned data
write.csv(train_clean, "training_data_cleaned.csv", row.names = FALSE)
write.csv(test_clean, "test_data_cleaned.csv", row.names = FALSE)

cat("\n=== CLEANING COMPLETE ===\n")
## 
## === CLEANING COMPLETE ===
cat("Final training dimensions:", dim(train_clean), "\n")
## Final training dimensions: 2567 34
cat("Final test dimensions:", dim(test_clean), "\n")
## Final test dimensions: 267 34

1.6 Exploratory Data Analysis

1.6.1 Correlation Matrix for pH Prediction

# EDA: Correlation Matrix for pH Prediction

# Ensure Brand Code is factor
train_clean$`Brand Code` <- as.factor(train_clean$`Brand Code`)

# Calculate correlations with pH
numeric_cols <- names(train_clean)[sapply(train_clean, is.numeric)]
target_correlations <- data.frame(
  Variable = character(),
  Correlation = numeric(),
  stringsAsFactors = FALSE
)

for(col in numeric_cols) {
  if(col != "PH") {
    cor_val <- cor(train_clean[[col]], train_clean$PH, use = "complete.obs")
    target_correlations <- rbind(target_correlations, 
                                  data.frame(Variable = col, 
                                             Correlation = round(cor_val, 3)))
  }
}

# Sort by absolute correlation
target_correlations <- target_correlations[order(-abs(target_correlations$Correlation)), ]

cat("=== TOP 15 VARIABLES MOST CORRELATED WITH pH ===\n")
## === TOP 15 VARIABLES MOST CORRELATED WITH pH ===
print(head(target_correlations, 15))
##             Variable Correlation
## 9           Mnf Flow      -0.447
## 26     Bowl Setpoint       0.349
## 16      Filler Level       0.322
## 19        Usage cont      -0.318
## 27 Pressure Setpoint      -0.306
## 14     Hyd Pressure3      -0.240
## 24   Pressure Vacuum       0.221
## 11     Fill Pressure      -0.212
## 13     Hyd Pressure2      -0.201
## 25     Oxygen Filler       0.166
## 30          Carb Rel       0.162
## 18       Temperature      -0.158
## 20         Carb Flow       0.156
## 29          Alch Rel       0.147
## 15     Hyd Pressure4      -0.140
# Create correlation matrix for top variables
top_vars <- head(target_correlations$Variable, 15)
top_vars <- c(top_vars, "PH")
cor_matrix <- cor(train_clean[, top_vars], use = "complete.obs")

# Visualize correlation matrix
corrplot(cor_matrix, 
         method = "color", 
         type = "upper",
         tl.cex = 0.8,
         tl.col = "black",
         title = "Top 15 Variable Correlations with pH",
         mar = c(0,0,2,0))

# Bar plot of top correlations with pH
ggplot(head(target_correlations, 15), aes(x = reorder(Variable, abs(Correlation)), 
                                            y = Correlation, 
                                            fill = Correlation > 0)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Top 15 Factors Correlated with pH",
       x = "Process Variables",
       y = "Correlation with pH") +
  theme_minimal() +
  scale_fill_manual(values = c("darkred", "darkgreen"), 
                    labels = c("Negative", "Positive"),
                    name = "Direction")

# Summary statistics for top 3 predictors vs pH
top_3 <- head(target_correlations$Variable, 3)
cat("\n=== TOP 3 PREDICTORS SUMMARY ===\n")
## 
## === TOP 3 PREDICTORS SUMMARY ===
for(var in top_3) {
  cat("\n", var, "(Correlation:", target_correlations[target_correlations$Variable == var, "Correlation"], ")\n")
  print(summary(train_clean[[var]]))
}
## 
##  Mnf Flow (Correlation: -0.447 )
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -100.20 -100.00   70.20   24.63  140.80  229.40 
## 
##  Bowl Setpoint (Correlation: 0.349 )
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    70.0   100.0   120.0   109.4   120.0   140.0 
## 
##  Filler Level (Correlation: 0.322 )
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    55.8    98.7   118.4   109.3   120.0   161.2

1.6.1.1 Correlation Analysis Findings

1.6.1.2 Top pH Predictors Identified:

  • Mnf Flow (Manufacturing Flow): Strongest negative correlation (-0.447) — Higher flow rates = Lower pH

  • Bowl Setpoint: Strong positive correlation (0.349) — Higher setpoint = Higher pH

  • Filler Level: Moderate positive correlation (0.322) — Higher fill level = Higher pH

  • Usage cont: Negative correlation (-0.318) — More usage = Lower pH

  • Pressure Setpoint: Negative correlation (-0.306) — Higher pressure = Lower pH

Key Insight: Manufacturing flow rate (Mnf Flow) is the single most important factor controlling pH, followed by bowl setpoint and filler level.

1.6.2 Brand Analysis

# EDA: Brand Analysis - Does pH differ by brand?


# Calculate pH statistics by brand
brand_stats <- train_clean %>%
  group_by(`Brand Code`) %>%
  summarise(
    Count = n(),
    Mean_pH = round(mean(PH), 3),
    SD_pH = round(sd(PH), 3),
    Min_pH = round(min(PH), 3),
    Max_pH = round(max(PH), 3),
    .groups = 'drop'
  )

cat("=== PH STATISTICS BY BRAND ===\n")
## === PH STATISTICS BY BRAND ===
print(brand_stats)
## # A tibble: 4 × 6
##   `Brand Code` Count Mean_pH SD_pH Min_pH Max_pH
##   <fct>        <int>   <dbl> <dbl>  <dbl>  <dbl>
## 1 A              293    8.50 0.163   8.06   8.86
## 2 B             1355    8.56 0.171   8      8.94
## 3 C              304    8.41 0.177   7.88   9.36
## 4 D              615    8.60 0.135   8.16   8.92
# ANOVA test to check if brand differences are significant
anova_result <- aov(PH ~ `Brand Code`, data = train_clean)
cat("\n=== ANOVA RESULTS ===\n")
## 
## === ANOVA RESULTS ===
print(summary(anova_result))
##                Df Sum Sq Mean Sq F value Pr(>F)    
## `Brand Code`    3   8.24  2.7462   103.3 <2e-16 ***
## Residuals    2563  68.13  0.0266                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Visualization: Boxplot of pH by brand
ggplot(train_clean, aes(x = `Brand Code`, y = PH, fill = `Brand Code`)) +
  geom_boxplot() +
  labs(title = "pH Distribution by Brand Code",
       x = "Brand",
       y = "pH Level") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set2") +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "red")

# Visualization: Density plot
ggplot(train_clean, aes(x = PH, fill = `Brand Code`)) +
  geom_density(alpha = 0.5) +
  labs(title = "pH Density Distribution by Brand",
       x = "pH",
       y = "Density") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set2")

1.6.2.1 Brand Analysis Summary

  • Significant pH differences across brands (p-value < 0.001) — Brand D has highest pH (8.60), Brand C lowest (8.41)

  • Brand B has largest volume (1,355 batches) and wide pH range (8.00 to 8.94)

  • Brand C shows most variation (SD = 0.177) with extreme values (7.88 to 9.36)

1.6.3 Interaction Effects

# EDA: Interaction Effects - Do combinations of factors predict pH better?

# Interaction plot: Mnf Flow × Brand on pH
ggplot(train_clean, aes(x = `Mnf Flow`, y = PH, color = `Brand Code`)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Interaction: Manufacturing Flow × Brand on pH",
       x = "Manufacturing Flow (Mnf Flow)",
       y = "pH",
       color = "Brand") +
  theme_minimal()

# Check interaction: Bowl Setpoint × Filler Level
ggplot(train_clean, aes(x = `Bowl Setpoint`, y = PH, color = `Filler Level`)) +
  geom_point(alpha = 0.4) +
  scale_color_gradient(low = "blue", high = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(title = "Interaction: Bowl Setpoint × Filler Level on pH",
       x = "Bowl Setpoint",
       y = "pH",
       color = "Filler Level") +
  theme_minimal()

# Calculate interaction strength
model_main <- lm(PH ~ `Mnf Flow` + `Brand Code`, data = train_clean)
model_interaction <- lm(PH ~ `Mnf Flow` * `Brand Code`, data = train_clean)

cat("=== INTERACTION EFFECT SIGNIFICANCE ===\n")
## === INTERACTION EFFECT SIGNIFICANCE ===
cat("Main effects R²:", round(summary(model_main)$r.squared, 4), "\n")
## Main effects R²: 0.305
cat("With interaction R²:", round(summary(model_interaction)$r.squared, 4), "\n")
## With interaction R²: 0.3227
cat("Improvement:", round(summary(model_interaction)$r.squared - summary(model_main)$r.squared, 4), "\n")
## Improvement: 0.0178
# ANOVA to test if interaction is significant
anova_compare <- anova(model_main, model_interaction)
print(anova_compare)
## Analysis of Variance Table
## 
## Model 1: PH ~ `Mnf Flow` + `Brand Code`
## Model 2: PH ~ `Mnf Flow` * `Brand Code`
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   2562 53.079                                  
## 2   2559 51.723  3    1.3561 22.364 2.723e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

1.6.3.1 Interaction Effects Summary

  • Interaction improves prediction by 1.8% (R² from 0.305 to 0.323) — statistically significant (p < 0.001)
  • Brand affects how Mnf Flow influences pH — relationship differs across brands A, B, C, D
  • Need to include Brand in final model — ignoring it reduces accuracy

1.6.4 pH Distribution Check

# EDA: Skewness Check for pH

# Load library
library(e1071)

# Calculate skewness
skew_value <- skewness(train_clean$PH)
cat("Skewness of pH:", round(skew_value, 4), "\n")
## Skewness of pH: -0.2906
# Interpretation rules of thumb
if(abs(skew_value) < 0.5) {
  cat("Interpretation: Approximately symmetric\n")
} else if(abs(skew_value) < 1) {
  cat("Interpretation: Moderately skewed\n")
} else {
  cat("Interpretation: Highly skewed\n")
}
## Interpretation: Approximately symmetric
# Quick visual: Simple histogram
hist(train_clean$PH, 
     main = "pH Distribution", 
     xlab = "pH",
     col = "lightblue",
     border = "black",
     breaks = 20)

# Add vertical line at mean
abline(v = mean(train_clean$PH), col = "red", lwd = 2, lty = 2)

# Show mean vs median
cat("\nMean:", round(mean(train_clean$PH), 4))
## 
## Mean: 8.5456
cat("\nMedian:", round(median(train_clean$PH), 4))
## 
## Median: 8.54
cat("\nDifference:", round(mean(train_clean$PH) - median(train_clean$PH), 4))
## 
## Difference: 0.0056

1.6.4.1 Skewness Summary

  • pH is approximately symmetric (skewness = -0.29, well within ±0.5 range)

  • Mean and median are nearly identical (8.5456 vs 8.5400, difference only 0.0056)

  • Linear regression is viable — no severe skewness requiring transformation

1.6.5 Multicollinearity Check

# EDA Multicollinearity - Are top predictors correlated with each other?


# Select top 10 predictors from earlier correlation analysis
top_predictors <- c("Mnf Flow", "Bowl Setpoint", "Filler Level", "Usage cont", 
                    "Pressure Setpoint", "Hyd Pressure3", "Pressure Vacuum", 
                    "Fill Pressure", "Hyd Pressure2", "Oxygen Filler")

# Create correlation matrix for top predictors
cor_matrix_top <- cor(train_clean[, top_predictors], use = "complete.obs")

# Visualize
corrplot::corrplot(cor_matrix_top, 
                   method = "number", 
                   type = "upper",
                   tl.cex = 0.8,
                   title = "Multicollinearity Among Top Predictors",
                   mar = c(0,0,2,0))

# Calculate VIF (Variance Inflation Factor)
# Build linear model with all top predictors
vif_model <- lm(PH ~ `Mnf Flow` + `Bowl Setpoint` + `Filler Level` + `Usage cont` + 
                      `Pressure Setpoint` + `Hyd Pressure3` + `Pressure Vacuum` + 
                      `Fill Pressure` + `Hyd Pressure2` + `Oxygen Filler`, 
                data = train_clean)

vif_values <- vif(vif_model)

cat("=== VARIANCE INFLATION FACTOR (VIF) ===\n")
## === VARIANCE INFLATION FACTOR (VIF) ===
cat("Rule of thumb: VIF > 5 indicates high multicollinearity\n")
## Rule of thumb: VIF > 5 indicates high multicollinearity
cat("VIF > 10 indicates severe multicollinearity\n\n")
## VIF > 10 indicates severe multicollinearity
print(data.frame(Variable = names(vif_values), VIF = round(vif_values, 2)))
##                                Variable   VIF
## `Mnf Flow`                   `Mnf Flow`  3.88
## `Bowl Setpoint`         `Bowl Setpoint`  8.80
## `Filler Level`           `Filler Level`  8.85
## `Usage cont`               `Usage cont`  1.41
## `Pressure Setpoint` `Pressure Setpoint`  2.00
## `Hyd Pressure3`         `Hyd Pressure3` 10.72
## `Pressure Vacuum`     `Pressure Vacuum`  1.62
## `Fill Pressure`         `Fill Pressure`  1.95
## `Hyd Pressure2`         `Hyd Pressure2`  7.62
## `Oxygen Filler`         `Oxygen Filler`  1.44
# Check if any VIF > 5
high_vif <- names(vif_values[vif_values > 5])
if(length(high_vif) > 0) {
  cat("\n High multicollinearity detected for:", paste(high_vif, collapse = ", "), "\n")
  cat("Recommendation: Remove or combine these variables\n")
} else {
  cat("\n No concerning multicollinearity (all VIF < 5)\n")
  cat("All top predictors can be used together\n")
}
## 
##  High multicollinearity detected for: `Bowl Setpoint`, `Filler Level`, `Hyd Pressure3`, `Hyd Pressure2` 
## Recommendation: Remove or combine these variables

1.6.5.1 Multicollinearity Summary

  • High correlation detected — Bowl Setpoint (VIF 8.8), Filler Level (8.85), Hyd Pressure3 (10.72), Hyd Pressure2 (7.62) are redundant
  • Impact on modeling — These variables measure similar process conditions; keeping all causes unstable coefficient estimates
  • Recommendation — Select representative variables (e.g., keep Filler Level, drop Hyd Pressure2/3) or use tree-based models (Random Forest/XGBoost) which handle redundancy naturally

1.6.6 Non-linear Relationships

# EDA: Check for non-linear relationships with pH


# Top 3 predictors vs pH with LOESS smooth curves
p1 <- ggplot(train_clean, aes(x = `Mnf Flow`, y = PH)) +
  geom_point(alpha = 0.3, size = 0.8) +
  geom_smooth(method = "loess", color = "red", se = TRUE) +
  geom_smooth(method = "lm", color = "blue", linetype = "dashed", se = FALSE) +
  labs(title = "Mnf Flow vs pH",
       x = "Manufacturing Flow",
       y = "pH") +
  theme_minimal()

p2 <- ggplot(train_clean, aes(x = `Bowl Setpoint`, y = PH)) +
  geom_point(alpha = 0.3, size = 0.8) +
  geom_smooth(method = "loess", color = "red", se = TRUE) +
  geom_smooth(method = "lm", color = "blue", linetype = "dashed", se = FALSE) +
  labs(title = "Bowl Setpoint vs pH",
       x = "Bowl Setpoint",
       y = "pH") +
  theme_minimal()

p3 <- ggplot(train_clean, aes(x = `Filler Level`, y = PH)) +
  geom_point(alpha = 0.3, size = 0.8) +
  geom_smooth(method = "loess", color = "red", se = TRUE) +
  geom_smooth(method = "lm", color = "blue", linetype = "dashed", se = FALSE) +
  labs(title = "Filler Level vs pH",
       x = "Filler Level",
       y = "pH") +
  theme_minimal()

# Arrange plots
library(gridExtra)
grid.arrange(p1, p2, p3, ncol = 3)

# Quick check: Is linear model sufficient?
model_linear <- lm(PH ~ `Mnf Flow`, data = train_clean)
model_poly <- lm(PH ~ poly(`Mnf Flow`, 2), data = train_clean)

cat("=== LINEAR vs POLYNOMIAL COMPARISON ===\n")
## === LINEAR vs POLYNOMIAL COMPARISON ===
cat("Linear R²:", round(summary(model_linear)$r.squared, 4), "\n")
## Linear R²: 0.1998
cat("Polynomial (degree 2) R²:", round(summary(model_poly)$r.squared, 4), "\n")
## Polynomial (degree 2) R²: 0.2153
if(summary(model_poly)$r.squared - summary(model_linear)$r.squared > 0.01) {
  cat("\n Non-linear pattern detected - polynomial improves fit by >1%\n")
} else {
  cat("\n Relationships are approximately linear\n")
}
## 
##  Non-linear pattern detected - polynomial improves fit by >1%

1.6.7 Non-linear Relationships Summary

  • Non-linear patterns detected — Polynomial model improves R² by 1.5% (0.200 to 0.215) for Mnf Flow vs pH
  • Practical implication — Simple linear regression may underfit; tree-based models (Random Forest, XGBoost) capture these curves automatically
  • Final recommendation — Use ensemble methods that handle both non-linearity and multicollinearity without manual feature selection

2 Next Step Modeling

Linear Regression (baseline) Random Forest (handles non-linearity) XGBoost (handles interactions)

2.0.1 Modeling section is performed by Sachi Kapoor

  • Part 2: model implementation
  • Part 3: testing the models and showing the results

The two cleaned files used here are:

  • training_data_cleaned.csv
  • test_data_cleaned.csv

From the earlier EDA, we already know two things:

  1. Brand matters and should stay in the modeling process.
  2. pH is not strongly skewed, so I am not going to log transform the target unless there is a strong reason to do that later.

2.1 Load the cleaned data

train_clean <- read.csv("training_data_cleaned.csv", check.names = FALSE)
test_clean  <- read.csv("test_data_cleaned.csv", check.names = FALSE)

cat("Training rows:", nrow(train_clean), "\n")
## Training rows: 2567
cat("Training columns:", ncol(train_clean), "\n")
## Training columns: 34
cat("Test rows:", nrow(test_clean), "\n")
## Test rows: 267
cat("Test columns:", ncol(test_clean), "\n")
## Test columns: 34

2.2 Make the column names model-friendly

One issue with this dataset is that several variable names have spaces in them, like Brand Code and Mnf Flow.

That is fine for basic viewing, but some modeling functions, especially the tree model through caret, can be picky about column names. To avoid that problem, I am standardizing the names once at the start. This does not change the data itself. It only makes the variable names easier for R to work with.

original_names <- names(train_clean)
safe_names <- make.names(original_names, unique = TRUE)

names(train_clean) <- safe_names
names(test_clean)  <- make.names(names(test_clean), unique = TRUE)

name_key <- data.frame(
  Original_Name = original_names,
  Modeling_Name = safe_names
)

kable(name_key, caption = "Original variable names and model-friendly names")
Original variable names and model-friendly names
Original_Name Modeling_Name
Brand Code Brand.Code
Carb Volume Carb.Volume
Fill Ounces Fill.Ounces
PC Volume PC.Volume
Carb Pressure Carb.Pressure
Carb Temp Carb.Temp
PSC PSC
PSC Fill PSC.Fill
PSC CO2 PSC.CO2
Mnf Flow Mnf.Flow
Carb Pressure1 Carb.Pressure1
Fill Pressure Fill.Pressure
Hyd Pressure1 Hyd.Pressure1
Hyd Pressure2 Hyd.Pressure2
Hyd Pressure3 Hyd.Pressure3
Hyd Pressure4 Hyd.Pressure4
Filler Level Filler.Level
Filler Speed Filler.Speed
Temperature Temperature
Usage cont Usage.cont
Carb Flow Carb.Flow
Density Density
MFR MFR
Balling Balling
Pressure Vacuum Pressure.Vacuum
PH PH
Oxygen Filler Oxygen.Filler
Bowl Setpoint Bowl.Setpoint
Pressure Setpoint Pressure.Setpoint
Air Pressurer Air.Pressurer
Alch Rel Alch.Rel
Carb Rel Carb.Rel
Balling Lvl Balling.Lvl
Brand_Code Brand_Code

2.3 Prepare the data

# Remove duplicate brand column if both versions exist
if ("Brand_Code" %in% names(train_clean)) {
  train_clean <- train_clean %>% select(-Brand_Code)
}

if ("Brand_Code" %in% names(test_clean)) {
  test_clean <- test_clean %>% select(-Brand_Code)
}

# Keep only rows with a known target in training
train_clean <- train_clean %>% filter(!is.na(PH))

# Convert Brand.Code to factor
train_clean$Brand.Code <- as.factor(train_clean$Brand.Code)
test_clean$Brand.Code  <- as.factor(test_clean$Brand.Code)

# Match factor levels between training and test data
all_levels <- union(levels(train_clean$Brand.Code), levels(test_clean$Brand.Code))
train_clean$Brand.Code <- factor(train_clean$Brand.Code, levels = all_levels)
test_clean$Brand.Code  <- factor(test_clean$Brand.Code, levels = all_levels)

# Split into training and validation sets
set.seed(42)
train_index <- createDataPartition(train_clean$PH, p = 0.80, list = FALSE)
train_set <- train_clean[train_index, ]
valid_set <- train_clean[-train_index, ]

cat("Modeling train rows:", nrow(train_set), "\n")
## Modeling train rows: 2055
cat("Validation rows:", nrow(valid_set), "\n")
## Validation rows: 512

2.4 Check missing values

missing_train <- sort(colSums(is.na(train_set)), decreasing = TRUE)
missing_train <- missing_train[missing_train > 0]
missing_train
## named numeric(0)

2.5 Handle missing values

Instead of using automatic imputation inside every model call, I am filling the missing values first. This makes the modeling steps more stable and easier to follow.

For Brand.Code, I use the most common category in the training set.

For numeric predictors, I use median imputation based on the training set.

# Fill missing Brand.Code with the most common value
mode_brand <- names(sort(table(train_set$Brand.Code), decreasing = TRUE))[1]

train_set$Brand.Code[is.na(train_set$Brand.Code)] <- mode_brand
valid_set$Brand.Code[is.na(valid_set$Brand.Code)] <- mode_brand
test_clean$Brand.Code[is.na(test_clean$Brand.Code)] <- mode_brand

# Median imputation for numeric predictors
numeric_cols <- names(train_set)[sapply(train_set, is.numeric)]
numeric_predictors <- setdiff(numeric_cols, "PH")

for (col in numeric_predictors) {
  med_value <- median(train_set[[col]], na.rm = TRUE)
  train_set[[col]][is.na(train_set[[col]])] <- med_value
  valid_set[[col]][is.na(valid_set[[col]])] <- med_value
  test_clean[[col]][is.na(test_clean[[col]])] <- med_value
}

# Also complete the full training set for the final model
train_clean$Brand.Code[is.na(train_clean$Brand.Code)] <- mode_brand

for (col in numeric_predictors) {
  med_value <- median(train_set[[col]], na.rm = TRUE)
  train_clean[[col]][is.na(train_clean[[col]])] <- med_value
}

# Quick check
colSums(is.na(train_set))
##        Brand.Code       Carb.Volume       Fill.Ounces         PC.Volume 
##                 0                 0                 0                 0 
##     Carb.Pressure         Carb.Temp               PSC          PSC.Fill 
##                 0                 0                 0                 0 
##           PSC.CO2          Mnf.Flow    Carb.Pressure1     Fill.Pressure 
##                 0                 0                 0                 0 
##     Hyd.Pressure1     Hyd.Pressure2     Hyd.Pressure3     Hyd.Pressure4 
##                 0                 0                 0                 0 
##      Filler.Level      Filler.Speed       Temperature        Usage.cont 
##                 0                 0                 0                 0 
##         Carb.Flow           Density               MFR           Balling 
##                 0                 0                 0                 0 
##   Pressure.Vacuum                PH     Oxygen.Filler     Bowl.Setpoint 
##                 0                 0                 0                 0 
## Pressure.Setpoint     Air.Pressurer          Alch.Rel          Carb.Rel 
##                 0                 0                 0                 0 
##       Balling.Lvl 
##                 0

2.6 Check the shape of the target

skew_value <- e1071::skewness(train_clean$PH)
cat("Skewness of PH:", round(skew_value, 4), "\n")
## Skewness of PH: -0.2906

Since the skewness is close to zero, I kept the target on its original scale.

2.7 Modeling plan

I am testing five different models:

  1. Linear Regression
  2. Ridge Regression
  3. Decision Tree
  4. Random Forest
  5. Gradient Boosting

This gives a mix of simpler models and more flexible nonlinear models.

ctrl <- trainControl(
  method = "cv",
  number = 5
)

2.8 Model 1: Linear Regression

set.seed(42)
model_lm <- train(
  PH ~ .,
  data = train_set,
  method = "lm",
  trControl = ctrl
)
model_lm
## Linear Regression 
## 
## 2055 samples
##   32 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1643, 1644, 1645, 1644, 1644 
## Resampling results:
## 
##   RMSE       Rsquared   MAE      
##   0.1351935  0.3858023  0.1047519
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

2.9 Model 2: Ridge Regression

set.seed(42)
model_ridge <- train(
  PH ~ .,
  data = train_set,
  method = "glmnet",
  trControl = ctrl,
  preProcess = c("center", "scale"),
  tuneGrid = expand.grid(
    alpha = 0,
    lambda = seq(0.0001, 1, length = 20)
  )
)
model_ridge
## glmnet 
## 
## 2055 samples
##   32 predictor
## 
## Pre-processing: centered (34), scaled (34) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1643, 1644, 1645, 1644, 1644 
## Resampling results across tuning parameters:
## 
##   lambda      RMSE       Rsquared   MAE      
##   0.00010000  0.1358004  0.3799658  0.1059655
##   0.05272632  0.1387485  0.3600465  0.1096929
##   0.10535263  0.1413069  0.3450262  0.1120139
##   0.15797895  0.1433040  0.3341904  0.1136882
##   0.21060526  0.1449582  0.3256599  0.1150341
##   0.26323158  0.1463730  0.3186534  0.1161775
##   0.31585789  0.1476051  0.3128023  0.1171778
##   0.36848421  0.1486928  0.3078064  0.1180522
##   0.42111053  0.1496734  0.3034231  0.1188466
##   0.47373684  0.1505561  0.2996170  0.1195543
##   0.52636316  0.1513603  0.2962506  0.1202114
##   0.57898947  0.1520995  0.2932383  0.1208241
##   0.63161579  0.1527822  0.2905286  0.1213918
##   0.68424211  0.1534170  0.2880675  0.1219265
##   0.73686842  0.1540060  0.2858437  0.1224314
##   0.78949474  0.1545535  0.2838368  0.1229017
##   0.84212105  0.1550660  0.2819998  0.1233432
##   0.89474737  0.1555545  0.2802607  0.1237670
##   0.94737368  0.1560067  0.2787120  0.1241692
##   1.00000000  0.1564361  0.2772598  0.1245510
## 
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 1e-04.

2.10 Model 3: Decision Tree

set.seed(42)
model_tree <- train(
  x = subset(train_set, select = -PH),
  y = train_set$PH,
  method = "rpart",
  trControl = ctrl,
  tuneLength = 10
)
model_tree
## CART 
## 
## 2055 samples
##   32 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1643, 1644, 1645, 1644, 1644 
## Resampling results across tuning parameters:
## 
##   cp          RMSE       Rsquared   MAE      
##   0.01093342  0.1301153  0.4333166  0.1003090
##   0.01109126  0.1301153  0.4333166  0.1003090
##   0.01324912  0.1330186  0.4077438  0.1030744
##   0.01432077  0.1353196  0.3867687  0.1054120
##   0.01456794  0.1359107  0.3815954  0.1055995
##   0.01938675  0.1396470  0.3464937  0.1095377
##   0.02866996  0.1404106  0.3391121  0.1100662
##   0.04210985  0.1447224  0.2999125  0.1139650
##   0.07021331  0.1506207  0.2397604  0.1175867
##   0.21545429  0.1626997  0.1827404  0.1296840
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.01109126.
rpart.plot(model_tree$finalModel, main = "Decision Tree for pH Prediction")

2.11 Model 4: Random Forest

set.seed(42)
model_rf <- train(
  PH ~ .,
  data = train_set,
  method = "rf",
  trControl = ctrl,
  tuneLength = 5,
  importance = TRUE,
  ntree = 300
)
model_rf
## Random Forest 
## 
## 2055 samples
##   32 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1643, 1644, 1645, 1644, 1644 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared   MAE       
##    2    0.1138655  0.6121972  0.08560002
##   10    0.1021945  0.6684124  0.07451729
##   18    0.1006049  0.6730289  0.07283461
##   26    0.1001186  0.6716816  0.07208962
##   34    0.1008149  0.6638367  0.07188785
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 26.

2.12 Model 5: Gradient Boosting

set.seed(42)
model_gbm <- train(
  PH ~ .,
  data = train_set,
  method = "gbm",
  trControl = ctrl,
  verbose = FALSE,
  tuneLength = 5
)
model_gbm
## Stochastic Gradient Boosting 
## 
## 2055 samples
##   32 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1643, 1644, 1645, 1644, 1644 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  RMSE       Rsquared   MAE       
##   1                   50      0.1370280  0.3860059  0.10786037
##   1                  100      0.1331103  0.4134434  0.10433427
##   1                  150      0.1306928  0.4328776  0.10210430
##   1                  200      0.1293571  0.4413648  0.10083512
##   1                  250      0.1288080  0.4446195  0.09995516
##   2                   50      0.1290572  0.4553366  0.10106246
##   2                  100      0.1240966  0.4896051  0.09627840
##   2                  150      0.1215857  0.5078372  0.09355046
##   2                  200      0.1200410  0.5187605  0.09215074
##   2                  250      0.1189785  0.5259303  0.09093541
##   3                   50      0.1255740  0.4835986  0.09794601
##   3                  100      0.1203402  0.5194404  0.09269385
##   3                  150      0.1173839  0.5399920  0.08973551
##   3                  200      0.1160355  0.5496639  0.08831585
##   3                  250      0.1148768  0.5578661  0.08699483
##   4                   50      0.1220018  0.5121369  0.09441330
##   4                  100      0.1166098  0.5469151  0.08954224
##   4                  150      0.1146449  0.5593636  0.08732072
##   4                  200      0.1136345  0.5660380  0.08604221
##   4                  250      0.1120033  0.5779589  0.08489552
##   5                   50      0.1189395  0.5363747  0.09158402
##   5                  100      0.1144811  0.5633077  0.08729330
##   5                  150      0.1123623  0.5775069  0.08525031
##   5                  200      0.1106779  0.5890316  0.08368335
##   5                  250      0.1100211  0.5936124  0.08273296
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 250, interaction.depth =
##  5, shrinkage = 0.1 and n.minobsinnode = 10.

2.13 Make validation predictions

pred_lm    <- predict(model_lm, valid_set)
pred_ridge <- predict(model_ridge, valid_set)
pred_tree  <- predict(model_tree, subset(valid_set, select = -PH))
pred_rf    <- predict(model_rf, valid_set)
pred_gbm   <- predict(model_gbm, valid_set)

2.14 Compare model performance

I used RMSE, MAE, and R-squared to compare the models on the validation set.

get_metrics <- function(actual, predicted, model_name) {
  data.frame(
    Model = model_name,
    RMSE = RMSE(predicted, actual),
    MAE  = MAE(predicted, actual),
    R2   = R2(predicted, actual)
  )
}

results_table <- bind_rows(
  get_metrics(valid_set$PH, pred_lm, "Linear Regression"),
  get_metrics(valid_set$PH, pred_ridge, "Ridge Regression"),
  get_metrics(valid_set$PH, pred_tree, "Decision Tree"),
  get_metrics(valid_set$PH, pred_rf, "Random Forest"),
  get_metrics(valid_set$PH, pred_gbm, "Gradient Boosting")
) %>%
  arrange(RMSE)

kable(results_table, digits = 4, caption = "Validation performance comparison")
Validation performance comparison
Model RMSE MAE R2
Random Forest 0.0921 0.0685 0.7260
Gradient Boosting 0.1062 0.0806 0.6232
Decision Tree 0.1325 0.1041 0.4104
Linear Regression 0.1336 0.1044 0.4010
Ridge Regression 0.1338 0.1059 0.4006

2.15 Best model

best_model_name <- results_table$Model[1]
cat("Best model based on validation RMSE:", best_model_name, "\n")
## Best model based on validation RMSE: Random Forest

2.16 Variable importance

Even though I tested several models, Random Forest is especially useful because it can also show which predictors mattered most.

rf_importance <- varImp(model_rf)
rf_importance_df <- rf_importance$importance %>%
  rownames_to_column("Feature") %>%
  arrange(desc(Overall))

kable(head(rf_importance_df, 12), digits = 4, caption = "Top 12 important predictors from Random Forest")
Top 12 important predictors from Random Forest
Feature Overall
Mnf.Flow 100.0000
Brand.CodeC 79.7399
Oxygen.Filler 57.9940
Pressure.Vacuum 57.9121
Balling.Lvl 50.3985
Hyd.Pressure3 46.4134
Alch.Rel 44.4176
Air.Pressurer 43.9509
Temperature 41.0065
Carb.Rel 40.4981
Filler.Speed 34.7497
Carb.Flow 30.2658
plot(varImp(model_rf), top = 12, main = "Random Forest Variable Importance")

2.16.1 Refit the final model on all training data

After comparing the models, I refit the final model using all available training rows with known pH.

set.seed(42)
final_model <- train(
  PH ~ .,
  data = train_clean,
  method = "rf",
  trControl = ctrl,
  tuneGrid = model_rf$bestTune,
  importance = TRUE,
  ntree = 300
)
final_model
## Random Forest 
## 
## 2567 samples
##   32 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 2054, 2054, 2053, 2054, 2053 
## Resampling results:
## 
##   RMSE        Rsquared   MAE       
##   0.09467057  0.7104964  0.06802449
## 
## Tuning parameter 'mtry' was held constant at a value of 26

2.16.2 Predict pH for the evaluation data

final_predictions <- predict(final_model, test_clean)

prediction_output <- test_clean %>%
  mutate(
    Predicted_PH_Final_Model = round(final_predictions, 4),
    Final_Model = "Random Forest",
    Prediction_Note = "PH was blank in evaluation file, so the final model prediction is provided here."
  )

summary(prediction_output$Predicted_PH_Final_Model)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   8.150   8.471   8.533   8.548   8.661   8.800

2.16.2.1 Save the output files

write.csv(results_table, "model_validation_results.csv", row.names = FALSE)
write.csv(rf_importance_df, "random_forest_variable_importance.csv", row.names = FALSE)
write.csv(prediction_output, "ABC_Beverage_PH_Predictions.csv", row.names = FALSE)

2.16.3 Final thoughts

cat("Based on the validation results, I selected the model with the lowest RMSE as the final model. ")

Based on the validation results, I selected the model with the lowest RMSE as the final model.

cat("In this type of manufacturing data, tree-based models are often helpful because they can capture nonlinear patterns and interactions that a basic linear model may miss. ")

In this type of manufacturing data, tree-based models are often helpful because they can capture nonlinear patterns and interactions that a basic linear model may miss.

cat("That is especially relevant here because earlier analysis already suggested that Brand Code changes the relationship between Mnf Flow and pH.\n\n")

That is especially relevant here because earlier analysis already suggested that Brand Code changes the relationship between Mnf Flow and pH.

cat("The final prediction file created by this report is **ABC_Beverage_PH_Predictions.csv**. ")

The final prediction file created by this report is ABC_Beverage_PH_Predictions.csv.

cat("This file can be turned in as the model prediction output for the evaluation dataset.\n")

This file can be turned in as the model prediction output for the evaluation dataset.

2.16.3.1 Write all predictions on excel file

# Write all predictions on excel file  
# ============================================================
# Step 1: Read all your actual data files
# ============================================================

# Read the predictions CSV (test data with predictions)
predictions_raw <- read_csv("ABC_Beverage_PH_Predictions.csv")

# Read the model comparison results
model_results <- results_table # Your 5-model comparison

# Read the random forest variable importance
rf_importance <- read_csv("random_forest_variable_importance.csv")  # Your 34-feature importance

# ============================================================
# Extract actual metrics
# ============================================================

rf_metrics <- results_table %>% filter(Model == "Random Forest")
RMSE_actual <- rf_metrics$RMSE
R2_actual <- rf_metrics$R2

# 90% prediction interval (z = 1.645)
PI_width <- round(1.645 * RMSE_actual, 4)

# ============================================================
# Sheet 1: Predictions (main output)
# ============================================================

predictions_df <- predictions_raw %>%
  mutate(
    RowID = row_number(),
    PH_Predicted = Predicted_PH_Final_Model,
    PH_Lower_90CI = PH_Predicted - (1.645 * RMSE_actual),
    PH_Upper_90CI = PH_Predicted + (1.645 * RMSE_actual),
    Alert_Flag = case_when(
      PH_Predicted < 8.2 ~ "LOW - Review",
      PH_Predicted > 8.9 ~ "HIGH - Review",
      TRUE ~ "OK"
    )
  ) %>%
  select(RowID, Brand_Code = `Brand.Code`, Mnf.Flow, Bowl.Setpoint, 
         Filler.Level, PH_Predicted, PH_Lower_90CI, PH_Upper_90CI, 
         Alert_Flag, Final_Model)

# ============================================================
# Sheet 2: Model Metadata
# ============================================================

metadata_df <- data.frame(
  Metric = c("Model", "RMSE", "R_Squared", "Prediction_Interval_90%", "Test_Rows"),
  Value = c("Random Forest", round(RMSE_actual, 4), round(R2_actual, 4), 
            paste0("±", PI_width), nrow(predictions_raw))
)

# ============================================================
# Sheet 3: Variable Importance (from your rf_importance_df)
# ============================================================

importance_df <- rf_importance_df %>%
  mutate(Rank = row_number(), Importance = round(Overall, 2)) %>%
  select(Rank, Variable = Feature, Importance)

# ============================================================
# Sheet 4: Model Comparison
# ============================================================

comparison_df <- results_table %>%
  arrange(RMSE) %>%
  mutate(across(c(RMSE, MAE, R2), ~ round(., 4))) %>%
  select(Model, RMSE, MAE, R2)

# ============================================================
# Sheet 5: Brand Summary
# ============================================================

brand_df <- predictions_df %>%
  group_by(Brand_Code) %>%
  summarise(
    Count = n(),
    Avg_pH = round(mean(PH_Predicted), 3),
    Min_pH = round(min(PH_Predicted), 3),
    Max_pH = round(max(PH_Predicted), 3),
    .groups = "drop"
  )

# ============================================================
# Sheet 6: Executive Summary (simple)
# ============================================================

summary_df <- data.frame(
  Key_Finding = c("Total Predictions", "Model RMSE", "Model R²", "Top Predictor", 
                  "Lowest pH Brand", "Runs Needing Review"),
  Value = c(nrow(predictions_df), round(RMSE_actual, 4), round(R2_actual, 4),
            rf_importance_df$Feature[1],
            brand_df$Brand_Code[which.min(brand_df$Avg_pH)],
            sum(predictions_df$Alert_Flag != "OK"))
)

# ============================================================
# Write to Excel
# ============================================================

sheets_list <- list(
  "Predictions" = predictions_df,
  "Model_Metadata" = metadata_df,
  "Variable_Importance" = importance_df,
  "Model_Comparison" = comparison_df,
  "Brand_Summary" = brand_df,
  "Executive_Summary" = summary_df
)

write_xlsx(sheets_list, "ABC_Beverage_pH_Predictions.xlsx")
---
title: "pH Predictive Modeling for ABC Beverage"
subtitle: "DATA 624 - Project 2"
author: 
  - "Mehreen Ali Gillani"
  - "Sachi Kapoor"
  - "Izza Khan"
date: "2026-05-23"
output:
  html_document:
    toc: true
    toc_float:
      smooth_scroll: true
    toc_depth: 3
    number_sections: true
    code_folding: show
    code_download: true
    theme: flatly
    highlight: tango
    fig_width: 7
    fig_height: 5
    fig_align: center
    dev: png
    df_print: kable
    keep_md: false
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(
  echo = TRUE,
  warning = FALSE,
  message = FALSE,
  fig.align = "center",
  out.width = "80%"
)
# Load required libraries
library(tidyverse)
library(readxl)   # For reading Excel files
library(corrplot)
library(ggplot2)
library(car)
library(caret)      # For data splitting and preprocessing
library(randomForest) # Will try this model
library(xgboost)    # Will try this too
library(glmnet)
library(gbm)
library(rpart)
library(rpart.plot)
library(e1071)
library(knitr)
library(readr)      # For reading CSV
library(dplyr)      # For data manipulation
library(tidyr)      # For data cleaning
library(writexl)    # For writing Excel files
library(stringr)    # For string manipulation
```
# Executive Summary

> **Note:** This is the technical report. For business-friendly recommendations, please refer to the separate "Non-Technical Report for Leadership."

## Project Objective

New regulations require ABC Beverage to understand manufacturing factors that control pH and provide a working predictive model. We have historical data from 2,567 production runs and must predict pH for 267 new runs.

## Key Findings

- **Random Forest** selected as final model (RMSE = 0.0921, MAE = 0.0685, R² = 0.726)
- **Top 3 predictors:** Manufacturing Flow, Bowl Setpoint, Filler Level
- **Brand matters:** Significant pH differences across brands (p < 0.001)
- **Predictions delivered:** Excel file with 90% confidence intervals (±0.1514 pH)

## Load Training and Testing data
```{r read_files}

# Set working directory (adjust to your path)
# setwd("your/folder/path")
# Read the training data
train_data <- read_excel("StudentData.xlsx")
test_data <- read_excel("StudentEvaluation.xlsx")


```
### Training Data 

```{r training_dataset}
# Check structure of training data
cat("=== TRAINING DATA ===\n")
cat("Dimensions:", dim(train_data), "\n")
cat("\nColumn names:\n")
print(names(train_data))
cat("\nFirst few rows:\n")
print(head(train_data,3))
cat("\nSummary of training data:\n")
print(summary(train_data))
```

**Key Insights from Training Data:**

- pH is numeric in training data (range: 7.88 to 9.36, mean ~8.55) — this is our target variable

- Dataset have missing values that need handling before modeling

- 33 predictor variables available (excluding Brand Code which is categorical)

### Test Data 

```{r test_dataset}
cat("\n\n=== TEST DATA ===\n")
cat("Dimensions:", dim(test_data), "\n")
cat("\nColumn names:\n")
print(names(test_data))
cat("\nFirst few rows:\n")
print(head(test_data,3))
cat("\nSummary of test data:\n")
print(summary(test_data))
```

### Check missing values
```{r check_missingness}
# Check for missing values in both datasets
cat("\n=== MISSING VALUES ===\n")
cat("Training data missing values per column:\n")
print(colSums(is.na(train_data)))
cat("\nTest data missing values per column:\n")
print(colSums(is.na(test_data)))
```

## Problem Understanding

### Business Goal:
ABC Beverage need to comply with new regulations requiring you to:

- Understand your manufacturing process and which factors predict pH
- Report a predictive model for pH to leadership

### Data Situation:

- **Training data (StudentData.xlsx):** Has historical manufacturing measurements INCLUDING pH values (it contains pH as numeric)
- **Test data (StudentEvaluation.xlsx):** Has the SAME manufacturing measurements BUT pH is empty/logical (all NA values) — this is what we need to predict

**Challenge:** Missing values in BOTH datasets across many columns (Brand Code has 120 NAs in training, MFR has 212 NAs, etc.)

## Data preprocessing

```{r preprocessing}
# Create a copy for preprocessing
train_clean <- train_data
test_clean <- test_data

# IDENTIFY COLUMN TYPES
cat("=== COLUMN TYPES IN TRAINING ===\n")
col_types <- sapply(train_clean, class)
print(table(col_types))

# Separate features and target in training
# Remove rows where PH is missing (only 4 NAs in training)
train_clean <- train_clean %>% filter(!is.na(PH))
cat("\nTraining rows after removing NA in PH:", nrow(train_clean), "\n")

# For test data, PH is all NA - we'll keep it as placeholder
# But we need to ensure test has same structure as training features

# Handle Brand Code as factor (categorical)
train_clean$`Brand Code` <- as.factor(train_clean$`Brand Code`)
test_clean$`Brand Code` <- as.factor(test_clean$`Brand Code`)

# Check levels match
cat("\nBrand Code levels in training:", levels(train_clean$`Brand Code`), "\n")
cat("Brand Code levels in test:", levels(test_clean$`Brand Code`), "\n")

# MISSING VALUE ANALYSIS
cat("\n=== MISSING VALUE SUMMARY ===\n")
missing_train <- sort(colSums(is.na(train_clean)), decreasing = TRUE)
missing_train <- missing_train[missing_train > 0]
cat("Columns with missing values in training:\n")
print(missing_train)

# Identify columns with >30% missing in training
high_missing <- names(missing_train[missing_train > 0.3 * nrow(train_clean)])
cat("\nColumns with >30% missing:", high_missing, "\n")

# For now, let's remove columns with >50% missing
threshold <- 0.5 * nrow(train_clean)
cols_to_remove <- names(missing_train[missing_train > threshold])
cat("Columns to remove (>50% missing):", cols_to_remove, "\n")

# Remove these columns from both datasets
if(length(cols_to_remove) > 0) {
  train_clean <- train_clean %>% select(-one_of(cols_to_remove))
  test_clean <- test_clean %>% select(-one_of(cols_to_remove))
}
```
### Handle Missing Data with Imputation

```{r imputing}

# For remaining missing values, use median/mode imputation
# Separate numeric and character columns
numeric_cols <- names(train_clean)[sapply(train_clean, is.numeric)]
factor_cols <- names(train_clean)[sapply(train_clean, is.factor)]

# Impute numeric columns with median
for(col in numeric_cols) {
  if(any(is.na(train_clean[[col]]))) {
    med_val <- median(train_clean[[col]], na.rm = TRUE)
    train_clean[[col]][is.na(train_clean[[col]])] <- med_val
  }
  if(any(is.na(test_clean[[col]]))) {
    med_val <- median(test_clean[[col]], na.rm = TRUE)
    test_clean[[col]][is.na(test_clean[[col]])] <- med_val
  }
}

# Impute factor columns with mode
for(col in factor_cols) {
  if(any(is.na(train_clean[[col]]))) {
    mode_val <- names(sort(table(train_clean[[col]]), decreasing = TRUE))[1]
    train_clean[[col]][is.na(train_clean[[col]])] <- mode_val
  }
  if(any(is.na(test_clean[[col]]))) {
    mode_val <- names(sort(table(test_clean[[col]]), decreasing = TRUE))[1]
    test_clean[[col]][is.na(test_clean[[col]])] <- mode_val
  }
}

# Verify no missing values remain
cat("\n=== MISSING VALUES AFTER CLEANING ===\n")
cat("Training:", sum(is.na(train_clean)), "\n")
cat("Test:", sum(is.na(test_clean)), "\n")
```

#### Summary of handling missing values

- **Removed target variable NAs:** Dropped 4 rows from training where pH was missing, leaving 2,567 complete training records

- **Handled missing values in predictors:** Imputed numeric columns with median values and categorical (Brand Code) with mode; no columns exceeded 50% missing, so all 33 original features were retained

- **Aligned datasets:** Ensured both training and test data have identical structure with same 33 columns and matching factor levels (A, B, C, D for Brand Code)



### Outliers and Anomalies Detection


```{r outliers}

# Ensure factor
train_clean$Brand_Code <- as.factor(train_clean$'Brand Code')
test_clean$Brand_Code <- as.factor(test_clean$'Brand Code')

# 1. OUTLIER DETECTION using IQR method
cat("=== OUTLIER ANALYSIS ===\n")

# Get numeric columns (exclude Brand Code and PH)
numeric_cols <- names(train_clean)[sapply(train_clean, is.numeric)]
numeric_cols <- numeric_cols[!numeric_cols %in% c("PH")]

outlier_summary <- data.frame(
  Variable = character(),
  Outliers_Count = integer(),
  Outliers_Percent = numeric(),
  stringsAsFactors = FALSE
)

for(col in numeric_cols) {
  Q1 <- quantile(train_clean[[col]], 0.25, na.rm = TRUE)
  Q3 <- quantile(train_clean[[col]], 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  
  outliers <- sum(train_clean[[col]] < lower_bound | train_clean[[col]] > upper_bound, na.rm = TRUE)
  outlier_pct <- (outliers / nrow(train_clean)) * 100
  
  outlier_summary <- rbind(outlier_summary, 
                           data.frame(Variable = col, 
                                      Outliers_Count = outliers, 
                                      Outliers_Percent = round(outlier_pct, 2)))
}

# Show top 10 variables with most outliers
outlier_summary <- outlier_summary[order(-outlier_summary$Outliers_Percent), ]
cat("\nTop 10 variables with highest outlier percentage:\n")
print(head(outlier_summary, 10))
```

#### Data Quality Findings

#### Top 3 variables with most outliers:

- Filler Speed - 17.1% outliers (440 out of 2,567 rows)
- MFR - 10.9% outliers (279 rows)
- Air Pressurer - 8.6% outliers (220 rows)

**Key insight:** These are not data entry errors — they represent legitimate manufacturing variation. Filler speed extremes likely occur during production ramp-up/down or different product runs.

#### Recommendation

**Lets Keep outliers but we will document them. Why?

- Manufacturing processes naturally have extreme values (startup, shutdown, different brands)

- Removing 17% of data would bias our model

- These extremes help predict pH across full operating range

### Check Negative and zero values

```{r check_impossible}
# 2. CHECK FOR IMPOSSIBLE/NONSENSICAL VALUES
cat("\n=== LOGICAL CONSISTENCY CHECKS ===\n")

# Check for negative pressures (should be impossible)
pressure_cols <- grep("Pressure", names(train_clean), value = TRUE)
for(col in pressure_cols) {
  negatives <- sum(train_clean[[col]] < 0, na.rm = TRUE)
  if(negatives > 0) {
    cat(col, ": ", negatives, "negative values (range:", min(train_clean[[col]]), "to", max(train_clean[[col]]), ")\n")
  }
}

# Check for zero or negative in volume/flow measurements
volume_cols <- c("Carb.Volume", "Fill.Ounces", "PC.Volume", "Carb.Flow")
for(col in volume_cols) {
  if(col %in% names(train_clean)) {
    zeros_or_neg <- sum(train_clean[[col]] <= 0, na.rm = TRUE)
    if(zeros_or_neg > 0) {
      cat(col, ": ", zeros_or_neg, "values <= 0\n")
    }
  }
}

# 3. CHECK DATA TYPES AND FORMATS
cat("\n=== DATA TYPE VERIFICATION ===\n")
cat("pH range in training:", range(train_clean$PH), "\n")
cat("pH should be between 0-14. All values valid:", 
    all(train_clean$PH >= 0 & train_clean$PH <= 14), "\n")

# Check for duplicate rows
duplicates <- sum(duplicated(train_clean[, !names(train_clean) %in% "PH"]))
cat("Duplicate rows (excluding pH):", duplicates, "\n")
```

#### Key Findings from Quality Checks

**Negative values detected :**

- Hyd Pressure1: 38 negative values (down to -0.8)

- Hyd Pressure3: 70 negative values (down to -1.2)

_ Pressure Vacuum: ALL values negative (-6.6 to -3.6) — this is correct (vacuum = negative pressure)

*No duplicate rows — data is clean*

pH range: 7.88 to 9.36 (normal for beverages)



### Save clean Datasets

```{r save_cleanData}
# Save cleaned data
write.csv(train_clean, "training_data_cleaned.csv", row.names = FALSE)
write.csv(test_clean, "test_data_cleaned.csv", row.names = FALSE)

cat("\n=== CLEANING COMPLETE ===\n")
cat("Final training dimensions:", dim(train_clean), "\n")
cat("Final test dimensions:", dim(test_clean), "\n")
```

## Exploratory Data Analysis

### Correlation Matrix for pH Prediction

```{r eda_corr}
# EDA: Correlation Matrix for pH Prediction

# Ensure Brand Code is factor
train_clean$`Brand Code` <- as.factor(train_clean$`Brand Code`)

# Calculate correlations with pH
numeric_cols <- names(train_clean)[sapply(train_clean, is.numeric)]
target_correlations <- data.frame(
  Variable = character(),
  Correlation = numeric(),
  stringsAsFactors = FALSE
)

for(col in numeric_cols) {
  if(col != "PH") {
    cor_val <- cor(train_clean[[col]], train_clean$PH, use = "complete.obs")
    target_correlations <- rbind(target_correlations, 
                                  data.frame(Variable = col, 
                                             Correlation = round(cor_val, 3)))
  }
}

# Sort by absolute correlation
target_correlations <- target_correlations[order(-abs(target_correlations$Correlation)), ]

cat("=== TOP 15 VARIABLES MOST CORRELATED WITH pH ===\n")
print(head(target_correlations, 15))

# Create correlation matrix for top variables
top_vars <- head(target_correlations$Variable, 15)
top_vars <- c(top_vars, "PH")
cor_matrix <- cor(train_clean[, top_vars], use = "complete.obs")

# Visualize correlation matrix
corrplot(cor_matrix, 
         method = "color", 
         type = "upper",
         tl.cex = 0.8,
         tl.col = "black",
         title = "Top 15 Variable Correlations with pH",
         mar = c(0,0,2,0))

# Bar plot of top correlations with pH
ggplot(head(target_correlations, 15), aes(x = reorder(Variable, abs(Correlation)), 
                                            y = Correlation, 
                                            fill = Correlation > 0)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Top 15 Factors Correlated with pH",
       x = "Process Variables",
       y = "Correlation with pH") +
  theme_minimal() +
  scale_fill_manual(values = c("darkred", "darkgreen"), 
                    labels = c("Negative", "Positive"),
                    name = "Direction")

# Summary statistics for top 3 predictors vs pH
top_3 <- head(target_correlations$Variable, 3)
cat("\n=== TOP 3 PREDICTORS SUMMARY ===\n")
for(var in top_3) {
  cat("\n", var, "(Correlation:", target_correlations[target_correlations$Variable == var, "Correlation"], ")\n")
  print(summary(train_clean[[var]]))
}
```

#### Correlation Analysis Findings

#### Top pH Predictors Identified:

- **Mnf Flow (Manufacturing Flow):** Strongest negative correlation (-0.447) — Higher flow rates = Lower pH

- **Bowl Setpoint:** Strong positive correlation (0.349) — Higher setpoint = Higher pH

- **Filler Level:** Moderate positive correlation (0.322) — Higher fill level = Higher pH

- **Usage cont:** Negative correlation (-0.318) — More usage = Lower pH

- **Pressure Setpoint:** Negative correlation (-0.306) — Higher pressure = Lower pH


**Key Insight:** Manufacturing flow rate (Mnf Flow) is the single most important factor controlling pH, followed by bowl setpoint and filler level.

### Brand Analysis

```{r bradn_analysis}
# EDA: Brand Analysis - Does pH differ by brand?


# Calculate pH statistics by brand
brand_stats <- train_clean %>%
  group_by(`Brand Code`) %>%
  summarise(
    Count = n(),
    Mean_pH = round(mean(PH), 3),
    SD_pH = round(sd(PH), 3),
    Min_pH = round(min(PH), 3),
    Max_pH = round(max(PH), 3),
    .groups = 'drop'
  )

cat("=== PH STATISTICS BY BRAND ===\n")
print(brand_stats)

# ANOVA test to check if brand differences are significant
anova_result <- aov(PH ~ `Brand Code`, data = train_clean)
cat("\n=== ANOVA RESULTS ===\n")
print(summary(anova_result))

# Visualization: Boxplot of pH by brand
ggplot(train_clean, aes(x = `Brand Code`, y = PH, fill = `Brand Code`)) +
  geom_boxplot() +
  labs(title = "pH Distribution by Brand Code",
       x = "Brand",
       y = "pH Level") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set2") +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "red")

# Visualization: Density plot
ggplot(train_clean, aes(x = PH, fill = `Brand Code`)) +
  geom_density(alpha = 0.5) +
  labs(title = "pH Density Distribution by Brand",
       x = "pH",
       y = "Density") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set2")
```

#### Brand Analysis Summary

- **Significant pH differences across brands (p-value < 0.001)** — Brand D has highest pH (8.60), Brand C lowest (8.41)

- **Brand B** has largest volume (1,355 batches) and wide pH range (8.00 to 8.94)

- **Brand C** shows most variation (SD = 0.177) with extreme values (7.88 to 9.36)

### Interaction Effects

```{r Interaction_Effects}
# EDA: Interaction Effects - Do combinations of factors predict pH better?

# Interaction plot: Mnf Flow × Brand on pH
ggplot(train_clean, aes(x = `Mnf Flow`, y = PH, color = `Brand Code`)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Interaction: Manufacturing Flow × Brand on pH",
       x = "Manufacturing Flow (Mnf Flow)",
       y = "pH",
       color = "Brand") +
  theme_minimal()

# Check interaction: Bowl Setpoint × Filler Level
ggplot(train_clean, aes(x = `Bowl Setpoint`, y = PH, color = `Filler Level`)) +
  geom_point(alpha = 0.4) +
  scale_color_gradient(low = "blue", high = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(title = "Interaction: Bowl Setpoint × Filler Level on pH",
       x = "Bowl Setpoint",
       y = "pH",
       color = "Filler Level") +
  theme_minimal()

# Calculate interaction strength
model_main <- lm(PH ~ `Mnf Flow` + `Brand Code`, data = train_clean)
model_interaction <- lm(PH ~ `Mnf Flow` * `Brand Code`, data = train_clean)

cat("=== INTERACTION EFFECT SIGNIFICANCE ===\n")
cat("Main effects R²:", round(summary(model_main)$r.squared, 4), "\n")
cat("With interaction R²:", round(summary(model_interaction)$r.squared, 4), "\n")
cat("Improvement:", round(summary(model_interaction)$r.squared - summary(model_main)$r.squared, 4), "\n")

# ANOVA to test if interaction is significant
anova_compare <- anova(model_main, model_interaction)
print(anova_compare)
```

#### Interaction Effects Summary

- **Interaction improves prediction by 1.8% (R² from 0.305 to 0.323) —** statistically significant (p < 0.001)
- **Brand affects how Mnf Flow influences pH —** relationship differs across brands A, B, C, D
- **Need to include Brand in final model —** ignoring it reduces accuracy

### pH Distribution Check

```{r pH_skewness_check}
# EDA: Skewness Check for pH

# Load library
library(e1071)

# Calculate skewness
skew_value <- skewness(train_clean$PH)
cat("Skewness of pH:", round(skew_value, 4), "\n")

# Interpretation rules of thumb
if(abs(skew_value) < 0.5) {
  cat("Interpretation: Approximately symmetric\n")
} else if(abs(skew_value) < 1) {
  cat("Interpretation: Moderately skewed\n")
} else {
  cat("Interpretation: Highly skewed\n")
}

# Quick visual: Simple histogram
hist(train_clean$PH, 
     main = "pH Distribution", 
     xlab = "pH",
     col = "lightblue",
     border = "black",
     breaks = 20)

# Add vertical line at mean
abline(v = mean(train_clean$PH), col = "red", lwd = 2, lty = 2)

# Show mean vs median
cat("\nMean:", round(mean(train_clean$PH), 4))
cat("\nMedian:", round(median(train_clean$PH), 4))
cat("\nDifference:", round(mean(train_clean$PH) - median(train_clean$PH), 4))
```

#### Skewness Summary

- **pH is approximately symmetric** (skewness = -0.29, well within ±0.5 range)

- **Mean and median are nearly identical** (8.5456 vs 8.5400, difference only 0.0056)

- **Linear regression is viable** — no severe skewness requiring transformation


### Multicollinearity Check

```{r}
# EDA Multicollinearity - Are top predictors correlated with each other?


# Select top 10 predictors from earlier correlation analysis
top_predictors <- c("Mnf Flow", "Bowl Setpoint", "Filler Level", "Usage cont", 
                    "Pressure Setpoint", "Hyd Pressure3", "Pressure Vacuum", 
                    "Fill Pressure", "Hyd Pressure2", "Oxygen Filler")

# Create correlation matrix for top predictors
cor_matrix_top <- cor(train_clean[, top_predictors], use = "complete.obs")

# Visualize
corrplot::corrplot(cor_matrix_top, 
                   method = "number", 
                   type = "upper",
                   tl.cex = 0.8,
                   title = "Multicollinearity Among Top Predictors",
                   mar = c(0,0,2,0))

# Calculate VIF (Variance Inflation Factor)
# Build linear model with all top predictors
vif_model <- lm(PH ~ `Mnf Flow` + `Bowl Setpoint` + `Filler Level` + `Usage cont` + 
                      `Pressure Setpoint` + `Hyd Pressure3` + `Pressure Vacuum` + 
                      `Fill Pressure` + `Hyd Pressure2` + `Oxygen Filler`, 
                data = train_clean)

vif_values <- vif(vif_model)

cat("=== VARIANCE INFLATION FACTOR (VIF) ===\n")
cat("Rule of thumb: VIF > 5 indicates high multicollinearity\n")
cat("VIF > 10 indicates severe multicollinearity\n\n")

print(data.frame(Variable = names(vif_values), VIF = round(vif_values, 2)))

# Check if any VIF > 5
high_vif <- names(vif_values[vif_values > 5])
if(length(high_vif) > 0) {
  cat("\n High multicollinearity detected for:", paste(high_vif, collapse = ", "), "\n")
  cat("Recommendation: Remove or combine these variables\n")
} else {
  cat("\n No concerning multicollinearity (all VIF < 5)\n")
  cat("All top predictors can be used together\n")
}
```

#### Multicollinearity Summary

- **High correlation detected** — Bowl Setpoint (VIF 8.8), Filler Level (8.85), Hyd Pressure3 (10.72), Hyd Pressure2 (7.62) are redundant
- **Impact on modeling** — These variables measure similar process conditions; keeping all causes unstable coefficient estimates
- **Recommendation** — Select representative variables (e.g., keep Filler Level, drop Hyd Pressure2/3) or use tree-based models (Random Forest/XGBoost) which handle redundancy naturally


### Non-linear Relationships

```{r Check_for_non-linear_relationships}
# EDA: Check for non-linear relationships with pH


# Top 3 predictors vs pH with LOESS smooth curves
p1 <- ggplot(train_clean, aes(x = `Mnf Flow`, y = PH)) +
  geom_point(alpha = 0.3, size = 0.8) +
  geom_smooth(method = "loess", color = "red", se = TRUE) +
  geom_smooth(method = "lm", color = "blue", linetype = "dashed", se = FALSE) +
  labs(title = "Mnf Flow vs pH",
       x = "Manufacturing Flow",
       y = "pH") +
  theme_minimal()

p2 <- ggplot(train_clean, aes(x = `Bowl Setpoint`, y = PH)) +
  geom_point(alpha = 0.3, size = 0.8) +
  geom_smooth(method = "loess", color = "red", se = TRUE) +
  geom_smooth(method = "lm", color = "blue", linetype = "dashed", se = FALSE) +
  labs(title = "Bowl Setpoint vs pH",
       x = "Bowl Setpoint",
       y = "pH") +
  theme_minimal()

p3 <- ggplot(train_clean, aes(x = `Filler Level`, y = PH)) +
  geom_point(alpha = 0.3, size = 0.8) +
  geom_smooth(method = "loess", color = "red", se = TRUE) +
  geom_smooth(method = "lm", color = "blue", linetype = "dashed", se = FALSE) +
  labs(title = "Filler Level vs pH",
       x = "Filler Level",
       y = "pH") +
  theme_minimal()

# Arrange plots
library(gridExtra)
grid.arrange(p1, p2, p3, ncol = 3)

# Quick check: Is linear model sufficient?
model_linear <- lm(PH ~ `Mnf Flow`, data = train_clean)
model_poly <- lm(PH ~ poly(`Mnf Flow`, 2), data = train_clean)

cat("=== LINEAR vs POLYNOMIAL COMPARISON ===\n")
cat("Linear R²:", round(summary(model_linear)$r.squared, 4), "\n")
cat("Polynomial (degree 2) R²:", round(summary(model_poly)$r.squared, 4), "\n")

if(summary(model_poly)$r.squared - summary(model_linear)$r.squared > 0.01) {
  cat("\n Non-linear pattern detected - polynomial improves fit by >1%\n")
} else {
  cat("\n Relationships are approximately linear\n")
}
```

### Non-linear Relationships Summary

- **Non-linear patterns detected** — Polynomial model improves R² by 1.5% (0.200 to 0.215) for Mnf Flow vs pH
- **Practical implication** — Simple linear regression may underfit; tree-based models (Random Forest, XGBoost) capture these curves automatically
- **Final recommendation** — Use ensemble methods that handle both non-linearity and multicollinearity without manual feature selection


# Next Step Modeling 

Linear Regression (baseline)
Random Forest (handles non-linearity)
XGBoost (handles interactions)

### Modeling section is performed by Sachi Kapoor

- Part 2: model implementation
- Part 3: testing the models and showing the results

The two cleaned files used here are:

- `training_data_cleaned.csv`
- `test_data_cleaned.csv`

From the earlier EDA, we already know two things:

1. Brand matters and should stay in the modeling process.
2. pH is not strongly skewed, so I am not going to log transform the target unless there is a strong reason to do that later.


## Load the cleaned data

```{r load-data}
train_clean <- read.csv("training_data_cleaned.csv", check.names = FALSE)
test_clean  <- read.csv("test_data_cleaned.csv", check.names = FALSE)

cat("Training rows:", nrow(train_clean), "\n")
cat("Training columns:", ncol(train_clean), "\n")
cat("Test rows:", nrow(test_clean), "\n")
cat("Test columns:", ncol(test_clean), "\n")
```

## Make the column names model-friendly

One issue with this dataset is that several variable names have spaces in them, like `Brand Code` and `Mnf Flow`.

That is fine for basic viewing, but some modeling functions, especially the tree model through `caret`, can be picky about column names. To avoid that problem, I am standardizing the names once at the start. This does **not** change the data itself. It only makes the variable names easier for R to work with.

```{r clean-column-names}
original_names <- names(train_clean)
safe_names <- make.names(original_names, unique = TRUE)

names(train_clean) <- safe_names
names(test_clean)  <- make.names(names(test_clean), unique = TRUE)

name_key <- data.frame(
  Original_Name = original_names,
  Modeling_Name = safe_names
)

kable(name_key, caption = "Original variable names and model-friendly names")
```

## Prepare the data

```{r prepare-data}
# Remove duplicate brand column if both versions exist
if ("Brand_Code" %in% names(train_clean)) {
  train_clean <- train_clean %>% select(-Brand_Code)
}

if ("Brand_Code" %in% names(test_clean)) {
  test_clean <- test_clean %>% select(-Brand_Code)
}

# Keep only rows with a known target in training
train_clean <- train_clean %>% filter(!is.na(PH))

# Convert Brand.Code to factor
train_clean$Brand.Code <- as.factor(train_clean$Brand.Code)
test_clean$Brand.Code  <- as.factor(test_clean$Brand.Code)

# Match factor levels between training and test data
all_levels <- union(levels(train_clean$Brand.Code), levels(test_clean$Brand.Code))
train_clean$Brand.Code <- factor(train_clean$Brand.Code, levels = all_levels)
test_clean$Brand.Code  <- factor(test_clean$Brand.Code, levels = all_levels)

# Split into training and validation sets
set.seed(42)
train_index <- createDataPartition(train_clean$PH, p = 0.80, list = FALSE)
train_set <- train_clean[train_index, ]
valid_set <- train_clean[-train_index, ]

cat("Modeling train rows:", nrow(train_set), "\n")
cat("Validation rows:", nrow(valid_set), "\n")
```

## Check missing values

```{r missing-summary}
missing_train <- sort(colSums(is.na(train_set)), decreasing = TRUE)
missing_train <- missing_train[missing_train > 0]
missing_train
```

## Handle missing values

Instead of using automatic imputation inside every model call, I am filling the missing values first. This makes the modeling steps more stable and easier to follow.

For `Brand.Code`, I use the most common category in the training set.

For numeric predictors, I use median imputation based on the training set.

```{r manual-imputation}
# Fill missing Brand.Code with the most common value
mode_brand <- names(sort(table(train_set$Brand.Code), decreasing = TRUE))[1]

train_set$Brand.Code[is.na(train_set$Brand.Code)] <- mode_brand
valid_set$Brand.Code[is.na(valid_set$Brand.Code)] <- mode_brand
test_clean$Brand.Code[is.na(test_clean$Brand.Code)] <- mode_brand

# Median imputation for numeric predictors
numeric_cols <- names(train_set)[sapply(train_set, is.numeric)]
numeric_predictors <- setdiff(numeric_cols, "PH")

for (col in numeric_predictors) {
  med_value <- median(train_set[[col]], na.rm = TRUE)
  train_set[[col]][is.na(train_set[[col]])] <- med_value
  valid_set[[col]][is.na(valid_set[[col]])] <- med_value
  test_clean[[col]][is.na(test_clean[[col]])] <- med_value
}

# Also complete the full training set for the final model
train_clean$Brand.Code[is.na(train_clean$Brand.Code)] <- mode_brand

for (col in numeric_predictors) {
  med_value <- median(train_set[[col]], na.rm = TRUE)
  train_clean[[col]][is.na(train_clean[[col]])] <- med_value
}

# Quick check
colSums(is.na(train_set))
```

## Check the shape of the target

```{r skewness-check}
skew_value <- e1071::skewness(train_clean$PH)
cat("Skewness of PH:", round(skew_value, 4), "\n")
```

Since the skewness is close to zero, I kept the target on its original scale.

## Modeling plan

I am testing five different models:

1. Linear Regression
2. Ridge Regression
3. Decision Tree
4. Random Forest
5. Gradient Boosting

This gives a mix of simpler models and more flexible nonlinear models.

```{r train-control}
ctrl <- trainControl(
  method = "cv",
  number = 5
)
```

## Model 1: Linear Regression

```{r linear-model}
set.seed(42)
model_lm <- train(
  PH ~ .,
  data = train_set,
  method = "lm",
  trControl = ctrl
)
model_lm
```

## Model 2: Ridge Regression

```{r ridge-model}
set.seed(42)
model_ridge <- train(
  PH ~ .,
  data = train_set,
  method = "glmnet",
  trControl = ctrl,
  preProcess = c("center", "scale"),
  tuneGrid = expand.grid(
    alpha = 0,
    lambda = seq(0.0001, 1, length = 20)
  )
)
model_ridge
```

## Model 3: Decision Tree

```{r tree-model}
set.seed(42)
model_tree <- train(
  x = subset(train_set, select = -PH),
  y = train_set$PH,
  method = "rpart",
  trControl = ctrl,
  tuneLength = 10
)
model_tree
```

```{r tree-plot, fig.width=10, fig.height=6}
rpart.plot(model_tree$finalModel, main = "Decision Tree for pH Prediction")
```

## Model 4: Random Forest

```{r rf-model}
set.seed(42)
model_rf <- train(
  PH ~ .,
  data = train_set,
  method = "rf",
  trControl = ctrl,
  tuneLength = 5,
  importance = TRUE,
  ntree = 300
)
model_rf
```

## Model 5: Gradient Boosting

```{r gbm-model}
set.seed(42)
model_gbm <- train(
  PH ~ .,
  data = train_set,
  method = "gbm",
  trControl = ctrl,
  verbose = FALSE,
  tuneLength = 5
)
model_gbm
```

## Make validation predictions

```{r predictions}
pred_lm    <- predict(model_lm, valid_set)
pred_ridge <- predict(model_ridge, valid_set)
pred_tree  <- predict(model_tree, subset(valid_set, select = -PH))
pred_rf    <- predict(model_rf, valid_set)
pred_gbm   <- predict(model_gbm, valid_set)
```

## Compare model performance

I used RMSE, MAE, and R-squared to compare the models on the validation set.

```{r results-table}
get_metrics <- function(actual, predicted, model_name) {
  data.frame(
    Model = model_name,
    RMSE = RMSE(predicted, actual),
    MAE  = MAE(predicted, actual),
    R2   = R2(predicted, actual)
  )
}

results_table <- bind_rows(
  get_metrics(valid_set$PH, pred_lm, "Linear Regression"),
  get_metrics(valid_set$PH, pred_ridge, "Ridge Regression"),
  get_metrics(valid_set$PH, pred_tree, "Decision Tree"),
  get_metrics(valid_set$PH, pred_rf, "Random Forest"),
  get_metrics(valid_set$PH, pred_gbm, "Gradient Boosting")
) %>%
  arrange(RMSE)

kable(results_table, digits = 4, caption = "Validation performance comparison")
```

## Best model

```{r best-model}
best_model_name <- results_table$Model[1]
cat("Best model based on validation RMSE:", best_model_name, "\n")
```

## Variable importance

Even though I tested several models, Random Forest is especially useful because it can also show which predictors mattered most.

```{r variable-importance}
rf_importance <- varImp(model_rf)
rf_importance_df <- rf_importance$importance %>%
  rownames_to_column("Feature") %>%
  arrange(desc(Overall))

kable(head(rf_importance_df, 12), digits = 4, caption = "Top 12 important predictors from Random Forest")
```

```{r importance-plot, fig.width=10, fig.height=7}
plot(varImp(model_rf), top = 12, main = "Random Forest Variable Importance")
```

### Refit the final model on all training data

After comparing the models, I refit the final model using all available training rows with known pH.

```{r final-model}
set.seed(42)
final_model <- train(
  PH ~ .,
  data = train_clean,
  method = "rf",
  trControl = ctrl,
  tuneGrid = model_rf$bestTune,
  importance = TRUE,
  ntree = 300
)
final_model
```

### Predict pH for the evaluation data

```{r final-predictions}
final_predictions <- predict(final_model, test_clean)

prediction_output <- test_clean %>%
  mutate(
    Predicted_PH_Final_Model = round(final_predictions, 4),
    Final_Model = "Random Forest",
    Prediction_Note = "PH was blank in evaluation file, so the final model prediction is provided here."
  )

summary(prediction_output$Predicted_PH_Final_Model)
```

#### Save the output files

```{r save-output}
write.csv(results_table, "model_validation_results.csv", row.names = FALSE)
write.csv(rf_importance_df, "random_forest_variable_importance.csv", row.names = FALSE)
write.csv(prediction_output, "ABC_Beverage_PH_Predictions.csv", row.names = FALSE)
```

### Final thoughts

```{r interpretation, results='asis'}
cat("Based on the validation results, I selected the model with the lowest RMSE as the final model. ")
cat("In this type of manufacturing data, tree-based models are often helpful because they can capture nonlinear patterns and interactions that a basic linear model may miss. ")
cat("That is especially relevant here because earlier analysis already suggested that Brand Code changes the relationship between Mnf Flow and pH.\n\n")

cat("The final prediction file created by this report is **ABC_Beverage_PH_Predictions.csv**. ")
cat("This file can be turned in as the model prediction output for the evaluation dataset.\n")
```

#### Write all predictions on excel file 
```{r}
# Write all predictions on excel file  
# ============================================================
# Step 1: Read all your actual data files
# ============================================================

# Read the predictions CSV (test data with predictions)
predictions_raw <- read_csv("ABC_Beverage_PH_Predictions.csv")

# Read the model comparison results
model_results <- results_table # Your 5-model comparison

# Read the random forest variable importance
rf_importance <- read_csv("random_forest_variable_importance.csv")  # Your 34-feature importance

# ============================================================
# Extract actual metrics
# ============================================================

rf_metrics <- results_table %>% filter(Model == "Random Forest")
RMSE_actual <- rf_metrics$RMSE
R2_actual <- rf_metrics$R2

# 90% prediction interval (z = 1.645)
PI_width <- round(1.645 * RMSE_actual, 4)

# ============================================================
# Sheet 1: Predictions (main output)
# ============================================================

predictions_df <- predictions_raw %>%
  mutate(
    RowID = row_number(),
    PH_Predicted = Predicted_PH_Final_Model,
    PH_Lower_90CI = PH_Predicted - (1.645 * RMSE_actual),
    PH_Upper_90CI = PH_Predicted + (1.645 * RMSE_actual),
    Alert_Flag = case_when(
      PH_Predicted < 8.2 ~ "LOW - Review",
      PH_Predicted > 8.9 ~ "HIGH - Review",
      TRUE ~ "OK"
    )
  ) %>%
  select(RowID, Brand_Code = `Brand.Code`, Mnf.Flow, Bowl.Setpoint, 
         Filler.Level, PH_Predicted, PH_Lower_90CI, PH_Upper_90CI, 
         Alert_Flag, Final_Model)

# ============================================================
# Sheet 2: Model Metadata
# ============================================================

metadata_df <- data.frame(
  Metric = c("Model", "RMSE", "R_Squared", "Prediction_Interval_90%", "Test_Rows"),
  Value = c("Random Forest", round(RMSE_actual, 4), round(R2_actual, 4), 
            paste0("±", PI_width), nrow(predictions_raw))
)

# ============================================================
# Sheet 3: Variable Importance (from your rf_importance_df)
# ============================================================

importance_df <- rf_importance_df %>%
  mutate(Rank = row_number(), Importance = round(Overall, 2)) %>%
  select(Rank, Variable = Feature, Importance)

# ============================================================
# Sheet 4: Model Comparison
# ============================================================

comparison_df <- results_table %>%
  arrange(RMSE) %>%
  mutate(across(c(RMSE, MAE, R2), ~ round(., 4))) %>%
  select(Model, RMSE, MAE, R2)

# ============================================================
# Sheet 5: Brand Summary
# ============================================================

brand_df <- predictions_df %>%
  group_by(Brand_Code) %>%
  summarise(
    Count = n(),
    Avg_pH = round(mean(PH_Predicted), 3),
    Min_pH = round(min(PH_Predicted), 3),
    Max_pH = round(max(PH_Predicted), 3),
    .groups = "drop"
  )

# ============================================================
# Sheet 6: Executive Summary (simple)
# ============================================================

summary_df <- data.frame(
  Key_Finding = c("Total Predictions", "Model RMSE", "Model R²", "Top Predictor", 
                  "Lowest pH Brand", "Runs Needing Review"),
  Value = c(nrow(predictions_df), round(RMSE_actual, 4), round(R2_actual, 4),
            rf_importance_df$Feature[1],
            brand_df$Brand_Code[which.min(brand_df$Avg_pH)],
            sum(predictions_df$Alert_Flag != "OK"))
)

# ============================================================
# Write to Excel
# ============================================================

sheets_list <- list(
  "Predictions" = predictions_df,
  "Model_Metadata" = metadata_df,
  "Variable_Importance" = importance_df,
  "Model_Comparison" = comparison_df,
  "Brand_Summary" = brand_df,
  "Executive_Summary" = summary_df
)

write_xlsx(sheets_list, "ABC_Beverage_pH_Predictions.xlsx")

```
