read_project_xlsx <- function(paths, fallback = NULL) {
  for (path in paths) {
    if (file.exists(path)) {
      return(read_excel(path))
    }
  }

  if (!is.null(fallback)) {
    return(fallback)
  }

  stop("None of these files exist: ", paste(paths, collapse = ", "))
}

mode_value <- function(x) {
  x <- x[!is.na(x)]
  if (!length(x)) return(NA)
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

nrmse_calc <- function(actual, pred) {
  actual <- as.numeric(actual)
  pred <- as.numeric(pred)
  rng <- diff(range(actual, na.rm = TRUE))
  if (is.na(rng) || rng == 0) return(NA_real_)
  sqrt(mean((actual - pred)^2, na.rm = TRUE)) / rng
}

metric_row <- function(model, obs, pred) {
  tibble(
    model = model,
    rmse = caret::RMSE(pred, obs),
    mae = caret::MAE(pred, obs),
    r2 = caret::R2(pred, obs)
  )
}

top_abs_coef <- function(coef_obj, n = 10) {
  coef_df <- as.matrix(coef_obj)
  tibble(
    term = rownames(coef_df),
    estimate = as.numeric(coef_df[, 1])
  ) |>
    filter(term != "(Intercept)") |>
    mutate(abs_estimate = abs(estimate)) |>
    arrange(desc(abs_estimate)) |>
    slice_head(n = n) |>
    select(term, estimate)
}

plot_obs_pred <- function(df, title) {
  ggplot(df, aes(x = observed, y = predicted)) +
    geom_point(alpha = 0.5) +
    geom_abline(
      slope = 1,
      intercept = 0,
      linetype = 2,
      linewidth = 0.35,
      color = "gray45"
    ) +
    labs(title = title, x = "Observed", y = "Predicted")
}

prepare_mnf_sensitivity <- function(df, suspicious_values = c(-100, -100.2)) {
  base_df <- df |>
    mutate(Mnf.Flow.Flag = as.integer(Mnf.Flow %in% suspicious_values))

  raw_imputed <- missForest(
    as.data.frame(base_df |> mutate(Brand.Code = as.factor(Brand.Code))),
    verbose = FALSE
  )$ximp |>
    as_tibble()

  recode_imputed <- missForest(
    as.data.frame(
      base_df |>
        mutate(
          Mnf.Flow = ifelse(Mnf.Flow %in% suspicious_values, NA, Mnf.Flow),
          Brand.Code = as.factor(Brand.Code)
        )
    ),
    verbose = FALSE
  )$ximp |>
    as_tibble()

  numeric_corr <- raw_imputed |>
    select(where(is.numeric)) |>
    cor(use = "pairwise.complete.obs")

  high_corr <- findCorrelation(numeric_corr, cutoff = 0.80, names = TRUE)
  high_corr <- setdiff(high_corr, c("Mnf.Flow", "Mnf.Flow.Flag"))

  if (length(high_corr) > 0) {
    raw_imputed <- raw_imputed |> select(-all_of(high_corr))
    recode_imputed <- recode_imputed |> select(-all_of(high_corr))
  }

  list(
    raw = raw_imputed,
    recode = recode_imputed,
    removed_predictors = high_corr
  )
}

fit_tree_sensitivity <- function(model_data, seed = 2026) {
  set.seed(seed)
  idx <- createDataPartition(model_data$PH, p = 0.80, list = FALSE)

  train_data <- model_data[idx, ]
  test_data <- model_data[-idx, ]

  ctrl <- trainControl(method = "cv", number = 5)

  rf_model <- train(
    PH ~ .,
    data = train_data,
    method = "rf",
    trControl = ctrl,
    ntree = 300,
    tuneLength = 3,
    importance = TRUE
  )
  rf_pred <- predict(rf_model, newdata = test_data)

  gbm_model <- train(
    PH ~ .,
    data = train_data,
    method = "gbm",
    trControl = ctrl,
    verbose = FALSE,
    tuneLength = 3
  )
  gbm_pred <- predict(gbm_model, newdata = test_data)

  metrics <- bind_rows(
    metric_row("Random forest", test_data$PH, rf_pred),
    metric_row("Gradient boosting", test_data$PH, gbm_pred)
  )

  list(
    metrics = metrics,
    best = metrics |> arrange(rmse) |> slice(1),
    rf_model = rf_model,
    gbm_model = gbm_model,
    rf_pred = rf_pred,
    gbm_pred = gbm_pred,
    test_data = test_data
  )
}

1 Introduction

This report is written for ABC Beverage leadership and the production team. Our goal is to turn the historical process data into a practical pH prediction workflow that we can explain, defend, and use to guide manufacturing decisions. We treat the analysis as a sequence of decisions: first we clean the data carefully, then we test which transformations stabilize the measurements, then we compare models from simple to complex, and finally we select the approach that best balances accuracy, stability, and interpretability.

One theme emerges early and stays important throughout the analysis: Mnf Flow behaves like a broader process-state variable. It sits alongside direct process controls such as carbonation, temperature, and pressure, but it may also be absorbing information about residence time, mixing, line condition, and operating mode. That helps explain why the model often treats it as a stronger signal than any single temperature or pressure column. By contrast, many temperature and pressure effects are already represented indirectly through carbonation variables like Carb Volume and Carb Rel, while the remaining temperature and pressure fields are spread across several columns and are often tightly controlled. In short, flow can capture more of the real operating variation that matters for pH.

2 Data Cleaning Journey

2.1 Load and inspect the data

#student_raw <- read_project_xlsx(c("Project2/StudentData.xlsx", "StudentData.xlsx"))
student_raw <- read_excel("Training Data.xlsx")
student_data <- student_raw |> rename_with(make.names)

eval_paths <- c(
  "Project2/StudentEvaluation.xlsx",
  "Project2/StudentEvaluation (1).xlsx",
  "StudentEvaluation.xlsx",
  "StudentEvaluation (1).xlsx"
)

evaluation_raw <- read_project_xlsx(eval_paths, fallback = student_raw)
evaluation_data <- evaluation_raw |> rename_with(make.names)

data_overview <- tibble(
  rows = nrow(student_data),
  cols = ncol(student_data),
  complete_rows = sum(complete.cases(student_data)),
  complete_share = round(100 * complete_rows / rows, 1)
)

data_overview |> kable()

rows	cols	complete_rows	complete_share
2571	33	2038	79.3

str(student_data)

## tibble [2,571 × 33] (S3: tbl_df/tbl/data.frame)
##  $ Brand.Code       : chr [1:2571] "B" "A" "B" "A" ...
##  $ Carb.Volume      : num [1:2571] 5.34 5.43 5.29 5.44 5.49 ...
##  $ Fill.Ounces      : num [1:2571] 24 24 24.1 24 24.3 ...
##  $ PC.Volume        : num [1:2571] 0.263 0.239 0.263 0.293 0.111 ...
##  $ Carb.Pressure    : num [1:2571] 68.2 68.4 70.8 63 67.2 66.6 64.2 67.6 64.2 72 ...
##  $ Carb.Temp        : num [1:2571] 141 140 145 133 137 ...
##  $ PSC              : num [1:2571] 0.104 0.124 0.09 NA 0.026 0.09 0.128 0.154 0.132 0.014 ...
##  $ PSC.Fill         : num [1:2571] 0.26 0.22 0.34 0.42 0.16 ...
##  $ PSC.CO2          : num [1:2571] 0.04 0.04 0.16 0.04 0.12 ...
##  $ Mnf.Flow         : num [1:2571] -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 ...
##  $ Carb.Pressure1   : num [1:2571] 119 122 120 115 118 ...
##  $ Fill.Pressure    : num [1:2571] 46 46 46 46.4 45.8 45.6 51.8 46.8 46 45.2 ...
##  $ Hyd.Pressure1    : num [1:2571] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Hyd.Pressure2    : num [1:2571] NA NA NA 0 0 0 0 0 0 0 ...
##  $ Hyd.Pressure3    : num [1:2571] NA NA NA 0 0 0 0 0 0 0 ...
##  $ Hyd.Pressure4    : num [1:2571] 118 106 82 92 92 116 124 132 90 108 ...
##  $ Filler.Level     : num [1:2571] 121 119 120 118 119 ...
##  $ Filler.Speed     : num [1:2571] 4002 3986 4020 4012 4010 ...
##  $ Temperature      : num [1:2571] 66 67.6 67 65.6 65.6 66.2 65.8 65.2 65.4 66.6 ...
##  $ Usage.cont       : num [1:2571] 16.2 19.9 17.8 17.4 17.7 ...
##  $ Carb.Flow        : num [1:2571] 2932 3144 2914 3062 3054 ...
##  $ Density          : num [1:2571] 0.88 0.92 1.58 1.54 1.54 1.52 0.84 0.84 0.9 0.9 ...
##  $ MFR              : num [1:2571] 725 727 735 731 723 ...
##  $ Balling          : num [1:2571] 1.4 1.5 3.14 3.04 3.04 ...
##  $ Pressure.Vacuum  : num [1:2571] -4 -4 -3.8 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 ...
##  $ PH               : num [1:2571] 8.36 8.26 8.94 8.24 8.26 8.32 8.4 8.38 8.38 8.5 ...
##  $ Oxygen.Filler    : num [1:2571] 0.022 0.026 0.024 0.03 0.03 0.024 0.066 0.046 0.064 0.022 ...
##  $ Bowl.Setpoint    : num [1:2571] 120 120 120 120 120 120 120 120 120 120 ...
##  $ Pressure.Setpoint: num [1:2571] 46.4 46.8 46.6 46 46 46 46 46 46 46 ...
##  $ Air.Pressurer    : num [1:2571] 143 143 142 146 146 ...
##  $ Alch.Rel         : num [1:2571] 6.58 6.56 7.66 7.14 7.14 7.16 6.54 6.52 6.52 6.54 ...
##  $ Carb.Rel         : num [1:2571] 5.32 5.3 5.84 5.42 5.44 5.44 5.38 5.34 5.34 5.34 ...
##  $ Balling.Lvl      : num [1:2571] 1.48 1.56 3.28 3.04 3.04 3.02 1.44 1.44 1.44 1.38 ...

The raw data contain 2571 rows and 33 columns. No separate evaluation workbook was found in the project folder, so the code falls back to the main dataset when it needs a scoring example.

2.2 Outlier review

predictor_outlier_tbl <- student_data |>
  select(where(is.numeric), -PH) |>
  summarise(across(everything(), list(
    n_out = ~ {
      q1 <- quantile(., 0.25, na.rm = TRUE)
      q3 <- quantile(., 0.75, na.rm = TRUE)
      iqr <- q3 - q1
      lo <- q1 - 1.5 * iqr
      hi <- q3 + 1.5 * iqr
      sum(. < lo | . > hi, na.rm = TRUE)
    },
    min = ~ min(., na.rm = TRUE),
    max = ~ max(., na.rm = TRUE)
  ), .names = "{.col}__{.fn}")) |>
  pivot_longer(everything(), names_to = c("variable", "stat"), names_sep = "__") |>
  pivot_wider(names_from = stat, values_from = value) |>
  mutate(outlier_share = round(100 * n_out / nrow(student_data), 1)) |>
  arrange(desc(n_out), desc(abs(max)), desc(abs(min)))

mnf_flow_flag_tbl <- tibble(
  suspicious_value = c(-100, -100.2),
  count = c(
    sum(student_data$Mnf.Flow == -100, na.rm = TRUE),
    sum(student_data$Mnf.Flow == -100.2, na.rm = TRUE)
  )
) |>
  mutate(share = round(100 * count / nrow(student_data), 1))

predictor_outlier_tbl |> head(10) |> kable(digits = 2)

variable	n_out	min	max	outlier_share
Filler.Speed	439	998.00	4030.00	17.1
MFR	238	31.40	868.60	9.3
Air.Pressurer	220	140.80	148.20	8.6
Oxygen.Filler	189	0.00	0.40	7.4
Pressure.Vacuum	125	-6.60	-3.60	4.9
Temperature	123	63.60	76.20	4.8
PC.Volume	86	0.08	0.48	3.3
Hyd.Pressure4	83	52.00	142.00	3.2
Fill.Pressure	79	34.60	60.40	3.1
PSC.CO2	77	0.00	0.24	3.0

mnf_flow_flag_tbl |> kable(digits = 2)

suspicious_value	count	share
-100.0	606	23.6
-100.2	578	22.5

The predictor review does not show a broad outlier problem across the entire file. Most columns behave like ordinary process measurements, and the standard IQR rule only flags a subset of variables that are already expected to move with machine settings and operating state. The biggest practical concern is Mnf Flow, not because it has the largest IQR outlier count, but because it contains repeated negative codes that do not look like ordinary flow readings. Carb Flow also shows a wide spread and was checked separately with Box-Cox, but it does not show the same repeated-code pattern. In plain English: the data have some extreme values, but the only values that look potentially erroneous enough to test separately are the Mnf Flow codes -100 and -100.2.

2.3 Missingness and near-zero variance

missing_counts <- student_data |>
  summarise(across(everything(), ~ sum(is.na(.)))) |>
  pivot_longer(everything(), names_to = "variable", values_to = "n_missing") |>
  arrange(desc(n_missing))

nzv_vars <- tibble(variable = names(student_data)[nearZeroVar(student_data)])
if (nrow(nzv_vars) == 0) nzv_vars <- tibble(variable = "None")

missing_counts |> head(12) |> kable()

variable	n_missing
MFR	212
Brand.Code	120
Filler.Speed	57
PC.Volume	39
PSC.CO2	39
Fill.Ounces	38
PSC	33
Carb.Pressure1	32
Hyd.Pressure4	30
Carb.Pressure	27
Carb.Temp	26
PSC.Fill	23

nzv_vars |> kable()

variable
Hyd.Pressure1

ggplot(student_data, aes(x = "", y = PH)) +
  geom_boxplot(width = 0.35, fill = "grey92", color = "grey25", outlier.size = 0.8) +
  labs(
    title = "Response distribution before cleaning",
    x = NULL,
    y = "PH"
  )

The missingness table shows why a full complete-case drop would be too aggressive. 2038 rows are complete, which means 533 rows would be lost if we excluded every observation with any missing value. In a production setting, every row represents process history that took time and money to collect, so we prefer to preserve that signal whenever we can do so responsibly.

2.4 Why imputation won

complete_data <- student_data |> filter(if_all(everything(), ~ !is.na(.)))

imputation_benchmark <- function(df, iter = 5) {
  numeric_idx <- 2:ncol(df)
  nrmse_values <- matrix(NA_real_, nrow = iter, ncol = 6)
  brand_values <- matrix(NA_real_, nrow = iter, ncol = 3)
  colnames(nrmse_values) <- c("mean", "median", "mode", "knn", "mice", "forest")
  colnames(brand_values) <- c("mode", "mice", "forest")

  for (i in seq_len(iter)) {
    missing_data <- df |>
      mutate(across(everything(), function(x) {
        idx <- sample(seq_along(x), size = floor(0.05 * length(x)))
        x[idx] <- NA
        x
      }))

    missing_rows <- lapply(missing_data, function(x) which(is.na(x)))

    mean_impute <- missing_data |>
      mutate(
        across(where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)),
        Brand.Code = factor(
          ifelse(is.na(Brand.Code), mode_value(Brand.Code), as.character(Brand.Code)),
          levels = levels(df$Brand.Code)
        )
      )

    median_impute <- missing_data |>
      mutate(
        across(where(is.numeric), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)),
        Brand.Code = factor(
          ifelse(is.na(Brand.Code), mode_value(Brand.Code), as.character(Brand.Code)),
          levels = levels(df$Brand.Code)
        )
      )

    mode_impute <- missing_data |>
      mutate(
        across(where(is.numeric), ~ ifelse(is.na(.), as.numeric(mode_value(.)), .)),
        Brand.Code = factor(
          ifelse(is.na(Brand.Code), mode_value(Brand.Code), as.character(Brand.Code)),
          levels = levels(df$Brand.Code)
        )
      )

    knn_impute <- bind_cols(
      Brand.Code = missing_data$Brand.Code,
      as.data.frame(impute.knn(as.matrix(missing_data |> select(-Brand.Code)))$data)
    )

    mice_impute <- complete(
      mice(missing_data |> mutate(Brand.Code = as.factor(Brand.Code)), printFlag = FALSE),
      1
    )

    forest_impute <- missForest(
      as.data.frame(missing_data |> mutate(Brand.Code = as.factor(Brand.Code)))
    )$ximp

    for (j in numeric_idx) {
      actual_values <- df[[j]][missing_rows[[j]]]
      nrmse_values[i, "mean"] <- ifelse(is.na(nrmse_values[i, "mean"]), nrmse_calc(actual_values, mean_impute[[j]][missing_rows[[j]]]), nrmse_values[i, "mean"])
      nrmse_values[i, "median"] <- ifelse(is.na(nrmse_values[i, "median"]), nrmse_calc(actual_values, median_impute[[j]][missing_rows[[j]]]), nrmse_values[i, "median"])
      nrmse_values[i, "mode"] <- ifelse(is.na(nrmse_values[i, "mode"]), nrmse_calc(actual_values, mode_impute[[j]][missing_rows[[j]]]), nrmse_values[i, "mode"])
      nrmse_values[i, "knn"] <- ifelse(is.na(nrmse_values[i, "knn"]), nrmse_calc(actual_values, knn_impute[[j]][missing_rows[[j]]]), nrmse_values[i, "knn"])
      nrmse_values[i, "mice"] <- ifelse(is.na(nrmse_values[i, "mice"]), nrmse_calc(actual_values, mice_impute[[j]][missing_rows[[j]]]), nrmse_values[i, "mice"])
      nrmse_values[i, "forest"] <- ifelse(is.na(nrmse_values[i, "forest"]), nrmse_calc(actual_values, forest_impute[[j]][missing_rows[[j]]]), nrmse_values[i, "forest"])
    }

    actual_brand <- as.character(df[[1]][missing_rows[[1]]])
    brand_values[i, "mode"] <- mean(actual_brand == as.character(mode_impute[[1]][missing_rows[[1]]]))
    brand_values[i, "mice"] <- mean(actual_brand == as.character(mice_impute[[1]][missing_rows[[1]]]))
    brand_values[i, "forest"] <- mean(actual_brand == as.character(forest_impute[[1]][missing_rows[[1]]]))
  }

  nrmse_tbl <- tibble(
    method = colnames(nrmse_values),
    nrmse = colMeans(nrmse_values, na.rm = TRUE)
  ) |> arrange(nrmse)

  brand_tbl <- tibble(
    method = colnames(brand_values),
    accuracy = colMeans(brand_values, na.rm = TRUE)
  ) |> arrange(desc(accuracy))

  list(nrmse = nrmse_tbl, brand = brand_tbl)
}

imputation_results <- imputation_benchmark(complete_data, iter = 5)

## Cluster size 2038 broken into 1913 125 
## Cluster size 1913 broken into 538 1375 
## Done cluster 538 
## Done cluster 1375 
## Done cluster 1913 
## Done cluster 125 
## Imputation sequence (missing proportion):  Brand.Code (0.049558390578999) Carb.Volume (0.049558390578999) Fill.Ounces (0.049558390578999) PC.Volume (0.049558390578999) Carb.Pressure (0.049558390578999) Carb.Temp (0.049558390578999) PSC (0.049558390578999) PSC.Fill (0.049558390578999) PSC.CO2 (0.049558390578999) Mnf.Flow (0.049558390578999) Carb.Pressure1 (0.049558390578999) Fill.Pressure (0.049558390578999) Hyd.Pressure1 (0.049558390578999) Hyd.Pressure2 (0.049558390578999) Hyd.Pressure3 (0.049558390578999) Hyd.Pressure4 (0.049558390578999) Filler.Level (0.049558390578999) Filler.Speed (0.049558390578999) Temperature (0.049558390578999) Usage.cont (0.049558390578999) Carb.Flow (0.049558390578999) Density (0.049558390578999) MFR (0.049558390578999) Balling (0.049558390578999) Pressure.Vacuum (0.049558390578999) PH (0.049558390578999) Oxygen.Filler (0.049558390578999) Bowl.Setpoint (0.049558390578999) Pressure.Setpoint (0.049558390578999) Air.Pressurer (0.049558390578999) Alch.Rel (0.049558390578999) Carb.Rel (0.049558390578999) Balling.Lvl (0.049558390578999) 
##   missForest iteration 1 in progress...done!
##     OOB errors MSE:            0.0220489708, 0.0018933281, 0.0059714057, 0.0016801838, 2.2053445498, 2.9213006236, 0.0021246546, 0.0125009269, 0.0016506376, 661.3480458945, 10.671351772, 1.7386230921, 21.8365831269, 10.6946090352, 6.3937934191, 34.1215687953, 18.8747964642, 14962.9874266325, 0.5508271496, 4.2355502434, 100700.362148042, 0.0025523335, 748.5856739485, 0.0045802997, 0.0586634749, 0.0103606782, 0.0008586879, 9.3889697099, 0.428208204, 0.5977747011, 0.0020270984, 0.0017181531, 0.0024779845
##     OOB errors NMSE:           0.1374280046, 0.1804556835, 0.8404967995, 0.5571637905, 0.1894071453, 0.1861621636, 0.9233983539, 0.8877087081, 0.9446077939, 0.0442583844, 0.5386857282, 0.215262847, 0.1429993517, 0.0410347098, 0.0261606294, 0.3039710372, 0.0775005607, 0.1234573774, 0.57216595, 0.5065750281, 0.1235857165, 0.0176858776, 0.1707023405, 0.0052333348, 0.1779297365, 0.3571888591, 0.5012949596, 0.0400135332, 0.1030323554, 0.3809732839, 0.0079223327, 0.1109166683, 0.0032697625
##     diff. convergence measure: 0.7139803392
##     time:                      22.26 seconds
## 
##   missForest iteration 2 in progress...done!
##     OOB errors MSE:            0.017106729, 0.0017906205, 0.0058651622, 0.001630099, 1.8779843387, 3.0534755341, 0.0021086787, 0.0124619318, 0.0016522125, 538.6333605743, 10.0717429595, 1.5830886544, 19.685785489, 9.3948838934, 6.0025065903, 33.0261273429, 14.9255808813, 14648.1322513655, 0.5258289474, 4.1905473725, 89100.9665703089, 0.0020467252, 742.1658995423, 0.0032319827, 0.0550844844, 0.0098371036, 0.0008597954, 9.1811778602, 0.4211080202, 0.591041048, 0.0018330058, 0.0017165574, 0.0024466016
##     OOB errors NMSE:           0.1066237356, 0.1706664809, 0.8255426367, 0.5405552082, 0.1612916461, 0.194585113, 0.9164550256, 0.8849396075, 0.9455090391, 0.0360461371, 0.5084177063, 0.1960057774, 0.1289146084, 0.0360477258, 0.0245596534, 0.2942123278, 0.0612849463, 0.1208595543, 0.5461993285, 0.5011926505, 0.1093502204, 0.0141823675, 0.1692384192, 0.0036927818, 0.1670744499, 0.3391383983, 0.5019414965, 0.0391279742, 0.1013239606, 0.3766817976, 0.0071637775, 0.1108136552, 0.0032283519
##     diff. convergence measure: 0.0089630984
##     time:                      21.11 seconds
## 
##   missForest iteration 3 in progress...done!
##     OOB errors MSE:            0.0171018334, 0.0017801178, 0.0058974609, 0.0016251967, 1.8806964106, 3.0242208772, 0.0020935528, 0.0123645131, 0.001644546, 529.5522013738, 10.106383153, 1.5716032118, 19.4236469582, 9.5743668397, 6.0056111622, 32.9750160307, 14.4909368769, 15506.6986225724, 0.5278798962, 4.2094014965, 88458.5075912977, 0.0020382543, 709.8390999503, 0.0031573626, 0.0559548701, 0.0099591524, 0.0008482133, 9.7342015549, 0.4216681567, 0.5855406392, 0.0019007478, 0.0017134779, 0.0024168358
##     OOB errors NMSE:           0.1065932222, 0.1696654523, 0.8300887915, 0.5389295697, 0.1615245738, 0.1927208371, 0.9098811463, 0.8780217687, 0.9411217414, 0.0354384126, 0.5101663299, 0.1945837388, 0.1271979644, 0.0367363934, 0.024572356, 0.2937570041, 0.0595002831, 0.1279434574, 0.5483297302, 0.5034476181, 0.108561755, 0.0141236697, 0.1618668376, 0.0036075228, 0.1697143802, 0.3433460851, 0.4951799896, 0.0414848283, 0.1014587366, 0.3731762816, 0.0074285276, 0.1106148539, 0.0031890752
##     diff. convergence measure: 0.0005725341
##     time:                      20.89 seconds
## 
##   missForest iteration 4 in progress...done!
##     OOB errors MSE:            0.0169134429, 0.0017784515, 0.0058801945, 0.0016224482, 1.7848695926, 3.1195830798, 0.0020967885, 0.0123090384, 0.0016437888, 522.2356164465, 10.0774346485, 1.563306558, 19.2984799136, 9.2478668753, 6.0325783935, 33.0513206663, 14.7667101461, 15755.6257522953, 0.5309432615, 4.2491760268, 89541.0676579173, 0.0019833297, 744.6201789175, 0.0031828728, 0.0557572254, 0.009795656, 0.0008492966, 9.8065118173, 0.422216931, 0.5919926751, 0.0018630822, 0.0016954077, 0.0024850476
##     OOB errors NMSE:           0.1054190117, 0.1695066354, 0.8276584868, 0.5380181348, 0.1532944385, 0.198797868, 0.9112874113, 0.8740824338, 0.9406884377, 0.034948776, 0.5087050205, 0.1935565114, 0.1263782937, 0.0354836284, 0.0246826942, 0.2944367618, 0.0606326176, 0.129997318, 0.5515117688, 0.5082046822, 0.1098903397, 0.0137430809, 0.1697980762, 0.0036366701, 0.1691149124, 0.3377094744, 0.4958123941, 0.0417929973, 0.1015907787, 0.3772882878, 0.0072813225, 0.1094483205, 0.0032790824
##     diff. convergence measure: -0.0001122343
##     time:                      20.8 seconds
## 
## Cluster size 2038 broken into 588 1450 
## Done cluster 588 
## Done cluster 1450 
## Imputation sequence (missing proportion):  Brand.Code (0.049558390578999) Carb.Volume (0.049558390578999) Fill.Ounces (0.049558390578999) PC.Volume (0.049558390578999) Carb.Pressure (0.049558390578999) Carb.Temp (0.049558390578999) PSC (0.049558390578999) PSC.Fill (0.049558390578999) PSC.CO2 (0.049558390578999) Mnf.Flow (0.049558390578999) Carb.Pressure1 (0.049558390578999) Fill.Pressure (0.049558390578999) Hyd.Pressure1 (0.049558390578999) Hyd.Pressure2 (0.049558390578999) Hyd.Pressure3 (0.049558390578999) Hyd.Pressure4 (0.049558390578999) Filler.Level (0.049558390578999) Filler.Speed (0.049558390578999) Temperature (0.049558390578999) Usage.cont (0.049558390578999) Carb.Flow (0.049558390578999) Density (0.049558390578999) MFR (0.049558390578999) Balling (0.049558390578999) Pressure.Vacuum (0.049558390578999) PH (0.049558390578999) Oxygen.Filler (0.049558390578999) Bowl.Setpoint (0.049558390578999) Pressure.Setpoint (0.049558390578999) Air.Pressurer (0.049558390578999) Alch.Rel (0.049558390578999) Carb.Rel (0.049558390578999) Balling.Lvl (0.049558390578999) 
##   missForest iteration 1 in progress...done!
##     OOB errors MSE:            0.0231514864, 0.0019294385, 0.0060197399, 0.0016952915, 2.2209738466, 2.9962795846, 0.0021094265, 0.0123515057, 0.0016736835, 724.0355451055, 10.5442463723, 1.7562479484, 22.1014749642, 10.7983763765, 6.6732693948, 34.9222120789, 19.0351207612, 22065.4510968499, 0.5574880513, 4.301394063, 93890.5603261959, 0.0026491476, 719.1616943522, 0.00447632, 0.0586821708, 0.0100287848, 0.0008519986, 9.1953631649, 0.4385685537, 0.6028764466, 0.0022355043, 0.0016904743, 0.0025374554
##     OOB errors NMSE:           0.1437080087, 0.1819377244, 0.839178937, 0.5562961965, 0.1903217178, 0.1907419281, 0.9133097153, 0.8872986497, 0.9521174886, 0.0485246408, 0.5388796131, 0.2157944179, 0.1468671423, 0.0417182982, 0.0271203822, 0.3139060437, 0.0785798664, 0.1827182044, 0.5778890526, 0.5137134911, 0.1142282318, 0.0183579263, 0.1771978085, 0.0050883565, 0.1799429365, 0.3439570403, 0.4949810727, 0.0392894696, 0.1049052311, 0.391510705, 0.0087106722, 0.110007713, 0.0033468486
##     diff. convergence measure: 0.7111471051
##     time:                      20.36 seconds
## 
##   missForest iteration 2 in progress...done!
##     OOB errors MSE:            0.0176596109, 0.0018471926, 0.0059594669, 0.0016293207, 1.9362517316, 2.9784251778, 0.0021091284, 0.0121933162, 0.001669535, 533.6457912514, 9.9810762061, 1.6414730623, 19.8972350294, 9.321554201, 6.3160055442, 33.5054178929, 14.9194602669, 14631.327648066, 0.534768659, 4.3206056293, 84004.0157973131, 0.0020935962, 700.2994907269, 0.0033966562, 0.0561703625, 0.0095829597, 0.0008431904, 9.2252945428, 0.4333886228, 0.5945702997, 0.0019209841, 0.0016717213, 0.0024771162
##     OOB errors NMSE:           0.1096183408, 0.1741822874, 0.8307766075, 0.5346484284, 0.1659230505, 0.1896053239, 0.9131806738, 0.8759347494, 0.949757483, 0.0357647777, 0.5100979524, 0.2016917511, 0.1322196845, 0.0360127638, 0.0256684504, 0.3011708752, 0.0615897955, 0.1211581809, 0.5543382553, 0.5160079195, 0.1022001589, 0.014508095, 0.1725502568, 0.0038610728, 0.1722407307, 0.3286665843, 0.4898638495, 0.0394173588, 0.1036661959, 0.3861166554, 0.0074851401, 0.1087873577, 0.0032672625
##     diff. convergence measure: 0.0109141655
##     time:                      20.73 seconds
## 
##   missForest iteration 3 in progress...done!
##     OOB errors MSE:            0.0173841207, 0.00183782, 0.0059660576, 0.0016287838, 1.8934968309, 2.9221344897, 0.0020994149, 0.0121201234, 0.0016754333, 529.2171612545, 10.0449723375, 1.6199098081, 19.4883467292, 9.3605263396, 6.3127216489, 33.4665590092, 14.418633053, 14161.7086938488, 0.5355232951, 4.3171820117, 81803.9437079376, 0.0020544626, 712.6524683953, 0.0035268384, 0.055995344, 0.0096189609, 0.0008487301, 9.8532799938, 0.4350913373, 0.5900224807, 0.0019652692, 0.0016533322, 0.0024688919
##     OOB errors NMSE:           0.1079082929, 0.1732984923, 0.8316953821, 0.5344722445, 0.1622592585, 0.1860218818, 0.9089750605, 0.870676773, 0.9531129254, 0.0354679723, 0.5133634605, 0.199042222, 0.1295025692, 0.0361633282, 0.0256551045, 0.3008215835, 0.0595223048, 0.1172693897, 0.555120507, 0.5155990384, 0.0995235283, 0.0142369092, 0.1755939681, 0.0040090546, 0.1717040542, 0.3299013163, 0.493082225, 0.0421005824, 0.1040734838, 0.3831632811, 0.0076576975, 0.1075906854, 0.0032564148
##     diff. convergence measure: 0.0006102145
##     time:                      20.84 seconds
## 
##   missForest iteration 4 in progress...done!
##     OOB errors MSE:            0.0176633478, 0.0018356686, 0.0059581957, 0.0016290046, 1.7925581248, 3.1173361114, 0.0021042694, 0.0121865207, 0.0016640616, 505.4875083687, 10.0512838066, 1.6309459247, 19.5428896838, 9.1453797553, 6.4614212026, 33.4385287686, 14.8035785388, 14462.4056817869, 0.535746934, 4.2965272774, 83672.5242202308, 0.0020821474, 698.6483425434, 0.0034518533, 0.0555246453, 0.0097070966, 0.0008388561, 9.4138526135, 0.4466296874, 0.5941804682, 0.001875169, 0.0016690362, 0.0025127783
##     OOB errors NMSE:           0.1096415364, 0.1730956281, 0.8305993996, 0.5345446937, 0.1536095268, 0.1984483369, 0.9110768749, 0.8754465762, 0.9466438394, 0.0338776182, 0.5136860177, 0.2003982563, 0.1298650142, 0.0353321339, 0.0262594243, 0.3005696274, 0.0611114182, 0.1197593825, 0.5553523298, 0.5131322531, 0.1017968628, 0.0144287577, 0.1721434223, 0.0039238169, 0.1702607042, 0.3329241046, 0.4873457416, 0.0402230199, 0.1068334475, 0.3858634971, 0.0073066209, 0.1086126244, 0.0033143
##     diff. convergence measure: -0.0001692671
##     time:                      21.66 seconds
## 
## Cluster size 2038 broken into 588 1450 
## Done cluster 588 
## Done cluster 1450 
## Imputation sequence (missing proportion):  Brand.Code (0.049558390578999) Carb.Volume (0.049558390578999) Fill.Ounces (0.049558390578999) PC.Volume (0.049558390578999) Carb.Pressure (0.049558390578999) Carb.Temp (0.049558390578999) PSC (0.049558390578999) PSC.Fill (0.049558390578999) PSC.CO2 (0.049558390578999) Mnf.Flow (0.049558390578999) Carb.Pressure1 (0.049558390578999) Fill.Pressure (0.049558390578999) Hyd.Pressure1 (0.049558390578999) Hyd.Pressure2 (0.049558390578999) Hyd.Pressure3 (0.049558390578999) Hyd.Pressure4 (0.049558390578999) Filler.Level (0.049558390578999) Filler.Speed (0.049558390578999) Temperature (0.049558390578999) Usage.cont (0.049558390578999) Carb.Flow (0.049558390578999) Density (0.049558390578999) MFR (0.049558390578999) Balling (0.049558390578999) Pressure.Vacuum (0.049558390578999) PH (0.049558390578999) Oxygen.Filler (0.049558390578999) Bowl.Setpoint (0.049558390578999) Pressure.Setpoint (0.049558390578999) Air.Pressurer (0.049558390578999) Alch.Rel (0.049558390578999) Carb.Rel (0.049558390578999) Balling.Lvl (0.049558390578999) 
##   missForest iteration 1 in progress...done!
##     OOB errors MSE:            0.0231514864, 0.0019294385, 0.0060197399, 0.0016952915, 2.2209738466, 2.9962795846, 0.0021094265, 0.0123515057, 0.0016736835, 724.0355451055, 10.5442463723, 1.7562479484, 22.1014749642, 10.7983763765, 6.6732693948, 34.9222120789, 19.0351207612, 22065.4510968499, 0.5574880513, 4.301394063, 93890.5603261959, 0.0026491476, 719.1616943522, 0.00447632, 0.0586821708, 0.0100287848, 0.0008519986, 9.1953631649, 0.4385685537, 0.6028764466, 0.0022355043, 0.0016904743, 0.0025374554
##     OOB errors NMSE:           0.1437080087, 0.1819377244, 0.839178937, 0.5562961965, 0.1903217178, 0.1907419281, 0.9133097153, 0.8872986497, 0.9521174886, 0.0485246408, 0.5388796131, 0.2157944179, 0.1468671423, 0.0417182982, 0.0271203822, 0.3139060437, 0.0785798664, 0.1827182044, 0.5778890526, 0.5137134911, 0.1142282318, 0.0183579263, 0.1771978085, 0.0050883565, 0.1799429365, 0.3439570403, 0.4949810727, 0.0392894696, 0.1049052311, 0.391510705, 0.0087106722, 0.110007713, 0.0033468486
##     diff. convergence measure: 0.7111471051
##     time:                      20.62 seconds
## 
##   missForest iteration 2 in progress...done!
##     OOB errors MSE:            0.0176596109, 0.0018471926, 0.0059594669, 0.0016293207, 1.9362517316, 2.9784251778, 0.0021091284, 0.0121933162, 0.001669535, 533.6457912514, 9.9810762061, 1.6414730623, 19.8972350294, 9.321554201, 6.3160055442, 33.5054178929, 14.9194602669, 14631.327648066, 0.534768659, 4.3206056293, 84004.0157973131, 0.0020935962, 700.2994907269, 0.0033966562, 0.0561703625, 0.0095829597, 0.0008431904, 9.2252945428, 0.4333886228, 0.5945702997, 0.0019209841, 0.0016717213, 0.0024771162
##     OOB errors NMSE:           0.1096183408, 0.1741822874, 0.8307766075, 0.5346484284, 0.1659230505, 0.1896053239, 0.9131806738, 0.8759347494, 0.949757483, 0.0357647777, 0.5100979524, 0.2016917511, 0.1322196845, 0.0360127638, 0.0256684504, 0.3011708752, 0.0615897955, 0.1211581809, 0.5543382553, 0.5160079195, 0.1022001589, 0.014508095, 0.1725502568, 0.0038610728, 0.1722407307, 0.3286665843, 0.4898638495, 0.0394173588, 0.1036661959, 0.3861166554, 0.0074851401, 0.1087873577, 0.0032672625
##     diff. convergence measure: 0.0109141655
##     time:                      20.92 seconds
## 
##   missForest iteration 3 in progress...done!
##     OOB errors MSE:            0.0173841207, 0.00183782, 0.0059660576, 0.0016287838, 1.8934968309, 2.9221344897, 0.0020994149, 0.0121201234, 0.0016754333, 529.2171612545, 10.0449723375, 1.6199098081, 19.4883467292, 9.3605263396, 6.3127216489, 33.4665590092, 14.418633053, 14161.7086938488, 0.5355232951, 4.3171820117, 81803.9437079376, 0.0020544626, 712.6524683953, 0.0035268384, 0.055995344, 0.0096189609, 0.0008487301, 9.8532799938, 0.4350913373, 0.5900224807, 0.0019652692, 0.0016533322, 0.0024688919
##     OOB errors NMSE:           0.1079082929, 0.1732984923, 0.8316953821, 0.5344722445, 0.1622592585, 0.1860218818, 0.9089750605, 0.870676773, 0.9531129254, 0.0354679723, 0.5133634605, 0.199042222, 0.1295025692, 0.0361633282, 0.0256551045, 0.3008215835, 0.0595223048, 0.1172693897, 0.555120507, 0.5155990384, 0.0995235283, 0.0142369092, 0.1755939681, 0.0040090546, 0.1717040542, 0.3299013163, 0.493082225, 0.0421005824, 0.1040734838, 0.3831632811, 0.0076576975, 0.1075906854, 0.0032564148
##     diff. convergence measure: 0.0006102145
##     time:                      20.72 seconds
## 
##   missForest iteration 4 in progress...done!
##     OOB errors MSE:            0.0176633478, 0.0018356686, 0.0059581957, 0.0016290046, 1.7925581248, 3.1173361114, 0.0021042694, 0.0121865207, 0.0016640616, 505.4875083687, 10.0512838066, 1.6309459247, 19.5428896838, 9.1453797553, 6.4614212026, 33.4385287686, 14.8035785388, 14462.4056817869, 0.535746934, 4.2965272774, 83672.5242202308, 0.0020821474, 698.6483425434, 0.0034518533, 0.0555246453, 0.0097070966, 0.0008388561, 9.4138526135, 0.4466296874, 0.5941804682, 0.001875169, 0.0016690362, 0.0025127783
##     OOB errors NMSE:           0.1096415364, 0.1730956281, 0.8305993996, 0.5345446937, 0.1536095268, 0.1984483369, 0.9110768749, 0.8754465762, 0.9466438394, 0.0338776182, 0.5136860177, 0.2003982563, 0.1298650142, 0.0353321339, 0.0262594243, 0.3005696274, 0.0611114182, 0.1197593825, 0.5553523298, 0.5131322531, 0.1017968628, 0.0144287577, 0.1721434223, 0.0039238169, 0.1702607042, 0.3329241046, 0.4873457416, 0.0402230199, 0.1068334475, 0.3858634971, 0.0073066209, 0.1086126244, 0.0033143
##     diff. convergence measure: -0.0001692671
##     time:                      20.86 seconds
## 
## Cluster size 2038 broken into 588 1450 
## Done cluster 588 
## Done cluster 1450 
## Imputation sequence (missing proportion):  Brand.Code (0.049558390578999) Carb.Volume (0.049558390578999) Fill.Ounces (0.049558390578999) PC.Volume (0.049558390578999) Carb.Pressure (0.049558390578999) Carb.Temp (0.049558390578999) PSC (0.049558390578999) PSC.Fill (0.049558390578999) PSC.CO2 (0.049558390578999) Mnf.Flow (0.049558390578999) Carb.Pressure1 (0.049558390578999) Fill.Pressure (0.049558390578999) Hyd.Pressure1 (0.049558390578999) Hyd.Pressure2 (0.049558390578999) Hyd.Pressure3 (0.049558390578999) Hyd.Pressure4 (0.049558390578999) Filler.Level (0.049558390578999) Filler.Speed (0.049558390578999) Temperature (0.049558390578999) Usage.cont (0.049558390578999) Carb.Flow (0.049558390578999) Density (0.049558390578999) MFR (0.049558390578999) Balling (0.049558390578999) Pressure.Vacuum (0.049558390578999) PH (0.049558390578999) Oxygen.Filler (0.049558390578999) Bowl.Setpoint (0.049558390578999) Pressure.Setpoint (0.049558390578999) Air.Pressurer (0.049558390578999) Alch.Rel (0.049558390578999) Carb.Rel (0.049558390578999) Balling.Lvl (0.049558390578999) 
##   missForest iteration 1 in progress...done!
##     OOB errors MSE:            0.0231514864, 0.0019294385, 0.0060197399, 0.0016952915, 2.2209738466, 2.9962795846, 0.0021094265, 0.0123515057, 0.0016736835, 724.0355451055, 10.5442463723, 1.7562479484, 22.1014749642, 10.7983763765, 6.6732693948, 34.9222120789, 19.0351207612, 22065.4510968499, 0.5574880513, 4.301394063, 93890.5603261959, 0.0026491476, 719.1616943522, 0.00447632, 0.0586821708, 0.0100287848, 0.0008519986, 9.1953631649, 0.4385685537, 0.6028764466, 0.0022355043, 0.0016904743, 0.0025374554
##     OOB errors NMSE:           0.1437080087, 0.1819377244, 0.839178937, 0.5562961965, 0.1903217178, 0.1907419281, 0.9133097153, 0.8872986497, 0.9521174886, 0.0485246408, 0.5388796131, 0.2157944179, 0.1468671423, 0.0417182982, 0.0271203822, 0.3139060437, 0.0785798664, 0.1827182044, 0.5778890526, 0.5137134911, 0.1142282318, 0.0183579263, 0.1771978085, 0.0050883565, 0.1799429365, 0.3439570403, 0.4949810727, 0.0392894696, 0.1049052311, 0.391510705, 0.0087106722, 0.110007713, 0.0033468486
##     diff. convergence measure: 0.7111471051
##     time:                      20.49 seconds
## 
##   missForest iteration 2 in progress...done!
##     OOB errors MSE:            0.0176596109, 0.0018471926, 0.0059594669, 0.0016293207, 1.9362517316, 2.9784251778, 0.0021091284, 0.0121933162, 0.001669535, 533.6457912514, 9.9810762061, 1.6414730623, 19.8972350294, 9.321554201, 6.3160055442, 33.5054178929, 14.9194602669, 14631.327648066, 0.534768659, 4.3206056293, 84004.0157973131, 0.0020935962, 700.2994907269, 0.0033966562, 0.0561703625, 0.0095829597, 0.0008431904, 9.2252945428, 0.4333886228, 0.5945702997, 0.0019209841, 0.0016717213, 0.0024771162
##     OOB errors NMSE:           0.1096183408, 0.1741822874, 0.8307766075, 0.5346484284, 0.1659230505, 0.1896053239, 0.9131806738, 0.8759347494, 0.949757483, 0.0357647777, 0.5100979524, 0.2016917511, 0.1322196845, 0.0360127638, 0.0256684504, 0.3011708752, 0.0615897955, 0.1211581809, 0.5543382553, 0.5160079195, 0.1022001589, 0.014508095, 0.1725502568, 0.0038610728, 0.1722407307, 0.3286665843, 0.4898638495, 0.0394173588, 0.1036661959, 0.3861166554, 0.0074851401, 0.1087873577, 0.0032672625
##     diff. convergence measure: 0.0109141655
##     time:                      20.62 seconds
## 
##   missForest iteration 3 in progress...done!
##     OOB errors MSE:            0.0173841207, 0.00183782, 0.0059660576, 0.0016287838, 1.8934968309, 2.9221344897, 0.0020994149, 0.0121201234, 0.0016754333, 529.2171612545, 10.0449723375, 1.6199098081, 19.4883467292, 9.3605263396, 6.3127216489, 33.4665590092, 14.418633053, 14161.7086938488, 0.5355232951, 4.3171820117, 81803.9437079376, 0.0020544626, 712.6524683953, 0.0035268384, 0.055995344, 0.0096189609, 0.0008487301, 9.8532799938, 0.4350913373, 0.5900224807, 0.0019652692, 0.0016533322, 0.0024688919
##     OOB errors NMSE:           0.1079082929, 0.1732984923, 0.8316953821, 0.5344722445, 0.1622592585, 0.1860218818, 0.9089750605, 0.870676773, 0.9531129254, 0.0354679723, 0.5133634605, 0.199042222, 0.1295025692, 0.0361633282, 0.0256551045, 0.3008215835, 0.0595223048, 0.1172693897, 0.555120507, 0.5155990384, 0.0995235283, 0.0142369092, 0.1755939681, 0.0040090546, 0.1717040542, 0.3299013163, 0.493082225, 0.0421005824, 0.1040734838, 0.3831632811, 0.0076576975, 0.1075906854, 0.0032564148
##     diff. convergence measure: 0.0006102145
##     time:                      20.64 seconds
## 
##   missForest iteration 4 in progress...done!
##     OOB errors MSE:            0.0176633478, 0.0018356686, 0.0059581957, 0.0016290046, 1.7925581248, 3.1173361114, 0.0021042694, 0.0121865207, 0.0016640616, 505.4875083687, 10.0512838066, 1.6309459247, 19.5428896838, 9.1453797553, 6.4614212026, 33.4385287686, 14.8035785388, 14462.4056817869, 0.535746934, 4.2965272774, 83672.5242202308, 0.0020821474, 698.6483425434, 0.0034518533, 0.0555246453, 0.0097070966, 0.0008388561, 9.4138526135, 0.4466296874, 0.5941804682, 0.001875169, 0.0016690362, 0.0025127783
##     OOB errors NMSE:           0.1096415364, 0.1730956281, 0.8305993996, 0.5345446937, 0.1536095268, 0.1984483369, 0.9110768749, 0.8754465762, 0.9466438394, 0.0338776182, 0.5136860177, 0.2003982563, 0.1298650142, 0.0353321339, 0.0262594243, 0.3005696274, 0.0611114182, 0.1197593825, 0.5553523298, 0.5131322531, 0.1017968628, 0.0144287577, 0.1721434223, 0.0039238169, 0.1702607042, 0.3329241046, 0.4873457416, 0.0402230199, 0.1068334475, 0.3858634971, 0.0073066209, 0.1086126244, 0.0033143
##     diff. convergence measure: -0.0001692671
##     time:                      20.83 seconds
## 
## Cluster size 2038 broken into 588 1450 
## Done cluster 588 
## Done cluster 1450 
## Imputation sequence (missing proportion):  Brand.Code (0.049558390578999) Carb.Volume (0.049558390578999) Fill.Ounces (0.049558390578999) PC.Volume (0.049558390578999) Carb.Pressure (0.049558390578999) Carb.Temp (0.049558390578999) PSC (0.049558390578999) PSC.Fill (0.049558390578999) PSC.CO2 (0.049558390578999) Mnf.Flow (0.049558390578999) Carb.Pressure1 (0.049558390578999) Fill.Pressure (0.049558390578999) Hyd.Pressure1 (0.049558390578999) Hyd.Pressure2 (0.049558390578999) Hyd.Pressure3 (0.049558390578999) Hyd.Pressure4 (0.049558390578999) Filler.Level (0.049558390578999) Filler.Speed (0.049558390578999) Temperature (0.049558390578999) Usage.cont (0.049558390578999) Carb.Flow (0.049558390578999) Density (0.049558390578999) MFR (0.049558390578999) Balling (0.049558390578999) Pressure.Vacuum (0.049558390578999) PH (0.049558390578999) Oxygen.Filler (0.049558390578999) Bowl.Setpoint (0.049558390578999) Pressure.Setpoint (0.049558390578999) Air.Pressurer (0.049558390578999) Alch.Rel (0.049558390578999) Carb.Rel (0.049558390578999) Balling.Lvl (0.049558390578999) 
##   missForest iteration 1 in progress...done!
##     OOB errors MSE:            0.0231514864, 0.0019294385, 0.0060197399, 0.0016952915, 2.2209738466, 2.9962795846, 0.0021094265, 0.0123515057, 0.0016736835, 724.0355451055, 10.5442463723, 1.7562479484, 22.1014749642, 10.7983763765, 6.6732693948, 34.9222120789, 19.0351207612, 22065.4510968499, 0.5574880513, 4.301394063, 93890.5603261959, 0.0026491476, 719.1616943522, 0.00447632, 0.0586821708, 0.0100287848, 0.0008519986, 9.1953631649, 0.4385685537, 0.6028764466, 0.0022355043, 0.0016904743, 0.0025374554
##     OOB errors NMSE:           0.1437080087, 0.1819377244, 0.839178937, 0.5562961965, 0.1903217178, 0.1907419281, 0.9133097153, 0.8872986497, 0.9521174886, 0.0485246408, 0.5388796131, 0.2157944179, 0.1468671423, 0.0417182982, 0.0271203822, 0.3139060437, 0.0785798664, 0.1827182044, 0.5778890526, 0.5137134911, 0.1142282318, 0.0183579263, 0.1771978085, 0.0050883565, 0.1799429365, 0.3439570403, 0.4949810727, 0.0392894696, 0.1049052311, 0.391510705, 0.0087106722, 0.110007713, 0.0033468486
##     diff. convergence measure: 0.7111471051
##     time:                      20.44 seconds
## 
##   missForest iteration 2 in progress...done!
##     OOB errors MSE:            0.0176596109, 0.0018471926, 0.0059594669, 0.0016293207, 1.9362517316, 2.9784251778, 0.0021091284, 0.0121933162, 0.001669535, 533.6457912514, 9.9810762061, 1.6414730623, 19.8972350294, 9.321554201, 6.3160055442, 33.5054178929, 14.9194602669, 14631.327648066, 0.534768659, 4.3206056293, 84004.0157973131, 0.0020935962, 700.2994907269, 0.0033966562, 0.0561703625, 0.0095829597, 0.0008431904, 9.2252945428, 0.4333886228, 0.5945702997, 0.0019209841, 0.0016717213, 0.0024771162
##     OOB errors NMSE:           0.1096183408, 0.1741822874, 0.8307766075, 0.5346484284, 0.1659230505, 0.1896053239, 0.9131806738, 0.8759347494, 0.949757483, 0.0357647777, 0.5100979524, 0.2016917511, 0.1322196845, 0.0360127638, 0.0256684504, 0.3011708752, 0.0615897955, 0.1211581809, 0.5543382553, 0.5160079195, 0.1022001589, 0.014508095, 0.1725502568, 0.0038610728, 0.1722407307, 0.3286665843, 0.4898638495, 0.0394173588, 0.1036661959, 0.3861166554, 0.0074851401, 0.1087873577, 0.0032672625
##     diff. convergence measure: 0.0109141655
##     time:                      20.88 seconds
## 
##   missForest iteration 3 in progress...done!
##     OOB errors MSE:            0.0173841207, 0.00183782, 0.0059660576, 0.0016287838, 1.8934968309, 2.9221344897, 0.0020994149, 0.0121201234, 0.0016754333, 529.2171612545, 10.0449723375, 1.6199098081, 19.4883467292, 9.3605263396, 6.3127216489, 33.4665590092, 14.418633053, 14161.7086938488, 0.5355232951, 4.3171820117, 81803.9437079376, 0.0020544626, 712.6524683953, 0.0035268384, 0.055995344, 0.0096189609, 0.0008487301, 9.8532799938, 0.4350913373, 0.5900224807, 0.0019652692, 0.0016533322, 0.0024688919
##     OOB errors NMSE:           0.1079082929, 0.1732984923, 0.8316953821, 0.5344722445, 0.1622592585, 0.1860218818, 0.9089750605, 0.870676773, 0.9531129254, 0.0354679723, 0.5133634605, 0.199042222, 0.1295025692, 0.0361633282, 0.0256551045, 0.3008215835, 0.0595223048, 0.1172693897, 0.555120507, 0.5155990384, 0.0995235283, 0.0142369092, 0.1755939681, 0.0040090546, 0.1717040542, 0.3299013163, 0.493082225, 0.0421005824, 0.1040734838, 0.3831632811, 0.0076576975, 0.1075906854, 0.0032564148
##     diff. convergence measure: 0.0006102145
##     time:                      20.76 seconds
## 
##   missForest iteration 4 in progress...done!
##     OOB errors MSE:            0.0176633478, 0.0018356686, 0.0059581957, 0.0016290046, 1.7925581248, 3.1173361114, 0.0021042694, 0.0121865207, 0.0016640616, 505.4875083687, 10.0512838066, 1.6309459247, 19.5428896838, 9.1453797553, 6.4614212026, 33.4385287686, 14.8035785388, 14462.4056817869, 0.535746934, 4.2965272774, 83672.5242202308, 0.0020821474, 698.6483425434, 0.0034518533, 0.0555246453, 0.0097070966, 0.0008388561, 9.4138526135, 0.4466296874, 0.5941804682, 0.001875169, 0.0016690362, 0.0025127783
##     OOB errors NMSE:           0.1096415364, 0.1730956281, 0.8305993996, 0.5345446937, 0.1536095268, 0.1984483369, 0.9110768749, 0.8754465762, 0.9466438394, 0.0338776182, 0.5136860177, 0.2003982563, 0.1298650142, 0.0353321339, 0.0262594243, 0.3005696274, 0.0611114182, 0.1197593825, 0.5553523298, 0.5131322531, 0.1017968628, 0.0144287577, 0.1721434223, 0.0039238169, 0.1702607042, 0.3329241046, 0.4873457416, 0.0402230199, 0.1068334475, 0.3858634971, 0.0073066209, 0.1086126244, 0.0033143
##     diff. convergence measure: -0.0001692671
##     time:                      21.03 seconds

imputation_results$nrmse |> kable(digits = 4)

method	nrmse
mice	0.0784
forest	0.0883
knn	0.1882
mean	0.2295
median	0.2370
mode	0.2739

imputation_results$brand |> kable(digits = 4)

method	accuracy
forest	0.9624
mice	0.8535
mode	NaN

ggplot(imputation_results$nrmse, aes(x = reorder(method, nrmse), y = nrmse)) +
  geom_col(aes(fill = nrmse), color = NA, width = 0.65) +
  scale_fill_gradient(low = "darkgreen", high = "darkred") +
  coord_flip() +
  labs(
    title = "Imputation comparison by NRMSE",
    x = NULL,
    y = "Average NRMSE"
  )

The benchmark shows that forest-based imputation is the strongest recovery method on these data. KNN and MICE are reasonable alternatives, but they were not as consistently accurate in this experiment, so forest imputation becomes the foundation for the cleaned modeling set.

2.5 Final cleaning pipeline

### Get Exact Imputation Method for Both Training and Test Sets
student_data_updated = student_data |> filter(!is.na(PH))
student_data_predictors = student_data_updated |> select(-PH)
student_data_ph = student_data_updated |> select(PH)

imputation_model <- missForest(
  as.data.frame(student_data_predictors |> mutate(Brand.Code = as.factor(Brand.Code))))

## Imputation sequence (missing proportion):  Mnf.Flow (0) Density (0) Balling (0) Pressure.Vacuum (0) Air.Pressurer (0) Balling.Lvl (0.000389559797428905) Carb.Flow (0.000779119594857811) Bowl.Setpoint (0.000779119594857811) Usage.cont (0.00194779898714453) Alch.Rel (0.00272691858200234) Carb.Rel (0.00311647837943124) Carb.Volume (0.00389559797428905) Hyd.Pressure1 (0.00428515777171796) Oxygen.Filler (0.00428515777171796) Temperature (0.00467471756914686) Pressure.Setpoint (0.00467471756914686) Hyd.Pressure2 (0.00584339696143358) Hyd.Pressure3 (0.00584339696143358) Filler.Level (0.00623295675886249) Fill.Pressure (0.0070120763537203) PSC.Fill (0.00895987534086482) Carb.Temp (0.0101285547331515) Carb.Pressure (0.0105181145305804) Hyd.Pressure4 (0.0109076743280093) Carb.Pressure1 (0.012465913517725) PSC (0.0128554733151539) Fill.Ounces (0.0148032723022984) PC.Volume (0.0151928320997273) PSC.CO2 (0.0151928320997273) Filler.Speed (0.0210362290611609) Brand.Code (0.0467471756914686) MFR (0.0810284378652123) 
##   missForest iteration 1 in progress...done!
##     OOB errors MSE:            621.0370077231, 0.0029825319, 0.0056859805, 0.0614672408, 0.6313308105, 0.0085331535, 151572.630810374, 13.5584233471, 4.6616638946, 0.0115684666, 0.002816241, 0.0024145787, 32.9018289049, 0.0010671208, 0.961541945, 0.6485090165, 10.3782430288, 7.1001252058, 23.9910699297, 3.190527561, 0.0124559855, 3.382566403, 2.0077623136, 58.5779861513, 10.784407184, 0.0022116049, 0.0064545142, 0.0019735777, 0.0017581789, 34590.6837907357, 0.020904566, 964.164402375
##     OOB errors NMSE:           0.043505155, 0.0209963841, 0.0065831216, 0.1890187631, 0.4294462589, 0.0113070795, 0.1323351123, 0.0580245735, 0.5264868123, 0.0453338305, 0.1699253867, 0.2133750013, 0.2129282671, 0.5254764002, 0.5057929954, 0.1560831868, 0.0387031312, 0.0278451775, 0.0973885232, 0.3161180588, 0.8981313898, 0.2079370628, 0.1604051287, 0.341600717, 0.4827861692, 0.9121995153, 0.8440219656, 0.5376404255, 0.9484284922, 0.0584211399, 0.1281818087, 0.176630715
##     diff. convergence measure: 0.6773859145
##     time:                      26.58 seconds
## 
##   missForest iteration 2 in progress...done!
##     OOB errors MSE:            633.8502553524, 0.0029292733, 0.0054220633, 0.0609500755, 0.6302748279, 0.0102124998, 136474.715684955, 12.0629008777, 4.6743668242, 0.0111590671, 0.0027777531, 0.0023682077, 31.861534849, 0.001062578, 0.9437449332, 0.6450062706, 9.908001165, 6.7399608614, 24.1763566575, 3.2409765134, 0.0123001285, 2.8823333824, 1.97976993, 56.4341672121, 10.852063966, 0.0022080017, 0.0064144227, 0.0019973047, 0.0017551331, 22887.1437214671, 0.0206453623, 956.9230778777
##     OOB errors NMSE:           0.0444027542, 0.0206214552, 0.0062775633, 0.1874284209, 0.4287279544, 0.0135323415, 0.1191534166, 0.0516243416, 0.527921477, 0.0437294997, 0.1676031203, 0.2092772262, 0.206195875, 0.5232394234, 0.4964313613, 0.1552401458, 0.0369494786, 0.0264326897, 0.0981406698, 0.3211165503, 0.8868934129, 0.1771861557, 0.1581687475, 0.3290989201, 0.4858149642, 0.9107133374, 0.8387794127, 0.5441041358, 0.9467854965, 0.0386547151, 0.1265924336, 0.1753041359
##     diff. convergence measure: 0.0039814744
##     time:                      26.52 seconds
## 
##   missForest iteration 3 in progress...done!
##     OOB errors MSE:            634.9125112319, 0.0029818241, 0.0054347158, 0.061142584, 0.6319074392, 0.0101489654, 138890.851255719, 12.0280688644, 4.6755417573, 0.011492321, 0.0027865968, 0.0023944467, 31.7968476061, 0.0010698133, 0.9445289973, 0.6366975716, 9.7942842444, 6.7790974652, 24.3309477191, 3.2970937635, 0.0124125077, 2.990202591, 1.9512955395, 54.3366549551, 10.7262645856, 0.0022016034, 0.0064437625, 0.0019935158, 0.0017680073, 23368.3469352782, 0.0206552863, 959.66750011
##     OOB errors NMSE:           0.0444771679, 0.0209914014, 0.0062922122, 0.1880204066, 0.4298384955, 0.0134481536, 0.1212628976, 0.0514752746, 0.5280541736, 0.0450354354, 0.1681367276, 0.211595947, 0.2057772435, 0.5268022442, 0.4968437969, 0.1532404076, 0.036525399, 0.0265861751, 0.0987682114, 0.3266766578, 0.8949964529, 0.1838172174, 0.1558938576, 0.3168671631, 0.4801832962, 0.9080743066, 0.8426160249, 0.5430719526, 0.9537303182, 0.0394674322, 0.1266532853, 0.1758069021
##     diff. convergence measure: -0.0004709523
##     time:                      26.26 seconds

clean_data <- imputation_model$ximp |> as_tibble() |>
  bind_cols(student_data_ph)

numeric_corr <- clean_data |>
  select(where(is.numeric)) |>
  cor(use = "pairwise.complete.obs")

high_corr <- findCorrelation(numeric_corr, cutoff = 0.80, names = TRUE)
if (length(high_corr) == 0) high_corr <- character(0)

model_data <- clean_data
if (length(high_corr) > 0) {
  model_data <- model_data |> select(-all_of(high_corr))
}

clean_check <- model_data |>
  summarise(across(everything(), ~ sum(is.na(.))))

tibble(removed_predictors = if (length(high_corr) == 0) "None" else high_corr) |> kable()

removed_predictors
Hyd.Pressure3
Balling
Density
Balling.Lvl
Alch.Rel
Filler.Level
Filler.Speed
Carb.Temp

clean_check |> kable()

Brand.Code	Carb.Volume	Fill.Ounces	PC.Volume	Carb.Pressure	PSC	PSC.Fill	PSC.CO2	Mnf.Flow	Carb.Pressure1	Fill.Pressure	Hyd.Pressure1	Hyd.Pressure2	Hyd.Pressure4	Temperature	Usage.cont	Carb.Flow	MFR	Pressure.Vacuum	Oxygen.Filler	Bowl.Setpoint	Pressure.Setpoint	Air.Pressurer	Carb.Rel	PH
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0

ggplot(model_data, aes(x = "", y = PH)) +
  geom_boxplot(width = 0.35, fill = "grey90", color = "grey25", outlier.size = 0.7) +
  labs(
    title = "Response distribution after cleaning",
    x = NULL,
    y = "PH"
  )

The cleaned data keep the full set of observations after imputation and remove only the strongest redundant predictors. That gives us a stable starting point for modeling while avoiding unnecessary data loss and reducing the chance that highly correlated sensors will distort later fits.

2.6 Mnf Flow sensitivity check

mnf_suspicious_values <- c(-100, -100.2)

mnf_suspicious_summary <- tibble(
  suspicious_value = mnf_suspicious_values,
  count = c(
    sum(student_data$Mnf.Flow == -100, na.rm = TRUE),
    sum(student_data$Mnf.Flow == -100.2, na.rm = TRUE)
  )
) |>
  mutate(share = round(100 * count / nrow(student_data), 1))

mnf_sensitivity <- prepare_mnf_sensitivity(student_data, mnf_suspicious_values)

mnf_raw_data <- mnf_sensitivity$raw
mnf_recode_data <- mnf_sensitivity$recode
mnf_removed_predictors <- mnf_sensitivity$removed_predictors

mnf_suspicious_summary |> kable()

suspicious_value	count	share
-100.0	606	23.6
-100.2	578	22.5

tibble(
  removed_predictors = if (length(mnf_removed_predictors) == 0) "None" else mnf_removed_predictors
) |> kable()

removed_predictors
Hyd.Pressure3
Balling
Density
Balling.Lvl
Alch.Rel
Filler.Level
Filler.Speed
Carb.Temp

The negative Mnf Flow values are not rare: there are 1184 rows with -100 or -100.2. That is enough to justify a sensitivity check, because these values could either be legitimate coded readings or placeholders for an abnormal sensor state. The comparison below keeps a suspicious-value flag in both versions, then runs the same tree-model workflow twice: once with the raw values left in place and once with them recoded as missing before imputation.

mnf_raw_fit <- fit_tree_sensitivity(mnf_raw_data)
mnf_recode_fit <- fit_tree_sensitivity(mnf_recode_data)

mnf_sensitivity_metrics <- bind_rows(
  mnf_raw_fit$metrics |> mutate(version = "Raw Mnf Flow"),
  mnf_recode_fit$metrics |> mutate(version = "Recoded as missing")
) |>
  select(version, model, rmse, mae, r2)

mnf_best_tbl <- tibble(
  version = c("Raw Mnf Flow", "Recoded as missing"),
  winner = c(mnf_raw_fit$best$model[[1]], mnf_recode_fit$best$model[[1]]),
  winner_rmse = c(mnf_raw_fit$best$rmse, mnf_recode_fit$best$rmse)
)

mnf_sensitivity_metrics |> kable(digits = 4)

version	model	rmse	mae	r2
Raw Mnf Flow	Random forest	0.0982	0.0715	0.6595
Raw Mnf Flow	Gradient boosting	0.1152	0.0899	0.5279
Recoded as missing	Random forest	0.0950	0.0704	0.6878
Recoded as missing	Gradient boosting	0.1127	0.0883	0.5493

mnf_best_tbl |> kable(digits = 4)

version	winner	winner_rmse
Raw Mnf Flow	Random forest	0.0982
Recoded as missing	Random forest	0.0950

ggplot(mnf_sensitivity_metrics, aes(x = model, y = rmse, fill = version)) +
  geom_col(position = position_dodge(width = 0.7), width = 0.6) +
  coord_flip() +
  labs(
    title = "Tree-model RMSE with Mnf Flow handled two ways",
    x = NULL,
    y = "RMSE",
    fill = NULL
  )

The sensitivity check shows that recoding -100 and -100.2 as missing barely changes the tree-model outcome. Random forest remains the better performer in both versions, and the RMSE/MAE values move only slightly. That tells us the odd Mnf Flow values are worth flagging, but they are not driving the main conclusion. In practical terms, the model is robust to how we handle those codes, so we can keep the main story focused on the broader process signal rather than on those individual anomalies.

2.7 Box Cox analysis

numeric_data <- model_data |> select(where(is.numeric))

positive_numeric_names <- names(Filter(function(x) min(x, na.rm = TRUE) > 0, numeric_data))
nonpositive_numeric_names <- setdiff(names(numeric_data), positive_numeric_names)

boxcox_tbl <- bind_rows(lapply(positive_numeric_names, function(v) {
  trans <- BoxCoxTrans(numeric_data[[v]])
  tibble(
    variable = v,
    lambda = unname(trans$lambda),
    transform_need = abs(unname(trans$lambda) - 1)
  )
})) |>
  arrange(desc(transform_need))

boxcox_ignored <- tibble(variable = nonpositive_numeric_names)
if (nrow(boxcox_ignored) == 0) boxcox_ignored <- tibble(variable = "None")

carb_flow_bc <- predict(BoxCoxTrans(model_data$Carb.Flow), model_data$Carb.Flow)

carb_flow_boxcox <- tibble(
  scale = rep(c("Raw Carb Flow", "Box-Cox transformed Carb Flow"), each = nrow(model_data)),
  value = c(model_data$Carb.Flow, carb_flow_bc)
)

carb_flow_qq <- carb_flow_boxcox |>
  mutate(scale = factor(scale, levels = c("Raw Carb Flow", "Box-Cox transformed Carb Flow")))

boxcox_tbl |> head(12) |> kable(digits = 4)

variable	lambda	transform_need
Carb.Volume	-2.0	3.0
Temperature	-2.0	3.0
Pressure.Setpoint	-2.0	3.0
Air.Pressurer	-2.0	3.0
Carb.Rel	-2.0	3.0
Fill.Pressure	-0.7	1.7
Hyd.Pressure4	-0.3	1.3
Carb.Pressure	-0.2	1.2
Fill.Ounces	2.0	1.0
Usage.cont	2.0	1.0
MFR	2.0	1.0
Bowl.Setpoint	2.0	1.0

boxcox_ignored |> kable()

variable
PSC.Fill
PSC.CO2
Mnf.Flow
Hyd.Pressure1
Hyd.Pressure2
Pressure.Vacuum

ggplot(boxcox_tbl |> slice_head(n = 12), aes(x = reorder(variable, transform_need), y = lambda, fill = transform_need)) +
  geom_col(color = NA, width = 0.65) +
  scale_fill_gradient(low = "darkgreen", high = "darkred") +
  geom_hline(yintercept = 1, linetype = "dashed", linewidth = 0.3, color = "grey50") +
  coord_flip() +
  labs(
    title = "Box-Cox lambda profile by predictor",
    x = NULL,
    y = "Lambda"
  )

The Box-Cox check helps us see which predictors are naturally skewed and would benefit from a power transform before modeling. Some variables include zeros or negative values, so they cannot be transformed directly without a shift. For that reason, we treat Box-Cox as a diagnostic step that informs the modeling strategy rather than as a one-size-fits-all cleanup rule.

ggplot(model_data, aes(x = Carb.Flow)) +
  geom_histogram(bins = 25, fill = "grey70", color = NA) +
  labs(
    title = "Raw Carb Flow distribution",
    x = "Carb Flow",
    y = "Count"
  )

ggplot(carb_flow_boxcox, aes(x = value)) +
  geom_histogram(bins = 25, fill = "grey70", color = NA) +
  facet_wrap(~scale, scales = "free_x") +
  labs(
    title = "Carb Flow before and after Box-Cox transformation",
    x = "Carb Flow",
    y = "Count"
  )

ggplot(carb_flow_qq, aes(sample = value)) +
  stat_qq(size = 0.45, alpha = 0.5) +
  stat_qq_line(color = "grey40", linewidth = 0.4) +
  facet_wrap(~scale, scales = "free") +
  labs(
    title = "QQ check before and after Box-Cox transformation",
    x = "Theoretical quantiles",
    y = "Sample quantiles"
  )

3 Modeling Journey

3.1 Split the cleaned data

set.seed(2026)
idx <- createDataPartition(model_data$PH, p = 0.80, list = FALSE)

train_data <- model_data[idx, ]
test_data <- model_data[-idx, ]

x_all <- model.matrix(PH ~ ., data = model_data)[, -1]
x_train <- x_all[idx, ]
x_test <- x_all[-idx, ]
y_train <- train_data$PH
y_test <- test_data$PH

ctrl <- trainControl(method = "cv", number = 5)

3.2 Baseline linear regression

lm_model <- lm(PH ~ ., data = train_data)
lm_pred <- predict(lm_model, newdata = test_data)
lm_metrics <- metric_row("Linear regression", y_test, lm_pred)
lm_adj_r2 <- summary(lm_model)$adj.r.squared

lm_coef_tbl <- as.data.frame(coef(summary(lm_model)))
lm_coef_tbl$term <- rownames(lm_coef_tbl)
lm_coef_tbl <- lm_coef_tbl[, c(ncol(lm_coef_tbl), 1:(ncol(lm_coef_tbl) - 1))]
lm_coef_tbl <- lm_coef_tbl[order(lm_coef_tbl$`Pr(>|t|)`), ]

lm_coef_tbl |> head(12) |> kable(digits = 4)

	term	Estimate	Std. Error	t value	Pr(>\|t\|)
Mnf.Flow	Mnf.Flow	-0.0006	0.0000	-12.9816	0.0000
(Intercept)	(Intercept)	10.8404	1.0023	10.8156	0.0000
Carb.Pressure1	Carb.Pressure1	0.0066	0.0008	8.1610	0.0000
Bowl.Setpoint	Bowl.Setpoint	0.0022	0.0003	7.1699	0.0000
Usage.cont	Usage.cont	-0.0079	0.0013	-6.1238	0.0000
Brand.CodeD	Brand.CodeD	0.0776	0.0138	5.6205	0.0000
Temperature	Temperature	-0.0135	0.0025	-5.3908	0.0000
Brand.CodeB	Brand.CodeB	0.0636	0.0124	5.1137	0.0000
Brand.CodeC	Brand.CodeC	-0.0608	0.0146	-4.1573	0.0000
Oxygen.Filler	Oxygen.Filler	-0.3373	0.0821	-4.1086	0.0000
Pressure.Setpoint	Pressure.Setpoint	-0.0077	0.0022	-3.4693	0.0005
Fill.Pressure	Fill.Pressure	0.0040	0.0013	2.9568	0.0031

We start with ordinary linear regression because leadership should be able to see a transparent baseline before we move to more flexible methods. This model gives us p-values, coefficient signs, and a quick read on which measurements matter most, but it also exposes the limits of a purely linear view when several sensors move together.

pred_lm <- tibble(
  model = "Linear regression",
  observed = y_test,
  predicted = lm_pred,
  residual = y_test - lm_pred
)
plot_obs_pred(pred_lm, "Linear regression: observed vs predicted")

3.3 Ridge regression

ridge_path <- glmnet(x_train, y_train, alpha = 0)
ridge_cv <- cv.glmnet(x_train, y_train, alpha = 0, nfolds = 5)
ridge_pred <- as.numeric(predict(ridge_cv, newx = x_test, s = "lambda.min"))
ridge_metrics <- metric_row("Ridge", y_test, ridge_pred)
ridge_coef_tbl <- top_abs_coef(coef(ridge_cv, s = "lambda.min"), n = 12)

op <- par(mar = c(5, 4, 6, 2) + 0.1)
plot(ridge_path, xvar = "lambda", label = FALSE, main = "", xlab = "log(lambda)")
title(main = "Ridge coefficient path", line = 3)

plot(ridge_cv, main = "", xlab = "log(lambda)", ylab = "Cross-validation error")
title(main = "Ridge cross-validation curve", line = 3)

par(op)

Ridge is our first safeguard against correlated sensors. It keeps every predictor in the model but shrinks the coefficients, which helps stabilize the fit when several process variables are telling us almost the same thing.

ridge_coef_tbl |> kable(digits = 4)

term	estimate
Oxygen.Filler	-0.2612
PC.Volume	-0.1240
Fill.Ounces	-0.0998
PSC.CO2	-0.0976
Carb.Rel	0.0747
Carb.Volume	-0.0705
Brand.CodeD	0.0648
Brand.CodeC	-0.0643
PSC	-0.0612
Brand.CodeB	0.0566
PSC.Fill	-0.0511
Temperature	-0.0137

3.4 Lasso regression

lasso_path <- glmnet(x_train, y_train, alpha = 1)
lasso_cv <- cv.glmnet(x_train, y_train, alpha = 1, nfolds = 5)
lasso_pred <- as.numeric(predict(lasso_cv, newx = x_test, s = "lambda.min"))
lasso_metrics <- metric_row("Lasso", y_test, lasso_pred)
lasso_coef_tbl <- top_abs_coef(coef(lasso_cv, s = "lambda.min"), n = 12)
lasso_nonzero <- sum(as.matrix(coef(lasso_cv, s = "lambda.min"))[, 1] != 0) - 1

op <- par(mar = c(5, 4, 6, 2) + 0.1)
plot(lasso_path, xvar = "lambda", label = FALSE, main = "", xlab = "log(lambda)")
title(main = "Lasso coefficient path", line = 3)

plot(lasso_cv, main = "", xlab = "log(lambda)", ylab = "Cross-validation error")
title(main = "Lasso cross-validation curve", line = 3)

par(op)

Lasso takes the same idea one step further by not only shrinking coefficients, but also setting some of them to zero. That gives us a more compact story about which process variables appear most important if we want a leaner model.

lasso_coef_tbl |> kable(digits = 4)

term	estimate
Oxygen.Filler	-0.3110
PC.Volume	-0.1215
Fill.Ounces	-0.0973
PSC.CO2	-0.0902
Brand.CodeD	0.0725
Carb.Rel	0.0637
Carb.Volume	-0.0634
Brand.CodeC	-0.0630
Brand.CodeB	0.0599
PSC.Fill	-0.0506
PSC	-0.0456
Temperature	-0.0132

3.5 Partial least squares

pls_grid <- data.frame(ncomp = 1:min(10, ncol(train_data) - 1))
pls_model <- train(
  PH ~ .,
  data = train_data,
  method = "pls",
  trControl = ctrl,
  preProcess = c("center", "scale"),
  tuneGrid = pls_grid
)
pls_pred <- predict(pls_model, newdata = test_data)
pls_metrics <- metric_row("PLS", y_test, pls_pred)
pls_importance <- varImp(pls_model)$importance |>
  tibble::rownames_to_column("term") |>
  as_tibble() |>
  arrange(desc(Overall))

ggplot(as_tibble(pls_model$results), aes(ncomp, RMSE)) +
  geom_line() +
  geom_point() +
  labs(
    title = "PLS tuning curve",
    x = "Number of components",
    y = "RMSE"
  )

PLS gives us a different way to handle correlated process measurements by compressing them into response-aware components. It is useful as a middle ground between direct regression and fully nonlinear methods, so it serves as a good checkpoint in the journey.

pls_importance |> head(12) |> kable(digits = 4)

term	Overall
Mnf.Flow	100.0000
Brand.CodeC	78.3040
Bowl.Setpoint	72.3439
Usage.cont	67.6834
Pressure.Setpoint	54.9554
Fill.Pressure	48.1418
Pressure.Vacuum	45.2396
Temperature	43.3939
Hyd.Pressure2	42.4797
Brand.CodeB	39.4568
Brand.CodeD	36.6921
Oxygen.Filler	35.7157

3.6 Random forest

rf_model <- train(
  PH ~ .,
  data = train_data,
  method = "rf",
  trControl = ctrl,
  ntree = 300,
  tuneLength = 3,
  importance = TRUE
)
rf_pred <- predict(rf_model, newdata = test_data)
rf_metrics <- metric_row("Random forest", y_test, rf_pred)

rf_importance_tbl <- varImp(rf_model)$importance |>
  tibble::rownames_to_column("term") |>
  as_tibble() |>
  arrange(desc(Overall)) |>
  slice_head(n = 10)

ggplot(rf_importance_tbl, aes(x = reorder(term, Overall), y = Overall)) +
  geom_col(aes(fill = Overall), color = NA, width = 0.65) +
  scale_fill_gradient(low = "darkred", high = "darkgreen") +
  coord_flip() +
  labs(
    title = "Random forest variable importance",
    x = NULL,
    y = "Overall importance"
  )

Random forest moves us beyond straight-line relationships and into a model that can capture nonlinear interactions among process variables. In a manufacturing setting, that matters because pH is often shaped by combinations of pressures, flows, temperatures, and filler behavior rather than any single measurement on its own.

The importance ranking also gives us a useful process clue. Mnf Flow is acting like a broad proxy for the state of the line, not just a simple speed reading. It can reflect how long product stays in the system, how well it mixes, and whether the line is operating in a normal or unusual mode. That broad role makes it informative in a way that a single temperature or pressure column may not be. The carbonation variables already carry much of the chemistry tied to dissolved CO2, so some of the temperature and pressure signal is likely being absorbed there. On top of that, temperature and pressure are split across several fields, which dilutes the effect of any one column, while flow is concentrated in one place and can vary more visibly across runs.

3.7 Gradient boosting

gbm_model <- train(
  PH ~ .,
  data = train_data,
  method = "gbm",
  trControl = ctrl,
  verbose = FALSE,
  tuneLength = 3
)
gbm_pred <- predict(gbm_model, newdata = test_data)
gbm_metrics <- metric_row("Gradient boosting", y_test, gbm_pred)

gbm_importance_tbl <- varImp(gbm_model)$importance |>
  tibble::rownames_to_column("term") |>
  as_tibble() |>
  arrange(desc(Overall)) |>
  slice_head(n = 10)

ggplot(gbm_importance_tbl, aes(x = reorder(term, Overall), y = Overall)) +
  geom_col(aes(fill = Overall), color = NA, width = 0.65) +
  scale_fill_gradient(low = "darkred", high = "darkgreen") +
  coord_flip() +
  labs(
    title = "Gradient boosting variable importance",
    x = NULL,
    y = "Overall importance"
  )

Boosting is the most flexible model in this sequence, so it is a natural challenger when the process contains several weak nonlinear effects that add up. The tradeoff is that it asks us to give up some transparency in exchange for potential gains in predictive power.

In this run, gradient boosting improved on the linear baselines, but it did not beat random forest. That matters because GBM is not automatically better just because it is more complex. It depends heavily on tuning choices such as the number of trees, interaction depth, and learning rate. Here the model was only tuned lightly, so it may not have reached its best bias-variance tradeoff. Random forest, by contrast, is often more forgiving on moderately noisy process data because many trees are averaged together, which can reduce variance without requiring as much careful tuning.

The reason GBM never fully overtakes random forest is also consistent with the process story. If one variable, like Mnf Flow, is carrying a lot of broad operating information while temperature and pressure are partially folded into other carbonation measures, the model may benefit more from a robust ensemble that can average over that uneven signal. GBM can still work well, but it usually needs more deliberate tuning to separate subtle effects from the stronger process-state pattern that random forest is already capturing.

3.8 Neural network addendum

nn_grid <- expand.grid(
  size = c(1, 3, 5),
  decay = c(0, 0.001, 0.01)
)

nn_model <- train(
  x = x_train,
  y = y_train,
  method = "nnet",
  trControl = ctrl,
  preProcess = c("center", "scale"),
  tuneGrid = nn_grid,
  trace = FALSE,
  MaxNWts = 5000,
  maxit = 500
)

nn_pred <- predict(nn_model, newdata = x_test)
nn_metrics <- metric_row("Neural network", y_test, nn_pred)

ggplot(as_tibble(nn_model$results), aes(x = size, y = decay, fill = RMSE)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(RMSE, 4)), color = "grey95", fontface = "bold", size = 3) +
  scale_fill_gradient(
    low = "darkgreen",
    high = "darkred",
    guide = guide_colorbar(
      barwidth = grid::unit(5.5, "cm"),
      barheight = grid::unit(0.4, "cm"),
      ticks = FALSE
    )
  ) +
  theme(legend.text = element_blank()) +
  labs(
    title = "Neural network tuning surface",
    x = "Hidden units",
    y = "Decay",
    fill = "RMSE"
  )

A neural network is worth trying here because the cleaned data set is large enough for a modest nonlinear model, but still small enough that a shallow network is the sensible scale. Using the matrix form of the predictors keeps the comparison aligned with ridge, lasso, and PLS while still letting the network learn nonlinear interactions.

The real question is whether that flexibility helps more than the structured splits in the tree-based models. If the neural net does not win, it still contributes useful evidence about the shape of the process signal: it would suggest that the data reward grouped rules and averaging more than a smooth nonlinear surface.

3.9 Non-linear Models: MARS

mars_grid = expand.grid(.degree = 1:3,
                        .nprune = 2:40)
set.seed(4802)
mars_model = train(x_train, y_train, method = "earth",
                   tuneGrid = mars_grid, trControl = trainControl(method = "cv"))
mars_model

## Multivariate Adaptive Regression Spline 
## 
## 2055 samples
##   26 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1849, 1849, 1850, 1849, 1850, 1849, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE       Rsquared   MAE       
##   1        2      0.1530804  0.2199326  0.11992367
##   1        3      0.1447556  0.3026516  0.11317537
##   1        4      0.1431707  0.3185217  0.11138224
##   1        5      0.1422096  0.3269543  0.11087623
##   1        6      0.1415654  0.3337418  0.11045948
##   1        7      0.1395315  0.3534574  0.10819975
##   1        8      0.1373283  0.3734254  0.10684295
##   1        9      0.1371869  0.3748793  0.10652854
##   1       10      0.1366040  0.3805543  0.10591282
##   1       11      0.1358162  0.3871270  0.10491793
##   1       12      0.1354421  0.3903183  0.10477593
##   1       13      0.1355102  0.3896758  0.10479063
##   1       14      0.1350368  0.3939793  0.10437076
##   1       15      0.1353007  0.3919157  0.10461137
##   1       16      0.1348352  0.3960488  0.10412560
##   1       17      0.1348041  0.3965046  0.10400771
##   1       18      0.1348532  0.3959228  0.10396929
##   1       19      0.1349107  0.3957335  0.10381036
##   1       20      0.1348068  0.3969842  0.10371997
##   1       21      0.1348806  0.3965845  0.10374351
##   1       22      0.1350050  0.3955238  0.10385199
##   1       23      0.1350981  0.3948238  0.10388658
##   1       24      0.1352442  0.3937749  0.10388120
##   1       25      0.1353387  0.3933090  0.10381274
##   1       26      0.1350132  0.3961041  0.10352188
##   1       27      0.1349634  0.3965455  0.10343309
##   1       28      0.1349084  0.3971027  0.10340304
##   1       29      0.1349451  0.3971491  0.10339319
##   1       30      0.1349447  0.3971752  0.10331820
##   1       31      0.1350587  0.3963988  0.10343699
##   1       32      0.1351117  0.3959191  0.10343229
##   1       33      0.1348912  0.3977610  0.10325484
##   1       34      0.1349092  0.3977948  0.10324660
##   1       35      0.1349072  0.3977686  0.10323973
##   1       36      0.1349072  0.3977686  0.10323973
##   1       37      0.1349072  0.3977686  0.10323973
##   1       38      0.1349072  0.3977686  0.10323973
##   1       39      0.1349072  0.3977686  0.10323973
##   1       40      0.1349072  0.3977686  0.10323973
##   2        2      0.1530804  0.2199326  0.11992367
##   2        3      0.1472295  0.2787374  0.11589472
##   2        4      0.1446334  0.3034280  0.11428082
##   2        5      0.1423643  0.3256869  0.11147829
##   2        6      0.1412510  0.3371803  0.11030493
##   2        7      0.1392800  0.3561903  0.10868920
##   2        8      0.1381175  0.3664563  0.10663767
##   2        9      0.1366320  0.3812187  0.10541640
##   2       10      0.1360049  0.3869177  0.10510501
##   2       11      0.1345853  0.4002541  0.10392173
##   2       12      0.1327328  0.4160324  0.10234370
##   2       13      0.1319401  0.4220520  0.10147932
##   2       14      0.1310271  0.4300779  0.10078022
##   2       15      0.1302503  0.4372052  0.10009143
##   2       16      0.1299984  0.4390567  0.09972055
##   2       17      0.1300561  0.4386617  0.09962475
##   2       18      0.1292271  0.4454814  0.09889966
##   2       19      0.1291958  0.4460462  0.09854813
##   2       20      0.1293103  0.4456901  0.09859703
##   2       21      0.1291066  0.4473245  0.09851171
##   2       22      0.1291824  0.4471135  0.09842115
##   2       23      0.1288751  0.4496328  0.09824632
##   2       24      0.1286738  0.4513078  0.09802570
##   2       25      0.1287770  0.4507290  0.09818558
##   2       26      0.1287136  0.4511480  0.09826289
##   2       27      0.1286456  0.4519299  0.09821330
##   2       28      0.1288934  0.4502725  0.09833212
##   2       29      0.1286258  0.4521189  0.09827586
##   2       30      0.1286226  0.4521221  0.09822689
##   2       31      0.1288090  0.4511722  0.09848295
##   2       32      0.1287120  0.4519608  0.09839870
##   2       33      0.1286208  0.4526833  0.09823894
##   2       34      0.1285134  0.4534777  0.09814826
##   2       35      0.1285418  0.4534861  0.09828997
##   2       36      0.1285876  0.4532308  0.09832435
##   2       37      0.1285876  0.4532308  0.09832435
##   2       38      0.1285280  0.4538101  0.09823408
##   2       39      0.1285280  0.4538101  0.09823408
##   2       40      0.1285259  0.4537433  0.09819948
##   3        2      0.1528737  0.2224256  0.11952946
##   3        3      0.1479996  0.2725603  0.11565283
##   3        4      0.1446025  0.3065619  0.11332085
##   3        5      0.1421860  0.3311396  0.11152093
##   3        6      0.1403264  0.3481586  0.10963260
##   3        7      0.1383726  0.3668156  0.10757688
##   3        8      0.1376644  0.3735815  0.10670812
##   3        9      0.1357488  0.3901300  0.10480473
##   3       10      0.1413520  0.3867188  0.10489436
##   3       11      0.1419360  0.3941660  0.10449480
##   3       12      0.1415904  0.4017852  0.10379752
##   3       13      0.1406109  0.4103714  0.10239061
##   3       14      0.1396718  0.4184280  0.10132080
##   3       15      0.1396462  0.4209129  0.10090530
##   3       16      0.1388287  0.4276349  0.10060519
##   3       17      0.1385654  0.4302713  0.10013700
##   3       18      0.1385349  0.4311069  0.09967114
##   3       19      0.1386348  0.4308935  0.09965815
##   3       20      0.1387075  0.4295826  0.09991349
##   3       21      0.1384601  0.4325544  0.09963313
##   3       22      0.1380869  0.4357017  0.09932260
##   3       23      0.1384123  0.4344252  0.09923021
##   3       24      0.1390915  0.4341468  0.09915279
##   3       25      0.1390596  0.4344183  0.09907787
##   3       26      0.1391192  0.4349731  0.09913553
##   3       27      0.1387637  0.4365060  0.09888459
##   3       28      0.1383958  0.4367571  0.09879473
##   3       29      0.1380311  0.4402865  0.09870461
##   3       30      0.1378934  0.4415655  0.09864432
##   3       31      0.1383598  0.4389247  0.09903042
##   3       32      0.1386127  0.4392583  0.09917142
##   3       33      0.1385429  0.4408078  0.09882112
##   3       34      0.1386128  0.4399771  0.09890200
##   3       35      0.1384344  0.4407022  0.09875892
##   3       36      0.1385096  0.4402435  0.09881147
##   3       37      0.1384239  0.4412483  0.09867646
##   3       38      0.1383911  0.4414845  0.09868745
##   3       39      0.1382946  0.4422462  0.09856212
##   3       40      0.1382559  0.4425589  0.09853185
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 34 and degree = 2.

mars_predict = predict(mars_model, x_test)
mars_values = data.frame(obs = y_test, pred = mars_predict) |>
  rename("pred" = y)
defaultSummary(mars_values)

##       RMSE   Rsquared        MAE 
## 0.12451112 0.47347262 0.09510244

mars_metrics <- metric_row("MARS", y_test, mars_predict)

3.10 Non-linear Models: SVM

set.seed(4802)
svm_model = train(x_train, y_train, method = "svmRadial",
                  preProcess = c("center", "scale"), tuneLength = 14,
                  trControl = trainControl(method = "cv"))
svm_model

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 2055 samples
##   26 predictor
## 
## Pre-processing: centered (26), scaled (26) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1849, 1849, 1850, 1849, 1850, 1849, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE       Rsquared   MAE       
##      0.25  0.1274390  0.4667806  0.09472354
##      0.50  0.1249602  0.4853215  0.09180308
##      1.00  0.1230379  0.4990139  0.08958601
##      2.00  0.1221784  0.5053651  0.08869118
##      4.00  0.1217612  0.5092987  0.08856583
##      8.00  0.1220978  0.5095259  0.08921912
##     16.00  0.1233072  0.5072926  0.09045709
##     32.00  0.1263891  0.4955804  0.09334229
##     64.00  0.1319430  0.4704690  0.09752783
##    128.00  0.1408356  0.4319201  0.10484662
##    256.00  0.1456818  0.4154403  0.10919258
##    512.00  0.1510843  0.3955517  0.11328195
##   1024.00  0.1514337  0.3946032  0.11357131
##   2048.00  0.1514337  0.3946032  0.11357131
## 
## Tuning parameter 'sigma' was held constant at a value of 0.02473474
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.02473474 and C = 4.

svm_predict = predict(svm_model, x_test)
svm_values = data.frame(obs = y_test, pred = svm_predict)
defaultSummary(svm_values)

##       RMSE   Rsquared        MAE 
## 0.11909427 0.51195037 0.08793649

svm_metrics <- metric_row("SVM", y_test, svm_predict)

3.11 Non-linear Models: KNN

set.seed(4802)
knn_model = train(x_train, y_train, method = "knn", preProc = c("center", "scale"),
                 tuneGrid = data.frame(.k = 1:20),
                 trControl = trainControl(method = "cv"))
knn_model

## k-Nearest Neighbors 
## 
## 2055 samples
##   26 predictor
## 
## Pre-processing: centered (26), scaled (26) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1849, 1849, 1850, 1849, 1850, 1849, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE       Rsquared   MAE       
##    1  0.1531553  0.3685101  0.10416892
##    2  0.1333942  0.4518124  0.09504035
##    3  0.1264984  0.4854762  0.09145719
##    4  0.1245714  0.4953773  0.09078836
##    5  0.1239387  0.4975112  0.09053362
##    6  0.1232905  0.5010337  0.09027578
##    7  0.1232877  0.4998761  0.09064188
##    8  0.1233476  0.4997777  0.09118220
##    9  0.1233922  0.4993114  0.09166419
##   10  0.1235735  0.4985427  0.09226155
##   11  0.1246036  0.4909626  0.09310106
##   12  0.1253195  0.4853954  0.09371691
##   13  0.1257149  0.4823119  0.09432147
##   14  0.1263369  0.4774003  0.09494671
##   15  0.1266877  0.4752064  0.09536990
##   16  0.1271108  0.4718543  0.09599436
##   17  0.1273332  0.4703405  0.09608894
##   18  0.1279337  0.4653327  0.09681282
##   19  0.1282629  0.4622493  0.09725985
##   20  0.1287082  0.4584443  0.09769344
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 7.

knn_predict = predict(knn_model, x_test)
knn_values = data.frame(obs = y_test, pred = knn_predict)
defaultSummary(knn_values)

##       RMSE   Rsquared        MAE 
## 0.12296905 0.48156846 0.09067453

knn_metrics <- metric_row("KNN", y_test, knn_predict)

3.12 Compare the models

model_metrics <- bind_rows(
  lm_metrics,
  ridge_metrics,
  lasso_metrics,
  pls_metrics,
  rf_metrics,
  gbm_metrics,
  nn_metrics,
  mars_metrics,
  svm_metrics,
  knn_metrics,
) |>
  mutate(
    adjusted_r2 = c(lm_adj_r2, rep(NA_real_, 9))
  ) |>
  select(model, rmse, mae, r2, adjusted_r2)

best_model <- model_metrics |> arrange(rmse) |> slice(1)
best_model_name <- best_model$model[[1]]

model_metrics |> arrange(rmse) |> kable(digits = 4)

model	rmse	mae	r2	adjusted_r2
Random forest	0.1059	0.0739	0.6142788	NA
SVM	0.1191	0.0879	0.5119504	NA
KNN	0.1230	0.0907	0.4815685	NA
MARS	0.1245	0.0951	0.4734726	NA
Gradient boosting	0.1258	0.0955	0.4567741	NA
Ridge	0.1365	0.1061	0.3579047	NA
Lasso	0.1366	0.1061	0.3569850	NA
Linear regression	0.1366	0.1060	0.3574204	0.3838
PLS	0.1366	0.1058	0.3573964	NA
Neural network	7.5521	7.5502	NA	NA

ggplot(model_metrics, aes(x = reorder(model, rmse), y = rmse, fill = rmse)) +
  geom_col(color = NA, width = 0.65) +
  scale_fill_gradient(low = "darkgreen", high = "darkred") +
  coord_flip() +
  labs(
    title = "Test-set RMSE by model",
    x = NULL,
    y = "RMSE"
  )

prediction_comparison <- bind_rows(
  tibble(model = "Linear regression", observed = y_test, predicted = lm_pred),
  tibble(model = "Ridge", observed = y_test, predicted = ridge_pred),
  tibble(model = "Lasso", observed = y_test, predicted = lasso_pred),
  tibble(model = "PLS", observed = y_test, predicted = pls_pred),
  tibble(model = "Random forest", observed = y_test, predicted = rf_pred),
  tibble(model = "Gradient boosting", observed = y_test, predicted = gbm_pred),
  tibble(model = "Neural network", observed = y_test, predicted = nn_pred),
  tibble(model = "MARS", observed = y_test, predicted = mars_predict),
  tibble(model = "SVM", observed = y_test, predicted = svm_predict),
  tibble(model = "KNN", observed = y_test, predicted = knn_predict)
) |>
  mutate(residual = observed - predicted)

ggplot(prediction_comparison, aes(x = model, y = residual)) +
  geom_boxplot(width = 0.55, fill = "grey92", color = "grey25", outlier.size = 0.7) +
  coord_flip() +
  geom_hline(yintercept = 0, linetype = 2, color = "gray45", linewidth = 0.35) +
  labs(
    title = "Residual spread by model",
    x = NULL,
    y = "Observed - predicted"
  )

ggplot(prediction_comparison, aes(x = observed, y = predicted)) +
  geom_point(alpha = 0.45, size = 0.9) +
  geom_abline(
    slope = 1,
    intercept = 0,
    linetype = 2,
    color = "gray45",
    linewidth = 0.35
  ) +
  facet_wrap(~ model, ncol = 2, scales = "free") +
  labs(
    title = "Observed vs predicted across candidate models",
    x = "Observed PH",
    y = "Predicted PH"
  )

The winner is Random forest, because it gives the lowest test RMSE in this run while still being backed up by a strong residual pattern and a more competitive R^2 than the simpler linear baselines. Ridge and lasso still matter because they show how shrinkage improves stability, but the selected tree-based model handles the process nonlinearity more effectively.

Gradient boosting is still an important checkpoint in the journey even though it does not win here. It gives a clear step up from ordinary regression, ridge, lasso, and PLS, which tells us the data do contain nonlinear structure. The fact that it still trails random forest suggests either that the boosting hyperparameters need more work or that the random forest averaging strategy fits this data better under the current cleaning pipeline. In other words, boosting was competitive, but not yet competitive enough to replace the forest model.

The Mnf Flow sensitivity check reinforces that interpretation. When the suspicious -100 and -100.2 values are recoded as missing and imputed, the ranking of the tree models does not change: random forest still wins, and gradient boosting stays in second place. That means the odd flow readings are not the reason the forest won. They are a data-quality concern worth noting, but the overall model comparison is stable whether we leave those values in or treat them as missing.

prediction_map <- list(
  "Linear regression" = lm_pred,
  "Ridge" = ridge_pred,
  "Lasso" = lasso_pred,
  "PLS" = pls_pred,
  "Random forest" = rf_pred,
  "Gradient boosting" = gbm_pred,
  "Neural network" = nn_pred,
  "MARS" = mars_predict,
  "SVM" = svm_predict,
  "KNN" = knn_predict
)

selected_predictions <- tibble(
  observed = y_test,
  predicted = prediction_map[[best_model_name]],
  residual = y_test - prediction_map[[best_model_name]]
)

write.csv(
  selected_predictions,
  "ABC_Beverage_Selected_Model_Predictions.csv",
  row.names = FALSE
)

3.13 Optional scoring set

if (any(file.exists(eval_paths))) {
  scoring_data <- evaluation_data |> select(any_of(names(model_data)))
  scoring_data <- scoring_data |> mutate(Brand.Code = factor(Brand.Code, levels = levels(model_data$Brand.Code)))
  head(scoring_data)
} else {
  head(model_data)
}

## # A tibble: 6 × 25
##   Brand.Code Carb.Volume Fill.Ounces PC.Volume Carb.Pressure   PSC PSC.Fill
##   <fct>            <dbl>       <dbl>     <dbl>         <dbl> <dbl>    <dbl>
## 1 B                 5.34        24.0     0.263          68.2 0.104    0.260
## 2 A                 5.43        24.0     0.239          68.4 0.124    0.220
## 3 B                 5.29        24.1     0.263          70.8 0.09     0.34 
## 4 A                 5.44        24.0     0.293          63   0.118    0.420
## 5 A                 5.49        24.3     0.111          67.2 0.026    0.16 
## 6 A                 5.38        23.9     0.269          66.6 0.09     0.240
## # ℹ 18 more variables: PSC.CO2 <dbl>, Mnf.Flow <dbl>, Carb.Pressure1 <dbl>,
## #   Fill.Pressure <dbl>, Hyd.Pressure1 <dbl>, Hyd.Pressure2 <dbl>,
## #   Hyd.Pressure4 <dbl>, Temperature <dbl>, Usage.cont <dbl>, Carb.Flow <dbl>,
## #   MFR <dbl>, Pressure.Vacuum <dbl>, Oxygen.Filler <dbl>, Bowl.Setpoint <dbl>,
## #   Pressure.Setpoint <dbl>, Air.Pressurer <dbl>, Carb.Rel <dbl>, PH <dbl>

This optional chunk keeps the report portable. If a separate evaluation workbook is available later, the same modeling workflow can score it directly without changing the rest of the document.

4 Using Model to Predict pH for Test Dataset

data_to_predict = read_excel("Test Data.xlsx") |> rename_with(make.names)
data_to_predict_predictors = data_to_predict |> select(-PH)

data_to_predict_updated = missForestPredict(imputation_model, 
  newdata = data_to_predict_predictors |> mutate(Brand.Code = as.factor(Brand.Code)))

data_to_predict_updated |>
  summarise(across(everything(), ~ sum(is.na(.))))

## # A tibble: 1 × 32
##   Brand.Code Carb.Volume Fill.Ounces PC.Volume Carb.Pressure Carb.Temp   PSC
##        <int>       <int>       <int>     <int>         <int>     <int> <int>
## 1          0           0           0         0             0         0     0
## # ℹ 25 more variables: PSC.Fill <int>, PSC.CO2 <int>, Mnf.Flow <int>,
## #   Carb.Pressure1 <int>, Fill.Pressure <int>, Hyd.Pressure1 <int>,
## #   Hyd.Pressure2 <int>, Hyd.Pressure3 <int>, Hyd.Pressure4 <int>,
## #   Filler.Level <int>, Filler.Speed <int>, Temperature <int>,
## #   Usage.cont <int>, Carb.Flow <int>, Density <int>, MFR <int>, Balling <int>,
## #   Pressure.Vacuum <int>, Oxygen.Filler <int>, Bowl.Setpoint <int>,
## #   Pressure.Setpoint <int>, Air.Pressurer <int>, Alch.Rel <int>, …

data_to_predict_updated = data_to_predict_updated |> select(-all_of(high_corr))

predicted_ph = predict(rf_model, newdata = data_to_predict_updated)

final_data = data_to_predict_updated |>
  bind_cols(data.frame(predicted_ph) |> rename("PH" = predicted_ph))

final_data |> head()

## # A tibble: 6 × 25
##   Brand.Code Carb.Volume Fill.Ounces PC.Volume Carb.Pressure   PSC PSC.Fill
##   <fct>            <dbl>       <dbl>     <dbl>         <dbl> <dbl>    <dbl>
## 1 D                 5.48        24.0     0.27           65.4 0.236    0.400
## 2 A                 5.39        24.0     0.227          63.2 0.042    0.220
## 3 B                 5.29        23.9     0.303          66.4 0.068    0.100
## 4 B                 5.27        23.9     0.186          64.8 0.004    0.200
## 5 B                 5.41        24.2     0.16           69.4 0.04     0.3  
## 6 B                 5.29        24.1     0.212          73.4 0.078    0.220
## # ℹ 18 more variables: PSC.CO2 <dbl>, Mnf.Flow <dbl>, Carb.Pressure1 <dbl>,
## #   Fill.Pressure <dbl>, Hyd.Pressure1 <dbl>, Hyd.Pressure2 <dbl>,
## #   Hyd.Pressure4 <dbl>, Temperature <dbl>, Usage.cont <dbl>, Carb.Flow <dbl>,
## #   MFR <dbl>, Pressure.Vacuum <dbl>, Oxygen.Filler <dbl>, Bowl.Setpoint <dbl>,
## #   Pressure.Setpoint <dbl>, Air.Pressurer <dbl>, Carb.Rel <dbl>, PH <dbl>

ABC Beverage - Full Data & Modeling Journey

Randy Howk

2026-05-21