read_project_xlsx <- function(paths, fallback = NULL) {
for (path in paths) {
if (file.exists(path)) {
return(read_excel(path))
}
}
if (!is.null(fallback)) {
return(fallback)
}
stop("None of these files exist: ", paste(paths, collapse = ", "))
}
mode_value <- function(x) {
x <- x[!is.na(x)]
if (!length(x)) return(NA)
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
nrmse_calc <- function(actual, pred) {
actual <- as.numeric(actual)
pred <- as.numeric(pred)
rng <- diff(range(actual, na.rm = TRUE))
if (is.na(rng) || rng == 0) return(NA_real_)
sqrt(mean((actual - pred)^2, na.rm = TRUE)) / rng
}
metric_row <- function(model, obs, pred) {
tibble(
model = model,
rmse = caret::RMSE(pred, obs),
mae = caret::MAE(pred, obs),
r2 = caret::R2(pred, obs)
)
}
top_abs_coef <- function(coef_obj, n = 10) {
coef_df <- as.matrix(coef_obj)
tibble(
term = rownames(coef_df),
estimate = as.numeric(coef_df[, 1])
) |>
filter(term != "(Intercept)") |>
mutate(abs_estimate = abs(estimate)) |>
arrange(desc(abs_estimate)) |>
slice_head(n = n) |>
select(term, estimate)
}
plot_obs_pred <- function(df, title) {
ggplot(df, aes(x = observed, y = predicted)) +
geom_point(alpha = 0.5) +
geom_abline(
slope = 1,
intercept = 0,
linetype = 2,
linewidth = 0.35,
color = "gray45"
) +
labs(title = title, x = "Observed", y = "Predicted")
}
prepare_mnf_sensitivity <- function(df, suspicious_values = c(-100, -100.2)) {
base_df <- df |>
mutate(Mnf.Flow.Flag = as.integer(Mnf.Flow %in% suspicious_values))
raw_imputed <- missForest(
as.data.frame(base_df |> mutate(Brand.Code = as.factor(Brand.Code))),
verbose = FALSE
)$ximp |>
as_tibble()
recode_imputed <- missForest(
as.data.frame(
base_df |>
mutate(
Mnf.Flow = ifelse(Mnf.Flow %in% suspicious_values, NA, Mnf.Flow),
Brand.Code = as.factor(Brand.Code)
)
),
verbose = FALSE
)$ximp |>
as_tibble()
numeric_corr <- raw_imputed |>
select(where(is.numeric)) |>
cor(use = "pairwise.complete.obs")
high_corr <- findCorrelation(numeric_corr, cutoff = 0.80, names = TRUE)
high_corr <- setdiff(high_corr, c("Mnf.Flow", "Mnf.Flow.Flag"))
if (length(high_corr) > 0) {
raw_imputed <- raw_imputed |> select(-all_of(high_corr))
recode_imputed <- recode_imputed |> select(-all_of(high_corr))
}
list(
raw = raw_imputed,
recode = recode_imputed,
removed_predictors = high_corr
)
}
fit_tree_sensitivity <- function(model_data, seed = 2026) {
set.seed(seed)
idx <- createDataPartition(model_data$PH, p = 0.80, list = FALSE)
train_data <- model_data[idx, ]
test_data <- model_data[-idx, ]
ctrl <- trainControl(method = "cv", number = 5)
rf_model <- train(
PH ~ .,
data = train_data,
method = "rf",
trControl = ctrl,
ntree = 300,
tuneLength = 3,
importance = TRUE
)
rf_pred <- predict(rf_model, newdata = test_data)
gbm_model <- train(
PH ~ .,
data = train_data,
method = "gbm",
trControl = ctrl,
verbose = FALSE,
tuneLength = 3
)
gbm_pred <- predict(gbm_model, newdata = test_data)
metrics <- bind_rows(
metric_row("Random forest", test_data$PH, rf_pred),
metric_row("Gradient boosting", test_data$PH, gbm_pred)
)
list(
metrics = metrics,
best = metrics |> arrange(rmse) |> slice(1),
rf_model = rf_model,
gbm_model = gbm_model,
rf_pred = rf_pred,
gbm_pred = gbm_pred,
test_data = test_data
)
}
This report is written for ABC Beverage leadership and the production team. Our goal is to turn the historical process data into a practical pH prediction workflow that we can explain, defend, and use to guide manufacturing decisions. We treat the analysis as a sequence of decisions: first we clean the data carefully, then we test which transformations stabilize the measurements, then we compare models from simple to complex, and finally we select the approach that best balances accuracy, stability, and interpretability.
One theme emerges early and stays important throughout the analysis:
Mnf Flow behaves like a broader process-state variable. It
sits alongside direct process controls such as carbonation, temperature,
and pressure, but it may also be absorbing information about residence
time, mixing, line condition, and operating mode. That helps explain why
the model often treats it as a stronger signal than any single
temperature or pressure column. By contrast, many temperature and
pressure effects are already represented indirectly through carbonation
variables like Carb Volume and Carb Rel, while
the remaining temperature and pressure fields are spread across several
columns and are often tightly controlled. In short, flow can capture
more of the real operating variation that matters for pH.
#student_raw <- read_project_xlsx(c("Project2/StudentData.xlsx", "StudentData.xlsx"))
student_raw <- read_excel("Training Data.xlsx")
student_data <- student_raw |> rename_with(make.names)
eval_paths <- c(
"Project2/StudentEvaluation.xlsx",
"Project2/StudentEvaluation (1).xlsx",
"StudentEvaluation.xlsx",
"StudentEvaluation (1).xlsx"
)
evaluation_raw <- read_project_xlsx(eval_paths, fallback = student_raw)
evaluation_data <- evaluation_raw |> rename_with(make.names)
data_overview <- tibble(
rows = nrow(student_data),
cols = ncol(student_data),
complete_rows = sum(complete.cases(student_data)),
complete_share = round(100 * complete_rows / rows, 1)
)
data_overview |> kable()
| rows | cols | complete_rows | complete_share |
|---|---|---|---|
| 2571 | 33 | 2038 | 79.3 |
str(student_data)
## tibble [2,571 × 33] (S3: tbl_df/tbl/data.frame)
## $ Brand.Code : chr [1:2571] "B" "A" "B" "A" ...
## $ Carb.Volume : num [1:2571] 5.34 5.43 5.29 5.44 5.49 ...
## $ Fill.Ounces : num [1:2571] 24 24 24.1 24 24.3 ...
## $ PC.Volume : num [1:2571] 0.263 0.239 0.263 0.293 0.111 ...
## $ Carb.Pressure : num [1:2571] 68.2 68.4 70.8 63 67.2 66.6 64.2 67.6 64.2 72 ...
## $ Carb.Temp : num [1:2571] 141 140 145 133 137 ...
## $ PSC : num [1:2571] 0.104 0.124 0.09 NA 0.026 0.09 0.128 0.154 0.132 0.014 ...
## $ PSC.Fill : num [1:2571] 0.26 0.22 0.34 0.42 0.16 ...
## $ PSC.CO2 : num [1:2571] 0.04 0.04 0.16 0.04 0.12 ...
## $ Mnf.Flow : num [1:2571] -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 ...
## $ Carb.Pressure1 : num [1:2571] 119 122 120 115 118 ...
## $ Fill.Pressure : num [1:2571] 46 46 46 46.4 45.8 45.6 51.8 46.8 46 45.2 ...
## $ Hyd.Pressure1 : num [1:2571] 0 0 0 0 0 0 0 0 0 0 ...
## $ Hyd.Pressure2 : num [1:2571] NA NA NA 0 0 0 0 0 0 0 ...
## $ Hyd.Pressure3 : num [1:2571] NA NA NA 0 0 0 0 0 0 0 ...
## $ Hyd.Pressure4 : num [1:2571] 118 106 82 92 92 116 124 132 90 108 ...
## $ Filler.Level : num [1:2571] 121 119 120 118 119 ...
## $ Filler.Speed : num [1:2571] 4002 3986 4020 4012 4010 ...
## $ Temperature : num [1:2571] 66 67.6 67 65.6 65.6 66.2 65.8 65.2 65.4 66.6 ...
## $ Usage.cont : num [1:2571] 16.2 19.9 17.8 17.4 17.7 ...
## $ Carb.Flow : num [1:2571] 2932 3144 2914 3062 3054 ...
## $ Density : num [1:2571] 0.88 0.92 1.58 1.54 1.54 1.52 0.84 0.84 0.9 0.9 ...
## $ MFR : num [1:2571] 725 727 735 731 723 ...
## $ Balling : num [1:2571] 1.4 1.5 3.14 3.04 3.04 ...
## $ Pressure.Vacuum : num [1:2571] -4 -4 -3.8 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 ...
## $ PH : num [1:2571] 8.36 8.26 8.94 8.24 8.26 8.32 8.4 8.38 8.38 8.5 ...
## $ Oxygen.Filler : num [1:2571] 0.022 0.026 0.024 0.03 0.03 0.024 0.066 0.046 0.064 0.022 ...
## $ Bowl.Setpoint : num [1:2571] 120 120 120 120 120 120 120 120 120 120 ...
## $ Pressure.Setpoint: num [1:2571] 46.4 46.8 46.6 46 46 46 46 46 46 46 ...
## $ Air.Pressurer : num [1:2571] 143 143 142 146 146 ...
## $ Alch.Rel : num [1:2571] 6.58 6.56 7.66 7.14 7.14 7.16 6.54 6.52 6.52 6.54 ...
## $ Carb.Rel : num [1:2571] 5.32 5.3 5.84 5.42 5.44 5.44 5.38 5.34 5.34 5.34 ...
## $ Balling.Lvl : num [1:2571] 1.48 1.56 3.28 3.04 3.04 3.02 1.44 1.44 1.44 1.38 ...
The raw data contain 2571 rows and 33 columns. No separate evaluation workbook was found in the project folder, so the code falls back to the main dataset when it needs a scoring example.
predictor_outlier_tbl <- student_data |>
select(where(is.numeric), -PH) |>
summarise(across(everything(), list(
n_out = ~ {
q1 <- quantile(., 0.25, na.rm = TRUE)
q3 <- quantile(., 0.75, na.rm = TRUE)
iqr <- q3 - q1
lo <- q1 - 1.5 * iqr
hi <- q3 + 1.5 * iqr
sum(. < lo | . > hi, na.rm = TRUE)
},
min = ~ min(., na.rm = TRUE),
max = ~ max(., na.rm = TRUE)
), .names = "{.col}__{.fn}")) |>
pivot_longer(everything(), names_to = c("variable", "stat"), names_sep = "__") |>
pivot_wider(names_from = stat, values_from = value) |>
mutate(outlier_share = round(100 * n_out / nrow(student_data), 1)) |>
arrange(desc(n_out), desc(abs(max)), desc(abs(min)))
mnf_flow_flag_tbl <- tibble(
suspicious_value = c(-100, -100.2),
count = c(
sum(student_data$Mnf.Flow == -100, na.rm = TRUE),
sum(student_data$Mnf.Flow == -100.2, na.rm = TRUE)
)
) |>
mutate(share = round(100 * count / nrow(student_data), 1))
predictor_outlier_tbl |> head(10) |> kable(digits = 2)
| variable | n_out | min | max | outlier_share |
|---|---|---|---|---|
| Filler.Speed | 439 | 998.00 | 4030.00 | 17.1 |
| MFR | 238 | 31.40 | 868.60 | 9.3 |
| Air.Pressurer | 220 | 140.80 | 148.20 | 8.6 |
| Oxygen.Filler | 189 | 0.00 | 0.40 | 7.4 |
| Pressure.Vacuum | 125 | -6.60 | -3.60 | 4.9 |
| Temperature | 123 | 63.60 | 76.20 | 4.8 |
| PC.Volume | 86 | 0.08 | 0.48 | 3.3 |
| Hyd.Pressure4 | 83 | 52.00 | 142.00 | 3.2 |
| Fill.Pressure | 79 | 34.60 | 60.40 | 3.1 |
| PSC.CO2 | 77 | 0.00 | 0.24 | 3.0 |
mnf_flow_flag_tbl |> kable(digits = 2)
| suspicious_value | count | share |
|---|---|---|
| -100.0 | 606 | 23.6 |
| -100.2 | 578 | 22.5 |
The predictor review does not show a broad outlier problem across the
entire file. Most columns behave like ordinary process measurements, and
the standard IQR rule only flags a subset of variables that are already
expected to move with machine settings and operating state. The biggest
practical concern is Mnf Flow, not because it has the
largest IQR outlier count, but because it contains repeated negative
codes that do not look like ordinary flow readings.
Carb Flow also shows a wide spread and was checked
separately with Box-Cox, but it does not show the same repeated-code
pattern. In plain English: the data have some extreme values, but the
only values that look potentially erroneous enough to test separately
are the Mnf Flow codes -100 and
-100.2.
missing_counts <- student_data |>
summarise(across(everything(), ~ sum(is.na(.)))) |>
pivot_longer(everything(), names_to = "variable", values_to = "n_missing") |>
arrange(desc(n_missing))
nzv_vars <- tibble(variable = names(student_data)[nearZeroVar(student_data)])
if (nrow(nzv_vars) == 0) nzv_vars <- tibble(variable = "None")
missing_counts |> head(12) |> kable()
| variable | n_missing |
|---|---|
| MFR | 212 |
| Brand.Code | 120 |
| Filler.Speed | 57 |
| PC.Volume | 39 |
| PSC.CO2 | 39 |
| Fill.Ounces | 38 |
| PSC | 33 |
| Carb.Pressure1 | 32 |
| Hyd.Pressure4 | 30 |
| Carb.Pressure | 27 |
| Carb.Temp | 26 |
| PSC.Fill | 23 |
nzv_vars |> kable()
| variable |
|---|
| Hyd.Pressure1 |
ggplot(student_data, aes(x = "", y = PH)) +
geom_boxplot(width = 0.35, fill = "grey92", color = "grey25", outlier.size = 0.8) +
labs(
title = "Response distribution before cleaning",
x = NULL,
y = "PH"
)
The missingness table shows why a full complete-case drop would be too aggressive. 2038 rows are complete, which means 533 rows would be lost if we excluded every observation with any missing value. In a production setting, every row represents process history that took time and money to collect, so we prefer to preserve that signal whenever we can do so responsibly.
complete_data <- student_data |> filter(if_all(everything(), ~ !is.na(.)))
imputation_benchmark <- function(df, iter = 5) {
numeric_idx <- 2:ncol(df)
nrmse_values <- matrix(NA_real_, nrow = iter, ncol = 6)
brand_values <- matrix(NA_real_, nrow = iter, ncol = 3)
colnames(nrmse_values) <- c("mean", "median", "mode", "knn", "mice", "forest")
colnames(brand_values) <- c("mode", "mice", "forest")
for (i in seq_len(iter)) {
missing_data <- df |>
mutate(across(everything(), function(x) {
idx <- sample(seq_along(x), size = floor(0.05 * length(x)))
x[idx] <- NA
x
}))
missing_rows <- lapply(missing_data, function(x) which(is.na(x)))
mean_impute <- missing_data |>
mutate(
across(where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)),
Brand.Code = factor(
ifelse(is.na(Brand.Code), mode_value(Brand.Code), as.character(Brand.Code)),
levels = levels(df$Brand.Code)
)
)
median_impute <- missing_data |>
mutate(
across(where(is.numeric), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)),
Brand.Code = factor(
ifelse(is.na(Brand.Code), mode_value(Brand.Code), as.character(Brand.Code)),
levels = levels(df$Brand.Code)
)
)
mode_impute <- missing_data |>
mutate(
across(where(is.numeric), ~ ifelse(is.na(.), as.numeric(mode_value(.)), .)),
Brand.Code = factor(
ifelse(is.na(Brand.Code), mode_value(Brand.Code), as.character(Brand.Code)),
levels = levels(df$Brand.Code)
)
)
knn_impute <- bind_cols(
Brand.Code = missing_data$Brand.Code,
as.data.frame(impute.knn(as.matrix(missing_data |> select(-Brand.Code)))$data)
)
mice_impute <- complete(
mice(missing_data |> mutate(Brand.Code = as.factor(Brand.Code)), printFlag = FALSE),
1
)
forest_impute <- missForest(
as.data.frame(missing_data |> mutate(Brand.Code = as.factor(Brand.Code)))
)$ximp
for (j in numeric_idx) {
actual_values <- df[[j]][missing_rows[[j]]]
nrmse_values[i, "mean"] <- ifelse(is.na(nrmse_values[i, "mean"]), nrmse_calc(actual_values, mean_impute[[j]][missing_rows[[j]]]), nrmse_values[i, "mean"])
nrmse_values[i, "median"] <- ifelse(is.na(nrmse_values[i, "median"]), nrmse_calc(actual_values, median_impute[[j]][missing_rows[[j]]]), nrmse_values[i, "median"])
nrmse_values[i, "mode"] <- ifelse(is.na(nrmse_values[i, "mode"]), nrmse_calc(actual_values, mode_impute[[j]][missing_rows[[j]]]), nrmse_values[i, "mode"])
nrmse_values[i, "knn"] <- ifelse(is.na(nrmse_values[i, "knn"]), nrmse_calc(actual_values, knn_impute[[j]][missing_rows[[j]]]), nrmse_values[i, "knn"])
nrmse_values[i, "mice"] <- ifelse(is.na(nrmse_values[i, "mice"]), nrmse_calc(actual_values, mice_impute[[j]][missing_rows[[j]]]), nrmse_values[i, "mice"])
nrmse_values[i, "forest"] <- ifelse(is.na(nrmse_values[i, "forest"]), nrmse_calc(actual_values, forest_impute[[j]][missing_rows[[j]]]), nrmse_values[i, "forest"])
}
actual_brand <- as.character(df[[1]][missing_rows[[1]]])
brand_values[i, "mode"] <- mean(actual_brand == as.character(mode_impute[[1]][missing_rows[[1]]]))
brand_values[i, "mice"] <- mean(actual_brand == as.character(mice_impute[[1]][missing_rows[[1]]]))
brand_values[i, "forest"] <- mean(actual_brand == as.character(forest_impute[[1]][missing_rows[[1]]]))
}
nrmse_tbl <- tibble(
method = colnames(nrmse_values),
nrmse = colMeans(nrmse_values, na.rm = TRUE)
) |> arrange(nrmse)
brand_tbl <- tibble(
method = colnames(brand_values),
accuracy = colMeans(brand_values, na.rm = TRUE)
) |> arrange(desc(accuracy))
list(nrmse = nrmse_tbl, brand = brand_tbl)
}
imputation_results <- imputation_benchmark(complete_data, iter = 5)
## Cluster size 2038 broken into 1913 125
## Cluster size 1913 broken into 538 1375
## Done cluster 538
## Done cluster 1375
## Done cluster 1913
## Done cluster 125
## Imputation sequence (missing proportion): Brand.Code (0.049558390578999) Carb.Volume (0.049558390578999) Fill.Ounces (0.049558390578999) PC.Volume (0.049558390578999) Carb.Pressure (0.049558390578999) Carb.Temp (0.049558390578999) PSC (0.049558390578999) PSC.Fill (0.049558390578999) PSC.CO2 (0.049558390578999) Mnf.Flow (0.049558390578999) Carb.Pressure1 (0.049558390578999) Fill.Pressure (0.049558390578999) Hyd.Pressure1 (0.049558390578999) Hyd.Pressure2 (0.049558390578999) Hyd.Pressure3 (0.049558390578999) Hyd.Pressure4 (0.049558390578999) Filler.Level (0.049558390578999) Filler.Speed (0.049558390578999) Temperature (0.049558390578999) Usage.cont (0.049558390578999) Carb.Flow (0.049558390578999) Density (0.049558390578999) MFR (0.049558390578999) Balling (0.049558390578999) Pressure.Vacuum (0.049558390578999) PH (0.049558390578999) Oxygen.Filler (0.049558390578999) Bowl.Setpoint (0.049558390578999) Pressure.Setpoint (0.049558390578999) Air.Pressurer (0.049558390578999) Alch.Rel (0.049558390578999) Carb.Rel (0.049558390578999) Balling.Lvl (0.049558390578999)
## missForest iteration 1 in progress...done!
## OOB errors MSE: 0.0220489708, 0.0018933281, 0.0059714057, 0.0016801838, 2.2053445498, 2.9213006236, 0.0021246546, 0.0125009269, 0.0016506376, 661.3480458945, 10.671351772, 1.7386230921, 21.8365831269, 10.6946090352, 6.3937934191, 34.1215687953, 18.8747964642, 14962.9874266325, 0.5508271496, 4.2355502434, 100700.362148042, 0.0025523335, 748.5856739485, 0.0045802997, 0.0586634749, 0.0103606782, 0.0008586879, 9.3889697099, 0.428208204, 0.5977747011, 0.0020270984, 0.0017181531, 0.0024779845
## OOB errors NMSE: 0.1374280046, 0.1804556835, 0.8404967995, 0.5571637905, 0.1894071453, 0.1861621636, 0.9233983539, 0.8877087081, 0.9446077939, 0.0442583844, 0.5386857282, 0.215262847, 0.1429993517, 0.0410347098, 0.0261606294, 0.3039710372, 0.0775005607, 0.1234573774, 0.57216595, 0.5065750281, 0.1235857165, 0.0176858776, 0.1707023405, 0.0052333348, 0.1779297365, 0.3571888591, 0.5012949596, 0.0400135332, 0.1030323554, 0.3809732839, 0.0079223327, 0.1109166683, 0.0032697625
## diff. convergence measure: 0.7139803392
## time: 22.26 seconds
##
## missForest iteration 2 in progress...done!
## OOB errors MSE: 0.017106729, 0.0017906205, 0.0058651622, 0.001630099, 1.8779843387, 3.0534755341, 0.0021086787, 0.0124619318, 0.0016522125, 538.6333605743, 10.0717429595, 1.5830886544, 19.685785489, 9.3948838934, 6.0025065903, 33.0261273429, 14.9255808813, 14648.1322513655, 0.5258289474, 4.1905473725, 89100.9665703089, 0.0020467252, 742.1658995423, 0.0032319827, 0.0550844844, 0.0098371036, 0.0008597954, 9.1811778602, 0.4211080202, 0.591041048, 0.0018330058, 0.0017165574, 0.0024466016
## OOB errors NMSE: 0.1066237356, 0.1706664809, 0.8255426367, 0.5405552082, 0.1612916461, 0.194585113, 0.9164550256, 0.8849396075, 0.9455090391, 0.0360461371, 0.5084177063, 0.1960057774, 0.1289146084, 0.0360477258, 0.0245596534, 0.2942123278, 0.0612849463, 0.1208595543, 0.5461993285, 0.5011926505, 0.1093502204, 0.0141823675, 0.1692384192, 0.0036927818, 0.1670744499, 0.3391383983, 0.5019414965, 0.0391279742, 0.1013239606, 0.3766817976, 0.0071637775, 0.1108136552, 0.0032283519
## diff. convergence measure: 0.0089630984
## time: 21.11 seconds
##
## missForest iteration 3 in progress...done!
## OOB errors MSE: 0.0171018334, 0.0017801178, 0.0058974609, 0.0016251967, 1.8806964106, 3.0242208772, 0.0020935528, 0.0123645131, 0.001644546, 529.5522013738, 10.106383153, 1.5716032118, 19.4236469582, 9.5743668397, 6.0056111622, 32.9750160307, 14.4909368769, 15506.6986225724, 0.5278798962, 4.2094014965, 88458.5075912977, 0.0020382543, 709.8390999503, 0.0031573626, 0.0559548701, 0.0099591524, 0.0008482133, 9.7342015549, 0.4216681567, 0.5855406392, 0.0019007478, 0.0017134779, 0.0024168358
## OOB errors NMSE: 0.1065932222, 0.1696654523, 0.8300887915, 0.5389295697, 0.1615245738, 0.1927208371, 0.9098811463, 0.8780217687, 0.9411217414, 0.0354384126, 0.5101663299, 0.1945837388, 0.1271979644, 0.0367363934, 0.024572356, 0.2937570041, 0.0595002831, 0.1279434574, 0.5483297302, 0.5034476181, 0.108561755, 0.0141236697, 0.1618668376, 0.0036075228, 0.1697143802, 0.3433460851, 0.4951799896, 0.0414848283, 0.1014587366, 0.3731762816, 0.0074285276, 0.1106148539, 0.0031890752
## diff. convergence measure: 0.0005725341
## time: 20.89 seconds
##
## missForest iteration 4 in progress...done!
## OOB errors MSE: 0.0169134429, 0.0017784515, 0.0058801945, 0.0016224482, 1.7848695926, 3.1195830798, 0.0020967885, 0.0123090384, 0.0016437888, 522.2356164465, 10.0774346485, 1.563306558, 19.2984799136, 9.2478668753, 6.0325783935, 33.0513206663, 14.7667101461, 15755.6257522953, 0.5309432615, 4.2491760268, 89541.0676579173, 0.0019833297, 744.6201789175, 0.0031828728, 0.0557572254, 0.009795656, 0.0008492966, 9.8065118173, 0.422216931, 0.5919926751, 0.0018630822, 0.0016954077, 0.0024850476
## OOB errors NMSE: 0.1054190117, 0.1695066354, 0.8276584868, 0.5380181348, 0.1532944385, 0.198797868, 0.9112874113, 0.8740824338, 0.9406884377, 0.034948776, 0.5087050205, 0.1935565114, 0.1263782937, 0.0354836284, 0.0246826942, 0.2944367618, 0.0606326176, 0.129997318, 0.5515117688, 0.5082046822, 0.1098903397, 0.0137430809, 0.1697980762, 0.0036366701, 0.1691149124, 0.3377094744, 0.4958123941, 0.0417929973, 0.1015907787, 0.3772882878, 0.0072813225, 0.1094483205, 0.0032790824
## diff. convergence measure: -0.0001122343
## time: 20.8 seconds
##
## Cluster size 2038 broken into 588 1450
## Done cluster 588
## Done cluster 1450
## Imputation sequence (missing proportion): Brand.Code (0.049558390578999) Carb.Volume (0.049558390578999) Fill.Ounces (0.049558390578999) PC.Volume (0.049558390578999) Carb.Pressure (0.049558390578999) Carb.Temp (0.049558390578999) PSC (0.049558390578999) PSC.Fill (0.049558390578999) PSC.CO2 (0.049558390578999) Mnf.Flow (0.049558390578999) Carb.Pressure1 (0.049558390578999) Fill.Pressure (0.049558390578999) Hyd.Pressure1 (0.049558390578999) Hyd.Pressure2 (0.049558390578999) Hyd.Pressure3 (0.049558390578999) Hyd.Pressure4 (0.049558390578999) Filler.Level (0.049558390578999) Filler.Speed (0.049558390578999) Temperature (0.049558390578999) Usage.cont (0.049558390578999) Carb.Flow (0.049558390578999) Density (0.049558390578999) MFR (0.049558390578999) Balling (0.049558390578999) Pressure.Vacuum (0.049558390578999) PH (0.049558390578999) Oxygen.Filler (0.049558390578999) Bowl.Setpoint (0.049558390578999) Pressure.Setpoint (0.049558390578999) Air.Pressurer (0.049558390578999) Alch.Rel (0.049558390578999) Carb.Rel (0.049558390578999) Balling.Lvl (0.049558390578999)
## missForest iteration 1 in progress...done!
## OOB errors MSE: 0.0231514864, 0.0019294385, 0.0060197399, 0.0016952915, 2.2209738466, 2.9962795846, 0.0021094265, 0.0123515057, 0.0016736835, 724.0355451055, 10.5442463723, 1.7562479484, 22.1014749642, 10.7983763765, 6.6732693948, 34.9222120789, 19.0351207612, 22065.4510968499, 0.5574880513, 4.301394063, 93890.5603261959, 0.0026491476, 719.1616943522, 0.00447632, 0.0586821708, 0.0100287848, 0.0008519986, 9.1953631649, 0.4385685537, 0.6028764466, 0.0022355043, 0.0016904743, 0.0025374554
## OOB errors NMSE: 0.1437080087, 0.1819377244, 0.839178937, 0.5562961965, 0.1903217178, 0.1907419281, 0.9133097153, 0.8872986497, 0.9521174886, 0.0485246408, 0.5388796131, 0.2157944179, 0.1468671423, 0.0417182982, 0.0271203822, 0.3139060437, 0.0785798664, 0.1827182044, 0.5778890526, 0.5137134911, 0.1142282318, 0.0183579263, 0.1771978085, 0.0050883565, 0.1799429365, 0.3439570403, 0.4949810727, 0.0392894696, 0.1049052311, 0.391510705, 0.0087106722, 0.110007713, 0.0033468486
## diff. convergence measure: 0.7111471051
## time: 20.36 seconds
##
## missForest iteration 2 in progress...done!
## OOB errors MSE: 0.0176596109, 0.0018471926, 0.0059594669, 0.0016293207, 1.9362517316, 2.9784251778, 0.0021091284, 0.0121933162, 0.001669535, 533.6457912514, 9.9810762061, 1.6414730623, 19.8972350294, 9.321554201, 6.3160055442, 33.5054178929, 14.9194602669, 14631.327648066, 0.534768659, 4.3206056293, 84004.0157973131, 0.0020935962, 700.2994907269, 0.0033966562, 0.0561703625, 0.0095829597, 0.0008431904, 9.2252945428, 0.4333886228, 0.5945702997, 0.0019209841, 0.0016717213, 0.0024771162
## OOB errors NMSE: 0.1096183408, 0.1741822874, 0.8307766075, 0.5346484284, 0.1659230505, 0.1896053239, 0.9131806738, 0.8759347494, 0.949757483, 0.0357647777, 0.5100979524, 0.2016917511, 0.1322196845, 0.0360127638, 0.0256684504, 0.3011708752, 0.0615897955, 0.1211581809, 0.5543382553, 0.5160079195, 0.1022001589, 0.014508095, 0.1725502568, 0.0038610728, 0.1722407307, 0.3286665843, 0.4898638495, 0.0394173588, 0.1036661959, 0.3861166554, 0.0074851401, 0.1087873577, 0.0032672625
## diff. convergence measure: 0.0109141655
## time: 20.73 seconds
##
## missForest iteration 3 in progress...done!
## OOB errors MSE: 0.0173841207, 0.00183782, 0.0059660576, 0.0016287838, 1.8934968309, 2.9221344897, 0.0020994149, 0.0121201234, 0.0016754333, 529.2171612545, 10.0449723375, 1.6199098081, 19.4883467292, 9.3605263396, 6.3127216489, 33.4665590092, 14.418633053, 14161.7086938488, 0.5355232951, 4.3171820117, 81803.9437079376, 0.0020544626, 712.6524683953, 0.0035268384, 0.055995344, 0.0096189609, 0.0008487301, 9.8532799938, 0.4350913373, 0.5900224807, 0.0019652692, 0.0016533322, 0.0024688919
## OOB errors NMSE: 0.1079082929, 0.1732984923, 0.8316953821, 0.5344722445, 0.1622592585, 0.1860218818, 0.9089750605, 0.870676773, 0.9531129254, 0.0354679723, 0.5133634605, 0.199042222, 0.1295025692, 0.0361633282, 0.0256551045, 0.3008215835, 0.0595223048, 0.1172693897, 0.555120507, 0.5155990384, 0.0995235283, 0.0142369092, 0.1755939681, 0.0040090546, 0.1717040542, 0.3299013163, 0.493082225, 0.0421005824, 0.1040734838, 0.3831632811, 0.0076576975, 0.1075906854, 0.0032564148
## diff. convergence measure: 0.0006102145
## time: 20.84 seconds
##
## missForest iteration 4 in progress...done!
## OOB errors MSE: 0.0176633478, 0.0018356686, 0.0059581957, 0.0016290046, 1.7925581248, 3.1173361114, 0.0021042694, 0.0121865207, 0.0016640616, 505.4875083687, 10.0512838066, 1.6309459247, 19.5428896838, 9.1453797553, 6.4614212026, 33.4385287686, 14.8035785388, 14462.4056817869, 0.535746934, 4.2965272774, 83672.5242202308, 0.0020821474, 698.6483425434, 0.0034518533, 0.0555246453, 0.0097070966, 0.0008388561, 9.4138526135, 0.4466296874, 0.5941804682, 0.001875169, 0.0016690362, 0.0025127783
## OOB errors NMSE: 0.1096415364, 0.1730956281, 0.8305993996, 0.5345446937, 0.1536095268, 0.1984483369, 0.9110768749, 0.8754465762, 0.9466438394, 0.0338776182, 0.5136860177, 0.2003982563, 0.1298650142, 0.0353321339, 0.0262594243, 0.3005696274, 0.0611114182, 0.1197593825, 0.5553523298, 0.5131322531, 0.1017968628, 0.0144287577, 0.1721434223, 0.0039238169, 0.1702607042, 0.3329241046, 0.4873457416, 0.0402230199, 0.1068334475, 0.3858634971, 0.0073066209, 0.1086126244, 0.0033143
## diff. convergence measure: -0.0001692671
## time: 21.66 seconds
##
## Cluster size 2038 broken into 588 1450
## Done cluster 588
## Done cluster 1450
## Imputation sequence (missing proportion): Brand.Code (0.049558390578999) Carb.Volume (0.049558390578999) Fill.Ounces (0.049558390578999) PC.Volume (0.049558390578999) Carb.Pressure (0.049558390578999) Carb.Temp (0.049558390578999) PSC (0.049558390578999) PSC.Fill (0.049558390578999) PSC.CO2 (0.049558390578999) Mnf.Flow (0.049558390578999) Carb.Pressure1 (0.049558390578999) Fill.Pressure (0.049558390578999) Hyd.Pressure1 (0.049558390578999) Hyd.Pressure2 (0.049558390578999) Hyd.Pressure3 (0.049558390578999) Hyd.Pressure4 (0.049558390578999) Filler.Level (0.049558390578999) Filler.Speed (0.049558390578999) Temperature (0.049558390578999) Usage.cont (0.049558390578999) Carb.Flow (0.049558390578999) Density (0.049558390578999) MFR (0.049558390578999) Balling (0.049558390578999) Pressure.Vacuum (0.049558390578999) PH (0.049558390578999) Oxygen.Filler (0.049558390578999) Bowl.Setpoint (0.049558390578999) Pressure.Setpoint (0.049558390578999) Air.Pressurer (0.049558390578999) Alch.Rel (0.049558390578999) Carb.Rel (0.049558390578999) Balling.Lvl (0.049558390578999)
## missForest iteration 1 in progress...done!
## OOB errors MSE: 0.0231514864, 0.0019294385, 0.0060197399, 0.0016952915, 2.2209738466, 2.9962795846, 0.0021094265, 0.0123515057, 0.0016736835, 724.0355451055, 10.5442463723, 1.7562479484, 22.1014749642, 10.7983763765, 6.6732693948, 34.9222120789, 19.0351207612, 22065.4510968499, 0.5574880513, 4.301394063, 93890.5603261959, 0.0026491476, 719.1616943522, 0.00447632, 0.0586821708, 0.0100287848, 0.0008519986, 9.1953631649, 0.4385685537, 0.6028764466, 0.0022355043, 0.0016904743, 0.0025374554
## OOB errors NMSE: 0.1437080087, 0.1819377244, 0.839178937, 0.5562961965, 0.1903217178, 0.1907419281, 0.9133097153, 0.8872986497, 0.9521174886, 0.0485246408, 0.5388796131, 0.2157944179, 0.1468671423, 0.0417182982, 0.0271203822, 0.3139060437, 0.0785798664, 0.1827182044, 0.5778890526, 0.5137134911, 0.1142282318, 0.0183579263, 0.1771978085, 0.0050883565, 0.1799429365, 0.3439570403, 0.4949810727, 0.0392894696, 0.1049052311, 0.391510705, 0.0087106722, 0.110007713, 0.0033468486
## diff. convergence measure: 0.7111471051
## time: 20.62 seconds
##
## missForest iteration 2 in progress...done!
## OOB errors MSE: 0.0176596109, 0.0018471926, 0.0059594669, 0.0016293207, 1.9362517316, 2.9784251778, 0.0021091284, 0.0121933162, 0.001669535, 533.6457912514, 9.9810762061, 1.6414730623, 19.8972350294, 9.321554201, 6.3160055442, 33.5054178929, 14.9194602669, 14631.327648066, 0.534768659, 4.3206056293, 84004.0157973131, 0.0020935962, 700.2994907269, 0.0033966562, 0.0561703625, 0.0095829597, 0.0008431904, 9.2252945428, 0.4333886228, 0.5945702997, 0.0019209841, 0.0016717213, 0.0024771162
## OOB errors NMSE: 0.1096183408, 0.1741822874, 0.8307766075, 0.5346484284, 0.1659230505, 0.1896053239, 0.9131806738, 0.8759347494, 0.949757483, 0.0357647777, 0.5100979524, 0.2016917511, 0.1322196845, 0.0360127638, 0.0256684504, 0.3011708752, 0.0615897955, 0.1211581809, 0.5543382553, 0.5160079195, 0.1022001589, 0.014508095, 0.1725502568, 0.0038610728, 0.1722407307, 0.3286665843, 0.4898638495, 0.0394173588, 0.1036661959, 0.3861166554, 0.0074851401, 0.1087873577, 0.0032672625
## diff. convergence measure: 0.0109141655
## time: 20.92 seconds
##
## missForest iteration 3 in progress...done!
## OOB errors MSE: 0.0173841207, 0.00183782, 0.0059660576, 0.0016287838, 1.8934968309, 2.9221344897, 0.0020994149, 0.0121201234, 0.0016754333, 529.2171612545, 10.0449723375, 1.6199098081, 19.4883467292, 9.3605263396, 6.3127216489, 33.4665590092, 14.418633053, 14161.7086938488, 0.5355232951, 4.3171820117, 81803.9437079376, 0.0020544626, 712.6524683953, 0.0035268384, 0.055995344, 0.0096189609, 0.0008487301, 9.8532799938, 0.4350913373, 0.5900224807, 0.0019652692, 0.0016533322, 0.0024688919
## OOB errors NMSE: 0.1079082929, 0.1732984923, 0.8316953821, 0.5344722445, 0.1622592585, 0.1860218818, 0.9089750605, 0.870676773, 0.9531129254, 0.0354679723, 0.5133634605, 0.199042222, 0.1295025692, 0.0361633282, 0.0256551045, 0.3008215835, 0.0595223048, 0.1172693897, 0.555120507, 0.5155990384, 0.0995235283, 0.0142369092, 0.1755939681, 0.0040090546, 0.1717040542, 0.3299013163, 0.493082225, 0.0421005824, 0.1040734838, 0.3831632811, 0.0076576975, 0.1075906854, 0.0032564148
## diff. convergence measure: 0.0006102145
## time: 20.72 seconds
##
## missForest iteration 4 in progress...done!
## OOB errors MSE: 0.0176633478, 0.0018356686, 0.0059581957, 0.0016290046, 1.7925581248, 3.1173361114, 0.0021042694, 0.0121865207, 0.0016640616, 505.4875083687, 10.0512838066, 1.6309459247, 19.5428896838, 9.1453797553, 6.4614212026, 33.4385287686, 14.8035785388, 14462.4056817869, 0.535746934, 4.2965272774, 83672.5242202308, 0.0020821474, 698.6483425434, 0.0034518533, 0.0555246453, 0.0097070966, 0.0008388561, 9.4138526135, 0.4466296874, 0.5941804682, 0.001875169, 0.0016690362, 0.0025127783
## OOB errors NMSE: 0.1096415364, 0.1730956281, 0.8305993996, 0.5345446937, 0.1536095268, 0.1984483369, 0.9110768749, 0.8754465762, 0.9466438394, 0.0338776182, 0.5136860177, 0.2003982563, 0.1298650142, 0.0353321339, 0.0262594243, 0.3005696274, 0.0611114182, 0.1197593825, 0.5553523298, 0.5131322531, 0.1017968628, 0.0144287577, 0.1721434223, 0.0039238169, 0.1702607042, 0.3329241046, 0.4873457416, 0.0402230199, 0.1068334475, 0.3858634971, 0.0073066209, 0.1086126244, 0.0033143
## diff. convergence measure: -0.0001692671
## time: 20.86 seconds
##
## Cluster size 2038 broken into 588 1450
## Done cluster 588
## Done cluster 1450
## Imputation sequence (missing proportion): Brand.Code (0.049558390578999) Carb.Volume (0.049558390578999) Fill.Ounces (0.049558390578999) PC.Volume (0.049558390578999) Carb.Pressure (0.049558390578999) Carb.Temp (0.049558390578999) PSC (0.049558390578999) PSC.Fill (0.049558390578999) PSC.CO2 (0.049558390578999) Mnf.Flow (0.049558390578999) Carb.Pressure1 (0.049558390578999) Fill.Pressure (0.049558390578999) Hyd.Pressure1 (0.049558390578999) Hyd.Pressure2 (0.049558390578999) Hyd.Pressure3 (0.049558390578999) Hyd.Pressure4 (0.049558390578999) Filler.Level (0.049558390578999) Filler.Speed (0.049558390578999) Temperature (0.049558390578999) Usage.cont (0.049558390578999) Carb.Flow (0.049558390578999) Density (0.049558390578999) MFR (0.049558390578999) Balling (0.049558390578999) Pressure.Vacuum (0.049558390578999) PH (0.049558390578999) Oxygen.Filler (0.049558390578999) Bowl.Setpoint (0.049558390578999) Pressure.Setpoint (0.049558390578999) Air.Pressurer (0.049558390578999) Alch.Rel (0.049558390578999) Carb.Rel (0.049558390578999) Balling.Lvl (0.049558390578999)
## missForest iteration 1 in progress...done!
## OOB errors MSE: 0.0231514864, 0.0019294385, 0.0060197399, 0.0016952915, 2.2209738466, 2.9962795846, 0.0021094265, 0.0123515057, 0.0016736835, 724.0355451055, 10.5442463723, 1.7562479484, 22.1014749642, 10.7983763765, 6.6732693948, 34.9222120789, 19.0351207612, 22065.4510968499, 0.5574880513, 4.301394063, 93890.5603261959, 0.0026491476, 719.1616943522, 0.00447632, 0.0586821708, 0.0100287848, 0.0008519986, 9.1953631649, 0.4385685537, 0.6028764466, 0.0022355043, 0.0016904743, 0.0025374554
## OOB errors NMSE: 0.1437080087, 0.1819377244, 0.839178937, 0.5562961965, 0.1903217178, 0.1907419281, 0.9133097153, 0.8872986497, 0.9521174886, 0.0485246408, 0.5388796131, 0.2157944179, 0.1468671423, 0.0417182982, 0.0271203822, 0.3139060437, 0.0785798664, 0.1827182044, 0.5778890526, 0.5137134911, 0.1142282318, 0.0183579263, 0.1771978085, 0.0050883565, 0.1799429365, 0.3439570403, 0.4949810727, 0.0392894696, 0.1049052311, 0.391510705, 0.0087106722, 0.110007713, 0.0033468486
## diff. convergence measure: 0.7111471051
## time: 20.49 seconds
##
## missForest iteration 2 in progress...done!
## OOB errors MSE: 0.0176596109, 0.0018471926, 0.0059594669, 0.0016293207, 1.9362517316, 2.9784251778, 0.0021091284, 0.0121933162, 0.001669535, 533.6457912514, 9.9810762061, 1.6414730623, 19.8972350294, 9.321554201, 6.3160055442, 33.5054178929, 14.9194602669, 14631.327648066, 0.534768659, 4.3206056293, 84004.0157973131, 0.0020935962, 700.2994907269, 0.0033966562, 0.0561703625, 0.0095829597, 0.0008431904, 9.2252945428, 0.4333886228, 0.5945702997, 0.0019209841, 0.0016717213, 0.0024771162
## OOB errors NMSE: 0.1096183408, 0.1741822874, 0.8307766075, 0.5346484284, 0.1659230505, 0.1896053239, 0.9131806738, 0.8759347494, 0.949757483, 0.0357647777, 0.5100979524, 0.2016917511, 0.1322196845, 0.0360127638, 0.0256684504, 0.3011708752, 0.0615897955, 0.1211581809, 0.5543382553, 0.5160079195, 0.1022001589, 0.014508095, 0.1725502568, 0.0038610728, 0.1722407307, 0.3286665843, 0.4898638495, 0.0394173588, 0.1036661959, 0.3861166554, 0.0074851401, 0.1087873577, 0.0032672625
## diff. convergence measure: 0.0109141655
## time: 20.62 seconds
##
## missForest iteration 3 in progress...done!
## OOB errors MSE: 0.0173841207, 0.00183782, 0.0059660576, 0.0016287838, 1.8934968309, 2.9221344897, 0.0020994149, 0.0121201234, 0.0016754333, 529.2171612545, 10.0449723375, 1.6199098081, 19.4883467292, 9.3605263396, 6.3127216489, 33.4665590092, 14.418633053, 14161.7086938488, 0.5355232951, 4.3171820117, 81803.9437079376, 0.0020544626, 712.6524683953, 0.0035268384, 0.055995344, 0.0096189609, 0.0008487301, 9.8532799938, 0.4350913373, 0.5900224807, 0.0019652692, 0.0016533322, 0.0024688919
## OOB errors NMSE: 0.1079082929, 0.1732984923, 0.8316953821, 0.5344722445, 0.1622592585, 0.1860218818, 0.9089750605, 0.870676773, 0.9531129254, 0.0354679723, 0.5133634605, 0.199042222, 0.1295025692, 0.0361633282, 0.0256551045, 0.3008215835, 0.0595223048, 0.1172693897, 0.555120507, 0.5155990384, 0.0995235283, 0.0142369092, 0.1755939681, 0.0040090546, 0.1717040542, 0.3299013163, 0.493082225, 0.0421005824, 0.1040734838, 0.3831632811, 0.0076576975, 0.1075906854, 0.0032564148
## diff. convergence measure: 0.0006102145
## time: 20.64 seconds
##
## missForest iteration 4 in progress...done!
## OOB errors MSE: 0.0176633478, 0.0018356686, 0.0059581957, 0.0016290046, 1.7925581248, 3.1173361114, 0.0021042694, 0.0121865207, 0.0016640616, 505.4875083687, 10.0512838066, 1.6309459247, 19.5428896838, 9.1453797553, 6.4614212026, 33.4385287686, 14.8035785388, 14462.4056817869, 0.535746934, 4.2965272774, 83672.5242202308, 0.0020821474, 698.6483425434, 0.0034518533, 0.0555246453, 0.0097070966, 0.0008388561, 9.4138526135, 0.4466296874, 0.5941804682, 0.001875169, 0.0016690362, 0.0025127783
## OOB errors NMSE: 0.1096415364, 0.1730956281, 0.8305993996, 0.5345446937, 0.1536095268, 0.1984483369, 0.9110768749, 0.8754465762, 0.9466438394, 0.0338776182, 0.5136860177, 0.2003982563, 0.1298650142, 0.0353321339, 0.0262594243, 0.3005696274, 0.0611114182, 0.1197593825, 0.5553523298, 0.5131322531, 0.1017968628, 0.0144287577, 0.1721434223, 0.0039238169, 0.1702607042, 0.3329241046, 0.4873457416, 0.0402230199, 0.1068334475, 0.3858634971, 0.0073066209, 0.1086126244, 0.0033143
## diff. convergence measure: -0.0001692671
## time: 20.83 seconds
##
## Cluster size 2038 broken into 588 1450
## Done cluster 588
## Done cluster 1450
## Imputation sequence (missing proportion): Brand.Code (0.049558390578999) Carb.Volume (0.049558390578999) Fill.Ounces (0.049558390578999) PC.Volume (0.049558390578999) Carb.Pressure (0.049558390578999) Carb.Temp (0.049558390578999) PSC (0.049558390578999) PSC.Fill (0.049558390578999) PSC.CO2 (0.049558390578999) Mnf.Flow (0.049558390578999) Carb.Pressure1 (0.049558390578999) Fill.Pressure (0.049558390578999) Hyd.Pressure1 (0.049558390578999) Hyd.Pressure2 (0.049558390578999) Hyd.Pressure3 (0.049558390578999) Hyd.Pressure4 (0.049558390578999) Filler.Level (0.049558390578999) Filler.Speed (0.049558390578999) Temperature (0.049558390578999) Usage.cont (0.049558390578999) Carb.Flow (0.049558390578999) Density (0.049558390578999) MFR (0.049558390578999) Balling (0.049558390578999) Pressure.Vacuum (0.049558390578999) PH (0.049558390578999) Oxygen.Filler (0.049558390578999) Bowl.Setpoint (0.049558390578999) Pressure.Setpoint (0.049558390578999) Air.Pressurer (0.049558390578999) Alch.Rel (0.049558390578999) Carb.Rel (0.049558390578999) Balling.Lvl (0.049558390578999)
## missForest iteration 1 in progress...done!
## OOB errors MSE: 0.0231514864, 0.0019294385, 0.0060197399, 0.0016952915, 2.2209738466, 2.9962795846, 0.0021094265, 0.0123515057, 0.0016736835, 724.0355451055, 10.5442463723, 1.7562479484, 22.1014749642, 10.7983763765, 6.6732693948, 34.9222120789, 19.0351207612, 22065.4510968499, 0.5574880513, 4.301394063, 93890.5603261959, 0.0026491476, 719.1616943522, 0.00447632, 0.0586821708, 0.0100287848, 0.0008519986, 9.1953631649, 0.4385685537, 0.6028764466, 0.0022355043, 0.0016904743, 0.0025374554
## OOB errors NMSE: 0.1437080087, 0.1819377244, 0.839178937, 0.5562961965, 0.1903217178, 0.1907419281, 0.9133097153, 0.8872986497, 0.9521174886, 0.0485246408, 0.5388796131, 0.2157944179, 0.1468671423, 0.0417182982, 0.0271203822, 0.3139060437, 0.0785798664, 0.1827182044, 0.5778890526, 0.5137134911, 0.1142282318, 0.0183579263, 0.1771978085, 0.0050883565, 0.1799429365, 0.3439570403, 0.4949810727, 0.0392894696, 0.1049052311, 0.391510705, 0.0087106722, 0.110007713, 0.0033468486
## diff. convergence measure: 0.7111471051
## time: 20.44 seconds
##
## missForest iteration 2 in progress...done!
## OOB errors MSE: 0.0176596109, 0.0018471926, 0.0059594669, 0.0016293207, 1.9362517316, 2.9784251778, 0.0021091284, 0.0121933162, 0.001669535, 533.6457912514, 9.9810762061, 1.6414730623, 19.8972350294, 9.321554201, 6.3160055442, 33.5054178929, 14.9194602669, 14631.327648066, 0.534768659, 4.3206056293, 84004.0157973131, 0.0020935962, 700.2994907269, 0.0033966562, 0.0561703625, 0.0095829597, 0.0008431904, 9.2252945428, 0.4333886228, 0.5945702997, 0.0019209841, 0.0016717213, 0.0024771162
## OOB errors NMSE: 0.1096183408, 0.1741822874, 0.8307766075, 0.5346484284, 0.1659230505, 0.1896053239, 0.9131806738, 0.8759347494, 0.949757483, 0.0357647777, 0.5100979524, 0.2016917511, 0.1322196845, 0.0360127638, 0.0256684504, 0.3011708752, 0.0615897955, 0.1211581809, 0.5543382553, 0.5160079195, 0.1022001589, 0.014508095, 0.1725502568, 0.0038610728, 0.1722407307, 0.3286665843, 0.4898638495, 0.0394173588, 0.1036661959, 0.3861166554, 0.0074851401, 0.1087873577, 0.0032672625
## diff. convergence measure: 0.0109141655
## time: 20.88 seconds
##
## missForest iteration 3 in progress...done!
## OOB errors MSE: 0.0173841207, 0.00183782, 0.0059660576, 0.0016287838, 1.8934968309, 2.9221344897, 0.0020994149, 0.0121201234, 0.0016754333, 529.2171612545, 10.0449723375, 1.6199098081, 19.4883467292, 9.3605263396, 6.3127216489, 33.4665590092, 14.418633053, 14161.7086938488, 0.5355232951, 4.3171820117, 81803.9437079376, 0.0020544626, 712.6524683953, 0.0035268384, 0.055995344, 0.0096189609, 0.0008487301, 9.8532799938, 0.4350913373, 0.5900224807, 0.0019652692, 0.0016533322, 0.0024688919
## OOB errors NMSE: 0.1079082929, 0.1732984923, 0.8316953821, 0.5344722445, 0.1622592585, 0.1860218818, 0.9089750605, 0.870676773, 0.9531129254, 0.0354679723, 0.5133634605, 0.199042222, 0.1295025692, 0.0361633282, 0.0256551045, 0.3008215835, 0.0595223048, 0.1172693897, 0.555120507, 0.5155990384, 0.0995235283, 0.0142369092, 0.1755939681, 0.0040090546, 0.1717040542, 0.3299013163, 0.493082225, 0.0421005824, 0.1040734838, 0.3831632811, 0.0076576975, 0.1075906854, 0.0032564148
## diff. convergence measure: 0.0006102145
## time: 20.76 seconds
##
## missForest iteration 4 in progress...done!
## OOB errors MSE: 0.0176633478, 0.0018356686, 0.0059581957, 0.0016290046, 1.7925581248, 3.1173361114, 0.0021042694, 0.0121865207, 0.0016640616, 505.4875083687, 10.0512838066, 1.6309459247, 19.5428896838, 9.1453797553, 6.4614212026, 33.4385287686, 14.8035785388, 14462.4056817869, 0.535746934, 4.2965272774, 83672.5242202308, 0.0020821474, 698.6483425434, 0.0034518533, 0.0555246453, 0.0097070966, 0.0008388561, 9.4138526135, 0.4466296874, 0.5941804682, 0.001875169, 0.0016690362, 0.0025127783
## OOB errors NMSE: 0.1096415364, 0.1730956281, 0.8305993996, 0.5345446937, 0.1536095268, 0.1984483369, 0.9110768749, 0.8754465762, 0.9466438394, 0.0338776182, 0.5136860177, 0.2003982563, 0.1298650142, 0.0353321339, 0.0262594243, 0.3005696274, 0.0611114182, 0.1197593825, 0.5553523298, 0.5131322531, 0.1017968628, 0.0144287577, 0.1721434223, 0.0039238169, 0.1702607042, 0.3329241046, 0.4873457416, 0.0402230199, 0.1068334475, 0.3858634971, 0.0073066209, 0.1086126244, 0.0033143
## diff. convergence measure: -0.0001692671
## time: 21.03 seconds
imputation_results$nrmse |> kable(digits = 4)
| method | nrmse |
|---|---|
| mice | 0.0784 |
| forest | 0.0883 |
| knn | 0.1882 |
| mean | 0.2295 |
| median | 0.2370 |
| mode | 0.2739 |
imputation_results$brand |> kable(digits = 4)
| method | accuracy |
|---|---|
| forest | 0.9624 |
| mice | 0.8535 |
| mode | NaN |
ggplot(imputation_results$nrmse, aes(x = reorder(method, nrmse), y = nrmse)) +
geom_col(aes(fill = nrmse), color = NA, width = 0.65) +
scale_fill_gradient(low = "darkgreen", high = "darkred") +
coord_flip() +
labs(
title = "Imputation comparison by NRMSE",
x = NULL,
y = "Average NRMSE"
)
The benchmark shows that forest-based imputation is the strongest recovery method on these data. KNN and MICE are reasonable alternatives, but they were not as consistently accurate in this experiment, so forest imputation becomes the foundation for the cleaned modeling set.
### Get Exact Imputation Method for Both Training and Test Sets
student_data_updated = student_data |> filter(!is.na(PH))
student_data_predictors = student_data_updated |> select(-PH)
student_data_ph = student_data_updated |> select(PH)
imputation_model <- missForest(
as.data.frame(student_data_predictors |> mutate(Brand.Code = as.factor(Brand.Code))))
## Imputation sequence (missing proportion): Mnf.Flow (0) Density (0) Balling (0) Pressure.Vacuum (0) Air.Pressurer (0) Balling.Lvl (0.000389559797428905) Carb.Flow (0.000779119594857811) Bowl.Setpoint (0.000779119594857811) Usage.cont (0.00194779898714453) Alch.Rel (0.00272691858200234) Carb.Rel (0.00311647837943124) Carb.Volume (0.00389559797428905) Hyd.Pressure1 (0.00428515777171796) Oxygen.Filler (0.00428515777171796) Temperature (0.00467471756914686) Pressure.Setpoint (0.00467471756914686) Hyd.Pressure2 (0.00584339696143358) Hyd.Pressure3 (0.00584339696143358) Filler.Level (0.00623295675886249) Fill.Pressure (0.0070120763537203) PSC.Fill (0.00895987534086482) Carb.Temp (0.0101285547331515) Carb.Pressure (0.0105181145305804) Hyd.Pressure4 (0.0109076743280093) Carb.Pressure1 (0.012465913517725) PSC (0.0128554733151539) Fill.Ounces (0.0148032723022984) PC.Volume (0.0151928320997273) PSC.CO2 (0.0151928320997273) Filler.Speed (0.0210362290611609) Brand.Code (0.0467471756914686) MFR (0.0810284378652123)
## missForest iteration 1 in progress...done!
## OOB errors MSE: 621.0370077231, 0.0029825319, 0.0056859805, 0.0614672408, 0.6313308105, 0.0085331535, 151572.630810374, 13.5584233471, 4.6616638946, 0.0115684666, 0.002816241, 0.0024145787, 32.9018289049, 0.0010671208, 0.961541945, 0.6485090165, 10.3782430288, 7.1001252058, 23.9910699297, 3.190527561, 0.0124559855, 3.382566403, 2.0077623136, 58.5779861513, 10.784407184, 0.0022116049, 0.0064545142, 0.0019735777, 0.0017581789, 34590.6837907357, 0.020904566, 964.164402375
## OOB errors NMSE: 0.043505155, 0.0209963841, 0.0065831216, 0.1890187631, 0.4294462589, 0.0113070795, 0.1323351123, 0.0580245735, 0.5264868123, 0.0453338305, 0.1699253867, 0.2133750013, 0.2129282671, 0.5254764002, 0.5057929954, 0.1560831868, 0.0387031312, 0.0278451775, 0.0973885232, 0.3161180588, 0.8981313898, 0.2079370628, 0.1604051287, 0.341600717, 0.4827861692, 0.9121995153, 0.8440219656, 0.5376404255, 0.9484284922, 0.0584211399, 0.1281818087, 0.176630715
## diff. convergence measure: 0.6773859145
## time: 26.58 seconds
##
## missForest iteration 2 in progress...done!
## OOB errors MSE: 633.8502553524, 0.0029292733, 0.0054220633, 0.0609500755, 0.6302748279, 0.0102124998, 136474.715684955, 12.0629008777, 4.6743668242, 0.0111590671, 0.0027777531, 0.0023682077, 31.861534849, 0.001062578, 0.9437449332, 0.6450062706, 9.908001165, 6.7399608614, 24.1763566575, 3.2409765134, 0.0123001285, 2.8823333824, 1.97976993, 56.4341672121, 10.852063966, 0.0022080017, 0.0064144227, 0.0019973047, 0.0017551331, 22887.1437214671, 0.0206453623, 956.9230778777
## OOB errors NMSE: 0.0444027542, 0.0206214552, 0.0062775633, 0.1874284209, 0.4287279544, 0.0135323415, 0.1191534166, 0.0516243416, 0.527921477, 0.0437294997, 0.1676031203, 0.2092772262, 0.206195875, 0.5232394234, 0.4964313613, 0.1552401458, 0.0369494786, 0.0264326897, 0.0981406698, 0.3211165503, 0.8868934129, 0.1771861557, 0.1581687475, 0.3290989201, 0.4858149642, 0.9107133374, 0.8387794127, 0.5441041358, 0.9467854965, 0.0386547151, 0.1265924336, 0.1753041359
## diff. convergence measure: 0.0039814744
## time: 26.52 seconds
##
## missForest iteration 3 in progress...done!
## OOB errors MSE: 634.9125112319, 0.0029818241, 0.0054347158, 0.061142584, 0.6319074392, 0.0101489654, 138890.851255719, 12.0280688644, 4.6755417573, 0.011492321, 0.0027865968, 0.0023944467, 31.7968476061, 0.0010698133, 0.9445289973, 0.6366975716, 9.7942842444, 6.7790974652, 24.3309477191, 3.2970937635, 0.0124125077, 2.990202591, 1.9512955395, 54.3366549551, 10.7262645856, 0.0022016034, 0.0064437625, 0.0019935158, 0.0017680073, 23368.3469352782, 0.0206552863, 959.66750011
## OOB errors NMSE: 0.0444771679, 0.0209914014, 0.0062922122, 0.1880204066, 0.4298384955, 0.0134481536, 0.1212628976, 0.0514752746, 0.5280541736, 0.0450354354, 0.1681367276, 0.211595947, 0.2057772435, 0.5268022442, 0.4968437969, 0.1532404076, 0.036525399, 0.0265861751, 0.0987682114, 0.3266766578, 0.8949964529, 0.1838172174, 0.1558938576, 0.3168671631, 0.4801832962, 0.9080743066, 0.8426160249, 0.5430719526, 0.9537303182, 0.0394674322, 0.1266532853, 0.1758069021
## diff. convergence measure: -0.0004709523
## time: 26.26 seconds
clean_data <- imputation_model$ximp |> as_tibble() |>
bind_cols(student_data_ph)
numeric_corr <- clean_data |>
select(where(is.numeric)) |>
cor(use = "pairwise.complete.obs")
high_corr <- findCorrelation(numeric_corr, cutoff = 0.80, names = TRUE)
if (length(high_corr) == 0) high_corr <- character(0)
model_data <- clean_data
if (length(high_corr) > 0) {
model_data <- model_data |> select(-all_of(high_corr))
}
clean_check <- model_data |>
summarise(across(everything(), ~ sum(is.na(.))))
tibble(removed_predictors = if (length(high_corr) == 0) "None" else high_corr) |> kable()
| removed_predictors |
|---|
| Hyd.Pressure3 |
| Balling |
| Density |
| Balling.Lvl |
| Alch.Rel |
| Filler.Level |
| Filler.Speed |
| Carb.Temp |
clean_check |> kable()
| Brand.Code | Carb.Volume | Fill.Ounces | PC.Volume | Carb.Pressure | PSC | PSC.Fill | PSC.CO2 | Mnf.Flow | Carb.Pressure1 | Fill.Pressure | Hyd.Pressure1 | Hyd.Pressure2 | Hyd.Pressure4 | Temperature | Usage.cont | Carb.Flow | MFR | Pressure.Vacuum | Oxygen.Filler | Bowl.Setpoint | Pressure.Setpoint | Air.Pressurer | Carb.Rel | PH |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
ggplot(model_data, aes(x = "", y = PH)) +
geom_boxplot(width = 0.35, fill = "grey90", color = "grey25", outlier.size = 0.7) +
labs(
title = "Response distribution after cleaning",
x = NULL,
y = "PH"
)
The cleaned data keep the full set of observations after imputation and remove only the strongest redundant predictors. That gives us a stable starting point for modeling while avoiding unnecessary data loss and reducing the chance that highly correlated sensors will distort later fits.
mnf_suspicious_values <- c(-100, -100.2)
mnf_suspicious_summary <- tibble(
suspicious_value = mnf_suspicious_values,
count = c(
sum(student_data$Mnf.Flow == -100, na.rm = TRUE),
sum(student_data$Mnf.Flow == -100.2, na.rm = TRUE)
)
) |>
mutate(share = round(100 * count / nrow(student_data), 1))
mnf_sensitivity <- prepare_mnf_sensitivity(student_data, mnf_suspicious_values)
mnf_raw_data <- mnf_sensitivity$raw
mnf_recode_data <- mnf_sensitivity$recode
mnf_removed_predictors <- mnf_sensitivity$removed_predictors
mnf_suspicious_summary |> kable()
| suspicious_value | count | share |
|---|---|---|
| -100.0 | 606 | 23.6 |
| -100.2 | 578 | 22.5 |
tibble(
removed_predictors = if (length(mnf_removed_predictors) == 0) "None" else mnf_removed_predictors
) |> kable()
| removed_predictors |
|---|
| Hyd.Pressure3 |
| Balling |
| Density |
| Balling.Lvl |
| Alch.Rel |
| Filler.Level |
| Filler.Speed |
| Carb.Temp |
The negative Mnf Flow values are not rare: there are
1184 rows with -100 or -100.2. That is enough
to justify a sensitivity check, because these values could either be
legitimate coded readings or placeholders for an abnormal sensor state.
The comparison below keeps a suspicious-value flag in both versions,
then runs the same tree-model workflow twice: once with the raw values
left in place and once with them recoded as missing before
imputation.
mnf_raw_fit <- fit_tree_sensitivity(mnf_raw_data)
mnf_recode_fit <- fit_tree_sensitivity(mnf_recode_data)
mnf_sensitivity_metrics <- bind_rows(
mnf_raw_fit$metrics |> mutate(version = "Raw Mnf Flow"),
mnf_recode_fit$metrics |> mutate(version = "Recoded as missing")
) |>
select(version, model, rmse, mae, r2)
mnf_best_tbl <- tibble(
version = c("Raw Mnf Flow", "Recoded as missing"),
winner = c(mnf_raw_fit$best$model[[1]], mnf_recode_fit$best$model[[1]]),
winner_rmse = c(mnf_raw_fit$best$rmse, mnf_recode_fit$best$rmse)
)
mnf_sensitivity_metrics |> kable(digits = 4)
| version | model | rmse | mae | r2 |
|---|---|---|---|---|
| Raw Mnf Flow | Random forest | 0.0982 | 0.0715 | 0.6595 |
| Raw Mnf Flow | Gradient boosting | 0.1152 | 0.0899 | 0.5279 |
| Recoded as missing | Random forest | 0.0950 | 0.0704 | 0.6878 |
| Recoded as missing | Gradient boosting | 0.1127 | 0.0883 | 0.5493 |
mnf_best_tbl |> kable(digits = 4)
| version | winner | winner_rmse |
|---|---|---|
| Raw Mnf Flow | Random forest | 0.0982 |
| Recoded as missing | Random forest | 0.0950 |
ggplot(mnf_sensitivity_metrics, aes(x = model, y = rmse, fill = version)) +
geom_col(position = position_dodge(width = 0.7), width = 0.6) +
coord_flip() +
labs(
title = "Tree-model RMSE with Mnf Flow handled two ways",
x = NULL,
y = "RMSE",
fill = NULL
)
The sensitivity check shows that recoding -100 and
-100.2 as missing barely changes the tree-model outcome.
Random forest remains the better performer in both versions, and the
RMSE/MAE values move only slightly. That tells us the odd
Mnf Flow values are worth flagging, but they are not
driving the main conclusion. In practical terms, the model is robust to
how we handle those codes, so we can keep the main story focused on the
broader process signal rather than on those individual anomalies.
numeric_data <- model_data |> select(where(is.numeric))
positive_numeric_names <- names(Filter(function(x) min(x, na.rm = TRUE) > 0, numeric_data))
nonpositive_numeric_names <- setdiff(names(numeric_data), positive_numeric_names)
boxcox_tbl <- bind_rows(lapply(positive_numeric_names, function(v) {
trans <- BoxCoxTrans(numeric_data[[v]])
tibble(
variable = v,
lambda = unname(trans$lambda),
transform_need = abs(unname(trans$lambda) - 1)
)
})) |>
arrange(desc(transform_need))
boxcox_ignored <- tibble(variable = nonpositive_numeric_names)
if (nrow(boxcox_ignored) == 0) boxcox_ignored <- tibble(variable = "None")
carb_flow_bc <- predict(BoxCoxTrans(model_data$Carb.Flow), model_data$Carb.Flow)
carb_flow_boxcox <- tibble(
scale = rep(c("Raw Carb Flow", "Box-Cox transformed Carb Flow"), each = nrow(model_data)),
value = c(model_data$Carb.Flow, carb_flow_bc)
)
carb_flow_qq <- carb_flow_boxcox |>
mutate(scale = factor(scale, levels = c("Raw Carb Flow", "Box-Cox transformed Carb Flow")))
boxcox_tbl |> head(12) |> kable(digits = 4)
| variable | lambda | transform_need |
|---|---|---|
| Carb.Volume | -2.0 | 3.0 |
| Temperature | -2.0 | 3.0 |
| Pressure.Setpoint | -2.0 | 3.0 |
| Air.Pressurer | -2.0 | 3.0 |
| Carb.Rel | -2.0 | 3.0 |
| Fill.Pressure | -0.7 | 1.7 |
| Hyd.Pressure4 | -0.3 | 1.3 |
| Carb.Pressure | -0.2 | 1.2 |
| Fill.Ounces | 2.0 | 1.0 |
| Usage.cont | 2.0 | 1.0 |
| MFR | 2.0 | 1.0 |
| Bowl.Setpoint | 2.0 | 1.0 |
boxcox_ignored |> kable()
| variable |
|---|
| PSC.Fill |
| PSC.CO2 |
| Mnf.Flow |
| Hyd.Pressure1 |
| Hyd.Pressure2 |
| Pressure.Vacuum |
ggplot(boxcox_tbl |> slice_head(n = 12), aes(x = reorder(variable, transform_need), y = lambda, fill = transform_need)) +
geom_col(color = NA, width = 0.65) +
scale_fill_gradient(low = "darkgreen", high = "darkred") +
geom_hline(yintercept = 1, linetype = "dashed", linewidth = 0.3, color = "grey50") +
coord_flip() +
labs(
title = "Box-Cox lambda profile by predictor",
x = NULL,
y = "Lambda"
)
The Box-Cox check helps us see which predictors are naturally skewed and would benefit from a power transform before modeling. Some variables include zeros or negative values, so they cannot be transformed directly without a shift. For that reason, we treat Box-Cox as a diagnostic step that informs the modeling strategy rather than as a one-size-fits-all cleanup rule.
ggplot(model_data, aes(x = Carb.Flow)) +
geom_histogram(bins = 25, fill = "grey70", color = NA) +
labs(
title = "Raw Carb Flow distribution",
x = "Carb Flow",
y = "Count"
)
ggplot(carb_flow_boxcox, aes(x = value)) +
geom_histogram(bins = 25, fill = "grey70", color = NA) +
facet_wrap(~scale, scales = "free_x") +
labs(
title = "Carb Flow before and after Box-Cox transformation",
x = "Carb Flow",
y = "Count"
)
ggplot(carb_flow_qq, aes(sample = value)) +
stat_qq(size = 0.45, alpha = 0.5) +
stat_qq_line(color = "grey40", linewidth = 0.4) +
facet_wrap(~scale, scales = "free") +
labs(
title = "QQ check before and after Box-Cox transformation",
x = "Theoretical quantiles",
y = "Sample quantiles"
)
set.seed(2026)
idx <- createDataPartition(model_data$PH, p = 0.80, list = FALSE)
train_data <- model_data[idx, ]
test_data <- model_data[-idx, ]
x_all <- model.matrix(PH ~ ., data = model_data)[, -1]
x_train <- x_all[idx, ]
x_test <- x_all[-idx, ]
y_train <- train_data$PH
y_test <- test_data$PH
ctrl <- trainControl(method = "cv", number = 5)
lm_model <- lm(PH ~ ., data = train_data)
lm_pred <- predict(lm_model, newdata = test_data)
lm_metrics <- metric_row("Linear regression", y_test, lm_pred)
lm_adj_r2 <- summary(lm_model)$adj.r.squared
lm_coef_tbl <- as.data.frame(coef(summary(lm_model)))
lm_coef_tbl$term <- rownames(lm_coef_tbl)
lm_coef_tbl <- lm_coef_tbl[, c(ncol(lm_coef_tbl), 1:(ncol(lm_coef_tbl) - 1))]
lm_coef_tbl <- lm_coef_tbl[order(lm_coef_tbl$`Pr(>|t|)`), ]
lm_coef_tbl |> head(12) |> kable(digits = 4)
| term | Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|---|
| Mnf.Flow | Mnf.Flow | -0.0006 | 0.0000 | -12.9816 | 0.0000 |
| (Intercept) | (Intercept) | 10.8404 | 1.0023 | 10.8156 | 0.0000 |
| Carb.Pressure1 | Carb.Pressure1 | 0.0066 | 0.0008 | 8.1610 | 0.0000 |
| Bowl.Setpoint | Bowl.Setpoint | 0.0022 | 0.0003 | 7.1699 | 0.0000 |
| Usage.cont | Usage.cont | -0.0079 | 0.0013 | -6.1238 | 0.0000 |
| Brand.CodeD | Brand.CodeD | 0.0776 | 0.0138 | 5.6205 | 0.0000 |
| Temperature | Temperature | -0.0135 | 0.0025 | -5.3908 | 0.0000 |
| Brand.CodeB | Brand.CodeB | 0.0636 | 0.0124 | 5.1137 | 0.0000 |
| Brand.CodeC | Brand.CodeC | -0.0608 | 0.0146 | -4.1573 | 0.0000 |
| Oxygen.Filler | Oxygen.Filler | -0.3373 | 0.0821 | -4.1086 | 0.0000 |
| Pressure.Setpoint | Pressure.Setpoint | -0.0077 | 0.0022 | -3.4693 | 0.0005 |
| Fill.Pressure | Fill.Pressure | 0.0040 | 0.0013 | 2.9568 | 0.0031 |
We start with ordinary linear regression because leadership should be able to see a transparent baseline before we move to more flexible methods. This model gives us p-values, coefficient signs, and a quick read on which measurements matter most, but it also exposes the limits of a purely linear view when several sensors move together.
pred_lm <- tibble(
model = "Linear regression",
observed = y_test,
predicted = lm_pred,
residual = y_test - lm_pred
)
plot_obs_pred(pred_lm, "Linear regression: observed vs predicted")
ridge_path <- glmnet(x_train, y_train, alpha = 0)
ridge_cv <- cv.glmnet(x_train, y_train, alpha = 0, nfolds = 5)
ridge_pred <- as.numeric(predict(ridge_cv, newx = x_test, s = "lambda.min"))
ridge_metrics <- metric_row("Ridge", y_test, ridge_pred)
ridge_coef_tbl <- top_abs_coef(coef(ridge_cv, s = "lambda.min"), n = 12)
op <- par(mar = c(5, 4, 6, 2) + 0.1)
plot(ridge_path, xvar = "lambda", label = FALSE, main = "", xlab = "log(lambda)")
title(main = "Ridge coefficient path", line = 3)
plot(ridge_cv, main = "", xlab = "log(lambda)", ylab = "Cross-validation error")
title(main = "Ridge cross-validation curve", line = 3)
par(op)
Ridge is our first safeguard against correlated sensors. It keeps every predictor in the model but shrinks the coefficients, which helps stabilize the fit when several process variables are telling us almost the same thing.
ridge_coef_tbl |> kable(digits = 4)
| term | estimate |
|---|---|
| Oxygen.Filler | -0.2612 |
| PC.Volume | -0.1240 |
| Fill.Ounces | -0.0998 |
| PSC.CO2 | -0.0976 |
| Carb.Rel | 0.0747 |
| Carb.Volume | -0.0705 |
| Brand.CodeD | 0.0648 |
| Brand.CodeC | -0.0643 |
| PSC | -0.0612 |
| Brand.CodeB | 0.0566 |
| PSC.Fill | -0.0511 |
| Temperature | -0.0137 |
lasso_path <- glmnet(x_train, y_train, alpha = 1)
lasso_cv <- cv.glmnet(x_train, y_train, alpha = 1, nfolds = 5)
lasso_pred <- as.numeric(predict(lasso_cv, newx = x_test, s = "lambda.min"))
lasso_metrics <- metric_row("Lasso", y_test, lasso_pred)
lasso_coef_tbl <- top_abs_coef(coef(lasso_cv, s = "lambda.min"), n = 12)
lasso_nonzero <- sum(as.matrix(coef(lasso_cv, s = "lambda.min"))[, 1] != 0) - 1
op <- par(mar = c(5, 4, 6, 2) + 0.1)
plot(lasso_path, xvar = "lambda", label = FALSE, main = "", xlab = "log(lambda)")
title(main = "Lasso coefficient path", line = 3)
plot(lasso_cv, main = "", xlab = "log(lambda)", ylab = "Cross-validation error")
title(main = "Lasso cross-validation curve", line = 3)
par(op)
Lasso takes the same idea one step further by not only shrinking coefficients, but also setting some of them to zero. That gives us a more compact story about which process variables appear most important if we want a leaner model.
lasso_coef_tbl |> kable(digits = 4)
| term | estimate |
|---|---|
| Oxygen.Filler | -0.3110 |
| PC.Volume | -0.1215 |
| Fill.Ounces | -0.0973 |
| PSC.CO2 | -0.0902 |
| Brand.CodeD | 0.0725 |
| Carb.Rel | 0.0637 |
| Carb.Volume | -0.0634 |
| Brand.CodeC | -0.0630 |
| Brand.CodeB | 0.0599 |
| PSC.Fill | -0.0506 |
| PSC | -0.0456 |
| Temperature | -0.0132 |
pls_grid <- data.frame(ncomp = 1:min(10, ncol(train_data) - 1))
pls_model <- train(
PH ~ .,
data = train_data,
method = "pls",
trControl = ctrl,
preProcess = c("center", "scale"),
tuneGrid = pls_grid
)
pls_pred <- predict(pls_model, newdata = test_data)
pls_metrics <- metric_row("PLS", y_test, pls_pred)
pls_importance <- varImp(pls_model)$importance |>
tibble::rownames_to_column("term") |>
as_tibble() |>
arrange(desc(Overall))
ggplot(as_tibble(pls_model$results), aes(ncomp, RMSE)) +
geom_line() +
geom_point() +
labs(
title = "PLS tuning curve",
x = "Number of components",
y = "RMSE"
)
PLS gives us a different way to handle correlated process measurements by compressing them into response-aware components. It is useful as a middle ground between direct regression and fully nonlinear methods, so it serves as a good checkpoint in the journey.
pls_importance |> head(12) |> kable(digits = 4)
| term | Overall |
|---|---|
| Mnf.Flow | 100.0000 |
| Brand.CodeC | 78.3040 |
| Bowl.Setpoint | 72.3439 |
| Usage.cont | 67.6834 |
| Pressure.Setpoint | 54.9554 |
| Fill.Pressure | 48.1418 |
| Pressure.Vacuum | 45.2396 |
| Temperature | 43.3939 |
| Hyd.Pressure2 | 42.4797 |
| Brand.CodeB | 39.4568 |
| Brand.CodeD | 36.6921 |
| Oxygen.Filler | 35.7157 |
rf_model <- train(
PH ~ .,
data = train_data,
method = "rf",
trControl = ctrl,
ntree = 300,
tuneLength = 3,
importance = TRUE
)
rf_pred <- predict(rf_model, newdata = test_data)
rf_metrics <- metric_row("Random forest", y_test, rf_pred)
rf_importance_tbl <- varImp(rf_model)$importance |>
tibble::rownames_to_column("term") |>
as_tibble() |>
arrange(desc(Overall)) |>
slice_head(n = 10)
ggplot(rf_importance_tbl, aes(x = reorder(term, Overall), y = Overall)) +
geom_col(aes(fill = Overall), color = NA, width = 0.65) +
scale_fill_gradient(low = "darkred", high = "darkgreen") +
coord_flip() +
labs(
title = "Random forest variable importance",
x = NULL,
y = "Overall importance"
)
Random forest moves us beyond straight-line relationships and into a model that can capture nonlinear interactions among process variables. In a manufacturing setting, that matters because pH is often shaped by combinations of pressures, flows, temperatures, and filler behavior rather than any single measurement on its own.
The importance ranking also gives us a useful process clue.
Mnf Flow is acting like a broad proxy for the state of the
line, not just a simple speed reading. It can reflect how long product
stays in the system, how well it mixes, and whether the line is
operating in a normal or unusual mode. That broad role makes it
informative in a way that a single temperature or pressure column may
not be. The carbonation variables already carry much of the chemistry
tied to dissolved CO2, so some of the temperature and pressure signal is
likely being absorbed there. On top of that, temperature and pressure
are split across several fields, which dilutes the effect of any one
column, while flow is concentrated in one place and can vary more
visibly across runs.
gbm_model <- train(
PH ~ .,
data = train_data,
method = "gbm",
trControl = ctrl,
verbose = FALSE,
tuneLength = 3
)
gbm_pred <- predict(gbm_model, newdata = test_data)
gbm_metrics <- metric_row("Gradient boosting", y_test, gbm_pred)
gbm_importance_tbl <- varImp(gbm_model)$importance |>
tibble::rownames_to_column("term") |>
as_tibble() |>
arrange(desc(Overall)) |>
slice_head(n = 10)
ggplot(gbm_importance_tbl, aes(x = reorder(term, Overall), y = Overall)) +
geom_col(aes(fill = Overall), color = NA, width = 0.65) +
scale_fill_gradient(low = "darkred", high = "darkgreen") +
coord_flip() +
labs(
title = "Gradient boosting variable importance",
x = NULL,
y = "Overall importance"
)
Boosting is the most flexible model in this sequence, so it is a natural challenger when the process contains several weak nonlinear effects that add up. The tradeoff is that it asks us to give up some transparency in exchange for potential gains in predictive power.
In this run, gradient boosting improved on the linear baselines, but it did not beat random forest. That matters because GBM is not automatically better just because it is more complex. It depends heavily on tuning choices such as the number of trees, interaction depth, and learning rate. Here the model was only tuned lightly, so it may not have reached its best bias-variance tradeoff. Random forest, by contrast, is often more forgiving on moderately noisy process data because many trees are averaged together, which can reduce variance without requiring as much careful tuning.
The reason GBM never fully overtakes random forest is also consistent
with the process story. If one variable, like Mnf Flow, is
carrying a lot of broad operating information while temperature and
pressure are partially folded into other carbonation measures, the model
may benefit more from a robust ensemble that can average over that
uneven signal. GBM can still work well, but it usually needs more
deliberate tuning to separate subtle effects from the stronger
process-state pattern that random forest is already capturing.
nn_grid <- expand.grid(
size = c(1, 3, 5),
decay = c(0, 0.001, 0.01)
)
nn_model <- train(
x = x_train,
y = y_train,
method = "nnet",
trControl = ctrl,
preProcess = c("center", "scale"),
tuneGrid = nn_grid,
trace = FALSE,
MaxNWts = 5000,
maxit = 500
)
nn_pred <- predict(nn_model, newdata = x_test)
nn_metrics <- metric_row("Neural network", y_test, nn_pred)
ggplot(as_tibble(nn_model$results), aes(x = size, y = decay, fill = RMSE)) +
geom_tile(color = "white") +
geom_text(aes(label = round(RMSE, 4)), color = "grey95", fontface = "bold", size = 3) +
scale_fill_gradient(
low = "darkgreen",
high = "darkred",
guide = guide_colorbar(
barwidth = grid::unit(5.5, "cm"),
barheight = grid::unit(0.4, "cm"),
ticks = FALSE
)
) +
theme(legend.text = element_blank()) +
labs(
title = "Neural network tuning surface",
x = "Hidden units",
y = "Decay",
fill = "RMSE"
)
A neural network is worth trying here because the cleaned data set is large enough for a modest nonlinear model, but still small enough that a shallow network is the sensible scale. Using the matrix form of the predictors keeps the comparison aligned with ridge, lasso, and PLS while still letting the network learn nonlinear interactions.
The real question is whether that flexibility helps more than the structured splits in the tree-based models. If the neural net does not win, it still contributes useful evidence about the shape of the process signal: it would suggest that the data reward grouped rules and averaging more than a smooth nonlinear surface.
mars_grid = expand.grid(.degree = 1:3,
.nprune = 2:40)
set.seed(4802)
mars_model = train(x_train, y_train, method = "earth",
tuneGrid = mars_grid, trControl = trainControl(method = "cv"))
mars_model
## Multivariate Adaptive Regression Spline
##
## 2055 samples
## 26 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1849, 1849, 1850, 1849, 1850, 1849, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 0.1530804 0.2199326 0.11992367
## 1 3 0.1447556 0.3026516 0.11317537
## 1 4 0.1431707 0.3185217 0.11138224
## 1 5 0.1422096 0.3269543 0.11087623
## 1 6 0.1415654 0.3337418 0.11045948
## 1 7 0.1395315 0.3534574 0.10819975
## 1 8 0.1373283 0.3734254 0.10684295
## 1 9 0.1371869 0.3748793 0.10652854
## 1 10 0.1366040 0.3805543 0.10591282
## 1 11 0.1358162 0.3871270 0.10491793
## 1 12 0.1354421 0.3903183 0.10477593
## 1 13 0.1355102 0.3896758 0.10479063
## 1 14 0.1350368 0.3939793 0.10437076
## 1 15 0.1353007 0.3919157 0.10461137
## 1 16 0.1348352 0.3960488 0.10412560
## 1 17 0.1348041 0.3965046 0.10400771
## 1 18 0.1348532 0.3959228 0.10396929
## 1 19 0.1349107 0.3957335 0.10381036
## 1 20 0.1348068 0.3969842 0.10371997
## 1 21 0.1348806 0.3965845 0.10374351
## 1 22 0.1350050 0.3955238 0.10385199
## 1 23 0.1350981 0.3948238 0.10388658
## 1 24 0.1352442 0.3937749 0.10388120
## 1 25 0.1353387 0.3933090 0.10381274
## 1 26 0.1350132 0.3961041 0.10352188
## 1 27 0.1349634 0.3965455 0.10343309
## 1 28 0.1349084 0.3971027 0.10340304
## 1 29 0.1349451 0.3971491 0.10339319
## 1 30 0.1349447 0.3971752 0.10331820
## 1 31 0.1350587 0.3963988 0.10343699
## 1 32 0.1351117 0.3959191 0.10343229
## 1 33 0.1348912 0.3977610 0.10325484
## 1 34 0.1349092 0.3977948 0.10324660
## 1 35 0.1349072 0.3977686 0.10323973
## 1 36 0.1349072 0.3977686 0.10323973
## 1 37 0.1349072 0.3977686 0.10323973
## 1 38 0.1349072 0.3977686 0.10323973
## 1 39 0.1349072 0.3977686 0.10323973
## 1 40 0.1349072 0.3977686 0.10323973
## 2 2 0.1530804 0.2199326 0.11992367
## 2 3 0.1472295 0.2787374 0.11589472
## 2 4 0.1446334 0.3034280 0.11428082
## 2 5 0.1423643 0.3256869 0.11147829
## 2 6 0.1412510 0.3371803 0.11030493
## 2 7 0.1392800 0.3561903 0.10868920
## 2 8 0.1381175 0.3664563 0.10663767
## 2 9 0.1366320 0.3812187 0.10541640
## 2 10 0.1360049 0.3869177 0.10510501
## 2 11 0.1345853 0.4002541 0.10392173
## 2 12 0.1327328 0.4160324 0.10234370
## 2 13 0.1319401 0.4220520 0.10147932
## 2 14 0.1310271 0.4300779 0.10078022
## 2 15 0.1302503 0.4372052 0.10009143
## 2 16 0.1299984 0.4390567 0.09972055
## 2 17 0.1300561 0.4386617 0.09962475
## 2 18 0.1292271 0.4454814 0.09889966
## 2 19 0.1291958 0.4460462 0.09854813
## 2 20 0.1293103 0.4456901 0.09859703
## 2 21 0.1291066 0.4473245 0.09851171
## 2 22 0.1291824 0.4471135 0.09842115
## 2 23 0.1288751 0.4496328 0.09824632
## 2 24 0.1286738 0.4513078 0.09802570
## 2 25 0.1287770 0.4507290 0.09818558
## 2 26 0.1287136 0.4511480 0.09826289
## 2 27 0.1286456 0.4519299 0.09821330
## 2 28 0.1288934 0.4502725 0.09833212
## 2 29 0.1286258 0.4521189 0.09827586
## 2 30 0.1286226 0.4521221 0.09822689
## 2 31 0.1288090 0.4511722 0.09848295
## 2 32 0.1287120 0.4519608 0.09839870
## 2 33 0.1286208 0.4526833 0.09823894
## 2 34 0.1285134 0.4534777 0.09814826
## 2 35 0.1285418 0.4534861 0.09828997
## 2 36 0.1285876 0.4532308 0.09832435
## 2 37 0.1285876 0.4532308 0.09832435
## 2 38 0.1285280 0.4538101 0.09823408
## 2 39 0.1285280 0.4538101 0.09823408
## 2 40 0.1285259 0.4537433 0.09819948
## 3 2 0.1528737 0.2224256 0.11952946
## 3 3 0.1479996 0.2725603 0.11565283
## 3 4 0.1446025 0.3065619 0.11332085
## 3 5 0.1421860 0.3311396 0.11152093
## 3 6 0.1403264 0.3481586 0.10963260
## 3 7 0.1383726 0.3668156 0.10757688
## 3 8 0.1376644 0.3735815 0.10670812
## 3 9 0.1357488 0.3901300 0.10480473
## 3 10 0.1413520 0.3867188 0.10489436
## 3 11 0.1419360 0.3941660 0.10449480
## 3 12 0.1415904 0.4017852 0.10379752
## 3 13 0.1406109 0.4103714 0.10239061
## 3 14 0.1396718 0.4184280 0.10132080
## 3 15 0.1396462 0.4209129 0.10090530
## 3 16 0.1388287 0.4276349 0.10060519
## 3 17 0.1385654 0.4302713 0.10013700
## 3 18 0.1385349 0.4311069 0.09967114
## 3 19 0.1386348 0.4308935 0.09965815
## 3 20 0.1387075 0.4295826 0.09991349
## 3 21 0.1384601 0.4325544 0.09963313
## 3 22 0.1380869 0.4357017 0.09932260
## 3 23 0.1384123 0.4344252 0.09923021
## 3 24 0.1390915 0.4341468 0.09915279
## 3 25 0.1390596 0.4344183 0.09907787
## 3 26 0.1391192 0.4349731 0.09913553
## 3 27 0.1387637 0.4365060 0.09888459
## 3 28 0.1383958 0.4367571 0.09879473
## 3 29 0.1380311 0.4402865 0.09870461
## 3 30 0.1378934 0.4415655 0.09864432
## 3 31 0.1383598 0.4389247 0.09903042
## 3 32 0.1386127 0.4392583 0.09917142
## 3 33 0.1385429 0.4408078 0.09882112
## 3 34 0.1386128 0.4399771 0.09890200
## 3 35 0.1384344 0.4407022 0.09875892
## 3 36 0.1385096 0.4402435 0.09881147
## 3 37 0.1384239 0.4412483 0.09867646
## 3 38 0.1383911 0.4414845 0.09868745
## 3 39 0.1382946 0.4422462 0.09856212
## 3 40 0.1382559 0.4425589 0.09853185
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 34 and degree = 2.
mars_predict = predict(mars_model, x_test)
mars_values = data.frame(obs = y_test, pred = mars_predict) |>
rename("pred" = y)
defaultSummary(mars_values)
## RMSE Rsquared MAE
## 0.12451112 0.47347262 0.09510244
mars_metrics <- metric_row("MARS", y_test, mars_predict)
set.seed(4802)
svm_model = train(x_train, y_train, method = "svmRadial",
preProcess = c("center", "scale"), tuneLength = 14,
trControl = trainControl(method = "cv"))
svm_model
## Support Vector Machines with Radial Basis Function Kernel
##
## 2055 samples
## 26 predictor
##
## Pre-processing: centered (26), scaled (26)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1849, 1849, 1850, 1849, 1850, 1849, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 0.1274390 0.4667806 0.09472354
## 0.50 0.1249602 0.4853215 0.09180308
## 1.00 0.1230379 0.4990139 0.08958601
## 2.00 0.1221784 0.5053651 0.08869118
## 4.00 0.1217612 0.5092987 0.08856583
## 8.00 0.1220978 0.5095259 0.08921912
## 16.00 0.1233072 0.5072926 0.09045709
## 32.00 0.1263891 0.4955804 0.09334229
## 64.00 0.1319430 0.4704690 0.09752783
## 128.00 0.1408356 0.4319201 0.10484662
## 256.00 0.1456818 0.4154403 0.10919258
## 512.00 0.1510843 0.3955517 0.11328195
## 1024.00 0.1514337 0.3946032 0.11357131
## 2048.00 0.1514337 0.3946032 0.11357131
##
## Tuning parameter 'sigma' was held constant at a value of 0.02473474
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.02473474 and C = 4.
svm_predict = predict(svm_model, x_test)
svm_values = data.frame(obs = y_test, pred = svm_predict)
defaultSummary(svm_values)
## RMSE Rsquared MAE
## 0.11909427 0.51195037 0.08793649
svm_metrics <- metric_row("SVM", y_test, svm_predict)
set.seed(4802)
knn_model = train(x_train, y_train, method = "knn", preProc = c("center", "scale"),
tuneGrid = data.frame(.k = 1:20),
trControl = trainControl(method = "cv"))
knn_model
## k-Nearest Neighbors
##
## 2055 samples
## 26 predictor
##
## Pre-processing: centered (26), scaled (26)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1849, 1849, 1850, 1849, 1850, 1849, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 0.1531553 0.3685101 0.10416892
## 2 0.1333942 0.4518124 0.09504035
## 3 0.1264984 0.4854762 0.09145719
## 4 0.1245714 0.4953773 0.09078836
## 5 0.1239387 0.4975112 0.09053362
## 6 0.1232905 0.5010337 0.09027578
## 7 0.1232877 0.4998761 0.09064188
## 8 0.1233476 0.4997777 0.09118220
## 9 0.1233922 0.4993114 0.09166419
## 10 0.1235735 0.4985427 0.09226155
## 11 0.1246036 0.4909626 0.09310106
## 12 0.1253195 0.4853954 0.09371691
## 13 0.1257149 0.4823119 0.09432147
## 14 0.1263369 0.4774003 0.09494671
## 15 0.1266877 0.4752064 0.09536990
## 16 0.1271108 0.4718543 0.09599436
## 17 0.1273332 0.4703405 0.09608894
## 18 0.1279337 0.4653327 0.09681282
## 19 0.1282629 0.4622493 0.09725985
## 20 0.1287082 0.4584443 0.09769344
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 7.
knn_predict = predict(knn_model, x_test)
knn_values = data.frame(obs = y_test, pred = knn_predict)
defaultSummary(knn_values)
## RMSE Rsquared MAE
## 0.12296905 0.48156846 0.09067453
knn_metrics <- metric_row("KNN", y_test, knn_predict)
model_metrics <- bind_rows(
lm_metrics,
ridge_metrics,
lasso_metrics,
pls_metrics,
rf_metrics,
gbm_metrics,
nn_metrics,
mars_metrics,
svm_metrics,
knn_metrics,
) |>
mutate(
adjusted_r2 = c(lm_adj_r2, rep(NA_real_, 9))
) |>
select(model, rmse, mae, r2, adjusted_r2)
best_model <- model_metrics |> arrange(rmse) |> slice(1)
best_model_name <- best_model$model[[1]]
model_metrics |> arrange(rmse) |> kable(digits = 4)
| model | rmse | mae | r2 | adjusted_r2 |
|---|---|---|---|---|
| Random forest | 0.1059 | 0.0739 | 0.6142788 | NA |
| SVM | 0.1191 | 0.0879 | 0.5119504 | NA |
| KNN | 0.1230 | 0.0907 | 0.4815685 | NA |
| MARS | 0.1245 | 0.0951 | 0.4734726 | NA |
| Gradient boosting | 0.1258 | 0.0955 | 0.4567741 | NA |
| Ridge | 0.1365 | 0.1061 | 0.3579047 | NA |
| Lasso | 0.1366 | 0.1061 | 0.3569850 | NA |
| Linear regression | 0.1366 | 0.1060 | 0.3574204 | 0.3838 |
| PLS | 0.1366 | 0.1058 | 0.3573964 | NA |
| Neural network | 7.5521 | 7.5502 | NA | NA |
ggplot(model_metrics, aes(x = reorder(model, rmse), y = rmse, fill = rmse)) +
geom_col(color = NA, width = 0.65) +
scale_fill_gradient(low = "darkgreen", high = "darkred") +
coord_flip() +
labs(
title = "Test-set RMSE by model",
x = NULL,
y = "RMSE"
)
prediction_comparison <- bind_rows(
tibble(model = "Linear regression", observed = y_test, predicted = lm_pred),
tibble(model = "Ridge", observed = y_test, predicted = ridge_pred),
tibble(model = "Lasso", observed = y_test, predicted = lasso_pred),
tibble(model = "PLS", observed = y_test, predicted = pls_pred),
tibble(model = "Random forest", observed = y_test, predicted = rf_pred),
tibble(model = "Gradient boosting", observed = y_test, predicted = gbm_pred),
tibble(model = "Neural network", observed = y_test, predicted = nn_pred),
tibble(model = "MARS", observed = y_test, predicted = mars_predict),
tibble(model = "SVM", observed = y_test, predicted = svm_predict),
tibble(model = "KNN", observed = y_test, predicted = knn_predict)
) |>
mutate(residual = observed - predicted)
ggplot(prediction_comparison, aes(x = model, y = residual)) +
geom_boxplot(width = 0.55, fill = "grey92", color = "grey25", outlier.size = 0.7) +
coord_flip() +
geom_hline(yintercept = 0, linetype = 2, color = "gray45", linewidth = 0.35) +
labs(
title = "Residual spread by model",
x = NULL,
y = "Observed - predicted"
)
ggplot(prediction_comparison, aes(x = observed, y = predicted)) +
geom_point(alpha = 0.45, size = 0.9) +
geom_abline(
slope = 1,
intercept = 0,
linetype = 2,
color = "gray45",
linewidth = 0.35
) +
facet_wrap(~ model, ncol = 2, scales = "free") +
labs(
title = "Observed vs predicted across candidate models",
x = "Observed PH",
y = "Predicted PH"
)
The winner is Random forest, because it gives the lowest test RMSE in
this run while still being backed up by a strong residual pattern and a
more competitive R^2 than the simpler linear baselines.
Ridge and lasso still matter because they show how shrinkage improves
stability, but the selected tree-based model handles the process
nonlinearity more effectively.
Gradient boosting is still an important checkpoint in the journey even though it does not win here. It gives a clear step up from ordinary regression, ridge, lasso, and PLS, which tells us the data do contain nonlinear structure. The fact that it still trails random forest suggests either that the boosting hyperparameters need more work or that the random forest averaging strategy fits this data better under the current cleaning pipeline. In other words, boosting was competitive, but not yet competitive enough to replace the forest model.
The Mnf Flow sensitivity check reinforces that
interpretation. When the suspicious -100 and
-100.2 values are recoded as missing and imputed, the
ranking of the tree models does not change: random forest still wins,
and gradient boosting stays in second place. That means the odd flow
readings are not the reason the forest won. They are a data-quality
concern worth noting, but the overall model comparison is stable whether
we leave those values in or treat them as missing.
prediction_map <- list(
"Linear regression" = lm_pred,
"Ridge" = ridge_pred,
"Lasso" = lasso_pred,
"PLS" = pls_pred,
"Random forest" = rf_pred,
"Gradient boosting" = gbm_pred,
"Neural network" = nn_pred,
"MARS" = mars_predict,
"SVM" = svm_predict,
"KNN" = knn_predict
)
selected_predictions <- tibble(
observed = y_test,
predicted = prediction_map[[best_model_name]],
residual = y_test - prediction_map[[best_model_name]]
)
write.csv(
selected_predictions,
"ABC_Beverage_Selected_Model_Predictions.csv",
row.names = FALSE
)
if (any(file.exists(eval_paths))) {
scoring_data <- evaluation_data |> select(any_of(names(model_data)))
scoring_data <- scoring_data |> mutate(Brand.Code = factor(Brand.Code, levels = levels(model_data$Brand.Code)))
head(scoring_data)
} else {
head(model_data)
}
## # A tibble: 6 × 25
## Brand.Code Carb.Volume Fill.Ounces PC.Volume Carb.Pressure PSC PSC.Fill
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 B 5.34 24.0 0.263 68.2 0.104 0.260
## 2 A 5.43 24.0 0.239 68.4 0.124 0.220
## 3 B 5.29 24.1 0.263 70.8 0.09 0.34
## 4 A 5.44 24.0 0.293 63 0.118 0.420
## 5 A 5.49 24.3 0.111 67.2 0.026 0.16
## 6 A 5.38 23.9 0.269 66.6 0.09 0.240
## # ℹ 18 more variables: PSC.CO2 <dbl>, Mnf.Flow <dbl>, Carb.Pressure1 <dbl>,
## # Fill.Pressure <dbl>, Hyd.Pressure1 <dbl>, Hyd.Pressure2 <dbl>,
## # Hyd.Pressure4 <dbl>, Temperature <dbl>, Usage.cont <dbl>, Carb.Flow <dbl>,
## # MFR <dbl>, Pressure.Vacuum <dbl>, Oxygen.Filler <dbl>, Bowl.Setpoint <dbl>,
## # Pressure.Setpoint <dbl>, Air.Pressurer <dbl>, Carb.Rel <dbl>, PH <dbl>
This optional chunk keeps the report portable. If a separate evaluation workbook is available later, the same modeling workflow can score it directly without changing the rest of the document.
data_to_predict = read_excel("Test Data.xlsx") |> rename_with(make.names)
data_to_predict_predictors = data_to_predict |> select(-PH)
data_to_predict_updated = missForestPredict(imputation_model,
newdata = data_to_predict_predictors |> mutate(Brand.Code = as.factor(Brand.Code)))
data_to_predict_updated |>
summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 × 32
## Brand.Code Carb.Volume Fill.Ounces PC.Volume Carb.Pressure Carb.Temp PSC
## <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0 0
## # ℹ 25 more variables: PSC.Fill <int>, PSC.CO2 <int>, Mnf.Flow <int>,
## # Carb.Pressure1 <int>, Fill.Pressure <int>, Hyd.Pressure1 <int>,
## # Hyd.Pressure2 <int>, Hyd.Pressure3 <int>, Hyd.Pressure4 <int>,
## # Filler.Level <int>, Filler.Speed <int>, Temperature <int>,
## # Usage.cont <int>, Carb.Flow <int>, Density <int>, MFR <int>, Balling <int>,
## # Pressure.Vacuum <int>, Oxygen.Filler <int>, Bowl.Setpoint <int>,
## # Pressure.Setpoint <int>, Air.Pressurer <int>, Alch.Rel <int>, …
data_to_predict_updated = data_to_predict_updated |> select(-all_of(high_corr))
predicted_ph = predict(rf_model, newdata = data_to_predict_updated)
final_data = data_to_predict_updated |>
bind_cols(data.frame(predicted_ph) |> rename("PH" = predicted_ph))
final_data |> head()
## # A tibble: 6 × 25
## Brand.Code Carb.Volume Fill.Ounces PC.Volume Carb.Pressure PSC PSC.Fill
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 D 5.48 24.0 0.27 65.4 0.236 0.400
## 2 A 5.39 24.0 0.227 63.2 0.042 0.220
## 3 B 5.29 23.9 0.303 66.4 0.068 0.100
## 4 B 5.27 23.9 0.186 64.8 0.004 0.200
## 5 B 5.41 24.2 0.16 69.4 0.04 0.3
## 6 B 5.29 24.1 0.212 73.4 0.078 0.220
## # ℹ 18 more variables: PSC.CO2 <dbl>, Mnf.Flow <dbl>, Carb.Pressure1 <dbl>,
## # Fill.Pressure <dbl>, Hyd.Pressure1 <dbl>, Hyd.Pressure2 <dbl>,
## # Hyd.Pressure4 <dbl>, Temperature <dbl>, Usage.cont <dbl>, Carb.Flow <dbl>,
## # MFR <dbl>, Pressure.Vacuum <dbl>, Oxygen.Filler <dbl>, Bowl.Setpoint <dbl>,
## # Pressure.Setpoint <dbl>, Air.Pressurer <dbl>, Carb.Rel <dbl>, PH <dbl>