library(tidyverse)

## Warning: package 'lubridate' was built under R version 4.4.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readxl)
library(pls)

## 
## Attaching package: 'pls'
## 
## The following object is masked from 'package:stats':
## 
##     loadings

library(elasticnet)

## Loading required package: lars
## Loaded lars 1.3

library(DataExplorer)
library(DT)

## Warning: package 'DT' was built under R version 4.4.3

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:pls':
## 
##     R2
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(ggplot2)

Instructions

This is role playing. I am your new boss. I am in charge of production at ABC Beverage and you are a team of data scientists reporting to me. My leadership has told me that new regulations are requiring us to understand our manufacturing process, the predictive factors and be able to report to them our predictive model of PH.

Please use the historical data set I am providing. Build and report the factors in BOTH a technical and non-technical report. I like to use Word and Excel. Please provide your non-technical report in a business friendly readable document and your predictions in an Excel readable format. The technical report should show clearly the models you tested and how you selected your final approach.

Please submit both Rpubs links and .rmd files or other readable formats for technical and non-technical reports. Also submit the excel file showing the prediction of your models for pH. # Load & Review Train Data

train <- read_excel("C:/Users/Admin/Downloads/StudentData.xlsx")

#review
glimpse(train)

## Rows: 2,571
## Columns: 33
## $ `Brand Code`        <chr> "B", "A", "B", "A", "A", "A", "A", "B", "B", "B", …
## $ `Carb Volume`       <dbl> 5.340000, 5.426667, 5.286667, 5.440000, 5.486667, …
## $ `Fill Ounces`       <dbl> 23.96667, 24.00667, 24.06000, 24.00667, 24.31333, …
## $ `PC Volume`         <dbl> 0.2633333, 0.2386667, 0.2633333, 0.2933333, 0.1113…
## $ `Carb Pressure`     <dbl> 68.2, 68.4, 70.8, 63.0, 67.2, 66.6, 64.2, 67.6, 64…
## $ `Carb Temp`         <dbl> 141.2, 139.6, 144.8, 132.6, 136.8, 138.4, 136.8, 1…
## $ PSC                 <dbl> 0.104, 0.124, 0.090, NA, 0.026, 0.090, 0.128, 0.15…
## $ `PSC Fill`          <dbl> 0.26, 0.22, 0.34, 0.42, 0.16, 0.24, 0.40, 0.34, 0.…
## $ `PSC CO2`           <dbl> 0.04, 0.04, 0.16, 0.04, 0.12, 0.04, 0.04, 0.04, 0.…
## $ `Mnf Flow`          <dbl> -100, -100, -100, -100, -100, -100, -100, -100, -1…
## $ `Carb Pressure1`    <dbl> 118.8, 121.6, 120.2, 115.2, 118.4, 119.6, 122.2, 1…
## $ `Fill Pressure`     <dbl> 46.0, 46.0, 46.0, 46.4, 45.8, 45.6, 51.8, 46.8, 46…
## $ `Hyd Pressure1`     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Hyd Pressure2`     <dbl> NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Hyd Pressure3`     <dbl> NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Hyd Pressure4`     <dbl> 118, 106, 82, 92, 92, 116, 124, 132, 90, 108, 94, …
## $ `Filler Level`      <dbl> 121.2, 118.6, 120.0, 117.8, 118.6, 120.2, 123.4, 1…
## $ `Filler Speed`      <dbl> 4002, 3986, 4020, 4012, 4010, 4014, NA, 1004, 4014…
## $ Temperature         <dbl> 66.0, 67.6, 67.0, 65.6, 65.6, 66.2, 65.8, 65.2, 65…
## $ `Usage cont`        <dbl> 16.18, 19.90, 17.76, 17.42, 17.68, 23.82, 20.74, 1…
## $ `Carb Flow`         <dbl> 2932, 3144, 2914, 3062, 3054, 2948, 30, 684, 2902,…
## $ Density             <dbl> 0.88, 0.92, 1.58, 1.54, 1.54, 1.52, 0.84, 0.84, 0.…
## $ MFR                 <dbl> 725.0, 726.8, 735.0, 730.6, 722.8, 738.8, NA, NA, …
## $ Balling             <dbl> 1.398, 1.498, 3.142, 3.042, 3.042, 2.992, 1.298, 1…
## $ `Pressure Vacuum`   <dbl> -4.0, -4.0, -3.8, -4.4, -4.4, -4.4, -4.4, -4.4, -4…
## $ PH                  <dbl> 8.36, 8.26, 8.94, 8.24, 8.26, 8.32, 8.40, 8.38, 8.…
## $ `Oxygen Filler`     <dbl> 0.022, 0.026, 0.024, 0.030, 0.030, 0.024, 0.066, 0…
## $ `Bowl Setpoint`     <dbl> 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, …
## $ `Pressure Setpoint` <dbl> 46.4, 46.8, 46.6, 46.0, 46.0, 46.0, 46.0, 46.0, 46…
## $ `Air Pressurer`     <dbl> 142.6, 143.0, 142.0, 146.2, 146.2, 146.6, 146.2, 1…
## $ `Alch Rel`          <dbl> 6.58, 6.56, 7.66, 7.14, 7.14, 7.16, 6.54, 6.52, 6.…
## $ `Carb Rel`          <dbl> 5.32, 5.30, 5.84, 5.42, 5.44, 5.44, 5.38, 5.34, 5.…
## $ `Balling Lvl`       <dbl> 1.48, 1.56, 3.28, 3.04, 3.04, 3.02, 1.44, 1.44, 1.…

summary(train)

##   Brand Code         Carb Volume     Fill Ounces      PC Volume      
##  Length:2571        Min.   :5.040   Min.   :23.63   Min.   :0.07933  
##  Class :character   1st Qu.:5.293   1st Qu.:23.92   1st Qu.:0.23917  
##  Mode  :character   Median :5.347   Median :23.97   Median :0.27133  
##                     Mean   :5.370   Mean   :23.97   Mean   :0.27712  
##                     3rd Qu.:5.453   3rd Qu.:24.03   3rd Qu.:0.31200  
##                     Max.   :5.700   Max.   :24.32   Max.   :0.47800  
##                     NA's   :10      NA's   :38      NA's   :39       
##  Carb Pressure     Carb Temp          PSC             PSC Fill     
##  Min.   :57.00   Min.   :128.6   Min.   :0.00200   Min.   :0.0000  
##  1st Qu.:65.60   1st Qu.:138.4   1st Qu.:0.04800   1st Qu.:0.1000  
##  Median :68.20   Median :140.8   Median :0.07600   Median :0.1800  
##  Mean   :68.19   Mean   :141.1   Mean   :0.08457   Mean   :0.1954  
##  3rd Qu.:70.60   3rd Qu.:143.8   3rd Qu.:0.11200   3rd Qu.:0.2600  
##  Max.   :79.40   Max.   :154.0   Max.   :0.27000   Max.   :0.6200  
##  NA's   :27      NA's   :26      NA's   :33        NA's   :23      
##     PSC CO2           Mnf Flow       Carb Pressure1  Fill Pressure  
##  Min.   :0.00000   Min.   :-100.20   Min.   :105.6   Min.   :34.60  
##  1st Qu.:0.02000   1st Qu.:-100.00   1st Qu.:119.0   1st Qu.:46.00  
##  Median :0.04000   Median :  65.20   Median :123.2   Median :46.40  
##  Mean   :0.05641   Mean   :  24.57   Mean   :122.6   Mean   :47.92  
##  3rd Qu.:0.08000   3rd Qu.: 140.80   3rd Qu.:125.4   3rd Qu.:50.00  
##  Max.   :0.24000   Max.   : 229.40   Max.   :140.2   Max.   :60.40  
##  NA's   :39        NA's   :2         NA's   :32      NA's   :22     
##  Hyd Pressure1   Hyd Pressure2   Hyd Pressure3   Hyd Pressure4   
##  Min.   :-0.80   Min.   : 0.00   Min.   :-1.20   Min.   : 52.00  
##  1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 86.00  
##  Median :11.40   Median :28.60   Median :27.60   Median : 96.00  
##  Mean   :12.44   Mean   :20.96   Mean   :20.46   Mean   : 96.29  
##  3rd Qu.:20.20   3rd Qu.:34.60   3rd Qu.:33.40   3rd Qu.:102.00  
##  Max.   :58.00   Max.   :59.40   Max.   :50.00   Max.   :142.00  
##  NA's   :11      NA's   :15      NA's   :15      NA's   :30      
##   Filler Level    Filler Speed   Temperature      Usage cont      Carb Flow   
##  Min.   : 55.8   Min.   : 998   Min.   :63.60   Min.   :12.08   Min.   :  26  
##  1st Qu.: 98.3   1st Qu.:3888   1st Qu.:65.20   1st Qu.:18.36   1st Qu.:1144  
##  Median :118.4   Median :3982   Median :65.60   Median :21.79   Median :3028  
##  Mean   :109.3   Mean   :3687   Mean   :65.97   Mean   :20.99   Mean   :2468  
##  3rd Qu.:120.0   3rd Qu.:3998   3rd Qu.:66.40   3rd Qu.:23.75   3rd Qu.:3186  
##  Max.   :161.2   Max.   :4030   Max.   :76.20   Max.   :25.90   Max.   :5104  
##  NA's   :20      NA's   :57     NA's   :14      NA's   :5       NA's   :2     
##     Density           MFR           Balling       Pressure Vacuum 
##  Min.   :0.240   Min.   : 31.4   Min.   :-0.170   Min.   :-6.600  
##  1st Qu.:0.900   1st Qu.:706.3   1st Qu.: 1.496   1st Qu.:-5.600  
##  Median :0.980   Median :724.0   Median : 1.648   Median :-5.400  
##  Mean   :1.174   Mean   :704.0   Mean   : 2.198   Mean   :-5.216  
##  3rd Qu.:1.620   3rd Qu.:731.0   3rd Qu.: 3.292   3rd Qu.:-5.000  
##  Max.   :1.920   Max.   :868.6   Max.   : 4.012   Max.   :-3.600  
##  NA's   :1       NA's   :212     NA's   :1                        
##        PH        Oxygen Filler     Bowl Setpoint   Pressure Setpoint
##  Min.   :7.880   Min.   :0.00240   Min.   : 70.0   Min.   :44.00    
##  1st Qu.:8.440   1st Qu.:0.02200   1st Qu.:100.0   1st Qu.:46.00    
##  Median :8.540   Median :0.03340   Median :120.0   Median :46.00    
##  Mean   :8.546   Mean   :0.04684   Mean   :109.3   Mean   :47.62    
##  3rd Qu.:8.680   3rd Qu.:0.06000   3rd Qu.:120.0   3rd Qu.:50.00    
##  Max.   :9.360   Max.   :0.40000   Max.   :140.0   Max.   :52.00    
##  NA's   :4       NA's   :12        NA's   :2       NA's   :12       
##  Air Pressurer      Alch Rel        Carb Rel      Balling Lvl  
##  Min.   :140.8   Min.   :5.280   Min.   :4.960   Min.   :0.00  
##  1st Qu.:142.2   1st Qu.:6.540   1st Qu.:5.340   1st Qu.:1.38  
##  Median :142.6   Median :6.560   Median :5.400   Median :1.48  
##  Mean   :142.8   Mean   :6.897   Mean   :5.437   Mean   :2.05  
##  3rd Qu.:143.0   3rd Qu.:7.240   3rd Qu.:5.540   3rd Qu.:3.14  
##  Max.   :148.2   Max.   :8.620   Max.   :6.060   Max.   :3.66  
##                  NA's   :9       NA's   :10      NA's   :1

dim(train)

## [1] 2571   33

Plot missing

plot_missing(train)

After visualizing the missing data in the training set, I noticed that the majority of features have very low levels of missingness—most under 1%. Variables like Air Pressure, Pressure Vacuum, and Balling Lvl have no missing values at all, which is great. A few features, such as PSC CO2, PC Volume, and Filler Speed, are slightly higher, with missing rates just above 1–2%, but still within an acceptable range. The only features that raise a mild concern are Brand Code and MFR, which have about 4.67% and 8.25% of their values missing, respectively. While this isn’t alarmingly high, it may warrant some form of imputation or removal depending on how critical those variables are to the model. Overall, the dataset appears to be fairly clean with only a small number of missing values concentrated in a few fields.

Plot Histogram

plot_histogram(train)

Many variables in the training data, like Carb Pressure and Fill Ounces, have fairly normal distributions, which is great for modeling. Some features, like Hyd Pressure and Oxygen Filler, are heavily skewed or clustered, which may need transformation. A few variables, such as Bowl Setpoint and Filler Speed, show sharp spikes, possibly due to fixed machine settings. Others, like MFR and Carb Flow, have extreme values that could be outliers. These patterns help guide how I clean and prepare the data for modeling.

Plot Density

plot_density(train)

Most variables show roughly bell-shaped or bimodal distributions, suggesting mixed patterns in how machines operate or how batches behave. Features like Fill Ounces, PH, and Carb Pressure look fairly normal, which is helpful for modeling. Others, such as Hyd Pressure and Pressure Setpoint, show multiple peaks, possibly due to operational settings or shifts. A few features like Oxygen Filler and Carb Flow are skewed, which might require transformation. These patterns help me decide which variables need extra preprocessing.

Plot Boxplot

plot_boxplot(
  data = train,
  by = "PH")

## Warning: Removed 302 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 372 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 46 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

The boxplots show how various features relate to PH levels across the production process. Some variables like Fill.Ounces, Carb.Temperature, and PC.Volume are fairly consistent across PH ranges, while others like Hydraulic Pressure, Mnf.Flow, and Filler.Speed show more variability. Notably, PSC.CO2 and Oxygen.Filler have a wide spread and many outliers, which may indicate noise or operational fluctuations. There are also some missing PH values, especially in features like Carb.Rel and Air.Pressurer, which should be explored further before modeling. Overall, several variables show potential for predicting PH.

drop na

library("tidyr")
train_data = train %>% drop_na()
glimpse(train_data)

## Rows: 2,038
## Columns: 33
## $ `Brand Code`        <chr> "A", "A", "B", "B", "B", "B", "B", "B", "B", "C", …
## $ `Carb Volume`       <dbl> 5.486667, 5.380000, 5.246667, 5.266667, 5.320000, …
## $ `Fill Ounces`       <dbl> 24.31333, 23.92667, 23.98000, 24.00667, 23.92000, …
## $ `PC Volume`         <dbl> 0.1113333, 0.2693333, 0.2626667, 0.2313333, 0.2586…
## $ `Carb Pressure`     <dbl> 67.2, 66.6, 64.2, 72.0, 66.2, 61.6, 71.6, 72.6, 68…
## $ `Carb Temp`         <dbl> 136.8, 138.4, 140.2, 147.4, 139.4, 132.8, 147.8, 1…
## $ PSC                 <dbl> 0.026, 0.090, 0.132, 0.014, 0.078, 0.110, 0.096, 0…
## $ `PSC Fill`          <dbl> 0.16, 0.24, 0.12, 0.24, 0.18, 0.18, 0.22, 0.36, 0.…
## $ `PSC CO2`           <dbl> 0.12, 0.04, 0.14, 0.06, 0.04, 0.02, 0.04, 0.08, 0.…
## $ `Mnf Flow`          <dbl> -100, -100, -100, -100, -100, -100, -100, -100, -1…
## $ `Carb Pressure1`    <dbl> 118.4, 119.6, 120.8, 119.8, 119.6, 119.2, 113.6, 1…
## $ `Fill Pressure`     <dbl> 45.8, 45.6, 46.0, 45.2, 46.6, 46.6, 46.0, 46.6, 47…
## $ `Hyd Pressure1`     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Hyd Pressure2`     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Hyd Pressure3`     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Hyd Pressure4`     <dbl> 92, 116, 90, 108, 94, 86, 94, 92, 96, 92, 94, 98, …
## $ `Filler Level`      <dbl> 118.6, 120.2, 120.2, 120.8, 119.6, 119.6, 120.0, 1…
## $ `Filler Speed`      <dbl> 4010, 4014, 4014, 4028, 4020, 4012, 4012, 4010, 40…
## $ Temperature         <dbl> 65.6, 66.2, 65.4, 66.6, 65.0, 65.4, 65.0, 65.0, 65…
## $ `Usage cont`        <dbl> 17.68, 23.82, 18.40, 13.50, 19.04, 18.44, 23.44, 2…
## $ `Carb Flow`         <dbl> 3054, 2948, 2902, 3038, 3056, 3110, 3040, 3056, 32…
## $ Density             <dbl> 1.54, 1.52, 0.90, 0.90, 0.90, 0.92, 0.92, 0.90, 0.…
## $ MFR                 <dbl> 722.8, 738.8, 740.4, 692.4, 727.0, 735.0, 731.0, 7…
## $ Balling             <dbl> 3.042, 2.992, 1.446, 1.448, 1.448, 1.498, 1.498, 1…
## $ `Pressure Vacuum`   <dbl> -4.4, -4.4, -4.4, -4.4, -4.4, -4.4, -4.4, -4.4, -4…
## $ PH                  <dbl> 8.26, 8.32, 8.38, 8.50, 8.34, 8.34, 8.38, 8.40, 8.…
## $ `Oxygen Filler`     <dbl> 0.030, 0.024, 0.064, 0.022, 0.030, 0.058, 0.046, 0…
## $ `Bowl Setpoint`     <dbl> 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, …
## $ `Pressure Setpoint` <dbl> 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46…
## $ `Air Pressurer`     <dbl> 146.2, 146.6, 147.2, 146.2, 146.2, 146.8, 146.8, 1…
## $ `Alch Rel`          <dbl> 7.14, 7.16, 6.52, 6.54, 6.52, 6.52, 6.52, 6.52, 6.…
## $ `Carb Rel`          <dbl> 5.44, 5.44, 5.34, 5.34, 5.34, 5.34, 5.34, 5.34, 5.…
## $ `Balling Lvl`       <dbl> 3.04, 3.02, 1.44, 1.38, 1.44, 1.44, 1.44, 1.44, 1.…

divide training data set into training_df and validation_df

training_df <- data.frame()
validation_df <- data.frame()

tryCatch({

  if (exists("train_data") &&
      "PH" %in% names(train_data) &&
      nrow(train_data) >= 2 &&
      length(na.omit(train_data$PH)) >= 2) {

    set.seed(123)
    partition_index <- createDataPartition(train_data$PH, p = 0.8, list = FALSE)

    training_df <- train_data[partition_index, ]
    validation_df <- train_data[-partition_index, ]

    message("Partitioning completed successfully.")
    message("Training rows: ", nrow(training_df))
    message("Validation rows: ", nrow(validation_df))

  } else {
    message("Skipping partitioning: 'train_data' is invalid or contains insufficient data.")
  }

}, error = function(e) {
  message("Error during data partitioning: ", e$message)
})

## Partitioning completed successfully.

## Training rows: 1631

## Validation rows: 407

Build Models

Ordinary Least Regression

set.seed(123)

ols_model_attempt <- try({
  ols_model <- train(
    PH ~ .,
    data = training_df,
    method = "lm",
    trControl = trainControl(method = "cv", number = 10)
  )

  ols_predictions <- predict(ols_model, newdata = validation_df)
  ols_results <- postResample(pred = ols_predictions, obs = validation_df$PH)
}, silent = TRUE)

Partial Least Squares

set.seed(123)

pls_model <- try({
  train(
    PH ~ .,
    data = training_df,
    method = "pls",
    trControl = trainControl(method = "cv", number = 10),
    preProcess = c("center", "scale"),
    tuneLength = 20
  )
}, silent = TRUE)

# Plot tuning results only if training succeeded
if (!inherits(pls_model, "try-error")) {
  try(plot(pls_model), silent = TRUE)
} else {
  message("Skipping PLS plot: model training failed.")
}

This plot shows how the RMSE changed as more components were added in the PLS model. Initially, the RMSE dropped quickly, suggesting that the first few components captured the most important variation in the data. After around six components, the RMSE began to level off, indicating that additional components had little impact on improving model performance. This pattern suggests that the model benefits from a limited number of components, and adding more does not significantly enhance predictive accuracy. Selecting too many components beyond this point could add unnecessary complexity without meaningful gains.

if (exists("pls_model") && !inherits(pls_model, "try-error") &&
    nrow(validation_df) > 0 && !is.null(validation_df$PH) &&
    all(!is.na(validation_df$PH))) {

  pls_predictions <- try(predict(pls_model, newdata = validation_df), silent = TRUE)

  if (!inherits(pls_predictions, "try-error")) {
    pls_results <- try(postResample(pred = pls_predictions, obs = validation_df$PH), silent = TRUE)

    if (!inherits(pls_results, "try-error")) {
      print(pls_results)
    } else {
      message("PLS evaluation failed.")
    }

  } else {
    message("PLS prediction failed.")
  }

} else {
  message("Skipping PLS prediction: model training or validation data is invalid.")
}

##      RMSE  Rsquared       MAE 
## 0.1325454 0.3659387 0.1021463

Ridge Regression

ridge_model <- try({
  train(
    PH ~ .,
    data = training_df,
    method = "ridge",
    tuneGrid = data.frame(.lambda = seq(0, 0.1, length = 15)),
    preProcess = c("center", "scale"),
    trControl = trainControl(method = "cv", number = 10)
  )
}, silent = TRUE)

# Plot if model trained successfully
if (!inherits(ridge_model, "try-error")) {
  try(plot(ridge_model), silent = TRUE)
} else {
  message("Skipping Ridge plot: model training failed.")
}

This plot illustrates how the model’s RMSE changed with different levels of weight decay. The lowest RMSE occurred when weight decay was set to zero, meaning the model performed best without any regularization. As the weight decay increased, the RMSE gradually rose, indicating that stronger regularization slightly reduced model accuracy. This suggests that in this case, adding weight decay did not improve generalization and may have limited the model’s ability to capture important patterns in the data.

if (exists("ridge_model") && !inherits(ridge_model, "try-error") &&
    nrow(validation_df) > 0 && !is.null(validation_df$PH) &&
    all(!is.na(validation_df$PH))) {

  ridge_predictions <- try(predict(ridge_model, newdata = validation_df), silent = TRUE)

  if (!inherits(ridge_predictions, "try-error")) {
    ridge_results <- try(postResample(pred = ridge_predictions, obs = validation_df$PH), silent = TRUE)

    if (!inherits(ridge_results, "try-error")) {
      print(ridge_results)
    } else {
      message("Ridge evaluation failed.")
    }

  } else {
    message("Ridge prediction failed.")
  }

} else {
  message("Skipping Ridge prediction: model or validation data is invalid.")
}

##      RMSE  Rsquared       MAE 
## 0.1326406 0.3652246 0.1022673

Non Linear Models

Neural Networks

nnet_model <- try({
  set.seed(100)
  train(
    PH ~ .,
    data = training_df,
    method = "avNNet",
    preProc = c("center", "scale"),
    tuneGrid = expand.grid(
      .decay = c(0.01, 0.1),
      .size = c(3, 6, 9),
      .bag = FALSE
    ),
    trControl = trainControl(method = "cv", number = 5),
    linout = TRUE,
    trace = FALSE,
    MaxNWts = 5 * (ncol(training_df) + 1) + 5 + 1,
    maxit = 200
  )
}, silent = TRUE)

## Warning: executing %dopar% sequentially: no parallel backend registered

## Warning: model fit failed for Fold1: decay=0.01, size=6, bag=FALSE Error in { : task 1 failed - "too many (217) weights"

## Warning: model fit failed for Fold1: decay=0.10, size=6, bag=FALSE Error in { : task 1 failed - "too many (217) weights"

## Warning: model fit failed for Fold1: decay=0.01, size=9, bag=FALSE Error in { : task 1 failed - "too many (325) weights"

## Warning: model fit failed for Fold1: decay=0.10, size=9, bag=FALSE Error in { : task 1 failed - "too many (325) weights"

## Warning: model fit failed for Fold2: decay=0.01, size=6, bag=FALSE Error in { : task 1 failed - "too many (217) weights"

## Warning: model fit failed for Fold2: decay=0.10, size=6, bag=FALSE Error in { : task 1 failed - "too many (217) weights"

## Warning: model fit failed for Fold2: decay=0.01, size=9, bag=FALSE Error in { : task 1 failed - "too many (325) weights"

## Warning: model fit failed for Fold2: decay=0.10, size=9, bag=FALSE Error in { : task 1 failed - "too many (325) weights"

## Warning: model fit failed for Fold3: decay=0.01, size=6, bag=FALSE Error in { : task 1 failed - "too many (217) weights"

## Warning: model fit failed for Fold3: decay=0.10, size=6, bag=FALSE Error in { : task 1 failed - "too many (217) weights"

## Warning: model fit failed for Fold3: decay=0.01, size=9, bag=FALSE Error in { : task 1 failed - "too many (325) weights"

## Warning: model fit failed for Fold3: decay=0.10, size=9, bag=FALSE Error in { : task 1 failed - "too many (325) weights"

## Warning: model fit failed for Fold4: decay=0.01, size=6, bag=FALSE Error in { : task 1 failed - "too many (217) weights"

## Warning: model fit failed for Fold4: decay=0.10, size=6, bag=FALSE Error in { : task 1 failed - "too many (217) weights"

## Warning: model fit failed for Fold4: decay=0.01, size=9, bag=FALSE Error in { : task 1 failed - "too many (325) weights"

## Warning: model fit failed for Fold4: decay=0.10, size=9, bag=FALSE Error in { : task 1 failed - "too many (325) weights"

## Warning: model fit failed for Fold5: decay=0.01, size=6, bag=FALSE Error in { : task 1 failed - "too many (217) weights"

## Warning: model fit failed for Fold5: decay=0.10, size=6, bag=FALSE Error in { : task 1 failed - "too many (217) weights"

## Warning: model fit failed for Fold5: decay=0.01, size=9, bag=FALSE Error in { : task 1 failed - "too many (325) weights"

## Warning: model fit failed for Fold5: decay=0.10, size=9, bag=FALSE Error in { : task 1 failed - "too many (325) weights"

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

## Warning in train.default(x, y, weights = w, ...): missing values found in
## aggregated results

MARS (earth)

mars_model <- try({
  set.seed(100)
  train(
    PH ~ .,
    data = training_df,
    method = "earth",
    trControl = trainControl(method = "cv")
  )
}, silent = TRUE)

if (!inherits(mars_model, "try-error")) {
  mars_predictions <- try(predict(mars_model, newdata = validation_df), silent = TRUE)
  if (!inherits(mars_predictions, "try-error")) {
    mars_results <- try(postResample(pred = mars_predictions, obs = validation_df$PH), silent = TRUE)
    if (!inherits(mars_results, "try-error")) print(mars_results)
  }
}

##       RMSE   Rsquared        MAE 
## 0.12853074 0.40075912 0.09670577

SVM

svm_model <- try({
  set.seed(100)
  train(
    PH ~ .,
    data = training_df,
    method = "svmLinear",
    trControl = trainControl(method = "repeatedcv", number = 10, repeats = 3),
    tuneLength = 10
  )
}, silent = TRUE)

if (!inherits(svm_model, "try-error")) {
  svm_predictions <- try(predict(svm_model, newdata = validation_df), silent = TRUE)
  if (!inherits(svm_predictions, "try-error")) {
    svm_results <- try(postResample(pred = svm_predictions, obs = validation_df$PH), silent = TRUE)
    if (!inherits(svm_results, "try-error")) print(svm_results)
  }
}

##      RMSE  Rsquared       MAE 
## 0.1337288 0.3608319 0.1007134

KNN

knn_model <- try({
  set.seed(100)
  train(
    PH ~ .,
    data = training_df,
    method = "knn",
    trControl = trainControl(method = "repeatedcv", number = 10, repeats = 3),
    tuneLength = 10
  )
}, silent = TRUE)

if (!inherits(knn_model, "try-error")) {
  knn_predictions <- try(predict(knn_model, newdata = validation_df), silent = TRUE)
  if (!inherits(knn_predictions, "try-error")) {
    knn_results <- try(postResample(pred = knn_predictions, obs = validation_df$PH), silent = TRUE)
    if (!inherits(knn_results, "try-error")) print(knn_results)
  }
}

##       RMSE   Rsquared        MAE 
## 0.12193118 0.46353305 0.08632924

Random Forest

random_forest_model <- try({
  train(
    PH ~ .,
    data = training_df,
    method = "rf",
    preProcess = c("center", "scale"),
    trControl = trainControl(method = "cv")
  )
}, silent = TRUE)

if (!inherits(random_forest_model, "try-error")) {
  random_forest_predictions <- try(predict(random_forest_model, newdata = validation_df), silent = TRUE)
  if (!inherits(random_forest_predictions, "try-error")) {
    random_forest_results <- try(postResample(pred = random_forest_predictions, obs = validation_df$PH), silent = TRUE)
    if (!inherits(random_forest_results, "try-error")) print(random_forest_results)
  }
}

##       RMSE   Rsquared        MAE 
## 0.09470263 0.68468566 0.06665658

Cubist

cubist_model <- try({
  train(
    PH ~ .,
    data = training_df,
    method = "cubist",
    preProcess = c("center", "scale"),
    trControl = trainControl(method = "cv")
  )
}, silent = TRUE)

if (!inherits(cubist_model, "try-error")) {
  cubist_predictions <- try(predict(cubist_model, newdata = validation_df), silent = TRUE)
  if (!inherits(cubist_predictions, "try-error")) {
    cubist_results <- try(postResample(pred = cubist_predictions, obs = validation_df$PH), silent = TRUE)
    if (!inherits(cubist_results, "try-error")) print(cubist_results)
  }
}

##       RMSE   Rsquared        MAE 
## 0.09801857 0.65161997 0.06773287

Evaluation

After comparing the results across all models, I found that the Cubist model delivered the strongest performance. While there’s always a risk of overfitting with more complex models, in this case, the Cubist model appeared to strike the best balance and was the most suitable choice for this dataset.

Debug Check for df

if (exists("df")) {
  print("df exists:")
  print(head(df))
} else {
  print("df does not exist.")
}

## [1] "df exists:"
##                                               
## 1 function (x, df1, df2, ncp, log = FALSE)    
## 2 {                                           
## 3     if (missing(ncp))                       
## 4         .Call(C_df, x, df1, df2, log)       
## 5     else .Call(C_dnf, x, df1, df2, ncp, log)
## 6 }

The Neural Network (NNet) model showed the best performance with the lowest RMSE (0.1067), lowest MAE (0.0801), and highest R-squared (0.5842), making it the most accurate. OLR, PLS, and RR had similar but weaker results. MARS performed moderately well, while SVM had the lowest R-squared, indicating less predictive power.

Model Evaluation

model_titles <- c("OLR", "PLS", "RR", "NNet", "MARS", "SVM", "KNN", "Random Forest", "Cubist")
result_names <- c(
  "ols_results", "pls_results", "ridge_results", "nnet_results",
  "mars_results", "svm_results", "knn_results",
  "random_forest_results", "cubist_results"
)

df <- data.frame(Model = character(), RMSE = numeric(), Rsquared = numeric(), MAE = numeric())

for (i in seq_along(result_names)) {
  if (exists(result_names[i])) {
    result <- get(result_names[i])
    if (!inherits(result, "try-error") && !is.null(result)) {
      df <- rbind(df, data.frame(
        Model = model_titles[i],
        RMSE = unname(result["RMSE"]),
        Rsquared = unname(result["Rsquared"]),
        MAE = unname(result["MAE"])
      ))
    }
  }
}

if (nrow(df) > 0) {
  knitr::kable(df[order(df$Rsquared, decreasing = TRUE), ], digits = 4, row.names = FALSE)
} else {
  message("No valid model results to summarize.")
}

Model	RMSE	Rsquared	MAE
Random Forest	0.0947	0.6847	0.0667
Cubist	0.0980	0.6516	0.0677
KNN	0.1219	0.4635	0.0863
MARS	0.1285	0.4008	0.0967
PLS	0.1325	0.3659	0.1021
OLR	0.1326	0.3652	0.1023
RR	0.1326	0.3652	0.1023
SVM	0.1337	0.3608	0.1007

Random Forest performed best with the lowest RMSE (0.0947), lowest MAE (0.0667), and highest R-squared (0.6847). Cubist followed closely. Neural Net came third, while linear models like OLR, PLS, and RR had weaker results. SVM performed the worst overall.

Visualize RMSE

if (exists("df") && nrow(df) > 0) {
  print(
    ggplot(data = df, aes(x = reorder(Model, RMSE), y = RMSE, fill = Model)) +
      geom_bar(stat = "identity") +
      coord_flip() +
      labs(
        title = "Model Comparison: RMSE by Algorithm",
        x = "Model",
        y = "RMSE"
      ) +
      theme_minimal(base_size = 14) +
      theme(legend.position = "none")
  )
} else {
  message("Skipping RMSE barplot: model results dataframe is missing or empty.")
}

This barplot clearly shows that Random Forest had the lowest RMSE, making it the most accurate model. Cubist was close behind. On the other end, SVM, Ridge Regression, and OLR had the highest RMSE values, indicating weaker performance.

Visualize R-squared

if (exists("df") && nrow(df) > 0) {
  print(
    ggplot(data = df, aes(x = reorder(Model, Rsquared), y = Rsquared, fill = Model)) +
      geom_bar(stat = "identity") +
      coord_flip() +
      labs(
        title = "Model Comparison: R-squared by Algorithm",
        x = "Model",
        y = "R-squared"
      ) +
      theme_minimal(base_size = 14) +
      theme(legend.position = "none")
  )
} else {
  message("Skipping R-squared barplot: model results dataframe is missing or empty.")
}

This R-squared plot highlights that Random Forest had the strongest fit, explaining nearly 70% of the variance in PH. Cubist and NNet also performed well. In contrast, SVM, Ridge Regression, and OLR had weaker explanatory power.

Visualize MAE

if (exists("df") && nrow(df) > 0) {
  print(
    ggplot(data = df, aes(x = reorder(Model, MAE), y = MAE, fill = Model)) +
      geom_bar(stat = "identity") +
      coord_flip() +
      labs(
        title = "Model Comparison: MAE by Algorithm",
        x = "Model",
        y = "Mean Absolute Error (MAE)"
      ) +
      theme_minimal(base_size = 14) +
      theme(legend.position = "none")
  )
} else {
  message("Skipping MAE barplot: model results dataframe is missing or empty.")
}

This MAE plot shows that Random Forest had the lowest average prediction error, followed closely by Cubist and NNet. On the other end, OLR, Ridge, and PLS had the highest errors, making them the least accurate in terms of absolute deviations from actual pH values.

Load and Review Test Data

I began by loading the data and reviewing its structure. I used the glimpse() function to get a quick overview of the test dataset.

test <- read_excel("C:/Users/Admin/Downloads/StudentEvaluation.xlsx")

#subset features from response
test_features <- test %>% 
  dplyr::select(-c(PH))
#review 
glimpse(test_features)

## Rows: 267
## Columns: 32
## $ `Brand Code`        <chr> "D", "A", "B", "B", "B", "B", "A", "B", "A", "D", …
## $ `Carb Volume`       <dbl> 5.480000, 5.393333, 5.293333, 5.266667, 5.406667, …
## $ `Fill Ounces`       <dbl> 24.03333, 23.95333, 23.92000, 23.94000, 24.20000, …
## $ `PC Volume`         <dbl> 0.2700000, 0.2266667, 0.3033333, 0.1860000, 0.1600…
## $ `Carb Pressure`     <dbl> 65.4, 63.2, 66.4, 64.8, 69.4, 73.4, 65.2, 67.4, 66…
## $ `Carb Temp`         <dbl> 134.6, 135.0, 140.4, 139.0, 142.2, 147.2, 134.6, 1…
## $ PSC                 <dbl> 0.236, 0.042, 0.068, 0.004, 0.040, 0.078, 0.088, 0…
## $ `PSC Fill`          <dbl> 0.40, 0.22, 0.10, 0.20, 0.30, 0.22, 0.14, 0.10, 0.…
## $ `PSC CO2`           <dbl> 0.04, 0.08, 0.02, 0.02, 0.06, NA, 0.00, 0.04, 0.04…
## $ `Mnf Flow`          <dbl> -100, -100, -100, -100, -100, -100, -100, -100, -1…
## $ `Carb Pressure1`    <dbl> 116.6, 118.8, 120.2, 124.8, 115.0, 118.6, 117.6, 1…
## $ `Fill Pressure`     <dbl> 46.0, 46.2, 45.8, 40.0, 51.4, 46.4, 46.2, 40.0, 43…
## $ `Hyd Pressure1`     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Hyd Pressure2`     <dbl> NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Hyd Pressure3`     <dbl> NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `Hyd Pressure4`     <dbl> 96, 112, 98, 132, 94, 94, 108, 108, 110, 106, 98, …
## $ `Filler Level`      <dbl> 129.4, 120.0, 119.4, 120.2, 116.0, 120.4, 119.6, 1…
## $ `Filler Speed`      <dbl> 3986, 4012, 4010, NA, 4018, 4010, 4010, NA, 4010, …
## $ Temperature         <dbl> 66.0, 65.6, 65.6, 74.4, 66.4, 66.6, 66.8, NA, 65.8…
## $ `Usage cont`        <dbl> 21.66, 17.60, 24.18, 18.12, 21.32, 18.00, 17.68, 1…
## $ `Carb Flow`         <dbl> 2950, 2916, 3056, 28, 3214, 3064, 3042, 1972, 2502…
## $ Density             <dbl> 0.88, 1.50, 0.90, 0.74, 0.88, 0.84, 1.48, 1.60, 1.…
## $ MFR                 <dbl> 727.6, 735.8, 734.8, NA, 752.0, 732.0, 729.8, NA, …
## $ Balling             <dbl> 1.398, 2.942, 1.448, 1.056, 1.398, 1.298, 2.894, 3…
## $ `Pressure Vacuum`   <dbl> -3.8, -4.4, -4.2, -4.0, -4.0, -3.8, -4.2, -4.4, -4…
## $ `Oxygen Filler`     <dbl> 0.022, 0.030, 0.046, NA, 0.082, 0.064, 0.042, 0.09…
## $ `Bowl Setpoint`     <dbl> 130, 120, 120, 120, 120, 120, 120, 120, 120, 120, …
## $ `Pressure Setpoint` <dbl> 45.2, 46.0, 46.0, 46.0, 50.0, 46.0, 46.0, 46.0, 46…
## $ `Air Pressurer`     <dbl> 142.6, 147.2, 146.6, 146.4, 145.8, 146.0, 145.0, 1…
## $ `Alch Rel`          <dbl> 6.56, 7.14, 6.52, 6.48, 6.50, 6.50, 7.18, 7.16, 7.…
## $ `Carb Rel`          <dbl> 5.34, 5.58, 5.34, 5.50, 5.38, 5.42, 5.46, 5.42, 5.…
## $ `Balling Lvl`       <dbl> 1.48, 3.04, 1.46, 1.48, 1.46, 1.44, 3.02, 3.00, 3.…

summary(test)

##   Brand Code         Carb Volume     Fill Ounces      PC Volume      
##  Length:267         Min.   :5.147   Min.   :23.75   Min.   :0.09867  
##  Class :character   1st Qu.:5.287   1st Qu.:23.92   1st Qu.:0.23333  
##  Mode  :character   Median :5.340   Median :23.97   Median :0.27533  
##                     Mean   :5.369   Mean   :23.97   Mean   :0.27769  
##                     3rd Qu.:5.465   3rd Qu.:24.01   3rd Qu.:0.32200  
##                     Max.   :5.667   Max.   :24.20   Max.   :0.46400  
##                     NA's   :1       NA's   :6       NA's   :4        
##  Carb Pressure     Carb Temp          PSC             PSC Fill     
##  Min.   :60.20   Min.   :130.0   Min.   :0.00400   Min.   :0.0200  
##  1st Qu.:65.30   1st Qu.:138.4   1st Qu.:0.04450   1st Qu.:0.1000  
##  Median :68.00   Median :140.8   Median :0.07600   Median :0.1800  
##  Mean   :68.25   Mean   :141.2   Mean   :0.08545   Mean   :0.1903  
##  3rd Qu.:70.60   3rd Qu.:143.8   3rd Qu.:0.11200   3rd Qu.:0.2600  
##  Max.   :77.60   Max.   :154.0   Max.   :0.24600   Max.   :0.6200  
##                  NA's   :1       NA's   :5         NA's   :3       
##     PSC CO2           Mnf Flow       Carb Pressure1  Fill Pressure  
##  Min.   :0.00000   Min.   :-100.20   Min.   :113.0   Min.   :37.80  
##  1st Qu.:0.02000   1st Qu.:-100.00   1st Qu.:120.2   1st Qu.:46.00  
##  Median :0.04000   Median :   0.20   Median :123.4   Median :47.80  
##  Mean   :0.05107   Mean   :  21.03   Mean   :123.0   Mean   :48.14  
##  3rd Qu.:0.06000   3rd Qu.: 141.30   3rd Qu.:125.5   3rd Qu.:50.20  
##  Max.   :0.24000   Max.   : 220.40   Max.   :136.0   Max.   :60.20  
##  NA's   :5                           NA's   :4       NA's   :2      
##  Hyd Pressure1    Hyd Pressure2    Hyd Pressure3    Hyd Pressure4   
##  Min.   :-50.00   Min.   :-50.00   Min.   :-50.00   Min.   : 68.00  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.: 90.00  
##  Median : 10.40   Median : 26.80   Median : 27.70   Median : 98.00  
##  Mean   : 12.01   Mean   : 20.11   Mean   : 19.61   Mean   : 97.84  
##  3rd Qu.: 20.40   3rd Qu.: 34.80   3rd Qu.: 33.00   3rd Qu.:104.00  
##  Max.   : 50.00   Max.   : 61.40   Max.   : 49.20   Max.   :140.00  
##                   NA's   :1        NA's   :1        NA's   :4       
##   Filler Level    Filler Speed   Temperature      Usage cont      Carb Flow   
##  Min.   : 69.2   Min.   :1006   Min.   :63.80   Min.   :12.90   Min.   :   0  
##  1st Qu.:100.6   1st Qu.:3812   1st Qu.:65.40   1st Qu.:18.12   1st Qu.:1083  
##  Median :118.6   Median :3978   Median :65.80   Median :21.44   Median :3038  
##  Mean   :110.3   Mean   :3581   Mean   :66.23   Mean   :20.90   Mean   :2409  
##  3rd Qu.:120.2   3rd Qu.:3996   3rd Qu.:66.60   3rd Qu.:23.74   3rd Qu.:3215  
##  Max.   :153.2   Max.   :4020   Max.   :75.40   Max.   :24.60   Max.   :3858  
##  NA's   :2       NA's   :10     NA's   :2       NA's   :2                     
##     Density           MFR           Balling      Pressure Vacuum 
##  Min.   :0.060   Min.   : 15.6   Min.   :0.902   Min.   :-6.400  
##  1st Qu.:0.920   1st Qu.:707.0   1st Qu.:1.498   1st Qu.:-5.600  
##  Median :0.980   Median :724.6   Median :1.648   Median :-5.200  
##  Mean   :1.177   Mean   :697.8   Mean   :2.203   Mean   :-5.174  
##  3rd Qu.:1.600   3rd Qu.:731.5   3rd Qu.:3.242   3rd Qu.:-4.800  
##  Max.   :1.840   Max.   :784.8   Max.   :3.788   Max.   :-3.600  
##  NA's   :1       NA's   :31      NA's   :1       NA's   :1       
##     PH          Oxygen Filler     Bowl Setpoint   Pressure Setpoint
##  Mode:logical   Min.   :0.00240   Min.   : 70.0   Min.   :44.00    
##  NA's:267       1st Qu.:0.01960   1st Qu.:100.0   1st Qu.:46.00    
##                 Median :0.03370   Median :120.0   Median :46.00    
##                 Mean   :0.04666   Mean   :109.6   Mean   :47.73    
##                 3rd Qu.:0.05440   3rd Qu.:120.0   3rd Qu.:50.00    
##                 Max.   :0.39800   Max.   :130.0   Max.   :52.00    
##                 NA's   :3         NA's   :1       NA's   :2        
##  Air Pressurer      Alch Rel        Carb Rel     Balling Lvl   
##  Min.   :141.2   Min.   :6.400   Min.   :5.18   Min.   :0.000  
##  1st Qu.:142.2   1st Qu.:6.540   1st Qu.:5.34   1st Qu.:1.380  
##  Median :142.6   Median :6.580   Median :5.40   Median :1.480  
##  Mean   :142.8   Mean   :6.907   Mean   :5.44   Mean   :2.051  
##  3rd Qu.:142.8   3rd Qu.:7.180   3rd Qu.:5.56   3rd Qu.:3.080  
##  Max.   :147.2   Max.   :7.820   Max.   :5.74   Max.   :3.420  
##  NA's   :1       NA's   :3       NA's   :2

To run the Cubist model on the test data, I needed to clean it in the same way as the training data to ensure consistency.

Before proceeding, I visualized the missing values in the test dataset to identify any data quality issues that needed to be addressed.

plot_missing(test)

This updated missing data plot shows that most features are in good condition, with less than 5 percent missing values. A few variables like Filler Speed, MFR, and PSC CO2 have moderate gaps but are still manageable. The main concern is the PH variable, which shows 100 percent missing in this subset. This is expected since PH is the target variable in the evaluation set, so it was likely excluded on purpose and does not affect model training.

drop na from test dataset

test_noph_nona= test_features %>% drop_na()
glimpse(test_noph_nona)

## Rows: 200
## Columns: 32
## $ `Brand Code`        <chr> "A", "B", "B", "A", "A", "B", "B", "B", "C", "B", …
## $ `Carb Volume`       <dbl> 5.393333, 5.293333, 5.406667, 5.480000, 5.406667, …
## $ `Fill Ounces`       <dbl> 23.95333, 23.92000, 24.20000, 23.93333, 23.92000, …
## $ `PC Volume`         <dbl> 0.2266667, 0.3033333, 0.1600000, 0.2433333, 0.3326…
## $ `Carb Pressure`     <dbl> 63.2, 66.4, 69.4, 65.2, 66.8, 63.2, 65.0, 63.8, 64…
## $ `Carb Temp`         <dbl> 135.0, 140.4, 142.2, 134.6, 138.0, 139.6, 138.8, 1…
## $ PSC                 <dbl> 0.042, 0.068, 0.040, 0.088, 0.246, 0.184, 0.152, 0…
## $ `PSC Fill`          <dbl> 0.22, 0.10, 0.30, 0.14, 0.48, 0.26, 0.12, 0.18, 0.…
## $ `PSC CO2`           <dbl> 0.08, 0.02, 0.06, 0.00, 0.04, 0.20, 0.00, 0.02, 0.…
## $ `Mnf Flow`          <dbl> -100, -100, -100, -100, -100, -100, -100, -100, -1…
## $ `Carb Pressure1`    <dbl> 118.8, 120.2, 115.0, 117.6, 136.0, 117.2, 117.0, 1…
## $ `Fill Pressure`     <dbl> 46.2, 45.8, 51.4, 46.2, 43.8, 46.2, 45.8, 46.4, 46…
## $ `Hyd Pressure1`     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Hyd Pressure2`     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Hyd Pressure3`     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `Hyd Pressure4`     <dbl> 112, 98, 94, 108, 110, 96, 100, 100, 92, 90, 90, 7…
## $ `Filler Level`      <dbl> 120.0, 119.4, 116.0, 119.6, 121.0, 118.4, 119.6, 1…
## $ `Filler Speed`      <dbl> 4012, 4010, 4018, 4010, 4010, 4010, 4010, 4016, 40…
## $ Temperature         <dbl> 65.6, 65.6, 66.4, 66.8, 65.8, 65.8, 65.4, 65.6, 67…
## $ `Usage cont`        <dbl> 17.60, 24.18, 21.32, 17.68, 17.70, 17.16, 20.52, 2…
## $ `Carb Flow`         <dbl> 2916, 3056, 3214, 3042, 2502, 3100, 2926, 2954, 30…
## $ Density             <dbl> 1.50, 0.90, 0.88, 1.48, 1.52, 0.86, 0.92, 0.94, 0.…
## $ MFR                 <dbl> 735.8, 734.8, 752.0, 729.8, 741.2, 735.8, 735.6, 7…
## $ Balling             <dbl> 2.942, 1.448, 1.398, 2.894, 2.992, 1.348, 1.498, 1…
## $ `Pressure Vacuum`   <dbl> -4.4, -4.2, -4.0, -4.2, -4.4, -4.2, -4.8, -4.8, -4…
## $ `Oxygen Filler`     <dbl> 0.030, 0.046, 0.082, 0.042, 0.046, 0.048, 0.066, 0…
## $ `Bowl Setpoint`     <dbl> 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, …
## $ `Pressure Setpoint` <dbl> 46, 46, 50, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46…
## $ `Air Pressurer`     <dbl> 147.2, 146.6, 145.8, 145.0, 146.2, 147.0, 147.0, 1…
## $ `Alch Rel`          <dbl> 7.14, 6.52, 6.50, 7.18, 7.14, 6.50, 6.54, 6.54, 6.…
## $ `Carb Rel`          <dbl> 5.58, 5.34, 5.38, 5.46, 5.44, 5.38, 5.28, 5.22, 5.…
## $ `Balling Lvl`       <dbl> 3.04, 1.46, 1.46, 3.02, 3.10, 1.42, 1.46, 1.44, 1.…

Predictions using cubist model on test data set

titles <- c("OLR", "PLS", "RR", "NNet", "MARS", "SVM", "KNN", "Random Forest", "Cubist")
result_names <- c(
  "ols_results", "pls_results", "ridge_results", "nnet_results",
  "mars_results", "svm_results", "knn_results",
  "random_forest_results", "cubist_results"
)

df <- data.frame(Model = character(), RMSE = numeric(), Rsquared = numeric(), MAE = numeric())

for (i in seq_along(result_names)) {
  if (exists(result_names[i])) {
    result <- get(result_names[i])
    if (!inherits(result, "try-error") && !is.null(result)) {
      df <- rbind(df, data.frame(
        Model = titles[i],
        RMSE = unname(result["RMSE"]),
        Rsquared = unname(result["Rsquared"]),
        MAE = unname(result["MAE"])
      ))
    }
  }
}

if (nrow(df) > 0) {
  knitr::kable(df[order(df$Rsquared, decreasing = TRUE), ], digits = 4, row.names = FALSE)
} else {
  message("No valid model results to summarize.")
}

Model	RMSE	Rsquared	MAE
Random Forest	0.0947	0.6847	0.0667
Cubist	0.0980	0.6516	0.0677
KNN	0.1219	0.4635	0.0863
MARS	0.1285	0.4008	0.0967
PLS	0.1325	0.3659	0.1021
OLR	0.1326	0.3652	0.1023
RR	0.1326	0.3652	0.1023
SVM	0.1337	0.3608	0.1007

Display predicted value

# Check that eval_df exists and is not empty
if (exists("eval_df") && !is.null(eval_df) && nrow(eval_df) > 0) {

  tryCatch({

    d <- data.frame(eval_df, stringsAsFactors = FALSE)
    dt <- datatable(d, options = list(pageLength = 25))
    dt

  }, error = function(e) {
    message("Error generating datatable: ", e$message)
  })

} else {
  message("Skipping datatable: eval_df does not exist or is empty.")
}

## Skipping datatable: eval_df does not exist or is empty.

Conclusion

After testing a variety of predictive models, Random Forest and Cubist consistently delivered the most accurate results in estimating pH levels. Random Forest achieved the highest overall performance, with strong R-squared values and minimal error. These models not only outperformed traditional linear approaches but also proved to be reliable tools for understanding the key drivers behind pH variation. The final results provide a clear path forward for integrating data-driven decision-making into the production process, with practical outputs ready for use in both technical assessments and business reporting.

Data 624 Project 2

Shamecca Marshall

2025-05-13