library(openxlsx)
## Warning: package 'openxlsx' was built under R version 4.3.3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Loading required package: lattice
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.95 loaded
library(mice)
## Warning in check_dep_version(): ABI version mismatch: 
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-8
library(earth)
## Warning: package 'earth' was built under R version 4.3.3
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.3.3
## Loading required package: plotrix
## Warning: package 'plotrix' was built under R version 4.3.3
library(writexl)

Read in historical data

training_data <- read.xlsx("StudentData.xlsx")
head(training_data)
##   Brand.Code Carb.Volume Fill.Ounces PC.Volume Carb.Pressure Carb.Temp   PSC
## 1          B    5.340000    23.96667 0.2633333          68.2     141.2 0.104
## 2          A    5.426667    24.00667 0.2386667          68.4     139.6 0.124
## 3          B    5.286667    24.06000 0.2633333          70.8     144.8 0.090
## 4          A    5.440000    24.00667 0.2933333          63.0     132.6    NA
## 5          A    5.486667    24.31333 0.1113333          67.2     136.8 0.026
## 6          A    5.380000    23.92667 0.2693333          66.6     138.4 0.090
##   PSC.Fill PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure Hyd.Pressure1
## 1     0.26    0.04     -100          118.8          46.0             0
## 2     0.22    0.04     -100          121.6          46.0             0
## 3     0.34    0.16     -100          120.2          46.0             0
## 4     0.42    0.04     -100          115.2          46.4             0
## 5     0.16    0.12     -100          118.4          45.8             0
## 6     0.24    0.04     -100          119.6          45.6             0
##   Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4 Filler.Level Filler.Speed
## 1            NA            NA           118        121.2         4002
## 2            NA            NA           106        118.6         3986
## 3            NA            NA            82        120.0         4020
## 4             0             0            92        117.8         4012
## 5             0             0            92        118.6         4010
## 6             0             0           116        120.2         4014
##   Temperature Usage.cont Carb.Flow Density   MFR Balling Pressure.Vacuum   PH
## 1        66.0      16.18      2932    0.88 725.0   1.398            -4.0 8.36
## 2        67.6      19.90      3144    0.92 726.8   1.498            -4.0 8.26
## 3        67.0      17.76      2914    1.58 735.0   3.142            -3.8 8.94
## 4        65.6      17.42      3062    1.54 730.6   3.042            -4.4 8.24
## 5        65.6      17.68      3054    1.54 722.8   3.042            -4.4 8.26
## 6        66.2      23.82      2948    1.52 738.8   2.992            -4.4 8.32
##   Oxygen.Filler Bowl.Setpoint Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel
## 1         0.022           120              46.4         142.6     6.58     5.32
## 2         0.026           120              46.8         143.0     6.56     5.30
## 3         0.024           120              46.6         142.0     7.66     5.84
## 4         0.030           120              46.0         146.2     7.14     5.42
## 5         0.030           120              46.0         146.2     7.14     5.44
## 6         0.024           120              46.0         146.6     7.16     5.44
##   Balling.Lvl
## 1        1.48
## 2        1.56
## 3        3.28
## 4        3.04
## 5        3.04
## 6        3.02

Read in test evalulation data

test_data <- read.xlsx("StudentEvaluation.xlsx")
head(test_data)
##   Brand.Code Carb.Volume Fill.Ounces PC.Volume Carb.Pressure Carb.Temp   PSC
## 1          D    5.480000    24.03333 0.2700000          65.4     134.6 0.236
## 2          A    5.393333    23.95333 0.2266667          63.2     135.0 0.042
## 3          B    5.293333    23.92000 0.3033333          66.4     140.4 0.068
## 4          B    5.266667    23.94000 0.1860000          64.8     139.0 0.004
## 5          B    5.406667    24.20000 0.1600000          69.4     142.2 0.040
## 6          B    5.286667    24.10667 0.2120000          73.4     147.2 0.078
##   PSC.Fill PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure Hyd.Pressure1
## 1     0.40    0.04     -100          116.6          46.0             0
## 2     0.22    0.08     -100          118.8          46.2             0
## 3     0.10    0.02     -100          120.2          45.8             0
## 4     0.20    0.02     -100          124.8          40.0             0
## 5     0.30    0.06     -100          115.0          51.4             0
## 6     0.22      NA     -100          118.6          46.4             0
##   Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4 Filler.Level Filler.Speed
## 1            NA            NA            96        129.4         3986
## 2             0             0           112        120.0         4012
## 3             0             0            98        119.4         4010
## 4             0             0           132        120.2           NA
## 5             0             0            94        116.0         4018
## 6             0             0            94        120.4         4010
##   Temperature Usage.cont Carb.Flow Density   MFR Balling Pressure.Vacuum PH
## 1        66.0      21.66      2950    0.88 727.6   1.398            -3.8 NA
## 2        65.6      17.60      2916    1.50 735.8   2.942            -4.4 NA
## 3        65.6      24.18      3056    0.90 734.8   1.448            -4.2 NA
## 4        74.4      18.12        28    0.74    NA   1.056            -4.0 NA
## 5        66.4      21.32      3214    0.88 752.0   1.398            -4.0 NA
## 6        66.6      18.00      3064    0.84 732.0   1.298            -3.8 NA
##   Oxygen.Filler Bowl.Setpoint Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel
## 1         0.022           130              45.2         142.6     6.56     5.34
## 2         0.030           120              46.0         147.2     7.14     5.58
## 3         0.046           120              46.0         146.6     6.52     5.34
## 4            NA           120              46.0         146.4     6.48     5.50
## 5         0.082           120              50.0         145.8     6.50     5.38
## 6         0.064           120              46.0         146.0     6.50     5.42
##   Balling.Lvl
## 1        1.48
## 2        3.04
## 3        1.46
## 4        1.48
## 5        1.46
## 6        1.44

Get summary statistics about the training data

summary(training_data)
##   Brand.Code         Carb.Volume     Fill.Ounces      PC.Volume      
##  Length:2571        Min.   :5.040   Min.   :23.63   Min.   :0.07933  
##  Class :character   1st Qu.:5.293   1st Qu.:23.92   1st Qu.:0.23917  
##  Mode  :character   Median :5.347   Median :23.97   Median :0.27133  
##                     Mean   :5.370   Mean   :23.97   Mean   :0.27712  
##                     3rd Qu.:5.453   3rd Qu.:24.03   3rd Qu.:0.31200  
##                     Max.   :5.700   Max.   :24.32   Max.   :0.47800  
##                     NA's   :10      NA's   :38      NA's   :39       
##  Carb.Pressure     Carb.Temp          PSC             PSC.Fill     
##  Min.   :57.00   Min.   :128.6   Min.   :0.00200   Min.   :0.0000  
##  1st Qu.:65.60   1st Qu.:138.4   1st Qu.:0.04800   1st Qu.:0.1000  
##  Median :68.20   Median :140.8   Median :0.07600   Median :0.1800  
##  Mean   :68.19   Mean   :141.1   Mean   :0.08457   Mean   :0.1954  
##  3rd Qu.:70.60   3rd Qu.:143.8   3rd Qu.:0.11200   3rd Qu.:0.2600  
##  Max.   :79.40   Max.   :154.0   Max.   :0.27000   Max.   :0.6200  
##  NA's   :27      NA's   :26      NA's   :33        NA's   :23      
##     PSC.CO2           Mnf.Flow       Carb.Pressure1  Fill.Pressure  
##  Min.   :0.00000   Min.   :-100.20   Min.   :105.6   Min.   :34.60  
##  1st Qu.:0.02000   1st Qu.:-100.00   1st Qu.:119.0   1st Qu.:46.00  
##  Median :0.04000   Median :  65.20   Median :123.2   Median :46.40  
##  Mean   :0.05641   Mean   :  24.57   Mean   :122.6   Mean   :47.92  
##  3rd Qu.:0.08000   3rd Qu.: 140.80   3rd Qu.:125.4   3rd Qu.:50.00  
##  Max.   :0.24000   Max.   : 229.40   Max.   :140.2   Max.   :60.40  
##  NA's   :39        NA's   :2         NA's   :32      NA's   :22     
##  Hyd.Pressure1   Hyd.Pressure2   Hyd.Pressure3   Hyd.Pressure4   
##  Min.   :-0.80   Min.   : 0.00   Min.   :-1.20   Min.   : 52.00  
##  1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 86.00  
##  Median :11.40   Median :28.60   Median :27.60   Median : 96.00  
##  Mean   :12.44   Mean   :20.96   Mean   :20.46   Mean   : 96.29  
##  3rd Qu.:20.20   3rd Qu.:34.60   3rd Qu.:33.40   3rd Qu.:102.00  
##  Max.   :58.00   Max.   :59.40   Max.   :50.00   Max.   :142.00  
##  NA's   :11      NA's   :15      NA's   :15      NA's   :30      
##   Filler.Level    Filler.Speed   Temperature      Usage.cont      Carb.Flow   
##  Min.   : 55.8   Min.   : 998   Min.   :63.60   Min.   :12.08   Min.   :  26  
##  1st Qu.: 98.3   1st Qu.:3888   1st Qu.:65.20   1st Qu.:18.36   1st Qu.:1144  
##  Median :118.4   Median :3982   Median :65.60   Median :21.79   Median :3028  
##  Mean   :109.3   Mean   :3687   Mean   :65.97   Mean   :20.99   Mean   :2468  
##  3rd Qu.:120.0   3rd Qu.:3998   3rd Qu.:66.40   3rd Qu.:23.75   3rd Qu.:3186  
##  Max.   :161.2   Max.   :4030   Max.   :76.20   Max.   :25.90   Max.   :5104  
##  NA's   :20      NA's   :57     NA's   :14      NA's   :5       NA's   :2     
##     Density           MFR           Balling       Pressure.Vacuum 
##  Min.   :0.240   Min.   : 31.4   Min.   :-0.170   Min.   :-6.600  
##  1st Qu.:0.900   1st Qu.:706.3   1st Qu.: 1.496   1st Qu.:-5.600  
##  Median :0.980   Median :724.0   Median : 1.648   Median :-5.400  
##  Mean   :1.174   Mean   :704.0   Mean   : 2.198   Mean   :-5.216  
##  3rd Qu.:1.620   3rd Qu.:731.0   3rd Qu.: 3.292   3rd Qu.:-5.000  
##  Max.   :1.920   Max.   :868.6   Max.   : 4.012   Max.   :-3.600  
##  NA's   :1       NA's   :212     NA's   :1                        
##        PH        Oxygen.Filler     Bowl.Setpoint   Pressure.Setpoint
##  Min.   :7.880   Min.   :0.00240   Min.   : 70.0   Min.   :44.00    
##  1st Qu.:8.440   1st Qu.:0.02200   1st Qu.:100.0   1st Qu.:46.00    
##  Median :8.540   Median :0.03340   Median :120.0   Median :46.00    
##  Mean   :8.546   Mean   :0.04684   Mean   :109.3   Mean   :47.62    
##  3rd Qu.:8.680   3rd Qu.:0.06000   3rd Qu.:120.0   3rd Qu.:50.00    
##  Max.   :9.360   Max.   :0.40000   Max.   :140.0   Max.   :52.00    
##  NA's   :4       NA's   :12        NA's   :2       NA's   :12       
##  Air.Pressurer      Alch.Rel        Carb.Rel      Balling.Lvl  
##  Min.   :140.8   Min.   :5.280   Min.   :4.960   Min.   :0.00  
##  1st Qu.:142.2   1st Qu.:6.540   1st Qu.:5.340   1st Qu.:1.38  
##  Median :142.6   Median :6.560   Median :5.400   Median :1.48  
##  Mean   :142.8   Mean   :6.897   Mean   :5.437   Mean   :2.05  
##  3rd Qu.:143.0   3rd Qu.:7.240   3rd Qu.:5.540   3rd Qu.:3.14  
##  Max.   :148.2   Max.   :8.620   Max.   :6.060   Max.   :3.66  
##                  NA's   :9       NA's   :10      NA's   :1

Above we can see the basic summary statistics for our dataset we can see that there are 2571 rows and we are missing data in a lot of rows. We have mostly numeric predictors with one categorical predictor Brand.Code.

Now plot histograms for each numeric variables to see their distributions

numeric_data <- training_data %>% select(where(is.numeric))

numeric_long <- numeric_data %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")

# Plot histograms for each variable
ggplot(numeric_long, aes(x = Value)) +
  geom_histogram(fill = "steelblue", color = "white", bins = 30) +
  facet_wrap(~ Variable, scales = "free", ncol = 4) +
  theme_minimal() +
  labs(title = "Histograms of Numeric Variables", x = "", y = "Frequency")
## Warning: Removed 724 rows containing non-finite outside the scale range
## (`stat_bin()`).

We have a few predictors with skewed distributions like Air.Pressure, Carb.Pressure, Carb.Pressure1, MFR, Oxygen.Filler, PC.Volume, PSC, PSC.Fill, and Temperature then we have a few bi-modal distributions like Balling, Balling.Lvl, Carb.Rel, Carb.Volume, and Density. We also have a few normally distributed variables like Carb.Temp, Fill.Ounces and Carb.Temp.

Now lets generate box plots for each numeric variable to detect any columns with outliers.

# Create faceted boxplots
ggplot(numeric_long, aes(x = "", y = Value)) +
  geom_boxplot(outlier.color = "red", fill = "skyblue", width = 0.5) +
  facet_wrap(~ Variable, scales = "free", ncol = 4) +
  labs(title = "Boxplots of Numeric Predictors", y = "Value") +
  theme_minimal() +
  theme(axis.text.x = element_blank())
## Warning: Removed 724 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

From the boxplots above we can see that there are some outliers in the data that will need to be dealt with like Air.Pressurer, Alc.Rel, Carb.Pressure, Carb.PRessure1, Carb.Rel, Carb.Temp, Carb.Volume, Fill.Ounces, Fill.Pressure, Filler.Level, Filler.Speed, Hyd.Pressure4, MFR, Oxygen.Filler, PC.Volume, PRessure.Vacuum, PSC, PSC.CO2, PSC.Fill and Temperature. Some are more extreme than others we will aim to only remove extreme outliers.

Now lets remove any predictors with near zero variance.

# Identify near-zero variance predictors
nzv <- nearZeroVar(training_data, saveMetrics = TRUE)

# View the summary table
nzv
##                   freqRatio percentUnique zeroVar   nzv
## Brand.Code         2.014634     0.1555815   FALSE FALSE
## Carb.Volume        1.055556     3.9284325   FALSE FALSE
## Fill.Ounces        1.163043     3.5783742   FALSE FALSE
## PC.Volume          1.100000    17.6584986   FALSE FALSE
## Carb.Pressure      1.016393     4.1229094   FALSE FALSE
## Carb.Temp          1.016667     4.7841307   FALSE FALSE
## PSC                1.211538     5.0175029   FALSE FALSE
## PSC.Fill           1.091892     1.2446519   FALSE FALSE
## PSC.CO2            1.078303     0.5056398   FALSE FALSE
## Mnf.Flow           1.048443    18.9420459   FALSE FALSE
## Carb.Pressure1     1.031746     5.4453520   FALSE FALSE
## Fill.Pressure      1.762931     4.2007001   FALSE FALSE
## Hyd.Pressure1     31.111111     9.5293660   FALSE  TRUE
## Hyd.Pressure2      7.271028     8.0513419   FALSE FALSE
## Hyd.Pressure3     11.450704     7.4679113   FALSE FALSE
## Hyd.Pressure4      1.008264     1.5558149   FALSE FALSE
## Filler.Level       1.086957    11.2018670   FALSE FALSE
## Filler.Speed       1.045161     9.4904706   FALSE FALSE
## Temperature        1.040000     2.1781408   FALSE FALSE
## Usage.cont         1.105263    18.7086737   FALSE FALSE
## Carb.Flow          1.586207    20.7312330   FALSE FALSE
## Density            1.108374     3.0338390   FALSE FALSE
## MFR                1.222222    22.8315830   FALSE FALSE
## Balling            1.193182     8.4402956   FALSE FALSE
## Pressure.Vacuum    1.389728     0.6223259   FALSE FALSE
## PH                 1.078125     2.0225593   FALSE FALSE
## Oxygen.Filler      1.266667    13.1466356   FALSE FALSE
## Bowl.Setpoint      2.990847     0.4278491   FALSE FALSE
## Pressure.Setpoint  1.319361     0.3111630   FALSE FALSE
## Air.Pressurer      1.093407     1.2446519   FALSE FALSE
## Alch.Rel           1.098315     2.0614547   FALSE FALSE
## Carb.Rel           1.011583     1.6336056   FALSE FALSE
## Balling.Lvl        1.294872     3.1894205   FALSE FALSE

Only the Hyd.Pressure1 variable is near zero we will remove this variable.

training_data <- training_data %>%
  select(-Hyd.Pressure1)

Lets inspect the exact number of outliers in each column:

find_outliers <- function(x) {
  qnt <- quantile(x, probs = c(.25, .75), na.rm = TRUE)
  H <- 3 * IQR(x, na.rm = TRUE)
  sum(x < (qnt[1] - H) | x > (qnt[2] + H), na.rm = TRUE)
}

outlier_counts <- sapply(training_data %>% select(where(is.numeric)), find_outliers)
sort(outlier_counts, decreasing = TRUE)
##      Filler.Speed     Air.Pressurer               MFR     Oxygen.Filler 
##               271               213               118                56 
##       Temperature       Carb.Volume       Fill.Ounces         PC.Volume 
##                51                 0                 0                 0 
##     Carb.Pressure         Carb.Temp               PSC          PSC.Fill 
##                 0                 0                 0                 0 
##           PSC.CO2          Mnf.Flow    Carb.Pressure1     Fill.Pressure 
##                 0                 0                 0                 0 
##     Hyd.Pressure2     Hyd.Pressure3     Hyd.Pressure4      Filler.Level 
##                 0                 0                 0                 0 
##        Usage.cont         Carb.Flow           Density           Balling 
##                 0                 0                 0                 0 
##   Pressure.Vacuum                PH     Bowl.Setpoint Pressure.Setpoint 
##                 0                 0                 0                 0 
##          Alch.Rel          Carb.Rel       Balling.Lvl 
##                 0                 0                 0

I decided to only look for outliers that are more extreme by multiplying the IQR() function call by 3. I decided this because I am lacking domain knowledge on this subject I dont want to remove data points that could really not be outliers and provide valuable information to my models.

So, now, lets remove those outliers from the data. They will be imputed later.

# Custom outlier removal function: set outliers to NA
remove_outliers <- function(x) {
  qnt <- quantile(x, probs = c(.25, .75), na.rm = TRUE)
  H <- 3 * IQR(x, na.rm = TRUE)
  x[x < (qnt[1] - H) | x > (qnt[2] + H)] <- NA
  return(x)
}

# Get names of numeric columns excluding "PH"
numeric_predictors <- names(training_data)[sapply(training_data, is.numeric) & names(training_data) != "PH"]

# Apply the outlier removal only to those columns
training_data <- training_data %>%
  mutate(across(all_of(numeric_predictors), remove_outliers))

Now, lets check to see how many missing values there are in the dataset.

training_data %>%
  summarise_all(~sum(is.na(.))) %>%
  pivot_longer(everything(), names_to = "Variable", values_to = "Missing_Count") %>%
  arrange(desc(Missing_Count))
## # A tibble: 32 × 2
##    Variable      Missing_Count
##    <chr>                 <int>
##  1 MFR                     330
##  2 Filler.Speed            328
##  3 Air.Pressurer           213
##  4 Brand.Code              120
##  5 Oxygen.Filler            68
##  6 Temperature              65
##  7 PC.Volume                39
##  8 PSC.CO2                  39
##  9 Fill.Ounces              38
## 10 PSC                      33
## # ℹ 22 more rows

We can see that there are a lot of missing values in the dataset. The MFR column alone is missing 330 values and Brand.Code is missing 120. These missing values will have to be dealt with by either imputation or we could possibly just drop the problem columns if we are unable to impute them.

We only have 4 missing PH values (what we are trying to predict). So, we can just remove those rows from the dataset as I don’t want to impute the response variable and 4 missing rows is not significant.

Lets check the percent of missing data for each column and decide what to do.

missing_pct <- sapply(training_data, function(x) mean(is.na(x))) * 100
sort(missing_pct, decreasing = TRUE)
##               MFR      Filler.Speed     Air.Pressurer        Brand.Code 
##       12.83547258       12.75768184        8.28471412        4.66744457 
##     Oxygen.Filler       Temperature         PC.Volume           PSC.CO2 
##        2.64488526        2.52819914        1.51691949        1.51691949 
##       Fill.Ounces               PSC    Carb.Pressure1     Hyd.Pressure4 
##        1.47802412        1.28354726        1.24465189        1.16686114 
##     Carb.Pressure         Carb.Temp          PSC.Fill     Fill.Pressure 
##        1.05017503        1.01127966        0.89459354        0.85569817 
##      Filler.Level     Hyd.Pressure2     Hyd.Pressure3 Pressure.Setpoint 
##        0.77790743        0.58343057        0.58343057        0.46674446 
##       Carb.Volume          Carb.Rel          Alch.Rel        Usage.cont 
##        0.38895371        0.38895371        0.35005834        0.19447686 
##                PH          Mnf.Flow         Carb.Flow     Bowl.Setpoint 
##        0.15558149        0.07779074        0.07779074        0.07779074 
##           Density           Balling       Balling.Lvl   Pressure.Vacuum 
##        0.03889537        0.03889537        0.03889537        0.00000000

We are only missing at most about 13% of any column. I will impute all the values for any numerical predictors and for Brand.Code I will fill the missing values with “Unknown” and use that as a category.

Now, lets just quickly check the test data to see if anything is missing from there.

missing_pct_test <- sapply(test_data, function(x) mean(is.na(x))) * 100
sort(missing_pct_test, decreasing = TRUE)
##                PH               MFR      Filler.Speed        Brand.Code 
##       100.0000000        11.6104869         3.7453184         2.9962547 
##       Fill.Ounces               PSC           PSC.CO2         PC.Volume 
##         2.2471910         1.8726592         1.8726592         1.4981273 
##    Carb.Pressure1     Hyd.Pressure4          PSC.Fill     Oxygen.Filler 
##         1.4981273         1.4981273         1.1235955         1.1235955 
##          Alch.Rel     Fill.Pressure      Filler.Level       Temperature 
##         1.1235955         0.7490637         0.7490637         0.7490637 
##        Usage.cont Pressure.Setpoint          Carb.Rel       Carb.Volume 
##         0.7490637         0.7490637         0.7490637         0.3745318 
##         Carb.Temp     Hyd.Pressure2     Hyd.Pressure3           Density 
##         0.3745318         0.3745318         0.3745318         0.3745318 
##           Balling   Pressure.Vacuum     Bowl.Setpoint     Air.Pressurer 
##         0.3745318         0.3745318         0.3745318         0.3745318 
##     Carb.Pressure          Mnf.Flow     Hyd.Pressure1         Carb.Flow 
##         0.0000000         0.0000000         0.0000000         0.0000000 
##       Balling.Lvl 
##         0.0000000

We will have to perform the same imputations on the test data as well as it is missing some values just like the training set. We also do not have access to the test PH values so we will not be able to get a test error we will have to rely on the training error to pick the best model.

First, lets set all missing Brand.Code values to “Unknown”

training_data$`Brand.Code` <- as.character(training_data$`Brand.Code`)
training_data$`Brand.Code`[is.na(training_data$`Brand.Code`)] <- "Unknown"
training_data$`Brand.Code` <- as.factor(training_data$`Brand.Code`)
missing_pct <- sapply(training_data, function(x) mean(is.na(x))) * 100
sort(missing_pct, decreasing = TRUE)
##               MFR      Filler.Speed     Air.Pressurer     Oxygen.Filler 
##       12.83547258       12.75768184        8.28471412        2.64488526 
##       Temperature         PC.Volume           PSC.CO2       Fill.Ounces 
##        2.52819914        1.51691949        1.51691949        1.47802412 
##               PSC    Carb.Pressure1     Hyd.Pressure4     Carb.Pressure 
##        1.28354726        1.24465189        1.16686114        1.05017503 
##         Carb.Temp          PSC.Fill     Fill.Pressure      Filler.Level 
##        1.01127966        0.89459354        0.85569817        0.77790743 
##     Hyd.Pressure2     Hyd.Pressure3 Pressure.Setpoint       Carb.Volume 
##        0.58343057        0.58343057        0.46674446        0.38895371 
##          Carb.Rel          Alch.Rel        Usage.cont                PH 
##        0.38895371        0.35005834        0.19447686        0.15558149 
##          Mnf.Flow         Carb.Flow     Bowl.Setpoint           Density 
##        0.07779074        0.07779074        0.07779074        0.03889537 
##           Balling       Balling.Lvl        Brand.Code   Pressure.Vacuum 
##        0.03889537        0.03889537        0.00000000        0.00000000

Now, lets impute the numerical variables

# Apply mice imputation
mice_model <- mice(training_data, m = 1, method = 'pmm', seed = 123)
## 
##  iter imp variable
##   1   1  Carb.Volume  Fill.Ounces  PC.Volume  Carb.Pressure  Carb.Temp  PSC  PSC.Fill  PSC.CO2  Mnf.Flow  Carb.Pressure1  Fill.Pressure  Hyd.Pressure2  Hyd.Pressure3  Hyd.Pressure4  Filler.Level  Filler.Speed  Temperature  Usage.cont  Carb.Flow  Density  MFR  Balling  PH  Oxygen.Filler  Bowl.Setpoint  Pressure.Setpoint  Air.Pressurer  Alch.Rel  Carb.Rel  Balling.Lvl
##   2   1  Carb.Volume  Fill.Ounces  PC.Volume  Carb.Pressure  Carb.Temp  PSC  PSC.Fill  PSC.CO2  Mnf.Flow  Carb.Pressure1  Fill.Pressure  Hyd.Pressure2  Hyd.Pressure3  Hyd.Pressure4  Filler.Level  Filler.Speed  Temperature  Usage.cont  Carb.Flow  Density  MFR  Balling  PH  Oxygen.Filler  Bowl.Setpoint  Pressure.Setpoint  Air.Pressurer  Alch.Rel  Carb.Rel  Balling.Lvl
##   3   1  Carb.Volume  Fill.Ounces  PC.Volume  Carb.Pressure  Carb.Temp  PSC  PSC.Fill  PSC.CO2  Mnf.Flow  Carb.Pressure1  Fill.Pressure  Hyd.Pressure2  Hyd.Pressure3  Hyd.Pressure4  Filler.Level  Filler.Speed  Temperature  Usage.cont  Carb.Flow  Density  MFR  Balling  PH  Oxygen.Filler  Bowl.Setpoint  Pressure.Setpoint  Air.Pressurer  Alch.Rel  Carb.Rel  Balling.Lvl
##   4   1  Carb.Volume  Fill.Ounces  PC.Volume  Carb.Pressure  Carb.Temp  PSC  PSC.Fill  PSC.CO2  Mnf.Flow  Carb.Pressure1  Fill.Pressure  Hyd.Pressure2  Hyd.Pressure3  Hyd.Pressure4  Filler.Level  Filler.Speed  Temperature  Usage.cont  Carb.Flow  Density  MFR  Balling  PH  Oxygen.Filler  Bowl.Setpoint  Pressure.Setpoint  Air.Pressurer  Alch.Rel  Carb.Rel  Balling.Lvl
##   5   1  Carb.Volume  Fill.Ounces  PC.Volume  Carb.Pressure  Carb.Temp  PSC  PSC.Fill  PSC.CO2  Mnf.Flow  Carb.Pressure1  Fill.Pressure  Hyd.Pressure2  Hyd.Pressure3  Hyd.Pressure4  Filler.Level  Filler.Speed  Temperature  Usage.cont  Carb.Flow  Density  MFR  Balling  PH  Oxygen.Filler  Bowl.Setpoint  Pressure.Setpoint  Air.Pressurer  Alch.Rel  Carb.Rel  Balling.Lvl
# Extract completed dataset
train_data_imputed <- complete(mice_model, 1)
missing_pct <- sapply(train_data_imputed, function(x) mean(is.na(x))) * 100
sort(missing_pct, decreasing = TRUE)
##        Brand.Code       Carb.Volume       Fill.Ounces         PC.Volume 
##                 0                 0                 0                 0 
##     Carb.Pressure         Carb.Temp               PSC          PSC.Fill 
##                 0                 0                 0                 0 
##           PSC.CO2          Mnf.Flow    Carb.Pressure1     Fill.Pressure 
##                 0                 0                 0                 0 
##     Hyd.Pressure2     Hyd.Pressure3     Hyd.Pressure4      Filler.Level 
##                 0                 0                 0                 0 
##      Filler.Speed       Temperature        Usage.cont         Carb.Flow 
##                 0                 0                 0                 0 
##           Density               MFR           Balling   Pressure.Vacuum 
##                 0                 0                 0                 0 
##                PH     Oxygen.Filler     Bowl.Setpoint Pressure.Setpoint 
##                 0                 0                 0                 0 
##     Air.Pressurer          Alch.Rel          Carb.Rel       Balling.Lvl 
##                 0                 0                 0                 0

Now we have no missing values in the dataset.

Lets do the same cleaning for the test dataset

test_data$`Brand.Code` <- as.character(test_data$`Brand.Code`)
test_data$`Brand.Code`[is.na(test_data$`Brand.Code`)] <- "Unknown"
test_data$`Brand.Code` <- as.factor(test_data$`Brand.Code`)
missing_pct <- sapply(test_data, function(x) mean(is.na(x))) * 100
sort(missing_pct, decreasing = TRUE)
##                PH               MFR      Filler.Speed       Fill.Ounces 
##       100.0000000        11.6104869         3.7453184         2.2471910 
##               PSC           PSC.CO2         PC.Volume    Carb.Pressure1 
##         1.8726592         1.8726592         1.4981273         1.4981273 
##     Hyd.Pressure4          PSC.Fill     Oxygen.Filler          Alch.Rel 
##         1.4981273         1.1235955         1.1235955         1.1235955 
##     Fill.Pressure      Filler.Level       Temperature        Usage.cont 
##         0.7490637         0.7490637         0.7490637         0.7490637 
## Pressure.Setpoint          Carb.Rel       Carb.Volume         Carb.Temp 
##         0.7490637         0.7490637         0.3745318         0.3745318 
##     Hyd.Pressure2     Hyd.Pressure3           Density           Balling 
##         0.3745318         0.3745318         0.3745318         0.3745318 
##   Pressure.Vacuum     Bowl.Setpoint     Air.Pressurer        Brand.Code 
##         0.3745318         0.3745318         0.3745318         0.0000000 
##     Carb.Pressure          Mnf.Flow     Hyd.Pressure1         Carb.Flow 
##         0.0000000         0.0000000         0.0000000         0.0000000 
##       Balling.Lvl 
##         0.0000000
# Use the same outlier rule on test data
test_data <- test_data %>%
  mutate(across(all_of(numeric_predictors), remove_outliers))

test_data_mice <- mice(test_data, m = 1, method = 'pmm', seed = 123)
## 
##  iter imp variable
##   1   1  Carb.Volume  Fill.Ounces  PC.Volume  Carb.Temp  PSC  PSC.Fill  PSC.CO2  Carb.Pressure1  Fill.Pressure  Hyd.Pressure2  Hyd.Pressure3  Hyd.Pressure4  Filler.Level  Filler.Speed  Temperature  Usage.cont  Density  MFR  Balling  Pressure.Vacuum  Oxygen.Filler  Bowl.Setpoint  Pressure.Setpoint  Air.Pressurer  Alch.Rel  Carb.Rel
##   2   1  Carb.Volume  Fill.Ounces  PC.Volume  Carb.Temp  PSC  PSC.Fill  PSC.CO2  Carb.Pressure1  Fill.Pressure  Hyd.Pressure2  Hyd.Pressure3  Hyd.Pressure4  Filler.Level  Filler.Speed  Temperature  Usage.cont  Density  MFR  Balling  Pressure.Vacuum  Oxygen.Filler  Bowl.Setpoint  Pressure.Setpoint  Air.Pressurer  Alch.Rel  Carb.Rel
##   3   1  Carb.Volume  Fill.Ounces  PC.Volume  Carb.Temp  PSC  PSC.Fill  PSC.CO2  Carb.Pressure1  Fill.Pressure  Hyd.Pressure2  Hyd.Pressure3  Hyd.Pressure4  Filler.Level  Filler.Speed  Temperature  Usage.cont  Density  MFR  Balling  Pressure.Vacuum  Oxygen.Filler  Bowl.Setpoint  Pressure.Setpoint  Air.Pressurer  Alch.Rel  Carb.Rel
##   4   1  Carb.Volume  Fill.Ounces  PC.Volume  Carb.Temp  PSC  PSC.Fill  PSC.CO2  Carb.Pressure1  Fill.Pressure  Hyd.Pressure2  Hyd.Pressure3  Hyd.Pressure4  Filler.Level  Filler.Speed  Temperature  Usage.cont  Density  MFR  Balling  Pressure.Vacuum  Oxygen.Filler  Bowl.Setpoint  Pressure.Setpoint  Air.Pressurer  Alch.Rel  Carb.Rel
##   5   1  Carb.Volume  Fill.Ounces  PC.Volume  Carb.Temp  PSC  PSC.Fill  PSC.CO2  Carb.Pressure1  Fill.Pressure  Hyd.Pressure2  Hyd.Pressure3  Hyd.Pressure4  Filler.Level  Filler.Speed  Temperature  Usage.cont  Density  MFR  Balling  Pressure.Vacuum  Oxygen.Filler  Bowl.Setpoint  Pressure.Setpoint  Air.Pressurer  Alch.Rel  Carb.Rel
## Warning: Number of logged events: 1
test_data_imputed <- complete(test_data_mice, 1)
missing_pct <- sapply(test_data_imputed, function(x) mean(is.na(x))) * 100
sort(missing_pct, decreasing = TRUE)
##                PH        Brand.Code       Carb.Volume       Fill.Ounces 
##               100                 0                 0                 0 
##         PC.Volume     Carb.Pressure         Carb.Temp               PSC 
##                 0                 0                 0                 0 
##          PSC.Fill           PSC.CO2          Mnf.Flow    Carb.Pressure1 
##                 0                 0                 0                 0 
##     Fill.Pressure     Hyd.Pressure1     Hyd.Pressure2     Hyd.Pressure3 
##                 0                 0                 0                 0 
##     Hyd.Pressure4      Filler.Level      Filler.Speed       Temperature 
##                 0                 0                 0                 0 
##        Usage.cont         Carb.Flow           Density               MFR 
##                 0                 0                 0                 0 
##           Balling   Pressure.Vacuum     Oxygen.Filler     Bowl.Setpoint 
##                 0                 0                 0                 0 
## Pressure.Setpoint     Air.Pressurer          Alch.Rel          Carb.Rel 
##                 0                 0                 0                 0 
##       Balling.Lvl 
##                 0

Now the data is all cleaned and ready for some EDA.

# Select only numeric columns
numeric_data <- train_data_imputed %>% select(where(is.numeric))

# Compute correlation matrix (use complete cases to avoid NA problems)
cor_matrix <- cor(numeric_data, use = "complete.obs")

# Plot correlation matrix
corrplot(cor_matrix, method = "color", type = "upper", 
         tl.cex = 0.7, tl.col = "black", addCoef.col = "black", number.cex = 0.6,
         title = "Correlation Matrix", mar = c(0,0,1,0))

cor_long <- as.data.frame(as.table(cor_matrix)) %>%
  filter(Var1 != Var2) %>%                        
  mutate(abs_corr = abs(Freq)) %>%                
  arrange(desc(abs_corr))                         

cor_long_unique <- cor_long %>%
  rowwise() %>%
  mutate(pair = paste(sort(c(Var1, Var2)), collapse = "_")) %>%
  distinct(pair, .keep_all = TRUE) %>%
  select(Var1, Var2, Correlation = Freq) %>%
  arrange(desc(abs(Correlation)))

cor_long_unique
## # A tibble: 465 × 3
## # Rowwise: 
##    Var1          Var2          Correlation
##    <fct>         <fct>               <dbl>
##  1 Balling.Lvl   Balling             0.978
##  2 Balling       Density             0.955
##  3 Bowl.Setpoint Filler.Level        0.949
##  4 Balling.Lvl   Density             0.948
##  5 Hyd.Pressure3 Hyd.Pressure2       0.925
##  6 Alch.Rel      Balling             0.923
##  7 Balling.Lvl   Alch.Rel            0.920
##  8 Alch.Rel      Density             0.901
##  9 MFR           Filler.Speed        0.869
## 10 Balling.Lvl   Carb.Rel            0.843
## # ℹ 455 more rows

We can see that we have quite a few variables that are highly correlated with eachother we will have to keep this in mind when creating models as some models like linear regression are sensitive to multicolinearity.

top_corr <- cor_long_unique %>%
  mutate(abs_corr = abs(Correlation)) %>%
  arrange(desc(abs_corr)) %>%
  filter(abs_corr > .45)

top_corr
## # A tibble: 35 × 4
## # Rowwise: 
##    Var1          Var2          Correlation abs_corr
##    <fct>         <fct>               <dbl>    <dbl>
##  1 Balling.Lvl   Balling             0.978    0.978
##  2 Balling       Density             0.955    0.955
##  3 Bowl.Setpoint Filler.Level        0.949    0.949
##  4 Balling.Lvl   Density             0.948    0.948
##  5 Hyd.Pressure3 Hyd.Pressure2       0.925    0.925
##  6 Alch.Rel      Balling             0.923    0.923
##  7 Balling.Lvl   Alch.Rel            0.920    0.920
##  8 Alch.Rel      Density             0.901    0.901
##  9 MFR           Filler.Speed        0.869    0.869
## 10 Balling.Lvl   Carb.Rel            0.843    0.843
## # ℹ 25 more rows
top_corr
## # A tibble: 35 × 4
## # Rowwise: 
##    Var1          Var2          Correlation abs_corr
##    <fct>         <fct>               <dbl>    <dbl>
##  1 Balling.Lvl   Balling             0.978    0.978
##  2 Balling       Density             0.955    0.955
##  3 Bowl.Setpoint Filler.Level        0.949    0.949
##  4 Balling.Lvl   Density             0.948    0.948
##  5 Hyd.Pressure3 Hyd.Pressure2       0.925    0.925
##  6 Alch.Rel      Balling             0.923    0.923
##  7 Balling.Lvl   Alch.Rel            0.920    0.920
##  8 Alch.Rel      Density             0.901    0.901
##  9 MFR           Filler.Speed        0.869    0.869
## 10 Balling.Lvl   Carb.Rel            0.843    0.843
## # ℹ 25 more rows
# Create label for each pair
top_corr <- top_corr %>%
  mutate(label = paste(Var1, Var2, sep = " vs. "))

# Plot
ggplot(top_corr, aes(x = reorder(label, Correlation), y = Correlation, fill = Correlation > 0)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_fill_manual(values = c("TRUE" = "steelblue", "FALSE" = "firebrick")) +
  labs(title = "Top Predictor Correlations",
       x = "Variable Pair",
       y = "Correlation Coefficient") +
  theme_minimal()

cor_with_ph <- cor(numeric_data, use = "complete.obs")[, "PH"] %>%
  sort(decreasing = TRUE)
cor_with_ph
##                PH     Bowl.Setpoint      Filler.Level   Pressure.Vacuum 
##       1.000000000       0.349140795       0.325460967       0.219346419 
##     Oxygen.Filler          Carb.Rel         Carb.Flow          Alch.Rel 
##       0.195435096       0.158278785       0.154507074       0.146630953 
##       Balling.Lvl           Density           Balling       Carb.Volume 
##       0.100252347       0.077326556       0.064670253       0.063359843 
##     Carb.Pressure         PC.Volume               MFR      Filler.Speed 
##       0.056669020       0.039785554       0.035685860       0.022381373 
##         Carb.Temp     Air.Pressurer          PSC.Fill           PSC.CO2 
##       0.019350762      -0.005610081      -0.037947081      -0.071714809 
##    Carb.Pressure1               PSC       Fill.Ounces     Hyd.Pressure4 
##      -0.073353773      -0.088388469      -0.094171264      -0.130425683 
##     Hyd.Pressure2     Fill.Pressure       Temperature     Hyd.Pressure3 
##      -0.200700383      -0.217378467      -0.217382477      -0.239141118 
## Pressure.Setpoint        Usage.cont          Mnf.Flow 
##      -0.307068276      -0.318853705      -0.446993440

We have a ton of correlated predictors. So, we will run PCA on the training data to get uncorrelated components. This data will then be used to train and test models that are effected by correlated predictors like linear regression, ridge and lasso regression, KNN, SVMs and Neural Networks. For tree based models since these handle correlated predictors well we can use the full training set.

cor_df <- data.frame(
  Variable = names(cor_with_ph),
  Correlation = as.numeric(cor_with_ph)
)

# Plot all in one diverging bar chart
ggplot(cor_df, aes(x = reorder(Variable, Correlation), y = Correlation, fill = Correlation > 0)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_fill_manual(values = c("TRUE" = "steelblue", "FALSE" = "firebrick")) +
  labs(title = "Correlation of Predictors with pH",
       x = "Predictor",
       y = "Correlation with pH") +
  theme_minimal(base_size = 13) +
  theme(axis.text.y = element_text(size = 10))

Now split the training data into a train and test set so that we can produce test RMSE, R squared and more metrics to choose the best model. First, one hot encode the factor variables.

set.seed(123)
split <- createDataPartition(train_data_imputed$PH, p = 0.8, list = FALSE)
train_split <- train_data_imputed[split, ]
test_split <- train_data_imputed[-split, ]

# Split into predictors and response
X_train <- train_split %>% select(-PH)
y_train <- train_split$PH

X_test <- test_split %>% select(-PH)
y_test <- test_split$PH

# one hot encode factor variable
dummies <- dummyVars(~ ., data = train_data_imputed %>% select(-PH))

# Apply to training and test
X_train <- predict(dummies, newdata = X_train) %>% as.data.frame()
X_test <- predict(dummies, newdata = X_test)  %>% as.data.frame()
# Select numeric predictors (excluding PH)
numeric_train <- X_train %>% select(where(is.numeric)) 
numeric_test <- X_test %>% select(all_of(colnames(numeric_train)))

Now we will create the PCA model and the PCA train and test data sets

# Create a PCA preProcess model
pca_model <- preProcess(numeric_train, method = c("center", "scale", "pca"), thresh = 0.95)
# Apply PCA transformation
X_train_pca <- predict(pca_model, newdata = numeric_train)
X_test_pca <- predict(pca_model, newdata = numeric_test)

{r}T head(X_train_pca) head(X_test_pca)

Now we are ready to train and test some models lets start with some linear regression.

# Set up 5-fold CV
ctrl <- trainControl(method = "cv", number = 5)
lm_model_pca <- train(x = X_train_pca, y = y_train, method = "lm", trControl = ctrl, preProcess = c("center", "scale"))
lm_pred_pca <- predict(lm_model_pca, X_test_pca)
lm_results_pca <- postResample(pred = lm_pred_pca, obs = y_test)
lm_results_pca
##      RMSE  Rsquared       MAE 
## 0.1353794 0.3840276 0.1051837
lm_model <- train(x = X_train, y = y_train, method = "lm", trControl = ctrl, preProcess = c("center", "scale"))
lm_pred <- predict(lm_model, X_test)
lm_results <- postResample(pred = lm_pred, obs = y_test)
lm_results
##       RMSE   Rsquared        MAE 
## 0.12992941 0.43280596 0.09788884

Since the PCA model actually did worse lets try just removing correlated predictors and retrain a lm model. This might be better anyway as the goal of the assignment is to identifiy relationships with PH and using PCA we lose the interpretability.

# Identify correlated columns
high_cor <- findCorrelation(cor(numeric_train), cutoff = 0.9)
drop_vars <- names(numeric_train)[high_cor]

# Drop from train and test
X_train_uncorrelated <- X_train %>% select(-all_of(drop_vars))
X_test_uncorrelated <- X_test %>% select(-all_of(drop_vars))
lm_model_uncorrelated <- train(x = X_train_uncorrelated , y = y_train , method = "lm", trControl = ctrl, preProcess = c("center", "scale"))
lm_pred_uncorrelated  <- predict(lm_model_uncorrelated , X_test)
lm_results_uncorrelated  <- postResample(pred = lm_pred_uncorrelated , obs = y_test)
lm_results_uncorrelated 
##      RMSE  Rsquared       MAE 
## 0.1324853 0.4108472 0.1001905

The model with all the predictors is still performing the best. Lets try lasso regression now

# Define lambda grid for tuning
lasso_grid <- expand.grid(alpha = 1, lambda = 10^seq(-4, 1, length = 100))

# Train Lasso model
lasso_model <- train(x = X_train,
                     y = y_train,
                     method = "glmnet",
                     trControl = ctrl,
                     tuneGrid = lasso_grid,
                     preProcess = c("center", "scale")
                     )
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
# Predict on test set
lasso_pred <- predict(lasso_model, newdata = X_test)

# Evaluate performance
lasso_results <- postResample(pred = lasso_pred, obs = y_test)
lasso_results
##       RMSE   Rsquared        MAE 
## 0.12978990 0.43440049 0.09778531

Now lets try lasso on the pca and uncorrelated data

# Train Lasso model
lasso_model_pca <- train(x = X_train_pca,
                     y = y_train,
                     method = "glmnet",
                     trControl = ctrl,
                     tuneGrid = lasso_grid,
                     preProcess = c("center", "scale"))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
# Predict on test set
lasso_pred_pca <- predict(lasso_model_pca, newdata = X_test_pca)

# Evaluate performance
lasso_results_pca <- postResample(pred = lasso_pred_pca, obs = y_test)
lasso_results_pca
##      RMSE  Rsquared       MAE 
## 0.1353648 0.3844326 0.1052155
# Train Lasso model
lasso_model_uncorrelated <- train(x = X_train_uncorrelated,
                     y = y_train,
                     method = "glmnet",
                     trControl = ctrl,
                     tuneGrid = lasso_grid,
                     preProcess = c("center", "scale"))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
# Predict on test set
lasso_pred_uncorrelated <- predict(lasso_model_uncorrelated, newdata = X_test_uncorrelated)

# Evaluate performance
lasso_results_uncorrelated <- postResample(pred = lasso_pred_uncorrelated, obs = y_test)
lasso_results_uncorrelated
##      RMSE  Rsquared       MAE 
## 0.1327583 0.4090422 0.1004522

The best lasso model was the one on the original data with all the predictors like the lm model.

Now lets try some ridge regression

# Define tuning grid (alpha = 0 = Ridge)
ridge_grid <- expand.grid(alpha = 0, lambda = 10^seq(-4, 1, length = 100))

# Train Ridge Regression with scaling
ridge_model <- train(x = X_train,
                     y = y_train,
                     method = "glmnet",
                     preProcess = c("center", "scale"),
                     trControl = ctrl,
                     tuneGrid = ridge_grid)

# Predict on test data
ridge_pred <- predict(ridge_model, newdata = X_test)

# Evaluate performance
ridge_results <- postResample(pred = ridge_pred, obs = y_test)
ridge_results
##       RMSE   Rsquared        MAE 
## 0.13050735 0.42987866 0.09875385
# Train Ridge Regression with scaling
ridge_model_pca <- train(x = X_train_pca,
                     y = y_train,
                     method = "glmnet",
                     preProcess = c("center", "scale"),
                     trControl = ctrl,
                     tuneGrid = ridge_grid)

# Predict on test data
ridge_pred_pca <- predict(ridge_model_pca, newdata = X_test_pca)

# Evaluate performance
ridge_results_pca <- postResample(pred = ridge_pred_pca, obs = y_test)
ridge_results_pca
##      RMSE  Rsquared       MAE 
## 0.1354793 0.3840276 0.1055451
# Train Ridge Regression with scaling
ridge_model_uncorrelated <- train(x = X_train_uncorrelated,
                     y = y_train,
                     method = "glmnet",
                     preProcess = c("center", "scale"),
                     trControl = ctrl,
                     tuneGrid = ridge_grid)

# Predict on test data
ridge_pred_uncorrelated <- predict(ridge_model_uncorrelated, newdata = X_test_uncorrelated)

# Evaluate performance
ridge_results_uncorrelated <- postResample(pred = ridge_pred_uncorrelated, obs = y_test)
ridge_results_uncorrelated
##      RMSE  Rsquared       MAE 
## 0.1328217 0.4090734 0.1007041

Again the best RMSE and R squared comes from the data with all the predictors.

Now lets try a SVM model as none of the lm. lasso or ridge gave useable results the R squares were all less than a coin flip we can do better.

# Train SVM with radial kernel
svm_model <- train(x = X_train,
                   y = y_train,
                   method = "svmRadial",
                   preProcess = c("center", "scale"),  
                   trControl = ctrl,
                   tuneLength = 10) 

# Predict on test set
svm_pred <- predict(svm_model, newdata = X_test)

svm_results <- postResample(pred = svm_pred, obs = y_test)
svm_results
##       RMSE   Rsquared        MAE 
## 0.11501519 0.56039208 0.08034975
svm_model_pca <- train(x = X_train_pca,
                   y = y_train,
                   method = "svmRadial",
                   preProcess = c("center", "scale"),  
                   trControl = ctrl,
                   tuneLength = 10) 

# Predict on test set
svm_pred_pca <- predict(svm_model_pca, newdata = X_test_pca)

svm_results_pca <- postResample(pred = svm_pred_pca, obs = y_test)
svm_results_pca
##       RMSE   Rsquared        MAE 
## 0.12290105 0.50023349 0.08729625
svm_model_uncorrelated <- train(x = X_train_uncorrelated,
                   y = y_train,
                   method = "svmRadial",
                   preProcess = c("center", "scale"),  
                   trControl = ctrl,
                   tuneLength = 10) 

# Predict on test set
svm_pred_uncorrelated <- predict(svm_model_uncorrelated, newdata = X_test_uncorrelated)

svm_results_uncorrelated <- postResample(pred = svm_pred_uncorrelated, obs = y_test)
svm_results_uncorrelated
##       RMSE   Rsquared        MAE 
## 0.11740846 0.54272620 0.08258958

The best R squared and RMSE so far came from the SVM trained on all the predictors a R squared of .56 (basically a coin flip) is still not great tho we can do better.

Lets try pls becuase it does well when the predictors are correlated.

# Train PLS model
pls_model <- train(x = X_train,
                   y = y_train,
                   method = "pls",
                   preProcess = c("center", "scale"),
                   trControl = ctrl,
                   tuneLength = 15)  

# Predict on test set
pls_pred <- predict(pls_model, newdata = X_test)

# Evaluate performance
pls_results <- postResample(pred = pls_pred, obs = y_test)
pls_results
##      RMSE  Rsquared       MAE 
## 0.1297504 0.4342823 0.0978889

The pls model does not out perform the SVM we just made so we will move onto some more complex models.

Now lets try some of the more complex models like MARS, Tree based models like Random Forest, Neural Networks and Cubist model

Lets try a MARS model first. We will train on the full set of predictors and then also on the pruned dataset with correlated predictors removed.

# Train MARS model
mars_model <- train(x = X_train,
                    y = y_train,
                    method = "earth",
                    preProcess = c("center", "scale"),
                    trControl = ctrl,
                    tuneLength = 10)  

# Predict on test set
mars_pred <- predict(mars_model, newdata = X_test)

# Evaluate performance
mars_results <- postResample(pred = mars_pred, obs = y_test)
mars_results
##       RMSE   Rsquared        MAE 
## 0.12511611 0.47327790 0.09362943
# Train MARS model
mars_model_uncorrelated <- train(x = X_train_uncorrelated ,
                    y = y_train,
                    method = "earth",
                    preProcess = c("center", "scale"),
                    trControl = ctrl,
                    tuneLength = 10)  

# Predict on test set
mars_pred_uncorrelated  <- predict(mars_model_uncorrelated , newdata = X_test_uncorrelated )

# Evaluate performance
mars_results_uncorrelated  <- postResample(pred = mars_pred_uncorrelated , obs = y_test)
mars_results_uncorrelated 
##       RMSE   Rsquared        MAE 
## 0.13333186 0.40549489 0.09879608

The best MARS model is on the non-pruned dataset and gives an R squared of .47. Not the best but not the worst either

Now lets try a random forest model we will train this on the whole training set as random forests can handle correlated predictors.

# Train Random Forest
rf_model <- train(x = X_train,
                  y = y_train,
                  method = "rf",
                  trControl = ctrl,
                  tuneLength = 5,  
                  importance = TRUE,
                  ntree = 1000
                  )

# Predict and evaluate
rf_pred <- predict(rf_model, newdata = X_test)
rf_results <- postResample(pred = rf_pred, obs = y_test)
rf_results
##       RMSE   Rsquared        MAE 
## 0.10316270 0.64276756 0.06863096

We will analyze the variable importance later but for now this Random Forest is the best RMSE and Rsquared of .64.

Now lets try some boosted trees.

gbm_model <- train(x = X_train,
                   y = y_train,
                   method = "gbm",
                   trControl = ctrl,
                   verbose = FALSE,
                   tuneLength = 10)  

gbm_pred <- predict(gbm_model, newdata = X_test)
gbm_results <- postResample(pred = gbm_pred, obs = y_test)
gbm_results
##       RMSE   Rsquared        MAE 
## 0.10842098 0.60600878 0.07788096

For the boosted trees our best model gives an Rsquared of .60 not amazing but the second best model we have been able to create so far.

Now lets try a bagged tree model.

bag_model <- train(x = X_train,
                   y = y_train,
                   method = "treebag",
                   trControl = ctrl)

bag_pred <- predict(bag_model, newdata = X_test)
bag_results <- postResample(pred = bag_pred, obs = y_test)
bag_results
##       RMSE   Rsquared        MAE 
## 0.11934530 0.52478580 0.08973709

The bagged tree model give a Rsquared of .53 not great, not much better than a coin flip.

Now lets try a Cubist model.

cubist_model <- train(x = X_train,
                      y = y_train,
                      method = "cubist",
                      trControl = ctrl,
                      tuneLength = 10)  # tunes number of committees and neighbors

# Predict and evaluate
cubist_pred <- predict(cubist_model, newdata = X_test)
cubist_results <- postResample(pred = cubist_pred, obs = y_test)
cubist_results
##      RMSE  Rsquared       MAE 
## 0.1138548 0.5795800 0.0721388

The Cubist model give a RSquared of .58 better than most of the models we have tried but not amazing or the best.

Finally, lets try a Neural Network model

# Train neural network (nnet = single hidden layer)
nn_model <- train(x = X_train,
                  y = y_train,
                  method = "nnet",
                  trControl = ctrl,
                  preProcess = c("center", "scale"),
                  tuneLength = 10,
                  trace = FALSE,
                  linout = TRUE)  # for regression

# Predict and evaluate
nn_pred <- predict(nn_model, newdata = X_test)
nn_results <- postResample(pred = nn_pred, obs = y_test)
nn_results
##       RMSE   Rsquared        MAE 
## 0.12762710 0.47965944 0.09400249

The Neural Network gives a rsquared of .50 again not much better than a coin flip.

I want to try adding and transforming some of the predictors to see if we can improve the Rsquared.

So, lets add in some interaction terms and transform some predictors and then retrain and test some models to see if the R squared improves at all.

I did some research and I believe that a interaction term for flow pressure which would be the Mnf.Flow times the Fill.Pressure. Also an interaction term that captures the relationship between the Oxygen and Carbonation as this can affect PH. Then I will add in an interaction term for the Bowl.Setpoint and pressure vacuum. Then I will add in an interaction term for the Carb temp to carb pressure to capture that relationship.

Finally, I will transform some of the variables using log and square roots. I applied log transformations to predictors that were right-skewed with long tails, such as flow rate and oxygen levels, to reduce the influence of large values and stabilize variance. Square root transformations were used for variables that were non-negative with moderate skew or wide variance, helping to linearize relationships and smooth out disproportionate influence from larger values.

training_data_transformed <- train_data_imputed %>%
  mutate(
    Flow_Pressure = Mnf.Flow * Fill.Pressure,
    Oxygen_Rel = Oxygen.Filler * Carb.Rel,
    BowlVacuum = Bowl.Setpoint * Pressure.Vacuum,
    Temp_to_Pressure = Carb.Temp / (Carb.Pressure + 1),
    log_Flow = log(Mnf.Flow + abs(min(Mnf.Flow)) + 1),
    log_OxygenFiller = log(Oxygen.Filler + 1),
    log_MFR = log(MFR + 1),
    log_UsageCont = log(Usage.cont + 1),
    log_PSC_CO2 = log(PSC.CO2 + 1),
    sqrt_PSC_Fill = sqrt(PSC.Fill),
    sqrt_Vacuum = sqrt(abs(Pressure.Vacuum))
  )
head(training_data_transformed)
##   Brand.Code Carb.Volume Fill.Ounces PC.Volume Carb.Pressure Carb.Temp   PSC
## 1          B    5.340000    23.96667 0.2633333          68.2     141.2 0.104
## 2          A    5.426667    24.00667 0.2386667          68.4     139.6 0.124
## 3          B    5.286667    24.06000 0.2633333          70.8     144.8 0.090
## 4          A    5.440000    24.00667 0.2933333          63.0     132.6 0.008
## 5          A    5.486667    24.31333 0.1113333          67.2     136.8 0.026
## 6          A    5.380000    23.92667 0.2693333          66.6     138.4 0.090
##   PSC.Fill PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure Hyd.Pressure2
## 1     0.26    0.04     -100          118.8          46.0            37
## 2     0.22    0.04     -100          121.6          46.0             0
## 3     0.34    0.16     -100          120.2          46.0             0
## 4     0.42    0.04     -100          115.2          46.4             0
## 5     0.16    0.12     -100          118.4          45.8             0
## 6     0.24    0.04     -100          119.6          45.6             0
##   Hyd.Pressure3 Hyd.Pressure4 Filler.Level Filler.Speed Temperature Usage.cont
## 1          20.8           118        121.2         4002        66.0      16.18
## 2           0.0           106        118.6         3986        67.6      19.90
## 3           0.0            82        120.0         4020        67.0      17.76
## 4           0.0            92        117.8         4012        65.6      17.42
## 5           0.0            92        118.6         4010        65.6      17.68
## 6           0.0           116        120.2         4014        66.2      23.82
##   Carb.Flow Density   MFR Balling Pressure.Vacuum   PH Oxygen.Filler
## 1      2932    0.88 725.0   1.398            -4.0 8.36         0.022
## 2      3144    0.92 726.8   1.498            -4.0 8.26         0.026
## 3      2914    1.58 735.0   3.142            -3.8 8.94         0.024
## 4      3062    1.54 730.6   3.042            -4.4 8.24         0.030
## 5      3054    1.54 722.8   3.042            -4.4 8.26         0.030
## 6      2948    1.52 738.8   2.992            -4.4 8.32         0.024
##   Bowl.Setpoint Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel Balling.Lvl
## 1           120              46.4         142.6     6.58     5.32        1.48
## 2           120              46.8         143.0     6.56     5.30        1.56
## 3           120              46.6         142.0     7.66     5.84        3.28
## 4           120              46.0         142.0     7.14     5.42        3.04
## 5           120              46.0         142.0     7.14     5.44        3.04
## 6           120              46.0         142.2     7.16     5.44        3.02
##   Flow_Pressure Oxygen_Rel BowlVacuum Temp_to_Pressure  log_Flow
## 1         -4600    0.11704       -480         2.040462 0.1823216
## 2         -4600    0.13780       -480         2.011527 0.1823216
## 3         -4600    0.14016       -456         2.016713 0.1823216
## 4         -4640    0.16260       -528         2.071875 0.1823216
## 5         -4580    0.16320       -528         2.005865 0.1823216
## 6         -4560    0.13056       -528         2.047337 0.1823216
##   log_OxygenFiller  log_MFR log_UsageCont log_PSC_CO2 sqrt_PSC_Fill sqrt_Vacuum
## 1       0.02176149 6.587550      2.843746  0.03922071     0.5099020    2.000000
## 2       0.02566775 6.590026      3.039749  0.03922071     0.4690416    2.000000
## 3       0.02371653 6.601230      2.931727  0.14842001     0.5830952    1.949359
## 4       0.02955880 6.595234      2.913437  0.03922071     0.6480741    2.097618
## 5       0.02955880 6.584515      2.927453  0.11332869     0.4000000    2.097618
## 6       0.02371653 6.606380      3.211650  0.03922071     0.4898979    2.097618
split <- createDataPartition(training_data_transformed$PH, p = 0.8, list = FALSE)
train_split <- training_data_transformed[split, ]
test_split <- training_data_transformed[-split, ]

# Split into predictors and response
X_train_trans <- train_split %>% select(-PH)
y_train_trans <- train_split$PH

X_test_trans <- test_split %>% select(-PH)
y_test_trans <- test_split$PH
# Train Random Forest
rf_model_trans <- train(x = X_train_trans,
                  y = y_train_trans,
                  method = "rf",
                  trControl = ctrl,
                  tuneLength = 5,  
                  importance = TRUE,
                  ntree = 1000
                  )

# Predict and evaluate
rf_pred_trans <- predict(rf_model_trans, newdata = X_test_trans)
rf_results_trans <- postResample(pred = rf_pred_trans, obs = y_test_trans)
rf_results_trans
##       RMSE   Rsquared        MAE 
## 0.09910888 0.67101049 0.07157536

The random forest with the interaction terms and transformed predictors give the best Rsquared and lowest RMSE so far

varImp(rf_model_trans)
## rf variable importance
## 
##   only 20 most important variables shown (out of 42)
## 
##                 Overall
## Brand.Code       100.00
## BowlVacuum        61.49
## Temperature       58.36
## Hyd.Pressure3     57.38
## Alch.Rel          49.73
## Carb.Rel          49.64
## Balling.Lvl       49.12
## Pressure.Vacuum   48.46
## sqrt_Vacuum       48.31
## Flow_Pressure     46.75
## Carb.Flow         45.16
## Filler.Speed      44.16
## Balling           39.85
## log_UsageCont     38.53
## Density           38.11
## Bowl.Setpoint     37.03
## Carb.Pressure1    36.99
## Oxygen_Rel        36.93
## Usage.cont        35.05
## Fill.Pressure     33.49

The GBM model was the second best for the non transformed data so lets try that model on this data.

gbm_model_trans <- train(x = X_train_trans,
                   y = y_train_trans,
                   method = "gbm",
                   trControl = ctrl,
                   verbose = FALSE,
                   tuneLength = 10)  

gbm_pred_trans <- predict(gbm_model_trans, newdata = X_test_trans)
gbm_results_trans <- postResample(pred = gbm_pred_trans, obs = y_test_trans)
gbm_results_trans
##       RMSE   Rsquared        MAE 
## 0.10696484 0.61397390 0.07822457
bag_model_trans <- train(x = X_train_trans,
                   y = y_train_trans,
                   method = "treebag",
                   trControl = ctrl)

bag_pred_trans <- predict(bag_model_trans, newdata = X_test_trans)
bag_results_trans <- postResample(pred = bag_pred_trans, obs = y_test_trans)
bag_results_trans
##       RMSE   Rsquared        MAE 
## 0.12378319 0.48074321 0.09682214
cubist_model_trans <- train(x = X_train_trans,
                      y = y_train_trans,
                      method = "cubist",
                      trControl = ctrl,
                      tuneLength = 10)  # tunes number of committees and neighbors

# Predict and evaluate
cubist_pred_trans <- predict(cubist_model_trans, newdata = X_test_trans)
cubist_results_trans <- postResample(pred = cubist_pred_trans, obs = y_test_trans)
cubist_results_trans
##       RMSE   Rsquared        MAE 
## 0.10069510 0.65904861 0.07095879
# Train MARS model
mars_model_trans <- train(x = X_train_trans,
                    y = y_train_trans,
                    method = "earth",
                    preProcess = c("center", "scale"),
                    trControl = ctrl,
                    tuneLength = 10)  

# Predict on test set
mars_pred_trans <- predict(mars_model_trans, newdata = X_test_trans)

# Evaluate performance
mars_results_trans <- postResample(pred = mars_pred_trans, obs = y_test_trans)
mars_results_trans
##       RMSE   Rsquared        MAE 
## 0.12958290 0.43357315 0.09983316
# Train neural network (nnet = single hidden layer)
nn_model_trans <- train(x = X_train_trans,
                  y = y_train_trans,
                  method = "nnet",
                  trControl = ctrl,
                  preProcess = c("center", "scale"),
                  tuneLength = 10,
                  trace = FALSE,
                  linout = TRUE)  

# Predict and evaluate
nn_pred_trans <- predict(nn_model_trans, newdata = X_test_trans)
nn_results_trans <- postResample(pred = nn_pred_trans, obs = y_test_trans)
nn_results_trans
##       RMSE   Rsquared        MAE 
## 0.12344111 0.49547764 0.08996243

The random forest is still the best model I will use that model on the transformed data lets look at the variable importance again to see how the variables relate to the PH prediction

varImp(rf_model_trans)
## rf variable importance
## 
##   only 20 most important variables shown (out of 42)
## 
##                 Overall
## Brand.Code       100.00
## BowlVacuum        61.49
## Temperature       58.36
## Hyd.Pressure3     57.38
## Alch.Rel          49.73
## Carb.Rel          49.64
## Balling.Lvl       49.12
## Pressure.Vacuum   48.46
## sqrt_Vacuum       48.31
## Flow_Pressure     46.75
## Carb.Flow         45.16
## Filler.Speed      44.16
## Balling           39.85
## log_UsageCont     38.53
## Density           38.11
## Bowl.Setpoint     37.03
## Carb.Pressure1    36.99
## Oxygen_Rel        36.93
## Usage.cont        35.05
## Fill.Pressure     33.49

We can see that the Brand.Code is the most important to the model with BowlVacuum, Hyd.Pressure3 and Carb.Rel all around 60 importance then Alch.Rel and Balling.lvl in the 50s. This shows that these variable are the most important for the model to predict PH. These variables are the ones that the model uses to model PH. The BowlVacuum was a interaction term that I added in that is Mnf.Flow * Fill.Pressure so these variables should also be noted as important to PH.

Now lets just produce a bar chart of the variable importances for the non technical report.

plot(varImp(rf_model_trans), top = 20, main = "Top 20 Predictors of pH (Random Forest)")

Now lets generate the PH predictions on the test set. We will first have to add in the transformed predictors and interaction terms.

test_data_transformed <- test_data_imputed %>%
  mutate(
    Flow_Pressure = Mnf.Flow * Fill.Pressure,
    Oxygen_Rel = Oxygen.Filler * Carb.Rel,
    BowlVacuum = Bowl.Setpoint * Pressure.Vacuum,
    Temp_to_Pressure = Carb.Temp / (Carb.Pressure + 1),
    log_Flow = log(Mnf.Flow + abs(min(Mnf.Flow)) + 1),
    log_OxygenFiller = log(Oxygen.Filler + 1),
    log_MFR = log(MFR + 1),
    log_UsageCont = log(Usage.cont + 1),
    log_PSC_CO2 = log(PSC.CO2 + 1),
    sqrt_PSC_Fill = sqrt(PSC.Fill),
    sqrt_Vacuum = sqrt(abs(Pressure.Vacuum))
  )
head(test_data_transformed)
##   Brand.Code Carb.Volume Fill.Ounces PC.Volume Carb.Pressure Carb.Temp   PSC
## 1          D    5.480000    24.03333 0.2700000          65.4     134.6 0.236
## 2          A    5.393333    23.95333 0.2266667          63.2     135.0 0.042
## 3          B    5.293333    23.92000 0.3033333          66.4     140.4 0.068
## 4          B    5.266667    23.94000 0.1860000          64.8     139.0 0.004
## 5          B    5.406667    24.20000 0.1600000          69.4     142.2 0.040
## 6          B    5.286667    24.10667 0.2120000          73.4     147.2 0.078
##   PSC.Fill PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure Hyd.Pressure1
## 1     0.40    0.04     -100          116.6          46.0             0
## 2     0.22    0.08     -100          118.8          46.2             0
## 3     0.10    0.02     -100          120.2          45.8             0
## 4     0.20    0.02     -100          124.8          40.0             0
## 5     0.30    0.06     -100          115.0          51.4             0
## 6     0.22    0.08     -100          118.6          46.4             0
##   Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4 Filler.Level Filler.Speed
## 1             0             0            96        129.4         3986
## 2             0             0           112        120.0         4012
## 3             0             0            98        119.4         4010
## 4             0             0           132        120.2         4008
## 5             0             0            94        116.0         4018
## 6             0             0            94        120.4         4010
##   Temperature Usage.cont Carb.Flow Density   MFR Balling Pressure.Vacuum PH
## 1        66.0      21.66      2950    0.88 727.6   1.398            -3.8 NA
## 2        65.6      17.60      2916    1.50 735.8   2.942            -4.4 NA
## 3        65.6      24.18      3056    0.90 734.8   1.448            -4.2 NA
## 4        66.2      18.12        28    0.74 718.8   1.056            -4.0 NA
## 5        66.4      21.32      3214    0.88 752.0   1.398            -4.0 NA
## 6        66.6      18.00      3064    0.84 732.0   1.298            -3.8 NA
##   Oxygen.Filler Bowl.Setpoint Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel
## 1        0.0220           130              45.2         142.6     6.56     5.34
## 2        0.0300           120              46.0         142.0     7.14     5.58
## 3        0.0460           120              46.0         142.4     6.52     5.34
## 4        0.0158           120              46.0         142.4     6.48     5.50
## 5        0.0820           120              50.0         142.2     6.50     5.38
## 6        0.0640           120              46.0         143.0     6.50     5.42
##   Balling.Lvl Flow_Pressure Oxygen_Rel BowlVacuum Temp_to_Pressure  log_Flow
## 1        1.48         -4600    0.11748       -494         2.027108 0.1823216
## 2        3.04         -4620    0.16740       -528         2.102804 0.1823216
## 3        1.46         -4580    0.24564       -504         2.083086 0.1823216
## 4        1.48         -4000    0.08690       -480         2.112462 0.1823216
## 5        1.46         -5140    0.44116       -480         2.019886 0.1823216
## 6        1.44         -4640    0.34688       -456         1.978495 0.1823216
##   log_OxygenFiller  log_MFR log_UsageCont log_PSC_CO2 sqrt_PSC_Fill sqrt_Vacuum
## 1       0.02176149 6.591125      3.120601  0.03922071     0.6324555    1.949359
## 2       0.02955880 6.602316      2.923162  0.07696104     0.4690416    2.097618
## 3       0.04497337 6.600958      3.226050  0.01980263     0.3162278    2.049390
## 4       0.01567648 6.578973      2.950735  0.01980263     0.4472136    2.000000
## 5       0.07881118 6.624065      3.105483  0.05826891     0.5477226    2.000000
## 6       0.06203539 6.597146      2.944439  0.07696104     0.4690416    1.949359
predicted_ph <- predict(rf_model_trans, newdata =test_data_transformed)
predictions_df <- data.frame(ID = 1:length(predicted_ph), Predicted_PH = predicted_ph)
write_xlsx(predictions_df, "pH_predictions.xlsx")