library(openxlsx)
## Warning: package 'openxlsx' was built under R version 4.3.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Loading required package: lattice
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.95 loaded
library(mice)
## Warning in check_dep_version(): ABI version mismatch:
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-8
library(earth)
## Warning: package 'earth' was built under R version 4.3.3
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.3.3
## Loading required package: plotrix
## Warning: package 'plotrix' was built under R version 4.3.3
library(writexl)
Read in historical data
training_data <- read.xlsx("StudentData.xlsx")
head(training_data)
## Brand.Code Carb.Volume Fill.Ounces PC.Volume Carb.Pressure Carb.Temp PSC
## 1 B 5.340000 23.96667 0.2633333 68.2 141.2 0.104
## 2 A 5.426667 24.00667 0.2386667 68.4 139.6 0.124
## 3 B 5.286667 24.06000 0.2633333 70.8 144.8 0.090
## 4 A 5.440000 24.00667 0.2933333 63.0 132.6 NA
## 5 A 5.486667 24.31333 0.1113333 67.2 136.8 0.026
## 6 A 5.380000 23.92667 0.2693333 66.6 138.4 0.090
## PSC.Fill PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure Hyd.Pressure1
## 1 0.26 0.04 -100 118.8 46.0 0
## 2 0.22 0.04 -100 121.6 46.0 0
## 3 0.34 0.16 -100 120.2 46.0 0
## 4 0.42 0.04 -100 115.2 46.4 0
## 5 0.16 0.12 -100 118.4 45.8 0
## 6 0.24 0.04 -100 119.6 45.6 0
## Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4 Filler.Level Filler.Speed
## 1 NA NA 118 121.2 4002
## 2 NA NA 106 118.6 3986
## 3 NA NA 82 120.0 4020
## 4 0 0 92 117.8 4012
## 5 0 0 92 118.6 4010
## 6 0 0 116 120.2 4014
## Temperature Usage.cont Carb.Flow Density MFR Balling Pressure.Vacuum PH
## 1 66.0 16.18 2932 0.88 725.0 1.398 -4.0 8.36
## 2 67.6 19.90 3144 0.92 726.8 1.498 -4.0 8.26
## 3 67.0 17.76 2914 1.58 735.0 3.142 -3.8 8.94
## 4 65.6 17.42 3062 1.54 730.6 3.042 -4.4 8.24
## 5 65.6 17.68 3054 1.54 722.8 3.042 -4.4 8.26
## 6 66.2 23.82 2948 1.52 738.8 2.992 -4.4 8.32
## Oxygen.Filler Bowl.Setpoint Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel
## 1 0.022 120 46.4 142.6 6.58 5.32
## 2 0.026 120 46.8 143.0 6.56 5.30
## 3 0.024 120 46.6 142.0 7.66 5.84
## 4 0.030 120 46.0 146.2 7.14 5.42
## 5 0.030 120 46.0 146.2 7.14 5.44
## 6 0.024 120 46.0 146.6 7.16 5.44
## Balling.Lvl
## 1 1.48
## 2 1.56
## 3 3.28
## 4 3.04
## 5 3.04
## 6 3.02
Read in test evalulation data
test_data <- read.xlsx("StudentEvaluation.xlsx")
head(test_data)
## Brand.Code Carb.Volume Fill.Ounces PC.Volume Carb.Pressure Carb.Temp PSC
## 1 D 5.480000 24.03333 0.2700000 65.4 134.6 0.236
## 2 A 5.393333 23.95333 0.2266667 63.2 135.0 0.042
## 3 B 5.293333 23.92000 0.3033333 66.4 140.4 0.068
## 4 B 5.266667 23.94000 0.1860000 64.8 139.0 0.004
## 5 B 5.406667 24.20000 0.1600000 69.4 142.2 0.040
## 6 B 5.286667 24.10667 0.2120000 73.4 147.2 0.078
## PSC.Fill PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure Hyd.Pressure1
## 1 0.40 0.04 -100 116.6 46.0 0
## 2 0.22 0.08 -100 118.8 46.2 0
## 3 0.10 0.02 -100 120.2 45.8 0
## 4 0.20 0.02 -100 124.8 40.0 0
## 5 0.30 0.06 -100 115.0 51.4 0
## 6 0.22 NA -100 118.6 46.4 0
## Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4 Filler.Level Filler.Speed
## 1 NA NA 96 129.4 3986
## 2 0 0 112 120.0 4012
## 3 0 0 98 119.4 4010
## 4 0 0 132 120.2 NA
## 5 0 0 94 116.0 4018
## 6 0 0 94 120.4 4010
## Temperature Usage.cont Carb.Flow Density MFR Balling Pressure.Vacuum PH
## 1 66.0 21.66 2950 0.88 727.6 1.398 -3.8 NA
## 2 65.6 17.60 2916 1.50 735.8 2.942 -4.4 NA
## 3 65.6 24.18 3056 0.90 734.8 1.448 -4.2 NA
## 4 74.4 18.12 28 0.74 NA 1.056 -4.0 NA
## 5 66.4 21.32 3214 0.88 752.0 1.398 -4.0 NA
## 6 66.6 18.00 3064 0.84 732.0 1.298 -3.8 NA
## Oxygen.Filler Bowl.Setpoint Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel
## 1 0.022 130 45.2 142.6 6.56 5.34
## 2 0.030 120 46.0 147.2 7.14 5.58
## 3 0.046 120 46.0 146.6 6.52 5.34
## 4 NA 120 46.0 146.4 6.48 5.50
## 5 0.082 120 50.0 145.8 6.50 5.38
## 6 0.064 120 46.0 146.0 6.50 5.42
## Balling.Lvl
## 1 1.48
## 2 3.04
## 3 1.46
## 4 1.48
## 5 1.46
## 6 1.44
Get summary statistics about the training data
summary(training_data)
## Brand.Code Carb.Volume Fill.Ounces PC.Volume
## Length:2571 Min. :5.040 Min. :23.63 Min. :0.07933
## Class :character 1st Qu.:5.293 1st Qu.:23.92 1st Qu.:0.23917
## Mode :character Median :5.347 Median :23.97 Median :0.27133
## Mean :5.370 Mean :23.97 Mean :0.27712
## 3rd Qu.:5.453 3rd Qu.:24.03 3rd Qu.:0.31200
## Max. :5.700 Max. :24.32 Max. :0.47800
## NA's :10 NA's :38 NA's :39
## Carb.Pressure Carb.Temp PSC PSC.Fill
## Min. :57.00 Min. :128.6 Min. :0.00200 Min. :0.0000
## 1st Qu.:65.60 1st Qu.:138.4 1st Qu.:0.04800 1st Qu.:0.1000
## Median :68.20 Median :140.8 Median :0.07600 Median :0.1800
## Mean :68.19 Mean :141.1 Mean :0.08457 Mean :0.1954
## 3rd Qu.:70.60 3rd Qu.:143.8 3rd Qu.:0.11200 3rd Qu.:0.2600
## Max. :79.40 Max. :154.0 Max. :0.27000 Max. :0.6200
## NA's :27 NA's :26 NA's :33 NA's :23
## PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure
## Min. :0.00000 Min. :-100.20 Min. :105.6 Min. :34.60
## 1st Qu.:0.02000 1st Qu.:-100.00 1st Qu.:119.0 1st Qu.:46.00
## Median :0.04000 Median : 65.20 Median :123.2 Median :46.40
## Mean :0.05641 Mean : 24.57 Mean :122.6 Mean :47.92
## 3rd Qu.:0.08000 3rd Qu.: 140.80 3rd Qu.:125.4 3rd Qu.:50.00
## Max. :0.24000 Max. : 229.40 Max. :140.2 Max. :60.40
## NA's :39 NA's :2 NA's :32 NA's :22
## Hyd.Pressure1 Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4
## Min. :-0.80 Min. : 0.00 Min. :-1.20 Min. : 52.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 86.00
## Median :11.40 Median :28.60 Median :27.60 Median : 96.00
## Mean :12.44 Mean :20.96 Mean :20.46 Mean : 96.29
## 3rd Qu.:20.20 3rd Qu.:34.60 3rd Qu.:33.40 3rd Qu.:102.00
## Max. :58.00 Max. :59.40 Max. :50.00 Max. :142.00
## NA's :11 NA's :15 NA's :15 NA's :30
## Filler.Level Filler.Speed Temperature Usage.cont Carb.Flow
## Min. : 55.8 Min. : 998 Min. :63.60 Min. :12.08 Min. : 26
## 1st Qu.: 98.3 1st Qu.:3888 1st Qu.:65.20 1st Qu.:18.36 1st Qu.:1144
## Median :118.4 Median :3982 Median :65.60 Median :21.79 Median :3028
## Mean :109.3 Mean :3687 Mean :65.97 Mean :20.99 Mean :2468
## 3rd Qu.:120.0 3rd Qu.:3998 3rd Qu.:66.40 3rd Qu.:23.75 3rd Qu.:3186
## Max. :161.2 Max. :4030 Max. :76.20 Max. :25.90 Max. :5104
## NA's :20 NA's :57 NA's :14 NA's :5 NA's :2
## Density MFR Balling Pressure.Vacuum
## Min. :0.240 Min. : 31.4 Min. :-0.170 Min. :-6.600
## 1st Qu.:0.900 1st Qu.:706.3 1st Qu.: 1.496 1st Qu.:-5.600
## Median :0.980 Median :724.0 Median : 1.648 Median :-5.400
## Mean :1.174 Mean :704.0 Mean : 2.198 Mean :-5.216
## 3rd Qu.:1.620 3rd Qu.:731.0 3rd Qu.: 3.292 3rd Qu.:-5.000
## Max. :1.920 Max. :868.6 Max. : 4.012 Max. :-3.600
## NA's :1 NA's :212 NA's :1
## PH Oxygen.Filler Bowl.Setpoint Pressure.Setpoint
## Min. :7.880 Min. :0.00240 Min. : 70.0 Min. :44.00
## 1st Qu.:8.440 1st Qu.:0.02200 1st Qu.:100.0 1st Qu.:46.00
## Median :8.540 Median :0.03340 Median :120.0 Median :46.00
## Mean :8.546 Mean :0.04684 Mean :109.3 Mean :47.62
## 3rd Qu.:8.680 3rd Qu.:0.06000 3rd Qu.:120.0 3rd Qu.:50.00
## Max. :9.360 Max. :0.40000 Max. :140.0 Max. :52.00
## NA's :4 NA's :12 NA's :2 NA's :12
## Air.Pressurer Alch.Rel Carb.Rel Balling.Lvl
## Min. :140.8 Min. :5.280 Min. :4.960 Min. :0.00
## 1st Qu.:142.2 1st Qu.:6.540 1st Qu.:5.340 1st Qu.:1.38
## Median :142.6 Median :6.560 Median :5.400 Median :1.48
## Mean :142.8 Mean :6.897 Mean :5.437 Mean :2.05
## 3rd Qu.:143.0 3rd Qu.:7.240 3rd Qu.:5.540 3rd Qu.:3.14
## Max. :148.2 Max. :8.620 Max. :6.060 Max. :3.66
## NA's :9 NA's :10 NA's :1
Above we can see the basic summary statistics for our dataset we can see that there are 2571 rows and we are missing data in a lot of rows. We have mostly numeric predictors with one categorical predictor Brand.Code.
Now plot histograms for each numeric variables to see their distributions
numeric_data <- training_data %>% select(where(is.numeric))
numeric_long <- numeric_data %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")
# Plot histograms for each variable
ggplot(numeric_long, aes(x = Value)) +
geom_histogram(fill = "steelblue", color = "white", bins = 30) +
facet_wrap(~ Variable, scales = "free", ncol = 4) +
theme_minimal() +
labs(title = "Histograms of Numeric Variables", x = "", y = "Frequency")
## Warning: Removed 724 rows containing non-finite outside the scale range
## (`stat_bin()`).
We have a few predictors with skewed distributions like Air.Pressure,
Carb.Pressure, Carb.Pressure1, MFR, Oxygen.Filler, PC.Volume, PSC,
PSC.Fill, and Temperature then we have a few bi-modal distributions like
Balling, Balling.Lvl, Carb.Rel, Carb.Volume, and Density. We also have a
few normally distributed variables like Carb.Temp, Fill.Ounces and
Carb.Temp.
Now lets generate box plots for each numeric variable to detect any columns with outliers.
# Create faceted boxplots
ggplot(numeric_long, aes(x = "", y = Value)) +
geom_boxplot(outlier.color = "red", fill = "skyblue", width = 0.5) +
facet_wrap(~ Variable, scales = "free", ncol = 4) +
labs(title = "Boxplots of Numeric Predictors", y = "Value") +
theme_minimal() +
theme(axis.text.x = element_blank())
## Warning: Removed 724 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
From the boxplots above we can see that there are some outliers in the data that will need to be dealt with like Air.Pressurer, Alc.Rel, Carb.Pressure, Carb.PRessure1, Carb.Rel, Carb.Temp, Carb.Volume, Fill.Ounces, Fill.Pressure, Filler.Level, Filler.Speed, Hyd.Pressure4, MFR, Oxygen.Filler, PC.Volume, PRessure.Vacuum, PSC, PSC.CO2, PSC.Fill and Temperature. Some are more extreme than others we will aim to only remove extreme outliers.
Now lets remove any predictors with near zero variance.
# Identify near-zero variance predictors
nzv <- nearZeroVar(training_data, saveMetrics = TRUE)
# View the summary table
nzv
## freqRatio percentUnique zeroVar nzv
## Brand.Code 2.014634 0.1555815 FALSE FALSE
## Carb.Volume 1.055556 3.9284325 FALSE FALSE
## Fill.Ounces 1.163043 3.5783742 FALSE FALSE
## PC.Volume 1.100000 17.6584986 FALSE FALSE
## Carb.Pressure 1.016393 4.1229094 FALSE FALSE
## Carb.Temp 1.016667 4.7841307 FALSE FALSE
## PSC 1.211538 5.0175029 FALSE FALSE
## PSC.Fill 1.091892 1.2446519 FALSE FALSE
## PSC.CO2 1.078303 0.5056398 FALSE FALSE
## Mnf.Flow 1.048443 18.9420459 FALSE FALSE
## Carb.Pressure1 1.031746 5.4453520 FALSE FALSE
## Fill.Pressure 1.762931 4.2007001 FALSE FALSE
## Hyd.Pressure1 31.111111 9.5293660 FALSE TRUE
## Hyd.Pressure2 7.271028 8.0513419 FALSE FALSE
## Hyd.Pressure3 11.450704 7.4679113 FALSE FALSE
## Hyd.Pressure4 1.008264 1.5558149 FALSE FALSE
## Filler.Level 1.086957 11.2018670 FALSE FALSE
## Filler.Speed 1.045161 9.4904706 FALSE FALSE
## Temperature 1.040000 2.1781408 FALSE FALSE
## Usage.cont 1.105263 18.7086737 FALSE FALSE
## Carb.Flow 1.586207 20.7312330 FALSE FALSE
## Density 1.108374 3.0338390 FALSE FALSE
## MFR 1.222222 22.8315830 FALSE FALSE
## Balling 1.193182 8.4402956 FALSE FALSE
## Pressure.Vacuum 1.389728 0.6223259 FALSE FALSE
## PH 1.078125 2.0225593 FALSE FALSE
## Oxygen.Filler 1.266667 13.1466356 FALSE FALSE
## Bowl.Setpoint 2.990847 0.4278491 FALSE FALSE
## Pressure.Setpoint 1.319361 0.3111630 FALSE FALSE
## Air.Pressurer 1.093407 1.2446519 FALSE FALSE
## Alch.Rel 1.098315 2.0614547 FALSE FALSE
## Carb.Rel 1.011583 1.6336056 FALSE FALSE
## Balling.Lvl 1.294872 3.1894205 FALSE FALSE
Only the Hyd.Pressure1 variable is near zero we will remove this variable.
training_data <- training_data %>%
select(-Hyd.Pressure1)
Lets inspect the exact number of outliers in each column:
find_outliers <- function(x) {
qnt <- quantile(x, probs = c(.25, .75), na.rm = TRUE)
H <- 3 * IQR(x, na.rm = TRUE)
sum(x < (qnt[1] - H) | x > (qnt[2] + H), na.rm = TRUE)
}
outlier_counts <- sapply(training_data %>% select(where(is.numeric)), find_outliers)
sort(outlier_counts, decreasing = TRUE)
## Filler.Speed Air.Pressurer MFR Oxygen.Filler
## 271 213 118 56
## Temperature Carb.Volume Fill.Ounces PC.Volume
## 51 0 0 0
## Carb.Pressure Carb.Temp PSC PSC.Fill
## 0 0 0 0
## PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure
## 0 0 0 0
## Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4 Filler.Level
## 0 0 0 0
## Usage.cont Carb.Flow Density Balling
## 0 0 0 0
## Pressure.Vacuum PH Bowl.Setpoint Pressure.Setpoint
## 0 0 0 0
## Alch.Rel Carb.Rel Balling.Lvl
## 0 0 0
I decided to only look for outliers that are more extreme by multiplying the IQR() function call by 3. I decided this because I am lacking domain knowledge on this subject I dont want to remove data points that could really not be outliers and provide valuable information to my models.
So, now, lets remove those outliers from the data. They will be imputed later.
# Custom outlier removal function: set outliers to NA
remove_outliers <- function(x) {
qnt <- quantile(x, probs = c(.25, .75), na.rm = TRUE)
H <- 3 * IQR(x, na.rm = TRUE)
x[x < (qnt[1] - H) | x > (qnt[2] + H)] <- NA
return(x)
}
# Get names of numeric columns excluding "PH"
numeric_predictors <- names(training_data)[sapply(training_data, is.numeric) & names(training_data) != "PH"]
# Apply the outlier removal only to those columns
training_data <- training_data %>%
mutate(across(all_of(numeric_predictors), remove_outliers))
Now, lets check to see how many missing values there are in the dataset.
training_data %>%
summarise_all(~sum(is.na(.))) %>%
pivot_longer(everything(), names_to = "Variable", values_to = "Missing_Count") %>%
arrange(desc(Missing_Count))
## # A tibble: 32 × 2
## Variable Missing_Count
## <chr> <int>
## 1 MFR 330
## 2 Filler.Speed 328
## 3 Air.Pressurer 213
## 4 Brand.Code 120
## 5 Oxygen.Filler 68
## 6 Temperature 65
## 7 PC.Volume 39
## 8 PSC.CO2 39
## 9 Fill.Ounces 38
## 10 PSC 33
## # ℹ 22 more rows
We can see that there are a lot of missing values in the dataset. The MFR column alone is missing 330 values and Brand.Code is missing 120. These missing values will have to be dealt with by either imputation or we could possibly just drop the problem columns if we are unable to impute them.
We only have 4 missing PH values (what we are trying to predict). So, we can just remove those rows from the dataset as I don’t want to impute the response variable and 4 missing rows is not significant.
Lets check the percent of missing data for each column and decide what to do.
missing_pct <- sapply(training_data, function(x) mean(is.na(x))) * 100
sort(missing_pct, decreasing = TRUE)
## MFR Filler.Speed Air.Pressurer Brand.Code
## 12.83547258 12.75768184 8.28471412 4.66744457
## Oxygen.Filler Temperature PC.Volume PSC.CO2
## 2.64488526 2.52819914 1.51691949 1.51691949
## Fill.Ounces PSC Carb.Pressure1 Hyd.Pressure4
## 1.47802412 1.28354726 1.24465189 1.16686114
## Carb.Pressure Carb.Temp PSC.Fill Fill.Pressure
## 1.05017503 1.01127966 0.89459354 0.85569817
## Filler.Level Hyd.Pressure2 Hyd.Pressure3 Pressure.Setpoint
## 0.77790743 0.58343057 0.58343057 0.46674446
## Carb.Volume Carb.Rel Alch.Rel Usage.cont
## 0.38895371 0.38895371 0.35005834 0.19447686
## PH Mnf.Flow Carb.Flow Bowl.Setpoint
## 0.15558149 0.07779074 0.07779074 0.07779074
## Density Balling Balling.Lvl Pressure.Vacuum
## 0.03889537 0.03889537 0.03889537 0.00000000
We are only missing at most about 13% of any column. I will impute all the values for any numerical predictors and for Brand.Code I will fill the missing values with “Unknown” and use that as a category.
Now, lets just quickly check the test data to see if anything is missing from there.
missing_pct_test <- sapply(test_data, function(x) mean(is.na(x))) * 100
sort(missing_pct_test, decreasing = TRUE)
## PH MFR Filler.Speed Brand.Code
## 100.0000000 11.6104869 3.7453184 2.9962547
## Fill.Ounces PSC PSC.CO2 PC.Volume
## 2.2471910 1.8726592 1.8726592 1.4981273
## Carb.Pressure1 Hyd.Pressure4 PSC.Fill Oxygen.Filler
## 1.4981273 1.4981273 1.1235955 1.1235955
## Alch.Rel Fill.Pressure Filler.Level Temperature
## 1.1235955 0.7490637 0.7490637 0.7490637
## Usage.cont Pressure.Setpoint Carb.Rel Carb.Volume
## 0.7490637 0.7490637 0.7490637 0.3745318
## Carb.Temp Hyd.Pressure2 Hyd.Pressure3 Density
## 0.3745318 0.3745318 0.3745318 0.3745318
## Balling Pressure.Vacuum Bowl.Setpoint Air.Pressurer
## 0.3745318 0.3745318 0.3745318 0.3745318
## Carb.Pressure Mnf.Flow Hyd.Pressure1 Carb.Flow
## 0.0000000 0.0000000 0.0000000 0.0000000
## Balling.Lvl
## 0.0000000
We will have to perform the same imputations on the test data as well as it is missing some values just like the training set. We also do not have access to the test PH values so we will not be able to get a test error we will have to rely on the training error to pick the best model.
First, lets set all missing Brand.Code values to “Unknown”
training_data$`Brand.Code` <- as.character(training_data$`Brand.Code`)
training_data$`Brand.Code`[is.na(training_data$`Brand.Code`)] <- "Unknown"
training_data$`Brand.Code` <- as.factor(training_data$`Brand.Code`)
missing_pct <- sapply(training_data, function(x) mean(is.na(x))) * 100
sort(missing_pct, decreasing = TRUE)
## MFR Filler.Speed Air.Pressurer Oxygen.Filler
## 12.83547258 12.75768184 8.28471412 2.64488526
## Temperature PC.Volume PSC.CO2 Fill.Ounces
## 2.52819914 1.51691949 1.51691949 1.47802412
## PSC Carb.Pressure1 Hyd.Pressure4 Carb.Pressure
## 1.28354726 1.24465189 1.16686114 1.05017503
## Carb.Temp PSC.Fill Fill.Pressure Filler.Level
## 1.01127966 0.89459354 0.85569817 0.77790743
## Hyd.Pressure2 Hyd.Pressure3 Pressure.Setpoint Carb.Volume
## 0.58343057 0.58343057 0.46674446 0.38895371
## Carb.Rel Alch.Rel Usage.cont PH
## 0.38895371 0.35005834 0.19447686 0.15558149
## Mnf.Flow Carb.Flow Bowl.Setpoint Density
## 0.07779074 0.07779074 0.07779074 0.03889537
## Balling Balling.Lvl Brand.Code Pressure.Vacuum
## 0.03889537 0.03889537 0.00000000 0.00000000
Now, lets impute the numerical variables
# Apply mice imputation
mice_model <- mice(training_data, m = 1, method = 'pmm', seed = 123)
##
## iter imp variable
## 1 1 Carb.Volume Fill.Ounces PC.Volume Carb.Pressure Carb.Temp PSC PSC.Fill PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4 Filler.Level Filler.Speed Temperature Usage.cont Carb.Flow Density MFR Balling PH Oxygen.Filler Bowl.Setpoint Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel Balling.Lvl
## 2 1 Carb.Volume Fill.Ounces PC.Volume Carb.Pressure Carb.Temp PSC PSC.Fill PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4 Filler.Level Filler.Speed Temperature Usage.cont Carb.Flow Density MFR Balling PH Oxygen.Filler Bowl.Setpoint Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel Balling.Lvl
## 3 1 Carb.Volume Fill.Ounces PC.Volume Carb.Pressure Carb.Temp PSC PSC.Fill PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4 Filler.Level Filler.Speed Temperature Usage.cont Carb.Flow Density MFR Balling PH Oxygen.Filler Bowl.Setpoint Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel Balling.Lvl
## 4 1 Carb.Volume Fill.Ounces PC.Volume Carb.Pressure Carb.Temp PSC PSC.Fill PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4 Filler.Level Filler.Speed Temperature Usage.cont Carb.Flow Density MFR Balling PH Oxygen.Filler Bowl.Setpoint Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel Balling.Lvl
## 5 1 Carb.Volume Fill.Ounces PC.Volume Carb.Pressure Carb.Temp PSC PSC.Fill PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4 Filler.Level Filler.Speed Temperature Usage.cont Carb.Flow Density MFR Balling PH Oxygen.Filler Bowl.Setpoint Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel Balling.Lvl
# Extract completed dataset
train_data_imputed <- complete(mice_model, 1)
missing_pct <- sapply(train_data_imputed, function(x) mean(is.na(x))) * 100
sort(missing_pct, decreasing = TRUE)
## Brand.Code Carb.Volume Fill.Ounces PC.Volume
## 0 0 0 0
## Carb.Pressure Carb.Temp PSC PSC.Fill
## 0 0 0 0
## PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure
## 0 0 0 0
## Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4 Filler.Level
## 0 0 0 0
## Filler.Speed Temperature Usage.cont Carb.Flow
## 0 0 0 0
## Density MFR Balling Pressure.Vacuum
## 0 0 0 0
## PH Oxygen.Filler Bowl.Setpoint Pressure.Setpoint
## 0 0 0 0
## Air.Pressurer Alch.Rel Carb.Rel Balling.Lvl
## 0 0 0 0
Now we have no missing values in the dataset.
Lets do the same cleaning for the test dataset
test_data$`Brand.Code` <- as.character(test_data$`Brand.Code`)
test_data$`Brand.Code`[is.na(test_data$`Brand.Code`)] <- "Unknown"
test_data$`Brand.Code` <- as.factor(test_data$`Brand.Code`)
missing_pct <- sapply(test_data, function(x) mean(is.na(x))) * 100
sort(missing_pct, decreasing = TRUE)
## PH MFR Filler.Speed Fill.Ounces
## 100.0000000 11.6104869 3.7453184 2.2471910
## PSC PSC.CO2 PC.Volume Carb.Pressure1
## 1.8726592 1.8726592 1.4981273 1.4981273
## Hyd.Pressure4 PSC.Fill Oxygen.Filler Alch.Rel
## 1.4981273 1.1235955 1.1235955 1.1235955
## Fill.Pressure Filler.Level Temperature Usage.cont
## 0.7490637 0.7490637 0.7490637 0.7490637
## Pressure.Setpoint Carb.Rel Carb.Volume Carb.Temp
## 0.7490637 0.7490637 0.3745318 0.3745318
## Hyd.Pressure2 Hyd.Pressure3 Density Balling
## 0.3745318 0.3745318 0.3745318 0.3745318
## Pressure.Vacuum Bowl.Setpoint Air.Pressurer Brand.Code
## 0.3745318 0.3745318 0.3745318 0.0000000
## Carb.Pressure Mnf.Flow Hyd.Pressure1 Carb.Flow
## 0.0000000 0.0000000 0.0000000 0.0000000
## Balling.Lvl
## 0.0000000
# Use the same outlier rule on test data
test_data <- test_data %>%
mutate(across(all_of(numeric_predictors), remove_outliers))
test_data_mice <- mice(test_data, m = 1, method = 'pmm', seed = 123)
##
## iter imp variable
## 1 1 Carb.Volume Fill.Ounces PC.Volume Carb.Temp PSC PSC.Fill PSC.CO2 Carb.Pressure1 Fill.Pressure Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4 Filler.Level Filler.Speed Temperature Usage.cont Density MFR Balling Pressure.Vacuum Oxygen.Filler Bowl.Setpoint Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel
## 2 1 Carb.Volume Fill.Ounces PC.Volume Carb.Temp PSC PSC.Fill PSC.CO2 Carb.Pressure1 Fill.Pressure Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4 Filler.Level Filler.Speed Temperature Usage.cont Density MFR Balling Pressure.Vacuum Oxygen.Filler Bowl.Setpoint Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel
## 3 1 Carb.Volume Fill.Ounces PC.Volume Carb.Temp PSC PSC.Fill PSC.CO2 Carb.Pressure1 Fill.Pressure Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4 Filler.Level Filler.Speed Temperature Usage.cont Density MFR Balling Pressure.Vacuum Oxygen.Filler Bowl.Setpoint Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel
## 4 1 Carb.Volume Fill.Ounces PC.Volume Carb.Temp PSC PSC.Fill PSC.CO2 Carb.Pressure1 Fill.Pressure Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4 Filler.Level Filler.Speed Temperature Usage.cont Density MFR Balling Pressure.Vacuum Oxygen.Filler Bowl.Setpoint Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel
## 5 1 Carb.Volume Fill.Ounces PC.Volume Carb.Temp PSC PSC.Fill PSC.CO2 Carb.Pressure1 Fill.Pressure Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4 Filler.Level Filler.Speed Temperature Usage.cont Density MFR Balling Pressure.Vacuum Oxygen.Filler Bowl.Setpoint Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel
## Warning: Number of logged events: 1
test_data_imputed <- complete(test_data_mice, 1)
missing_pct <- sapply(test_data_imputed, function(x) mean(is.na(x))) * 100
sort(missing_pct, decreasing = TRUE)
## PH Brand.Code Carb.Volume Fill.Ounces
## 100 0 0 0
## PC.Volume Carb.Pressure Carb.Temp PSC
## 0 0 0 0
## PSC.Fill PSC.CO2 Mnf.Flow Carb.Pressure1
## 0 0 0 0
## Fill.Pressure Hyd.Pressure1 Hyd.Pressure2 Hyd.Pressure3
## 0 0 0 0
## Hyd.Pressure4 Filler.Level Filler.Speed Temperature
## 0 0 0 0
## Usage.cont Carb.Flow Density MFR
## 0 0 0 0
## Balling Pressure.Vacuum Oxygen.Filler Bowl.Setpoint
## 0 0 0 0
## Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel
## 0 0 0 0
## Balling.Lvl
## 0
Now the data is all cleaned and ready for some EDA.
# Select only numeric columns
numeric_data <- train_data_imputed %>% select(where(is.numeric))
# Compute correlation matrix (use complete cases to avoid NA problems)
cor_matrix <- cor(numeric_data, use = "complete.obs")
# Plot correlation matrix
corrplot(cor_matrix, method = "color", type = "upper",
tl.cex = 0.7, tl.col = "black", addCoef.col = "black", number.cex = 0.6,
title = "Correlation Matrix", mar = c(0,0,1,0))
cor_long <- as.data.frame(as.table(cor_matrix)) %>%
filter(Var1 != Var2) %>%
mutate(abs_corr = abs(Freq)) %>%
arrange(desc(abs_corr))
cor_long_unique <- cor_long %>%
rowwise() %>%
mutate(pair = paste(sort(c(Var1, Var2)), collapse = "_")) %>%
distinct(pair, .keep_all = TRUE) %>%
select(Var1, Var2, Correlation = Freq) %>%
arrange(desc(abs(Correlation)))
cor_long_unique
## # A tibble: 465 × 3
## # Rowwise:
## Var1 Var2 Correlation
## <fct> <fct> <dbl>
## 1 Balling.Lvl Balling 0.978
## 2 Balling Density 0.955
## 3 Bowl.Setpoint Filler.Level 0.949
## 4 Balling.Lvl Density 0.948
## 5 Hyd.Pressure3 Hyd.Pressure2 0.925
## 6 Alch.Rel Balling 0.923
## 7 Balling.Lvl Alch.Rel 0.920
## 8 Alch.Rel Density 0.901
## 9 MFR Filler.Speed 0.869
## 10 Balling.Lvl Carb.Rel 0.843
## # ℹ 455 more rows
We can see that we have quite a few variables that are highly correlated with eachother we will have to keep this in mind when creating models as some models like linear regression are sensitive to multicolinearity.
top_corr <- cor_long_unique %>%
mutate(abs_corr = abs(Correlation)) %>%
arrange(desc(abs_corr)) %>%
filter(abs_corr > .45)
top_corr
## # A tibble: 35 × 4
## # Rowwise:
## Var1 Var2 Correlation abs_corr
## <fct> <fct> <dbl> <dbl>
## 1 Balling.Lvl Balling 0.978 0.978
## 2 Balling Density 0.955 0.955
## 3 Bowl.Setpoint Filler.Level 0.949 0.949
## 4 Balling.Lvl Density 0.948 0.948
## 5 Hyd.Pressure3 Hyd.Pressure2 0.925 0.925
## 6 Alch.Rel Balling 0.923 0.923
## 7 Balling.Lvl Alch.Rel 0.920 0.920
## 8 Alch.Rel Density 0.901 0.901
## 9 MFR Filler.Speed 0.869 0.869
## 10 Balling.Lvl Carb.Rel 0.843 0.843
## # ℹ 25 more rows
top_corr
## # A tibble: 35 × 4
## # Rowwise:
## Var1 Var2 Correlation abs_corr
## <fct> <fct> <dbl> <dbl>
## 1 Balling.Lvl Balling 0.978 0.978
## 2 Balling Density 0.955 0.955
## 3 Bowl.Setpoint Filler.Level 0.949 0.949
## 4 Balling.Lvl Density 0.948 0.948
## 5 Hyd.Pressure3 Hyd.Pressure2 0.925 0.925
## 6 Alch.Rel Balling 0.923 0.923
## 7 Balling.Lvl Alch.Rel 0.920 0.920
## 8 Alch.Rel Density 0.901 0.901
## 9 MFR Filler.Speed 0.869 0.869
## 10 Balling.Lvl Carb.Rel 0.843 0.843
## # ℹ 25 more rows
# Create label for each pair
top_corr <- top_corr %>%
mutate(label = paste(Var1, Var2, sep = " vs. "))
# Plot
ggplot(top_corr, aes(x = reorder(label, Correlation), y = Correlation, fill = Correlation > 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_fill_manual(values = c("TRUE" = "steelblue", "FALSE" = "firebrick")) +
labs(title = "Top Predictor Correlations",
x = "Variable Pair",
y = "Correlation Coefficient") +
theme_minimal()
cor_with_ph <- cor(numeric_data, use = "complete.obs")[, "PH"] %>%
sort(decreasing = TRUE)
cor_with_ph
## PH Bowl.Setpoint Filler.Level Pressure.Vacuum
## 1.000000000 0.349140795 0.325460967 0.219346419
## Oxygen.Filler Carb.Rel Carb.Flow Alch.Rel
## 0.195435096 0.158278785 0.154507074 0.146630953
## Balling.Lvl Density Balling Carb.Volume
## 0.100252347 0.077326556 0.064670253 0.063359843
## Carb.Pressure PC.Volume MFR Filler.Speed
## 0.056669020 0.039785554 0.035685860 0.022381373
## Carb.Temp Air.Pressurer PSC.Fill PSC.CO2
## 0.019350762 -0.005610081 -0.037947081 -0.071714809
## Carb.Pressure1 PSC Fill.Ounces Hyd.Pressure4
## -0.073353773 -0.088388469 -0.094171264 -0.130425683
## Hyd.Pressure2 Fill.Pressure Temperature Hyd.Pressure3
## -0.200700383 -0.217378467 -0.217382477 -0.239141118
## Pressure.Setpoint Usage.cont Mnf.Flow
## -0.307068276 -0.318853705 -0.446993440
We have a ton of correlated predictors. So, we will run PCA on the training data to get uncorrelated components. This data will then be used to train and test models that are effected by correlated predictors like linear regression, ridge and lasso regression, KNN, SVMs and Neural Networks. For tree based models since these handle correlated predictors well we can use the full training set.
cor_df <- data.frame(
Variable = names(cor_with_ph),
Correlation = as.numeric(cor_with_ph)
)
# Plot all in one diverging bar chart
ggplot(cor_df, aes(x = reorder(Variable, Correlation), y = Correlation, fill = Correlation > 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_fill_manual(values = c("TRUE" = "steelblue", "FALSE" = "firebrick")) +
labs(title = "Correlation of Predictors with pH",
x = "Predictor",
y = "Correlation with pH") +
theme_minimal(base_size = 13) +
theme(axis.text.y = element_text(size = 10))
Now split the training data into a train and test set so that we can produce test RMSE, R squared and more metrics to choose the best model. First, one hot encode the factor variables.
set.seed(123)
split <- createDataPartition(train_data_imputed$PH, p = 0.8, list = FALSE)
train_split <- train_data_imputed[split, ]
test_split <- train_data_imputed[-split, ]
# Split into predictors and response
X_train <- train_split %>% select(-PH)
y_train <- train_split$PH
X_test <- test_split %>% select(-PH)
y_test <- test_split$PH
# one hot encode factor variable
dummies <- dummyVars(~ ., data = train_data_imputed %>% select(-PH))
# Apply to training and test
X_train <- predict(dummies, newdata = X_train) %>% as.data.frame()
X_test <- predict(dummies, newdata = X_test) %>% as.data.frame()
# Select numeric predictors (excluding PH)
numeric_train <- X_train %>% select(where(is.numeric))
numeric_test <- X_test %>% select(all_of(colnames(numeric_train)))
Now we will create the PCA model and the PCA train and test data sets
# Create a PCA preProcess model
pca_model <- preProcess(numeric_train, method = c("center", "scale", "pca"), thresh = 0.95)
# Apply PCA transformation
X_train_pca <- predict(pca_model, newdata = numeric_train)
X_test_pca <- predict(pca_model, newdata = numeric_test)
{r}T head(X_train_pca) head(X_test_pca)
Now we are ready to train and test some models lets start with some linear regression.
# Set up 5-fold CV
ctrl <- trainControl(method = "cv", number = 5)
lm_model_pca <- train(x = X_train_pca, y = y_train, method = "lm", trControl = ctrl, preProcess = c("center", "scale"))
lm_pred_pca <- predict(lm_model_pca, X_test_pca)
lm_results_pca <- postResample(pred = lm_pred_pca, obs = y_test)
lm_results_pca
## RMSE Rsquared MAE
## 0.1353794 0.3840276 0.1051837
lm_model <- train(x = X_train, y = y_train, method = "lm", trControl = ctrl, preProcess = c("center", "scale"))
lm_pred <- predict(lm_model, X_test)
lm_results <- postResample(pred = lm_pred, obs = y_test)
lm_results
## RMSE Rsquared MAE
## 0.12992941 0.43280596 0.09788884
Since the PCA model actually did worse lets try just removing correlated predictors and retrain a lm model. This might be better anyway as the goal of the assignment is to identifiy relationships with PH and using PCA we lose the interpretability.
# Identify correlated columns
high_cor <- findCorrelation(cor(numeric_train), cutoff = 0.9)
drop_vars <- names(numeric_train)[high_cor]
# Drop from train and test
X_train_uncorrelated <- X_train %>% select(-all_of(drop_vars))
X_test_uncorrelated <- X_test %>% select(-all_of(drop_vars))
lm_model_uncorrelated <- train(x = X_train_uncorrelated , y = y_train , method = "lm", trControl = ctrl, preProcess = c("center", "scale"))
lm_pred_uncorrelated <- predict(lm_model_uncorrelated , X_test)
lm_results_uncorrelated <- postResample(pred = lm_pred_uncorrelated , obs = y_test)
lm_results_uncorrelated
## RMSE Rsquared MAE
## 0.1324853 0.4108472 0.1001905
The model with all the predictors is still performing the best. Lets try lasso regression now
# Define lambda grid for tuning
lasso_grid <- expand.grid(alpha = 1, lambda = 10^seq(-4, 1, length = 100))
# Train Lasso model
lasso_model <- train(x = X_train,
y = y_train,
method = "glmnet",
trControl = ctrl,
tuneGrid = lasso_grid,
preProcess = c("center", "scale")
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
# Predict on test set
lasso_pred <- predict(lasso_model, newdata = X_test)
# Evaluate performance
lasso_results <- postResample(pred = lasso_pred, obs = y_test)
lasso_results
## RMSE Rsquared MAE
## 0.12978990 0.43440049 0.09778531
Now lets try lasso on the pca and uncorrelated data
# Train Lasso model
lasso_model_pca <- train(x = X_train_pca,
y = y_train,
method = "glmnet",
trControl = ctrl,
tuneGrid = lasso_grid,
preProcess = c("center", "scale"))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
# Predict on test set
lasso_pred_pca <- predict(lasso_model_pca, newdata = X_test_pca)
# Evaluate performance
lasso_results_pca <- postResample(pred = lasso_pred_pca, obs = y_test)
lasso_results_pca
## RMSE Rsquared MAE
## 0.1353648 0.3844326 0.1052155
# Train Lasso model
lasso_model_uncorrelated <- train(x = X_train_uncorrelated,
y = y_train,
method = "glmnet",
trControl = ctrl,
tuneGrid = lasso_grid,
preProcess = c("center", "scale"))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
# Predict on test set
lasso_pred_uncorrelated <- predict(lasso_model_uncorrelated, newdata = X_test_uncorrelated)
# Evaluate performance
lasso_results_uncorrelated <- postResample(pred = lasso_pred_uncorrelated, obs = y_test)
lasso_results_uncorrelated
## RMSE Rsquared MAE
## 0.1327583 0.4090422 0.1004522
The best lasso model was the one on the original data with all the predictors like the lm model.
Now lets try some ridge regression
# Define tuning grid (alpha = 0 = Ridge)
ridge_grid <- expand.grid(alpha = 0, lambda = 10^seq(-4, 1, length = 100))
# Train Ridge Regression with scaling
ridge_model <- train(x = X_train,
y = y_train,
method = "glmnet",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneGrid = ridge_grid)
# Predict on test data
ridge_pred <- predict(ridge_model, newdata = X_test)
# Evaluate performance
ridge_results <- postResample(pred = ridge_pred, obs = y_test)
ridge_results
## RMSE Rsquared MAE
## 0.13050735 0.42987866 0.09875385
# Train Ridge Regression with scaling
ridge_model_pca <- train(x = X_train_pca,
y = y_train,
method = "glmnet",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneGrid = ridge_grid)
# Predict on test data
ridge_pred_pca <- predict(ridge_model_pca, newdata = X_test_pca)
# Evaluate performance
ridge_results_pca <- postResample(pred = ridge_pred_pca, obs = y_test)
ridge_results_pca
## RMSE Rsquared MAE
## 0.1354793 0.3840276 0.1055451
# Train Ridge Regression with scaling
ridge_model_uncorrelated <- train(x = X_train_uncorrelated,
y = y_train,
method = "glmnet",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneGrid = ridge_grid)
# Predict on test data
ridge_pred_uncorrelated <- predict(ridge_model_uncorrelated, newdata = X_test_uncorrelated)
# Evaluate performance
ridge_results_uncorrelated <- postResample(pred = ridge_pred_uncorrelated, obs = y_test)
ridge_results_uncorrelated
## RMSE Rsquared MAE
## 0.1328217 0.4090734 0.1007041
Again the best RMSE and R squared comes from the data with all the predictors.
Now lets try a SVM model as none of the lm. lasso or ridge gave useable results the R squares were all less than a coin flip we can do better.
# Train SVM with radial kernel
svm_model <- train(x = X_train,
y = y_train,
method = "svmRadial",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneLength = 10)
# Predict on test set
svm_pred <- predict(svm_model, newdata = X_test)
svm_results <- postResample(pred = svm_pred, obs = y_test)
svm_results
## RMSE Rsquared MAE
## 0.11501519 0.56039208 0.08034975
svm_model_pca <- train(x = X_train_pca,
y = y_train,
method = "svmRadial",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneLength = 10)
# Predict on test set
svm_pred_pca <- predict(svm_model_pca, newdata = X_test_pca)
svm_results_pca <- postResample(pred = svm_pred_pca, obs = y_test)
svm_results_pca
## RMSE Rsquared MAE
## 0.12290105 0.50023349 0.08729625
svm_model_uncorrelated <- train(x = X_train_uncorrelated,
y = y_train,
method = "svmRadial",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneLength = 10)
# Predict on test set
svm_pred_uncorrelated <- predict(svm_model_uncorrelated, newdata = X_test_uncorrelated)
svm_results_uncorrelated <- postResample(pred = svm_pred_uncorrelated, obs = y_test)
svm_results_uncorrelated
## RMSE Rsquared MAE
## 0.11740846 0.54272620 0.08258958
The best R squared and RMSE so far came from the SVM trained on all the predictors a R squared of .56 (basically a coin flip) is still not great tho we can do better.
Lets try pls becuase it does well when the predictors are correlated.
# Train PLS model
pls_model <- train(x = X_train,
y = y_train,
method = "pls",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneLength = 15)
# Predict on test set
pls_pred <- predict(pls_model, newdata = X_test)
# Evaluate performance
pls_results <- postResample(pred = pls_pred, obs = y_test)
pls_results
## RMSE Rsquared MAE
## 0.1297504 0.4342823 0.0978889
The pls model does not out perform the SVM we just made so we will move onto some more complex models.
Now lets try some of the more complex models like MARS, Tree based models like Random Forest, Neural Networks and Cubist model
Lets try a MARS model first. We will train on the full set of predictors and then also on the pruned dataset with correlated predictors removed.
# Train MARS model
mars_model <- train(x = X_train,
y = y_train,
method = "earth",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneLength = 10)
# Predict on test set
mars_pred <- predict(mars_model, newdata = X_test)
# Evaluate performance
mars_results <- postResample(pred = mars_pred, obs = y_test)
mars_results
## RMSE Rsquared MAE
## 0.12511611 0.47327790 0.09362943
# Train MARS model
mars_model_uncorrelated <- train(x = X_train_uncorrelated ,
y = y_train,
method = "earth",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneLength = 10)
# Predict on test set
mars_pred_uncorrelated <- predict(mars_model_uncorrelated , newdata = X_test_uncorrelated )
# Evaluate performance
mars_results_uncorrelated <- postResample(pred = mars_pred_uncorrelated , obs = y_test)
mars_results_uncorrelated
## RMSE Rsquared MAE
## 0.13333186 0.40549489 0.09879608
The best MARS model is on the non-pruned dataset and gives an R squared of .47. Not the best but not the worst either
Now lets try a random forest model we will train this on the whole training set as random forests can handle correlated predictors.
# Train Random Forest
rf_model <- train(x = X_train,
y = y_train,
method = "rf",
trControl = ctrl,
tuneLength = 5,
importance = TRUE,
ntree = 1000
)
# Predict and evaluate
rf_pred <- predict(rf_model, newdata = X_test)
rf_results <- postResample(pred = rf_pred, obs = y_test)
rf_results
## RMSE Rsquared MAE
## 0.10316270 0.64276756 0.06863096
We will analyze the variable importance later but for now this Random Forest is the best RMSE and Rsquared of .64.
Now lets try some boosted trees.
gbm_model <- train(x = X_train,
y = y_train,
method = "gbm",
trControl = ctrl,
verbose = FALSE,
tuneLength = 10)
gbm_pred <- predict(gbm_model, newdata = X_test)
gbm_results <- postResample(pred = gbm_pred, obs = y_test)
gbm_results
## RMSE Rsquared MAE
## 0.10842098 0.60600878 0.07788096
For the boosted trees our best model gives an Rsquared of .60 not amazing but the second best model we have been able to create so far.
Now lets try a bagged tree model.
bag_model <- train(x = X_train,
y = y_train,
method = "treebag",
trControl = ctrl)
bag_pred <- predict(bag_model, newdata = X_test)
bag_results <- postResample(pred = bag_pred, obs = y_test)
bag_results
## RMSE Rsquared MAE
## 0.11934530 0.52478580 0.08973709
The bagged tree model give a Rsquared of .53 not great, not much better than a coin flip.
Now lets try a Cubist model.
cubist_model <- train(x = X_train,
y = y_train,
method = "cubist",
trControl = ctrl,
tuneLength = 10) # tunes number of committees and neighbors
# Predict and evaluate
cubist_pred <- predict(cubist_model, newdata = X_test)
cubist_results <- postResample(pred = cubist_pred, obs = y_test)
cubist_results
## RMSE Rsquared MAE
## 0.1138548 0.5795800 0.0721388
The Cubist model give a RSquared of .58 better than most of the models we have tried but not amazing or the best.
Finally, lets try a Neural Network model
# Train neural network (nnet = single hidden layer)
nn_model <- train(x = X_train,
y = y_train,
method = "nnet",
trControl = ctrl,
preProcess = c("center", "scale"),
tuneLength = 10,
trace = FALSE,
linout = TRUE) # for regression
# Predict and evaluate
nn_pred <- predict(nn_model, newdata = X_test)
nn_results <- postResample(pred = nn_pred, obs = y_test)
nn_results
## RMSE Rsquared MAE
## 0.12762710 0.47965944 0.09400249
The Neural Network gives a rsquared of .50 again not much better than a coin flip.
I want to try adding and transforming some of the predictors to see if we can improve the Rsquared.
So, lets add in some interaction terms and transform some predictors and then retrain and test some models to see if the R squared improves at all.
I did some research and I believe that a interaction term for flow pressure which would be the Mnf.Flow times the Fill.Pressure. Also an interaction term that captures the relationship between the Oxygen and Carbonation as this can affect PH. Then I will add in an interaction term for the Bowl.Setpoint and pressure vacuum. Then I will add in an interaction term for the Carb temp to carb pressure to capture that relationship.
Finally, I will transform some of the variables using log and square roots. I applied log transformations to predictors that were right-skewed with long tails, such as flow rate and oxygen levels, to reduce the influence of large values and stabilize variance. Square root transformations were used for variables that were non-negative with moderate skew or wide variance, helping to linearize relationships and smooth out disproportionate influence from larger values.
training_data_transformed <- train_data_imputed %>%
mutate(
Flow_Pressure = Mnf.Flow * Fill.Pressure,
Oxygen_Rel = Oxygen.Filler * Carb.Rel,
BowlVacuum = Bowl.Setpoint * Pressure.Vacuum,
Temp_to_Pressure = Carb.Temp / (Carb.Pressure + 1),
log_Flow = log(Mnf.Flow + abs(min(Mnf.Flow)) + 1),
log_OxygenFiller = log(Oxygen.Filler + 1),
log_MFR = log(MFR + 1),
log_UsageCont = log(Usage.cont + 1),
log_PSC_CO2 = log(PSC.CO2 + 1),
sqrt_PSC_Fill = sqrt(PSC.Fill),
sqrt_Vacuum = sqrt(abs(Pressure.Vacuum))
)
head(training_data_transformed)
## Brand.Code Carb.Volume Fill.Ounces PC.Volume Carb.Pressure Carb.Temp PSC
## 1 B 5.340000 23.96667 0.2633333 68.2 141.2 0.104
## 2 A 5.426667 24.00667 0.2386667 68.4 139.6 0.124
## 3 B 5.286667 24.06000 0.2633333 70.8 144.8 0.090
## 4 A 5.440000 24.00667 0.2933333 63.0 132.6 0.008
## 5 A 5.486667 24.31333 0.1113333 67.2 136.8 0.026
## 6 A 5.380000 23.92667 0.2693333 66.6 138.4 0.090
## PSC.Fill PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure Hyd.Pressure2
## 1 0.26 0.04 -100 118.8 46.0 37
## 2 0.22 0.04 -100 121.6 46.0 0
## 3 0.34 0.16 -100 120.2 46.0 0
## 4 0.42 0.04 -100 115.2 46.4 0
## 5 0.16 0.12 -100 118.4 45.8 0
## 6 0.24 0.04 -100 119.6 45.6 0
## Hyd.Pressure3 Hyd.Pressure4 Filler.Level Filler.Speed Temperature Usage.cont
## 1 20.8 118 121.2 4002 66.0 16.18
## 2 0.0 106 118.6 3986 67.6 19.90
## 3 0.0 82 120.0 4020 67.0 17.76
## 4 0.0 92 117.8 4012 65.6 17.42
## 5 0.0 92 118.6 4010 65.6 17.68
## 6 0.0 116 120.2 4014 66.2 23.82
## Carb.Flow Density MFR Balling Pressure.Vacuum PH Oxygen.Filler
## 1 2932 0.88 725.0 1.398 -4.0 8.36 0.022
## 2 3144 0.92 726.8 1.498 -4.0 8.26 0.026
## 3 2914 1.58 735.0 3.142 -3.8 8.94 0.024
## 4 3062 1.54 730.6 3.042 -4.4 8.24 0.030
## 5 3054 1.54 722.8 3.042 -4.4 8.26 0.030
## 6 2948 1.52 738.8 2.992 -4.4 8.32 0.024
## Bowl.Setpoint Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel Balling.Lvl
## 1 120 46.4 142.6 6.58 5.32 1.48
## 2 120 46.8 143.0 6.56 5.30 1.56
## 3 120 46.6 142.0 7.66 5.84 3.28
## 4 120 46.0 142.0 7.14 5.42 3.04
## 5 120 46.0 142.0 7.14 5.44 3.04
## 6 120 46.0 142.2 7.16 5.44 3.02
## Flow_Pressure Oxygen_Rel BowlVacuum Temp_to_Pressure log_Flow
## 1 -4600 0.11704 -480 2.040462 0.1823216
## 2 -4600 0.13780 -480 2.011527 0.1823216
## 3 -4600 0.14016 -456 2.016713 0.1823216
## 4 -4640 0.16260 -528 2.071875 0.1823216
## 5 -4580 0.16320 -528 2.005865 0.1823216
## 6 -4560 0.13056 -528 2.047337 0.1823216
## log_OxygenFiller log_MFR log_UsageCont log_PSC_CO2 sqrt_PSC_Fill sqrt_Vacuum
## 1 0.02176149 6.587550 2.843746 0.03922071 0.5099020 2.000000
## 2 0.02566775 6.590026 3.039749 0.03922071 0.4690416 2.000000
## 3 0.02371653 6.601230 2.931727 0.14842001 0.5830952 1.949359
## 4 0.02955880 6.595234 2.913437 0.03922071 0.6480741 2.097618
## 5 0.02955880 6.584515 2.927453 0.11332869 0.4000000 2.097618
## 6 0.02371653 6.606380 3.211650 0.03922071 0.4898979 2.097618
split <- createDataPartition(training_data_transformed$PH, p = 0.8, list = FALSE)
train_split <- training_data_transformed[split, ]
test_split <- training_data_transformed[-split, ]
# Split into predictors and response
X_train_trans <- train_split %>% select(-PH)
y_train_trans <- train_split$PH
X_test_trans <- test_split %>% select(-PH)
y_test_trans <- test_split$PH
# Train Random Forest
rf_model_trans <- train(x = X_train_trans,
y = y_train_trans,
method = "rf",
trControl = ctrl,
tuneLength = 5,
importance = TRUE,
ntree = 1000
)
# Predict and evaluate
rf_pred_trans <- predict(rf_model_trans, newdata = X_test_trans)
rf_results_trans <- postResample(pred = rf_pred_trans, obs = y_test_trans)
rf_results_trans
## RMSE Rsquared MAE
## 0.09910888 0.67101049 0.07157536
The random forest with the interaction terms and transformed predictors give the best Rsquared and lowest RMSE so far
varImp(rf_model_trans)
## rf variable importance
##
## only 20 most important variables shown (out of 42)
##
## Overall
## Brand.Code 100.00
## BowlVacuum 61.49
## Temperature 58.36
## Hyd.Pressure3 57.38
## Alch.Rel 49.73
## Carb.Rel 49.64
## Balling.Lvl 49.12
## Pressure.Vacuum 48.46
## sqrt_Vacuum 48.31
## Flow_Pressure 46.75
## Carb.Flow 45.16
## Filler.Speed 44.16
## Balling 39.85
## log_UsageCont 38.53
## Density 38.11
## Bowl.Setpoint 37.03
## Carb.Pressure1 36.99
## Oxygen_Rel 36.93
## Usage.cont 35.05
## Fill.Pressure 33.49
The GBM model was the second best for the non transformed data so lets try that model on this data.
gbm_model_trans <- train(x = X_train_trans,
y = y_train_trans,
method = "gbm",
trControl = ctrl,
verbose = FALSE,
tuneLength = 10)
gbm_pred_trans <- predict(gbm_model_trans, newdata = X_test_trans)
gbm_results_trans <- postResample(pred = gbm_pred_trans, obs = y_test_trans)
gbm_results_trans
## RMSE Rsquared MAE
## 0.10696484 0.61397390 0.07822457
bag_model_trans <- train(x = X_train_trans,
y = y_train_trans,
method = "treebag",
trControl = ctrl)
bag_pred_trans <- predict(bag_model_trans, newdata = X_test_trans)
bag_results_trans <- postResample(pred = bag_pred_trans, obs = y_test_trans)
bag_results_trans
## RMSE Rsquared MAE
## 0.12378319 0.48074321 0.09682214
cubist_model_trans <- train(x = X_train_trans,
y = y_train_trans,
method = "cubist",
trControl = ctrl,
tuneLength = 10) # tunes number of committees and neighbors
# Predict and evaluate
cubist_pred_trans <- predict(cubist_model_trans, newdata = X_test_trans)
cubist_results_trans <- postResample(pred = cubist_pred_trans, obs = y_test_trans)
cubist_results_trans
## RMSE Rsquared MAE
## 0.10069510 0.65904861 0.07095879
# Train MARS model
mars_model_trans <- train(x = X_train_trans,
y = y_train_trans,
method = "earth",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneLength = 10)
# Predict on test set
mars_pred_trans <- predict(mars_model_trans, newdata = X_test_trans)
# Evaluate performance
mars_results_trans <- postResample(pred = mars_pred_trans, obs = y_test_trans)
mars_results_trans
## RMSE Rsquared MAE
## 0.12958290 0.43357315 0.09983316
# Train neural network (nnet = single hidden layer)
nn_model_trans <- train(x = X_train_trans,
y = y_train_trans,
method = "nnet",
trControl = ctrl,
preProcess = c("center", "scale"),
tuneLength = 10,
trace = FALSE,
linout = TRUE)
# Predict and evaluate
nn_pred_trans <- predict(nn_model_trans, newdata = X_test_trans)
nn_results_trans <- postResample(pred = nn_pred_trans, obs = y_test_trans)
nn_results_trans
## RMSE Rsquared MAE
## 0.12344111 0.49547764 0.08996243
The random forest is still the best model I will use that model on the transformed data lets look at the variable importance again to see how the variables relate to the PH prediction
varImp(rf_model_trans)
## rf variable importance
##
## only 20 most important variables shown (out of 42)
##
## Overall
## Brand.Code 100.00
## BowlVacuum 61.49
## Temperature 58.36
## Hyd.Pressure3 57.38
## Alch.Rel 49.73
## Carb.Rel 49.64
## Balling.Lvl 49.12
## Pressure.Vacuum 48.46
## sqrt_Vacuum 48.31
## Flow_Pressure 46.75
## Carb.Flow 45.16
## Filler.Speed 44.16
## Balling 39.85
## log_UsageCont 38.53
## Density 38.11
## Bowl.Setpoint 37.03
## Carb.Pressure1 36.99
## Oxygen_Rel 36.93
## Usage.cont 35.05
## Fill.Pressure 33.49
We can see that the Brand.Code is the most important to the model with BowlVacuum, Hyd.Pressure3 and Carb.Rel all around 60 importance then Alch.Rel and Balling.lvl in the 50s. This shows that these variable are the most important for the model to predict PH. These variables are the ones that the model uses to model PH. The BowlVacuum was a interaction term that I added in that is Mnf.Flow * Fill.Pressure so these variables should also be noted as important to PH.
Now lets just produce a bar chart of the variable importances for the non technical report.
plot(varImp(rf_model_trans), top = 20, main = "Top 20 Predictors of pH (Random Forest)")
Now lets generate the PH predictions on the test set. We will first have to add in the transformed predictors and interaction terms.
test_data_transformed <- test_data_imputed %>%
mutate(
Flow_Pressure = Mnf.Flow * Fill.Pressure,
Oxygen_Rel = Oxygen.Filler * Carb.Rel,
BowlVacuum = Bowl.Setpoint * Pressure.Vacuum,
Temp_to_Pressure = Carb.Temp / (Carb.Pressure + 1),
log_Flow = log(Mnf.Flow + abs(min(Mnf.Flow)) + 1),
log_OxygenFiller = log(Oxygen.Filler + 1),
log_MFR = log(MFR + 1),
log_UsageCont = log(Usage.cont + 1),
log_PSC_CO2 = log(PSC.CO2 + 1),
sqrt_PSC_Fill = sqrt(PSC.Fill),
sqrt_Vacuum = sqrt(abs(Pressure.Vacuum))
)
head(test_data_transformed)
## Brand.Code Carb.Volume Fill.Ounces PC.Volume Carb.Pressure Carb.Temp PSC
## 1 D 5.480000 24.03333 0.2700000 65.4 134.6 0.236
## 2 A 5.393333 23.95333 0.2266667 63.2 135.0 0.042
## 3 B 5.293333 23.92000 0.3033333 66.4 140.4 0.068
## 4 B 5.266667 23.94000 0.1860000 64.8 139.0 0.004
## 5 B 5.406667 24.20000 0.1600000 69.4 142.2 0.040
## 6 B 5.286667 24.10667 0.2120000 73.4 147.2 0.078
## PSC.Fill PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure Hyd.Pressure1
## 1 0.40 0.04 -100 116.6 46.0 0
## 2 0.22 0.08 -100 118.8 46.2 0
## 3 0.10 0.02 -100 120.2 45.8 0
## 4 0.20 0.02 -100 124.8 40.0 0
## 5 0.30 0.06 -100 115.0 51.4 0
## 6 0.22 0.08 -100 118.6 46.4 0
## Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4 Filler.Level Filler.Speed
## 1 0 0 96 129.4 3986
## 2 0 0 112 120.0 4012
## 3 0 0 98 119.4 4010
## 4 0 0 132 120.2 4008
## 5 0 0 94 116.0 4018
## 6 0 0 94 120.4 4010
## Temperature Usage.cont Carb.Flow Density MFR Balling Pressure.Vacuum PH
## 1 66.0 21.66 2950 0.88 727.6 1.398 -3.8 NA
## 2 65.6 17.60 2916 1.50 735.8 2.942 -4.4 NA
## 3 65.6 24.18 3056 0.90 734.8 1.448 -4.2 NA
## 4 66.2 18.12 28 0.74 718.8 1.056 -4.0 NA
## 5 66.4 21.32 3214 0.88 752.0 1.398 -4.0 NA
## 6 66.6 18.00 3064 0.84 732.0 1.298 -3.8 NA
## Oxygen.Filler Bowl.Setpoint Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel
## 1 0.0220 130 45.2 142.6 6.56 5.34
## 2 0.0300 120 46.0 142.0 7.14 5.58
## 3 0.0460 120 46.0 142.4 6.52 5.34
## 4 0.0158 120 46.0 142.4 6.48 5.50
## 5 0.0820 120 50.0 142.2 6.50 5.38
## 6 0.0640 120 46.0 143.0 6.50 5.42
## Balling.Lvl Flow_Pressure Oxygen_Rel BowlVacuum Temp_to_Pressure log_Flow
## 1 1.48 -4600 0.11748 -494 2.027108 0.1823216
## 2 3.04 -4620 0.16740 -528 2.102804 0.1823216
## 3 1.46 -4580 0.24564 -504 2.083086 0.1823216
## 4 1.48 -4000 0.08690 -480 2.112462 0.1823216
## 5 1.46 -5140 0.44116 -480 2.019886 0.1823216
## 6 1.44 -4640 0.34688 -456 1.978495 0.1823216
## log_OxygenFiller log_MFR log_UsageCont log_PSC_CO2 sqrt_PSC_Fill sqrt_Vacuum
## 1 0.02176149 6.591125 3.120601 0.03922071 0.6324555 1.949359
## 2 0.02955880 6.602316 2.923162 0.07696104 0.4690416 2.097618
## 3 0.04497337 6.600958 3.226050 0.01980263 0.3162278 2.049390
## 4 0.01567648 6.578973 2.950735 0.01980263 0.4472136 2.000000
## 5 0.07881118 6.624065 3.105483 0.05826891 0.5477226 2.000000
## 6 0.06203539 6.597146 2.944439 0.07696104 0.4690416 1.949359
predicted_ph <- predict(rf_model_trans, newdata =test_data_transformed)
predictions_df <- data.frame(ID = 1:length(predicted_ph), Predicted_PH = predicted_ph)
write_xlsx(predictions_df, "pH_predictions.xlsx")