library(readr)
library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
library(magrittr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract()   masks magrittr::extract()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
## 
## The following object is masked from 'package:pls':
## 
##     R2
library(rpart)

Response Variable.: pH

imputed_data <- read_csv("imputed_test_data.csv")
## Rows: 2447 Columns: 36
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (36): Carb.Volume, Fill.Ounces, PC.Volume, Carb.Pressure, Carb.Temp, PSC...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(imputed_data)
## # A tibble: 6 × 36
##   Carb.Volume Fill.Ounces PC.Volume Carb.Pressure Carb.Temp   PSC PSC.Fill
##         <dbl>       <dbl>     <dbl>         <dbl>     <dbl> <dbl>    <dbl>
## 1        5.34        24.0     0.263          68.2      141. 0.104     0.26
## 2        5.43        24.0     0.239          68.4      140. 0.124     0.22
## 3        5.29        24.1     0.263          70.8      145. 0.09      0.34
## 4        5.44        24.0     0.293          63        133. 0.129     0.42
## 5        5.49        24.3     0.111          67.2      137. 0.026     0.16
## 6        5.38        23.9     0.269          66.6      138. 0.09      0.24
## # ℹ 29 more variables: PSC.CO2 <dbl>, Mnf.Flow <dbl>, Carb.Pressure1 <dbl>,
## #   Fill.Pressure <dbl>, Hyd.Pressure1 <dbl>, Hyd.Pressure2 <dbl>,
## #   Hyd.Pressure3 <dbl>, Hyd.Pressure4 <dbl>, Filler.Level <dbl>,
## #   Filler.Speed <dbl>, Temperature <dbl>, Usage.cont <dbl>, Carb.Flow <dbl>,
## #   Density <dbl>, MFR <dbl>, Balling <dbl>, Pressure.Vacuum <dbl>, PH <dbl>,
## #   Oxygen.Filler <dbl>, Bowl.Setpoint <dbl>, Pressure.Setpoint <dbl>,
## #   Air.Pressurer <dbl>, Alch.Rel <dbl>, Carb.Rel <dbl>, Balling.Lvl <dbl>, …
# Split

train_index <- createDataPartition(imputed_data$PH, p = 0.75, list = FALSE)
train_data <- imputed_data[train_index, ]
test_data <- imputed_data[-train_index, ]
# pls can handle multicollinearity but just checking
correlation <- cor(train_data$PH, train_data[, -which(names(train_data) == "PH")])
print(correlation)
##      Carb.Volume Fill.Ounces PC.Volume Carb.Pressure  Carb.Temp         PSC
## [1,]  0.07218182 -0.09250504 0.0264231    0.05558605 0.01134131 -0.08631458
##        PSC.Fill     PSC.CO2   Mnf.Flow Carb.Pressure1 Fill.Pressure
## [1,] -0.0242662 -0.09720644 -0.4418839    -0.06036832    -0.2197768
##      Hyd.Pressure1 Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4 Filler.Level
## [1,]   -0.08367108    -0.1987897    -0.2379096    -0.1315832    0.3180917
##      Filler.Speed Temperature Usage.cont Carb.Flow    Density        MFR
## [1,]  -0.06206353  -0.1488801 -0.3119293 0.1366277 0.08483373 -0.0723247
##        Balling Pressure.Vacuum Oxygen.Filler Bowl.Setpoint Pressure.Setpoint
## [1,] 0.0657022        0.219004     0.1726586     0.3389863        -0.3155336
##      Air.Pressurer  Alch.Rel  Carb.Rel Balling.Lvl Brand.CodeA Brand.CodeB
## [1,]   -0.01561297 0.1511893 0.1583297   0.1024894 -0.08977359   0.1001531
##      Brand.CodeC Brand.CodeD
## [1,]  -0.2940141   0.1818833
trainX <- train_data[, -which(names(train_data) == "PH")]  
trainY <- train_data$PH  

trainX <- as.data.frame(trainX)
trainY <- as.numeric(trainY)
plsTune_model <- train(
  x = trainX, 
  y = trainY,
  method = "pls", 
  tuneLength = 10, 
  trControl = trainControl(method = "cv"),  
)
# removed - preProc = c("center", "scale"); was already applied in imputation code
print(plsTune_model)
## Partial Least Squares 
## 
## 1837 samples
##   35 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1654, 1654, 1654, 1654, 1653, 1654, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE       Rsquared    MAE      
##    1     0.1698915  0.03889131  0.1360835
##    2     0.1682757  0.05418183  0.1351559
##    3     0.1549418  0.19924680  0.1217946
##    4     0.1512552  0.23834257  0.1176840
##    5     0.1495613  0.25227642  0.1165574
##    6     0.1481872  0.26633955  0.1156638
##    7     0.1461027  0.28651904  0.1137971
##    8     0.1455007  0.29293936  0.1130008
##    9     0.1444273  0.30358775  0.1123713
##   10     0.1418244  0.32805595  0.1101039
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 10.
#suppress or address warning?
optimal_components <- 7
pls_final_model <- plsr(PH ~ ., data = train_data, ncomp = optimal_components)
predictions <- predict(pls_final_model, newdata = test_data, ncomp = optimal_components)
rmse <- sqrt(mean((test_data$PH - predictions)^2))
r_squared <- cor(test_data$PH, predictions)^2
mae <- mean(abs(test_data$PH - predictions))

cat("Test RMSE:", rmse, "\n")
## Test RMSE: 0.1446069
cat("Test R²:", r_squared, "\n")
## Test R²: 0.2718792
cat("Test MAE:", mae, "\n")
## Test MAE: 0.1145993

CART: Regression: response variable is numerical and continuous.

cart_model <- train(
  PH ~ ., 
  data = train_data,
  method = "rpart",
  trControl = trainControl(method = "cv", number = 3, verboseIter = TRUE), 
  tuneLength = 10
)
## + Fold1: cp=0.01515 
## - Fold1: cp=0.01515 
## + Fold2: cp=0.01515 
## - Fold2: cp=0.01515 
## + Fold3: cp=0.01515 
## - Fold3: cp=0.01515
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.0159 on full training set
print(cart_model)
## CART 
## 
## 1837 samples
##   35 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 1226, 1224, 1224 
## Resampling results across tuning parameters:
## 
##   cp          RMSE       Rsquared   MAE      
##   0.01515350  0.1352945  0.3921165  0.1041521
##   0.01587580  0.1352070  0.3921712  0.1042564
##   0.01644801  0.1359497  0.3853779  0.1053982
##   0.01727906  0.1366106  0.3790389  0.1055609
##   0.01930733  0.1373128  0.3726317  0.1058595
##   0.02246855  0.1405136  0.3424462  0.1090925
##   0.03362470  0.1424540  0.3234390  0.1117618
##   0.04199066  0.1466342  0.2843337  0.1149674
##   0.06305480  0.1523962  0.2253521  0.1191724
##   0.21056463  0.1613791  0.1875065  0.1271038
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.0158758.
print(cart_model$bestTune)  
##          cp
## 2 0.0158758
best_cp <- cart_model$bestTune$cp

cart_final_model <- rpart(PH ~ ., data = train_data, method = "anova", control = rpart.control(cp = best_cp))

cart_predictions <- predict(cart_final_model, newdata = test_data)

cart_rmse <- sqrt(mean((test_data$PH - cart_predictions)^2))
cart_r_squared <- cor(test_data$PH, cart_predictions)^2
cart_mae <- mean(abs(test_data$PH - cart_predictions))

cat("CART Test RMSE:", cart_rmse, "\n")
## CART Test RMSE: 0.1323511
cat("CART Test R²:", cart_r_squared, "\n")
## CART Test R²: 0.3903304
cat("CART Test MAE:", cart_mae, "\n")
## CART Test MAE: 0.1055604

Question: We wait to predict on the best model?