Libraries Used:

library(tidyverse)
library(fpp3)
library(randomForest)
library(ggplot2)
library(caret)
library(AppliedPredictiveModeling)
library(e1071)
library(caTools)
library(gbm)
library(mlbench)
library(ipred)
library(class)
library(kernlab)
library(partykit)
library(rpart)
library(rpart.plot)
library(readxl)
library(corrplot)

Load Data:

train <- read.csv('https://raw.githubusercontent.com/Jlok17/2022MSDS/main/Source/Data%20624/StudentData%20(1).xlsx%20-%20Subset.csv')
test <- read.csv('https://raw.githubusercontent.com/Jlok17/2022MSDS/main/Source/Data%20624/StudentEvaluation.xlsx%20-%20Subset%20(2).csv')

EDA:

Missing values in the training data:
# Formatting Names to be more readable
colnames(train) <- sub("\\.", "_", colnames(train))
colnames(test) <- sub("\\.", "_", colnames(train))


print(colSums(is.na(train)))
##        Brand_Code       Carb_Volume       Fill_Ounces         PC_Volume 
##                 0                10                38                39 
##     Carb_Pressure         Carb_Temp               PSC          PSC_Fill 
##                27                26                33                23 
##           PSC_CO2          Mnf_Flow    Carb_Pressure1     Fill_Pressure 
##                39                 2                32                22 
##     Hyd_Pressure1     Hyd_Pressure2     Hyd_Pressure3     Hyd_Pressure4 
##                11                15                15                30 
##      Filler_Level      Filler_Speed       Temperature        Usage_cont 
##                20                57                14                 5 
##         Carb_Flow           Density               MFR           Balling 
##                 2                 1               212                 1 
##   Pressure_Vacuum                PH     Oxygen_Filler     Bowl_Setpoint 
##                 0                 4                12                 2 
## Pressure_Setpoint     Air_Pressurer          Alch_Rel          Carb_Rel 
##                12                 0                 9                10 
##       Balling_Lvl 
##                 1
paste("There are:", sum(is.na(train)), "total missing values")
## [1] "There are: 724 total missing values"
Missing values in test data:
print(colSums(is.na(test)))
##        Brand_Code       Carb_Volume       Fill_Ounces         PC_Volume 
##                 0                 1                 6                 4 
##     Carb_Pressure         Carb_Temp               PSC          PSC_Fill 
##                 0                 1                 5                 3 
##           PSC_CO2          Mnf_Flow    Carb_Pressure1     Fill_Pressure 
##                 5                 0                 4                 2 
##     Hyd_Pressure1     Hyd_Pressure2     Hyd_Pressure3     Hyd_Pressure4 
##                 0                 1                 1                 4 
##      Filler_Level      Filler_Speed       Temperature        Usage_cont 
##                 2                10                 2                 2 
##         Carb_Flow           Density               MFR           Balling 
##                 0                 1                31                 1 
##   Pressure_Vacuum                PH     Oxygen_Filler     Bowl_Setpoint 
##                 1               267                 3                 1 
## Pressure_Setpoint     Air_Pressurer          Alch_Rel          Carb_Rel 
##                 2                 1                 3                 2 
##       Balling_Lvl 
##                 0
paste("There are:", sum(is.na(test)), "total missing values")
## [1] "There are: 366 total missing values"

Summary Statistics:

longer_train <- train %>%
                pivot_longer(!`Brand_Code`,names_to = "col_names", values_to = "value")

summary_df <- as.data.frame(longer_train) %>% 
  group_by(col_names) %>%
    summarise(min = min(value, na.rm = TRUE),
                                max = max(value, na.rm = TRUE),
                                mean = mean(value, na.rm = TRUE),
                                median = median(value, na.rm = TRUE),
                                std = sd(value, na.rm = TRUE),
                                var = var(value, na.rm = TRUE))
  

print(summary_df,n = 32)
## # A tibble: 32 × 7
##    col_names               min      max      mean    median       std        var
##    <chr>                 <dbl>    <dbl>     <dbl>     <dbl>     <dbl>      <dbl>
##  1 Air_Pressurer      141.      148.     143.      143.        1.21      1.47e+0
##  2 Alch_Rel             5.28      8.62     6.90      6.56      0.505     2.55e-1
##  3 Balling             -0.17      4.01     2.20      1.65      0.931     8.67e-1
##  4 Balling_Lvl          0         3.66     2.05      1.48      0.870     7.57e-1
##  5 Bowl_Setpoint       70       140      109.      120        15.3       2.34e+2
##  6 Carb_Flow           26      5104     2468.     3028      1074.        1.15e+6
##  7 Carb_Pressure       57        79.4     68.2      68.2       3.54      1.25e+1
##  8 Carb_Pressure1     106.      140.     123.      123.        4.74      2.25e+1
##  9 Carb_Rel             4.96      6.06     5.44      5.4       0.129     1.66e-2
## 10 Carb_Temp          129.      154      141.      141.        4.04      1.63e+1
## 11 Carb_Volume          5.04      5.7      5.37      5.35      0.106     1.13e-2
## 12 Density              0.24      1.92     1.17      0.98      0.378     1.43e-1
## 13 Fill_Ounces         23.6      24.3     24.0      24.0       0.0875    7.66e-3
## 14 Fill_Pressure       34.6      60.4     47.9      46.4       3.18      1.01e+1
## 15 Filler_Level        55.8     161.     109.      118.       15.7       2.46e+2
## 16 Filler_Speed       998      4030     3687.     3982       771.        5.94e+5
## 17 Hyd_Pressure1       -0.8      58       12.4      11.4      12.4       1.55e+2
## 18 Hyd_Pressure2        0        59.4     21.0      28.6      16.4       2.69e+2
## 19 Hyd_Pressure3       -1.2      50       20.5      27.6      16.0       2.55e+2
## 20 Hyd_Pressure4       52       142       96.3      96        13.1       1.72e+2
## 21 MFR                 31.4     869.     704.      724        73.9       5.46e+3
## 22 Mnf_Flow          -100.      229.      24.6      65.2     119.        1.43e+4
## 23 Oxygen_Filler        0.0024    0.4      0.0468    0.0334    0.0466    2.18e-3
## 24 PC_Volume            0.0793    0.478    0.277     0.271     0.0607    3.68e-3
## 25 PH                   7.88      9.36     8.55      8.54      0.173     2.98e-2
## 26 PSC                  0.002     0.27     0.0846    0.076     0.0493    2.43e-3
## 27 PSC_CO2              0         0.24     0.0564    0.04      0.0430    1.85e-3
## 28 PSC_Fill             0         0.62     0.195     0.18      0.118     1.39e-2
## 29 Pressure_Setpoint   44        52       47.6      46         2.04      4.16e+0
## 30 Pressure_Vacuum     -6.6      -3.6     -5.22     -5.4       0.570     3.25e-1
## 31 Temperature         63.6      76.2     66.0      65.6       1.38      1.91e+0
## 32 Usage_cont          12.1      25.9     21.0      21.8       2.98      8.87e+0

Identify if there are near zero variance predictors:

nearZeroVar(train)
## [1] 13

Preprocess Data:

First, we can make dummy variables from the brand codes so that the model can interpret it easier:

train$code_a <- ifelse(train$Brand_Code == "A", 1,0)
train$code_b <- ifelse(train$Brand_Code == "B", 1,0)
train$code_c <- ifelse(train$Brand_Code == "C", 1,0)
train$code_d <- ifelse(train$Brand_Code == "D", 1,0)
train$code_a <-replace(train$code_a, is.na(train$code_a), 0)
train$code_b <-replace(train$code_b, is.na(train$code_b), 0)
train$code_c <-replace(train$code_c, is.na(train$code_c), 0)
train$code_d <-replace(train$code_d, is.na(train$code_d), 0)

train <- subset(train, select =-`Brand_Code`)
train <- train[complete.cases(train$PH),]

Now values can be imputed:

# Training Data
X_train <- subset(train,select = -`PH`)
X_train_df <- as.data.frame(X_train)

y_train <- train$PH


## Bag Imputation
preProc <- preProcess(X_train_df, method = c("bagImpute"))
X_train_imputed <- predict(preProc, X_train_df)

## KNN Imputation
preProc1 <- preProcess(X_train_df, method = c("knnImpute"))
X_train_imputed_2 <- predict(preProc1, X_train_df)

We can also apply PCA:

pcaObject <- prcomp(X_train_imputed, center =TRUE, scale. = TRUE)

train_pca <- as.data.frame(pcaObject$x)
train_pca$y <- y_train
plot(pcaObject, type= "lines")

Correlation between the predictors can be identified:

windows.options(width = 20, height = 20)
cor_matrix <- cor(X_train_imputed)
corrplot(cor_matrix,title = "Correlation Plot", method = "square", outline = T, addgrid.col = "darkgray", order="hclust", mar = c(4,0,4,0), addrect = 4, rect.col = "black", rect.lwd = 5,cl.pos = "b", tl.col = "indianred4", tl.cex = 1, cl.cex = 1, type = "lower")

Split Data into Train and dev test set:

set.seed(4614)

## split imputed data into train/test


X_train_imputed$y <-  y_train
X_train_imputed_2$y <- y_train

sample <- sample.split(X_train_imputed, SplitRatio = 0.8)
train_final <- subset(X_train_imputed, sample == TRUE)
dev_test <- subset(X_train_imputed, sample == FALSE)

sample_2 <- sample.split(X_train_imputed_2, SplitRatio = 0.8)
train_final_2 <- subset(X_train_imputed_2, sample == TRUE)
dev_test_2 <- subset(X_train_imputed_2, sample == FALSE)



## Split pca data into train/test
sample <- sample.split(train_pca, SplitRatio =0.8)
train_pca_final <- subset(train_pca, sample == TRUE)
dev_test_pca <- subset(train_pca, sample == FALSE)

Modeling:

We can start with just our default models to determine which one is the best one to tune and use for the prediction.

## KNN Imputation

## NN
# nnetgrid <- expand.grid(.decay = c(0,0.01, .1),
#                         .size =c(1:10),
#                         .bag = FALSE)
set.seed(200)
# nnettune <- train(subset(train_final_2, select =-y), train_final_2$y,
#                   method ="avNNet",
#                   tuneGrid = nnetgrid, 
#                   trControl = trainControl(method = "cv"),
#                   linout =TRUE,
#                   trace = FALSE,
#                   MaxNWts = 10 * (ncol(subset(train_final_2,select = -y)) + 1) + 10 + 1, maxit = 500)
# 
# nnPred <- predict(nnettune, newdata = subset(dev_test_2,select = -y))
# nn_acc <- postResample(pred = nnPred, obs = dev_test_2$y)

# KNN
knnModel <- train(x = subset(train_final_2, select = -y),
                  y = train_final_2$y ,
                  method = "knn",
                  tuneLength = 10)

knnModel
## k-Nearest Neighbors 
## 
## 1995 samples
##   35 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 1995, 1995, 1995, 1995, 1995, 1995, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE       Rsquared   MAE       
##    5  0.1365406  0.3966159  0.10031994
##    7  0.1343486  0.4065833  0.09981788
##    9  0.1330580  0.4124451  0.09942077
##   11  0.1319647  0.4192450  0.09905220
##   13  0.1317771  0.4194974  0.09935688
##   15  0.1316028  0.4206059  0.09968979
##   17  0.1317075  0.4191918  0.10008653
##   19  0.1321407  0.4150569  0.10069760
##   21  0.1321943  0.4144326  0.10092571
##   23  0.1322961  0.4134216  0.10124625
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.
knnPred <- predict(knnModel, newdata= subset(dev_test_2, select = -y))

knn_acc <- postResample(pred = knnPred, obs = dev_test_2$y)
knn_acc
##       RMSE   Rsquared        MAE 
## 0.11762673 0.56819775 0.09190545
## GBM
gbmGrid <- expand.grid(interaction.depth = seq(1, 7, by = 2),
                       n.trees = seq(100, 1000, by = 50),
                       shrinkage = c(0.01, 0.1),
                       n.minobsinnode = 10)

set.seed(12430)

gbmTune <- train(y ~ ., data = train_final_2,
                 method = "gbm", 
                 tuneGrid = gbmGrid,
                 verbose = FALSE)
gbm_predict <- predict(gbmTune, subset(dev_test_2, select = -y))
gbm <- postResample(gbm_predict, dev_test_2$y)
gbm
##       RMSE   Rsquared        MAE 
## 0.10013493 0.67665417 0.07510196
## Random Forest
set.seed(5565)
rforest <- randomForest(subset(train_final_2, select = -`y`), train_final_2$y)

rforest
## 
## Call:
##  randomForest(x = subset(train_final_2, select = -y), y = train_final_2$y) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 11
## 
##           Mean of squared residuals: 0.01003134
##                     % Var explained: 65.86
rfpredict <- predict(rforest, subset(dev_test_2, select = -`y`))
random_forest <- postResample(rfpredict, dev_test_2$y)
random_forest
##       RMSE   Rsquared        MAE 
## 0.09313631 0.74646728 0.06935459
## SVM
set.seed(5565)
svm_model <- svm(y ~ ., data = train_final_2)
summary(svm_model)
## 
## Call:
## svm(formula = y ~ ., data = train_final_2)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.02857143 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  1692
predictions <- predict(svm_model, subset(dev_test_2, select = -`y`))
postResample(predictions, dev_test_2$y)
##       RMSE   Rsquared        MAE 
## 0.11279850 0.59679936 0.08387256
#### Bag Imputation
### NN
# nnetgrid <- expand.grid(.decay = c(0,0.01, .1),
#                         .size =c(1:10),
#                         .bag = FALSE)
set.seed(200)
# nnettune <- train(subset(train_final, select =-y), train_final$y,
#                   method ="avNNet",
#                   tuneGrid = nnetgrid, 
#                   trControl = trainControl(method = "cv"),
#                   linout =TRUE,
#                   trace = FALSE,
#                   MaxNWts = 10 * (ncol(subset(train_final,select = -y)) + 1) + 10 + 1, maxit = 500)
# 
# nnPred <- predict(nnettune, newdata = subset(dev_test,select = -y))
# nn_acc <- postResample(pred = nnPred, obs = dev_test$y)

#KNN
knnModel <- train(x = subset(train_final, select = -y),
                  y = train_final$y ,
                  method = "knn",
                  tuneLength = 10)

knnModel
## k-Nearest Neighbors 
## 
## 1995 samples
##   35 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 1995, 1995, 1995, 1995, 1995, 1995, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE       Rsquared   MAE      
##    5  0.1495128  0.2985551  0.1084448
##    7  0.1472102  0.3007709  0.1086207
##    9  0.1463087  0.2994945  0.1091538
##   11  0.1458936  0.2971724  0.1096763
##   13  0.1455404  0.2956602  0.1101091
##   15  0.1455021  0.2925118  0.1105685
##   17  0.1454998  0.2898893  0.1109625
##   19  0.1457045  0.2862148  0.1113735
##   21  0.1461408  0.2806574  0.1119742
##   23  0.1464372  0.2764863  0.1124619
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata= subset(dev_test, select = -y))
knn_acc <- postResample(pred = knnPred, obs = dev_test$y)
knn_acc
##      RMSE  Rsquared       MAE 
## 0.1375904 0.3907158 0.1068326
## GBM

gbmGrid <- expand.grid(interaction.depth = seq(1, 7, by = 2),
                       n.trees = seq(100, 1000, by = 50),
                       shrinkage = c(0.01, 0.1),
                       n.minobsinnode = 10)

set.seed(12430)

gbmTune <- train(y ~ ., data = train_final,
                 method = "gbm", 
                tuneGrid = gbmGrid,
                 verbose = FALSE)

gbmTune
## Stochastic Gradient Boosting 
## 
## 1995 samples
##   35 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 1995, 1995, 1995, 1995, 1995, 1995, ... 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.trees  RMSE       Rsquared   MAE       
##   0.01       1                   100     0.1536103  0.2870859  0.12127110
##   0.01       1                   150     0.1492559  0.3149823  0.11743435
##   0.01       1                   200     0.1461038  0.3328192  0.11467516
##   0.01       1                   250     0.1437010  0.3463119  0.11266653
##   0.01       1                   300     0.1419221  0.3556510  0.11123774
##   0.01       1                   350     0.1405578  0.3629803  0.11015515
##   0.01       1                   400     0.1394849  0.3687671  0.10930643
##   0.01       1                   450     0.1386130  0.3736609  0.10861602
##   0.01       1                   500     0.1378439  0.3784042  0.10800447
##   0.01       1                   550     0.1372236  0.3822076  0.10752297
##   0.01       1                   600     0.1366566  0.3860043  0.10707406
##   0.01       1                   650     0.1361840  0.3890626  0.10669230
##   0.01       1                   700     0.1357313  0.3921841  0.10633872
##   0.01       1                   750     0.1353696  0.3943444  0.10604537
##   0.01       1                   800     0.1350108  0.3969491  0.10574452
##   0.01       1                   850     0.1346818  0.3991869  0.10546437
##   0.01       1                   900     0.1344117  0.4008470  0.10523770
##   0.01       1                   950     0.1341652  0.4024450  0.10502064
##   0.01       1                  1000     0.1339238  0.4040544  0.10481467
##   0.01       3                   100     0.1441418  0.3899512  0.11351760
##   0.01       3                   150     0.1382723  0.4123878  0.10846463
##   0.01       3                   200     0.1346676  0.4282070  0.10537279
##   0.01       3                   250     0.1323341  0.4398217  0.10336265
##   0.01       3                   300     0.1307183  0.4483016  0.10193011
##   0.01       3                   350     0.1294949  0.4553967  0.10084441
##   0.01       3                   400     0.1285158  0.4611682  0.09993271
##   0.01       3                   450     0.1276689  0.4662837  0.09914058
##   0.01       3                   500     0.1269823  0.4703995  0.09846318
##   0.01       3                   550     0.1263194  0.4746848  0.09781105
##   0.01       3                   600     0.1257485  0.4782525  0.09725429
##   0.01       3                   650     0.1252661  0.4812599  0.09677521
##   0.01       3                   700     0.1248324  0.4839584  0.09632768
##   0.01       3                   750     0.1244073  0.4866984  0.09588579
##   0.01       3                   800     0.1240399  0.4890355  0.09549763
##   0.01       3                   850     0.1237184  0.4910884  0.09513798
##   0.01       3                   900     0.1234230  0.4929655  0.09481732
##   0.01       3                   950     0.1231419  0.4948093  0.09451924
##   0.01       3                  1000     0.1228553  0.4966881  0.09420607
##   0.01       5                   100     0.1400518  0.4343543  0.11001775
##   0.01       5                   150     0.1337314  0.4551864  0.10453484
##   0.01       5                   200     0.1298680  0.4702690  0.10115817
##   0.01       5                   250     0.1274144  0.4812154  0.09895391
##   0.01       5                   300     0.1257513  0.4892355  0.09736453
##   0.01       5                   350     0.1244898  0.4959684  0.09615441
##   0.01       5                   400     0.1234749  0.5014972  0.09517368
##   0.01       5                   450     0.1226247  0.5063875  0.09432368
##   0.01       5                   500     0.1219232  0.5104684  0.09360354
##   0.01       5                   550     0.1212960  0.5142927  0.09298549
##   0.01       5                   600     0.1207740  0.5174028  0.09243484
##   0.01       5                   650     0.1202481  0.5208046  0.09190578
##   0.01       5                   700     0.1197832  0.5236796  0.09141677
##   0.01       5                   750     0.1193422  0.5265255  0.09096310
##   0.01       5                   800     0.1189801  0.5288033  0.09057983
##   0.01       5                   850     0.1186551  0.5308685  0.09023062
##   0.01       5                   900     0.1183435  0.5329087  0.08989429
##   0.01       5                   950     0.1180248  0.5350531  0.08956733
##   0.01       5                  1000     0.1177496  0.5368746  0.08928001
##   0.01       7                   100     0.1373942  0.4635271  0.10777090
##   0.01       7                   150     0.1306976  0.4833794  0.10185562
##   0.01       7                   200     0.1266678  0.4974445  0.09824387
##   0.01       7                   250     0.1240596  0.5085269  0.09583147
##   0.01       7                   300     0.1223480  0.5164939  0.09417842
##   0.01       7                   350     0.1211010  0.5225879  0.09293323
##   0.01       7                   400     0.1200759  0.5279613  0.09188101
##   0.01       7                   450     0.1192485  0.5324767  0.09104246
##   0.01       7                   500     0.1185343  0.5366174  0.09033234
##   0.01       7                   550     0.1179097  0.5403660  0.08970426
##   0.01       7                   600     0.1173634  0.5435683  0.08916338
##   0.01       7                   650     0.1168803  0.5465046  0.08867193
##   0.01       7                   700     0.1164611  0.5491055  0.08823954
##   0.01       7                   750     0.1160385  0.5518002  0.08780289
##   0.01       7                   800     0.1156816  0.5540876  0.08744364
##   0.01       7                   850     0.1153600  0.5561144  0.08710560
##   0.01       7                   900     0.1150618  0.5580905  0.08681053
##   0.01       7                   950     0.1147670  0.5600212  0.08650827
##   0.01       7                  1000     0.1144821  0.5619199  0.08622592
##   0.10       1                   100     0.1340565  0.4017954  0.10481060
##   0.10       1                   150     0.1326089  0.4108366  0.10333608
##   0.10       1                   200     0.1317614  0.4161629  0.10227676
##   0.10       1                   250     0.1313068  0.4190583  0.10156286
##   0.10       1                   300     0.1309932  0.4211211  0.10104602
##   0.10       1                   350     0.1308115  0.4223615  0.10064869
##   0.10       1                   400     0.1307645  0.4227533  0.10047514
##   0.10       1                   450     0.1307586  0.4228361  0.10025153
##   0.10       1                   500     0.1307810  0.4228650  0.10007924
##   0.10       1                   550     0.1306732  0.4239574  0.09995068
##   0.10       1                   600     0.1306347  0.4244325  0.09977288
##   0.10       1                   650     0.1306317  0.4248236  0.09964244
##   0.10       1                   700     0.1305682  0.4256217  0.09956138
##   0.10       1                   750     0.1306633  0.4251580  0.09955335
##   0.10       1                   800     0.1306872  0.4250753  0.09954620
##   0.10       1                   850     0.1307000  0.4254282  0.09944884
##   0.10       1                   900     0.1305989  0.4264349  0.09930551
##   0.10       1                   950     0.1306098  0.4267148  0.09931352
##   0.10       1                  1000     0.1306210  0.4268572  0.09931576
##   0.10       3                   100     0.1237004  0.4876400  0.09484227
##   0.10       3                   150     0.1218085  0.5006596  0.09270540
##   0.10       3                   200     0.1207281  0.5084098  0.09149936
##   0.10       3                   250     0.1197287  0.5161076  0.09054129
##   0.10       3                   300     0.1192438  0.5198689  0.08994274
##   0.10       3                   350     0.1188008  0.5233656  0.08940819
##   0.10       3                   400     0.1183355  0.5270352  0.08896095
##   0.10       3                   450     0.1179928  0.5299044  0.08864451
##   0.10       3                   500     0.1177943  0.5316518  0.08837790
##   0.10       3                   550     0.1174792  0.5342892  0.08807793
##   0.10       3                   600     0.1173005  0.5358117  0.08785658
##   0.10       3                   650     0.1171032  0.5374128  0.08770612
##   0.10       3                   700     0.1169462  0.5388723  0.08755418
##   0.10       3                   750     0.1167907  0.5401742  0.08742338
##   0.10       3                   800     0.1167117  0.5409748  0.08738554
##   0.10       3                   850     0.1166310  0.5417867  0.08729569
##   0.10       3                   900     0.1164433  0.5433093  0.08711014
##   0.10       3                   950     0.1163838  0.5439157  0.08705689
##   0.10       3                  1000     0.1163277  0.5444781  0.08697017
##   0.10       5                   100     0.1191208  0.5231502  0.09042409
##   0.10       5                   150     0.1174064  0.5352762  0.08848273
##   0.10       5                   200     0.1163261  0.5433933  0.08734742
##   0.10       5                   250     0.1156447  0.5484001  0.08667833
##   0.10       5                   300     0.1152657  0.5512900  0.08623685
##   0.10       5                   350     0.1147782  0.5551043  0.08577897
##   0.10       5                   400     0.1145082  0.5572710  0.08557336
##   0.10       5                   450     0.1142815  0.5590955  0.08530422
##   0.10       5                   500     0.1140707  0.5608200  0.08515894
##   0.10       5                   550     0.1139322  0.5620230  0.08499480
##   0.10       5                   600     0.1138320  0.5628937  0.08488888
##   0.10       5                   650     0.1137258  0.5638388  0.08479956
##   0.10       5                   700     0.1136177  0.5646940  0.08471355
##   0.10       5                   750     0.1135797  0.5650971  0.08463624
##   0.10       5                   800     0.1135291  0.5655815  0.08458170
##   0.10       5                   850     0.1134852  0.5660030  0.08452965
##   0.10       5                   900     0.1134644  0.5662224  0.08452265
##   0.10       5                   950     0.1134490  0.5664317  0.08447877
##   0.10       5                  1000     0.1134479  0.5665083  0.08445055
##   0.10       7                   100     0.1162020  0.5459411  0.08745502
##   0.10       7                   150     0.1148252  0.5553557  0.08609229
##   0.10       7                   200     0.1141102  0.5605495  0.08531496
##   0.10       7                   250     0.1134235  0.5656713  0.08458548
##   0.10       7                   300     0.1130927  0.5681010  0.08422083
##   0.10       7                   350     0.1129117  0.5695599  0.08399238
##   0.10       7                   400     0.1127421  0.5709328  0.08376244
##   0.10       7                   450     0.1125991  0.5720665  0.08360934
##   0.10       7                   500     0.1125000  0.5729175  0.08350768
##   0.10       7                   550     0.1124844  0.5730972  0.08346215
##   0.10       7                   600     0.1123874  0.5739184  0.08341364
##   0.10       7                   650     0.1123777  0.5740942  0.08337921
##   0.10       7                   700     0.1123235  0.5745588  0.08333510
##   0.10       7                   750     0.1123136  0.5746434  0.08329575
##   0.10       7                   800     0.1122906  0.5749137  0.08325012
##   0.10       7                   850     0.1122691  0.5751003  0.08324954
##   0.10       7                   900     0.1122359  0.5754014  0.08320756
##   0.10       7                   950     0.1122178  0.5755598  0.08319135
##   0.10       7                  1000     0.1122070  0.5756670  0.08317221
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 1000, interaction.depth =
##  7, shrinkage = 0.1 and n.minobsinnode = 10.
gbm_predict <- predict(gbmTune, subset(dev_test, select = -y))
gbm <- postResample(gbm_predict, dev_test$y)
gbm
##       RMSE   Rsquared        MAE 
## 0.09710108 0.69594338 0.07302651
## Random Forest
set.seed(5565)
rforest <- randomForest(subset(train_final, select = -`y`), train_final$y)

rforest
## 
## Call:
##  randomForest(x = subset(train_final, select = -y), y = train_final$y) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 11
## 
##           Mean of squared residuals: 0.009828275
##                     % Var explained: 66.55
rfpredict <- predict(rforest, subset(dev_test, select = -`y`))
random_forest <- postResample(rfpredict, dev_test$y)
random_forest
##      RMSE  Rsquared       MAE 
## 0.0927263 0.7482227 0.0687377
## SVM
set.seed(5565)
svm_model <- svm(y ~ ., data = train_final)
summary(svm_model)
## 
## Call:
## svm(formula = y ~ ., data = train_final)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.02857143 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  1690
predictions <- predict(svm_model, subset(dev_test, select = -`y`))
postResample(predictions, dev_test$y)
##       RMSE   Rsquared        MAE 
## 0.11276683 0.59724074 0.08380731

Random Forest with PCA

Now we can try our model with PCA applied to the predictors:

set.seed(1765)
rforest_pca <- randomForest(subset(train_pca_final, select = -`y`), train_pca_final$y)
rfpredict_pca <- predict(rforest_pca, subset(dev_test_pca, select = -`y`))

random_forest_pca <- postResample(rfpredict_pca, dev_test_pca$y)
random_forest_pca
##       RMSE   Rsquared        MAE 
## 0.11910405 0.59744944 0.09108898
# Top Predictors for PCA
arrange(varImp(rforest_pca), desc(Overall))
##        Overall
## PC2  8.4026854
## PC6  3.6705671
## PC3  3.4579474
## PC28 2.9760652
## PC9  2.4093058
## PC1  2.3025874
## PC27 1.8519788
## PC24 1.7090079
## PC21 1.6192835
## PC18 1.6109004
## PC16 1.5337504
## PC26 1.5188717
## PC15 1.5172884
## PC29 1.4178345
## PC20 1.4139694
## PC7  1.3535801
## PC13 1.3033223
## PC11 1.2760424
## PC35 1.2648512
## PC17 1.2082592
## PC19 1.1632458
## PC32 1.0933308
## PC23 1.0540734
## PC4  1.0239133
## PC22 1.0171593
## PC30 0.9730041
## PC25 0.9213585
## PC8  0.8685614
## PC31 0.8570031
## PC12 0.8053055
## PC34 0.7647792
## PC10 0.7490747
## PC14 0.6838966
## PC5  0.6728685
## PC33 0.6330814

PCA has not helped our model in its prediction. We can now try to adjust hyperparameters for slightly more accurate results:

Random Forest Tuned

set.seed(417)
rforest <- randomForest(subset(train_final, select = -`y`), train_final$y, ntree = 500, nodesize =1, mtry = 28)
rfpredict <- predict(rforest, subset(dev_test, select = -`y`))

random_forest <- postResample(rfpredict, dev_test$y)
random_forest
##       RMSE   Rsquared        MAE 
## 0.09070254 0.74837240 0.06629692
#Top Predictor for Tuned Random Forest
arrange(varImp(rforest), desc(Overall))
##                      Overall
## Mnf_Flow          11.7614246
## code_c             3.8367865
## Usage_cont         3.3899002
## Oxygen_Filler      3.2061761
## Alch_Rel           2.8471010
## Pressure_Vacuum    2.5994326
## Air_Pressurer      2.5841833
## Temperature        2.2562869
## Carb_Pressure1     1.9759164
## Carb_Rel           1.9595060
## Balling_Lvl        1.8956393
## Bowl_Setpoint      1.5382882
## Carb_Flow          1.4229279
## Balling            1.4011867
## Filler_Speed       1.3383844
## PC_Volume          1.2174458
## Hyd_Pressure3      1.1926622
## Density            1.1854274
## Filler_Level       1.1242689
## MFR                1.0284804
## Hyd_Pressure2      0.9667075
## Fill_Pressure      0.8783686
## Carb_Volume        0.8649111
## Fill_Ounces        0.8115930
## Hyd_Pressure4      0.7896093
## Hyd_Pressure1      0.7251549
## PSC                0.6772046
## PSC_Fill           0.6130646
## Carb_Pressure      0.5158117
## Carb_Temp          0.5141398
## code_b             0.3748545
## PSC_CO2            0.3727733
## code_d             0.2814658
## code_a             0.2470346
## Pressure_Setpoint  0.2445142

We can visualize an example decision tree from the forest for this model:

set.seed(2304)
tree <- train(subset(train_final, select= -`y`),train_final$y,method = "rpart",
              tuneLength= 10,
              trControl  = trainControl(method = "cv"))
windows.options(width = 20, height = 20)
rpart.plot(tree$finalModel)

This will not be an entirely accurate representation of the model decisions as random forest is an ensemble model, having multiple model results combined.

our second most accurate model from testing was the SVM model:

svmTuned <- train(x = subset(train_final, select = -`y`),
                  y = train_final$y ,
                  method = "svmRadial",
                  tuneLength = 14,
                  trControl = trainControl(method = "cv"))
svmTuned$finalModel
## Support Vector Machine object of class "ksvm" 
## 
## SV type: eps-svr  (regression) 
##  parameter : epsilon = 0.1  cost C = 4 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.0205681371602738 
## 
## Number of Support Vectors : 1703 
## 
## Objective Function Value : -2153.002 
## Training error : 0.209426
svmPred <-predict(svmTuned, newdata = subset(dev_test, select = -`y`))
svm_acc <- postResample(pred = svmPred, obs = dev_test$y)
svm_acc
##       RMSE   Rsquared        MAE 
## 0.10760932 0.62910166 0.07960835

Prediction of Test Data

First we need to transform the test data similar to the training data:

# Dummy Variable
test$code_a <- ifelse(test$Brand_Code == "A", 1,0)
test$code_b <- ifelse(test$Brand_Code == "B", 1,0)
test$code_c <- ifelse(test$Brand_Code == "C", 1,0)
test$code_d <- ifelse(test$Brand_Code == "D", 1,0)
test$code_a <-replace(test$code_a, is.na(test$code_a), 0)
test$code_b <-replace(test$code_b, is.na(test$code_b), 0)
test$code_c <-replace(test$code_c, is.na(test$code_c), 0)
test$code_d <-replace(test$code_d, is.na(test$code_d), 0)
test <- subset(test, select = -`Brand_Code`)

Apply Bagged imputation model to test data:

X_test_df <- as.data.frame(subset(test, select = -`PH`))
X_test_imputed <- predict(preProc, X_test_df)
X_test_imputed$PH <- test$PH

Now Feed our model the test data:

set.seed(417)
rforest_final <- randomForest(subset(X_train_imputed, select = -`y`), X_train_imputed$y, ntree = 500, nodesize =1, mtry = 28)
rfpredict_final <- predict(rforest_final, subset(X_test_imputed, select = -`PH`))
X_test_imputed$PH <- rfpredict_final 

Exporting Result as Excel

# writexl::write_xlsx(X_test_imputed, file= "project2_PH_prediction.xlsx")