library(tidyverse)
library(fpp3)
library(randomForest)
library(ggplot2)
library(caret)
library(AppliedPredictiveModeling)
library(e1071)
library(caTools)
library(gbm)
library(mlbench)
library(ipred)
library(class)
library(kernlab)
library(partykit)
library(rpart)
library(rpart.plot)
library(readxl)
library(corrplot)
train <- read.csv('https://raw.githubusercontent.com/Jlok17/2022MSDS/main/Source/Data%20624/StudentData%20(1).xlsx%20-%20Subset.csv')
test <- read.csv('https://raw.githubusercontent.com/Jlok17/2022MSDS/main/Source/Data%20624/StudentEvaluation.xlsx%20-%20Subset%20(2).csv')
# Formatting Names to be more readable
colnames(train) <- sub("\\.", "_", colnames(train))
colnames(test) <- sub("\\.", "_", colnames(train))
print(colSums(is.na(train)))
## Brand_Code Carb_Volume Fill_Ounces PC_Volume
## 0 10 38 39
## Carb_Pressure Carb_Temp PSC PSC_Fill
## 27 26 33 23
## PSC_CO2 Mnf_Flow Carb_Pressure1 Fill_Pressure
## 39 2 32 22
## Hyd_Pressure1 Hyd_Pressure2 Hyd_Pressure3 Hyd_Pressure4
## 11 15 15 30
## Filler_Level Filler_Speed Temperature Usage_cont
## 20 57 14 5
## Carb_Flow Density MFR Balling
## 2 1 212 1
## Pressure_Vacuum PH Oxygen_Filler Bowl_Setpoint
## 0 4 12 2
## Pressure_Setpoint Air_Pressurer Alch_Rel Carb_Rel
## 12 0 9 10
## Balling_Lvl
## 1
paste("There are:", sum(is.na(train)), "total missing values")
## [1] "There are: 724 total missing values"
print(colSums(is.na(test)))
## Brand_Code Carb_Volume Fill_Ounces PC_Volume
## 0 1 6 4
## Carb_Pressure Carb_Temp PSC PSC_Fill
## 0 1 5 3
## PSC_CO2 Mnf_Flow Carb_Pressure1 Fill_Pressure
## 5 0 4 2
## Hyd_Pressure1 Hyd_Pressure2 Hyd_Pressure3 Hyd_Pressure4
## 0 1 1 4
## Filler_Level Filler_Speed Temperature Usage_cont
## 2 10 2 2
## Carb_Flow Density MFR Balling
## 0 1 31 1
## Pressure_Vacuum PH Oxygen_Filler Bowl_Setpoint
## 1 267 3 1
## Pressure_Setpoint Air_Pressurer Alch_Rel Carb_Rel
## 2 1 3 2
## Balling_Lvl
## 0
paste("There are:", sum(is.na(test)), "total missing values")
## [1] "There are: 366 total missing values"
Summary Statistics:
longer_train <- train %>%
pivot_longer(!`Brand_Code`,names_to = "col_names", values_to = "value")
summary_df <- as.data.frame(longer_train) %>%
group_by(col_names) %>%
summarise(min = min(value, na.rm = TRUE),
max = max(value, na.rm = TRUE),
mean = mean(value, na.rm = TRUE),
median = median(value, na.rm = TRUE),
std = sd(value, na.rm = TRUE),
var = var(value, na.rm = TRUE))
print(summary_df,n = 32)
## # A tibble: 32 × 7
## col_names min max mean median std var
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Air_Pressurer 141. 148. 143. 143. 1.21 1.47e+0
## 2 Alch_Rel 5.28 8.62 6.90 6.56 0.505 2.55e-1
## 3 Balling -0.17 4.01 2.20 1.65 0.931 8.67e-1
## 4 Balling_Lvl 0 3.66 2.05 1.48 0.870 7.57e-1
## 5 Bowl_Setpoint 70 140 109. 120 15.3 2.34e+2
## 6 Carb_Flow 26 5104 2468. 3028 1074. 1.15e+6
## 7 Carb_Pressure 57 79.4 68.2 68.2 3.54 1.25e+1
## 8 Carb_Pressure1 106. 140. 123. 123. 4.74 2.25e+1
## 9 Carb_Rel 4.96 6.06 5.44 5.4 0.129 1.66e-2
## 10 Carb_Temp 129. 154 141. 141. 4.04 1.63e+1
## 11 Carb_Volume 5.04 5.7 5.37 5.35 0.106 1.13e-2
## 12 Density 0.24 1.92 1.17 0.98 0.378 1.43e-1
## 13 Fill_Ounces 23.6 24.3 24.0 24.0 0.0875 7.66e-3
## 14 Fill_Pressure 34.6 60.4 47.9 46.4 3.18 1.01e+1
## 15 Filler_Level 55.8 161. 109. 118. 15.7 2.46e+2
## 16 Filler_Speed 998 4030 3687. 3982 771. 5.94e+5
## 17 Hyd_Pressure1 -0.8 58 12.4 11.4 12.4 1.55e+2
## 18 Hyd_Pressure2 0 59.4 21.0 28.6 16.4 2.69e+2
## 19 Hyd_Pressure3 -1.2 50 20.5 27.6 16.0 2.55e+2
## 20 Hyd_Pressure4 52 142 96.3 96 13.1 1.72e+2
## 21 MFR 31.4 869. 704. 724 73.9 5.46e+3
## 22 Mnf_Flow -100. 229. 24.6 65.2 119. 1.43e+4
## 23 Oxygen_Filler 0.0024 0.4 0.0468 0.0334 0.0466 2.18e-3
## 24 PC_Volume 0.0793 0.478 0.277 0.271 0.0607 3.68e-3
## 25 PH 7.88 9.36 8.55 8.54 0.173 2.98e-2
## 26 PSC 0.002 0.27 0.0846 0.076 0.0493 2.43e-3
## 27 PSC_CO2 0 0.24 0.0564 0.04 0.0430 1.85e-3
## 28 PSC_Fill 0 0.62 0.195 0.18 0.118 1.39e-2
## 29 Pressure_Setpoint 44 52 47.6 46 2.04 4.16e+0
## 30 Pressure_Vacuum -6.6 -3.6 -5.22 -5.4 0.570 3.25e-1
## 31 Temperature 63.6 76.2 66.0 65.6 1.38 1.91e+0
## 32 Usage_cont 12.1 25.9 21.0 21.8 2.98 8.87e+0
Identify if there are near zero variance predictors:
nearZeroVar(train)
## [1] 13
Preprocess Data:
First, we can make dummy variables from the brand codes so that the model can interpret it easier:
train$code_a <- ifelse(train$Brand_Code == "A", 1,0)
train$code_b <- ifelse(train$Brand_Code == "B", 1,0)
train$code_c <- ifelse(train$Brand_Code == "C", 1,0)
train$code_d <- ifelse(train$Brand_Code == "D", 1,0)
train$code_a <-replace(train$code_a, is.na(train$code_a), 0)
train$code_b <-replace(train$code_b, is.na(train$code_b), 0)
train$code_c <-replace(train$code_c, is.na(train$code_c), 0)
train$code_d <-replace(train$code_d, is.na(train$code_d), 0)
train <- subset(train, select =-`Brand_Code`)
train <- train[complete.cases(train$PH),]
Now values can be imputed:
# Training Data
X_train <- subset(train,select = -`PH`)
X_train_df <- as.data.frame(X_train)
y_train <- train$PH
## Bag Imputation
preProc <- preProcess(X_train_df, method = c("bagImpute"))
X_train_imputed <- predict(preProc, X_train_df)
## KNN Imputation
preProc1 <- preProcess(X_train_df, method = c("knnImpute"))
X_train_imputed_2 <- predict(preProc1, X_train_df)
We can also apply PCA:
pcaObject <- prcomp(X_train_imputed, center =TRUE, scale. = TRUE)
train_pca <- as.data.frame(pcaObject$x)
train_pca$y <- y_train
plot(pcaObject, type= "lines")
Correlation between the predictors can be identified:
windows.options(width = 20, height = 20)
cor_matrix <- cor(X_train_imputed)
corrplot(cor_matrix,title = "Correlation Plot", method = "square", outline = T, addgrid.col = "darkgray", order="hclust", mar = c(4,0,4,0), addrect = 4, rect.col = "black", rect.lwd = 5,cl.pos = "b", tl.col = "indianred4", tl.cex = 1, cl.cex = 1, type = "lower")
Split Data into Train and dev test set:
set.seed(4614)
## split imputed data into train/test
X_train_imputed$y <- y_train
X_train_imputed_2$y <- y_train
sample <- sample.split(X_train_imputed, SplitRatio = 0.8)
train_final <- subset(X_train_imputed, sample == TRUE)
dev_test <- subset(X_train_imputed, sample == FALSE)
sample_2 <- sample.split(X_train_imputed_2, SplitRatio = 0.8)
train_final_2 <- subset(X_train_imputed_2, sample == TRUE)
dev_test_2 <- subset(X_train_imputed_2, sample == FALSE)
## Split pca data into train/test
sample <- sample.split(train_pca, SplitRatio =0.8)
train_pca_final <- subset(train_pca, sample == TRUE)
dev_test_pca <- subset(train_pca, sample == FALSE)
We can start with just our default models to determine which one is the best one to tune and use for the prediction.
## KNN Imputation
## NN
# nnetgrid <- expand.grid(.decay = c(0,0.01, .1),
# .size =c(1:10),
# .bag = FALSE)
set.seed(200)
# nnettune <- train(subset(train_final_2, select =-y), train_final_2$y,
# method ="avNNet",
# tuneGrid = nnetgrid,
# trControl = trainControl(method = "cv"),
# linout =TRUE,
# trace = FALSE,
# MaxNWts = 10 * (ncol(subset(train_final_2,select = -y)) + 1) + 10 + 1, maxit = 500)
#
# nnPred <- predict(nnettune, newdata = subset(dev_test_2,select = -y))
# nn_acc <- postResample(pred = nnPred, obs = dev_test_2$y)
# KNN
knnModel <- train(x = subset(train_final_2, select = -y),
y = train_final_2$y ,
method = "knn",
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 1995 samples
## 35 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 1995, 1995, 1995, 1995, 1995, 1995, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 0.1365406 0.3966159 0.10031994
## 7 0.1343486 0.4065833 0.09981788
## 9 0.1330580 0.4124451 0.09942077
## 11 0.1319647 0.4192450 0.09905220
## 13 0.1317771 0.4194974 0.09935688
## 15 0.1316028 0.4206059 0.09968979
## 17 0.1317075 0.4191918 0.10008653
## 19 0.1321407 0.4150569 0.10069760
## 21 0.1321943 0.4144326 0.10092571
## 23 0.1322961 0.4134216 0.10124625
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.
knnPred <- predict(knnModel, newdata= subset(dev_test_2, select = -y))
knn_acc <- postResample(pred = knnPred, obs = dev_test_2$y)
knn_acc
## RMSE Rsquared MAE
## 0.11762673 0.56819775 0.09190545
## GBM
gbmGrid <- expand.grid(interaction.depth = seq(1, 7, by = 2),
n.trees = seq(100, 1000, by = 50),
shrinkage = c(0.01, 0.1),
n.minobsinnode = 10)
set.seed(12430)
gbmTune <- train(y ~ ., data = train_final_2,
method = "gbm",
tuneGrid = gbmGrid,
verbose = FALSE)
gbm_predict <- predict(gbmTune, subset(dev_test_2, select = -y))
gbm <- postResample(gbm_predict, dev_test_2$y)
gbm
## RMSE Rsquared MAE
## 0.10013493 0.67665417 0.07510196
## Random Forest
set.seed(5565)
rforest <- randomForest(subset(train_final_2, select = -`y`), train_final_2$y)
rforest
##
## Call:
## randomForest(x = subset(train_final_2, select = -y), y = train_final_2$y)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 11
##
## Mean of squared residuals: 0.01003134
## % Var explained: 65.86
rfpredict <- predict(rforest, subset(dev_test_2, select = -`y`))
random_forest <- postResample(rfpredict, dev_test_2$y)
random_forest
## RMSE Rsquared MAE
## 0.09313631 0.74646728 0.06935459
## SVM
set.seed(5565)
svm_model <- svm(y ~ ., data = train_final_2)
summary(svm_model)
##
## Call:
## svm(formula = y ~ ., data = train_final_2)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 1
## gamma: 0.02857143
## epsilon: 0.1
##
##
## Number of Support Vectors: 1692
predictions <- predict(svm_model, subset(dev_test_2, select = -`y`))
postResample(predictions, dev_test_2$y)
## RMSE Rsquared MAE
## 0.11279850 0.59679936 0.08387256
#### Bag Imputation
### NN
# nnetgrid <- expand.grid(.decay = c(0,0.01, .1),
# .size =c(1:10),
# .bag = FALSE)
set.seed(200)
# nnettune <- train(subset(train_final, select =-y), train_final$y,
# method ="avNNet",
# tuneGrid = nnetgrid,
# trControl = trainControl(method = "cv"),
# linout =TRUE,
# trace = FALSE,
# MaxNWts = 10 * (ncol(subset(train_final,select = -y)) + 1) + 10 + 1, maxit = 500)
#
# nnPred <- predict(nnettune, newdata = subset(dev_test,select = -y))
# nn_acc <- postResample(pred = nnPred, obs = dev_test$y)
#KNN
knnModel <- train(x = subset(train_final, select = -y),
y = train_final$y ,
method = "knn",
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 1995 samples
## 35 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 1995, 1995, 1995, 1995, 1995, 1995, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 0.1495128 0.2985551 0.1084448
## 7 0.1472102 0.3007709 0.1086207
## 9 0.1463087 0.2994945 0.1091538
## 11 0.1458936 0.2971724 0.1096763
## 13 0.1455404 0.2956602 0.1101091
## 15 0.1455021 0.2925118 0.1105685
## 17 0.1454998 0.2898893 0.1109625
## 19 0.1457045 0.2862148 0.1113735
## 21 0.1461408 0.2806574 0.1119742
## 23 0.1464372 0.2764863 0.1124619
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata= subset(dev_test, select = -y))
knn_acc <- postResample(pred = knnPred, obs = dev_test$y)
knn_acc
## RMSE Rsquared MAE
## 0.1375904 0.3907158 0.1068326
## GBM
gbmGrid <- expand.grid(interaction.depth = seq(1, 7, by = 2),
n.trees = seq(100, 1000, by = 50),
shrinkage = c(0.01, 0.1),
n.minobsinnode = 10)
set.seed(12430)
gbmTune <- train(y ~ ., data = train_final,
method = "gbm",
tuneGrid = gbmGrid,
verbose = FALSE)
gbmTune
## Stochastic Gradient Boosting
##
## 1995 samples
## 35 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 1995, 1995, 1995, 1995, 1995, 1995, ...
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.trees RMSE Rsquared MAE
## 0.01 1 100 0.1536103 0.2870859 0.12127110
## 0.01 1 150 0.1492559 0.3149823 0.11743435
## 0.01 1 200 0.1461038 0.3328192 0.11467516
## 0.01 1 250 0.1437010 0.3463119 0.11266653
## 0.01 1 300 0.1419221 0.3556510 0.11123774
## 0.01 1 350 0.1405578 0.3629803 0.11015515
## 0.01 1 400 0.1394849 0.3687671 0.10930643
## 0.01 1 450 0.1386130 0.3736609 0.10861602
## 0.01 1 500 0.1378439 0.3784042 0.10800447
## 0.01 1 550 0.1372236 0.3822076 0.10752297
## 0.01 1 600 0.1366566 0.3860043 0.10707406
## 0.01 1 650 0.1361840 0.3890626 0.10669230
## 0.01 1 700 0.1357313 0.3921841 0.10633872
## 0.01 1 750 0.1353696 0.3943444 0.10604537
## 0.01 1 800 0.1350108 0.3969491 0.10574452
## 0.01 1 850 0.1346818 0.3991869 0.10546437
## 0.01 1 900 0.1344117 0.4008470 0.10523770
## 0.01 1 950 0.1341652 0.4024450 0.10502064
## 0.01 1 1000 0.1339238 0.4040544 0.10481467
## 0.01 3 100 0.1441418 0.3899512 0.11351760
## 0.01 3 150 0.1382723 0.4123878 0.10846463
## 0.01 3 200 0.1346676 0.4282070 0.10537279
## 0.01 3 250 0.1323341 0.4398217 0.10336265
## 0.01 3 300 0.1307183 0.4483016 0.10193011
## 0.01 3 350 0.1294949 0.4553967 0.10084441
## 0.01 3 400 0.1285158 0.4611682 0.09993271
## 0.01 3 450 0.1276689 0.4662837 0.09914058
## 0.01 3 500 0.1269823 0.4703995 0.09846318
## 0.01 3 550 0.1263194 0.4746848 0.09781105
## 0.01 3 600 0.1257485 0.4782525 0.09725429
## 0.01 3 650 0.1252661 0.4812599 0.09677521
## 0.01 3 700 0.1248324 0.4839584 0.09632768
## 0.01 3 750 0.1244073 0.4866984 0.09588579
## 0.01 3 800 0.1240399 0.4890355 0.09549763
## 0.01 3 850 0.1237184 0.4910884 0.09513798
## 0.01 3 900 0.1234230 0.4929655 0.09481732
## 0.01 3 950 0.1231419 0.4948093 0.09451924
## 0.01 3 1000 0.1228553 0.4966881 0.09420607
## 0.01 5 100 0.1400518 0.4343543 0.11001775
## 0.01 5 150 0.1337314 0.4551864 0.10453484
## 0.01 5 200 0.1298680 0.4702690 0.10115817
## 0.01 5 250 0.1274144 0.4812154 0.09895391
## 0.01 5 300 0.1257513 0.4892355 0.09736453
## 0.01 5 350 0.1244898 0.4959684 0.09615441
## 0.01 5 400 0.1234749 0.5014972 0.09517368
## 0.01 5 450 0.1226247 0.5063875 0.09432368
## 0.01 5 500 0.1219232 0.5104684 0.09360354
## 0.01 5 550 0.1212960 0.5142927 0.09298549
## 0.01 5 600 0.1207740 0.5174028 0.09243484
## 0.01 5 650 0.1202481 0.5208046 0.09190578
## 0.01 5 700 0.1197832 0.5236796 0.09141677
## 0.01 5 750 0.1193422 0.5265255 0.09096310
## 0.01 5 800 0.1189801 0.5288033 0.09057983
## 0.01 5 850 0.1186551 0.5308685 0.09023062
## 0.01 5 900 0.1183435 0.5329087 0.08989429
## 0.01 5 950 0.1180248 0.5350531 0.08956733
## 0.01 5 1000 0.1177496 0.5368746 0.08928001
## 0.01 7 100 0.1373942 0.4635271 0.10777090
## 0.01 7 150 0.1306976 0.4833794 0.10185562
## 0.01 7 200 0.1266678 0.4974445 0.09824387
## 0.01 7 250 0.1240596 0.5085269 0.09583147
## 0.01 7 300 0.1223480 0.5164939 0.09417842
## 0.01 7 350 0.1211010 0.5225879 0.09293323
## 0.01 7 400 0.1200759 0.5279613 0.09188101
## 0.01 7 450 0.1192485 0.5324767 0.09104246
## 0.01 7 500 0.1185343 0.5366174 0.09033234
## 0.01 7 550 0.1179097 0.5403660 0.08970426
## 0.01 7 600 0.1173634 0.5435683 0.08916338
## 0.01 7 650 0.1168803 0.5465046 0.08867193
## 0.01 7 700 0.1164611 0.5491055 0.08823954
## 0.01 7 750 0.1160385 0.5518002 0.08780289
## 0.01 7 800 0.1156816 0.5540876 0.08744364
## 0.01 7 850 0.1153600 0.5561144 0.08710560
## 0.01 7 900 0.1150618 0.5580905 0.08681053
## 0.01 7 950 0.1147670 0.5600212 0.08650827
## 0.01 7 1000 0.1144821 0.5619199 0.08622592
## 0.10 1 100 0.1340565 0.4017954 0.10481060
## 0.10 1 150 0.1326089 0.4108366 0.10333608
## 0.10 1 200 0.1317614 0.4161629 0.10227676
## 0.10 1 250 0.1313068 0.4190583 0.10156286
## 0.10 1 300 0.1309932 0.4211211 0.10104602
## 0.10 1 350 0.1308115 0.4223615 0.10064869
## 0.10 1 400 0.1307645 0.4227533 0.10047514
## 0.10 1 450 0.1307586 0.4228361 0.10025153
## 0.10 1 500 0.1307810 0.4228650 0.10007924
## 0.10 1 550 0.1306732 0.4239574 0.09995068
## 0.10 1 600 0.1306347 0.4244325 0.09977288
## 0.10 1 650 0.1306317 0.4248236 0.09964244
## 0.10 1 700 0.1305682 0.4256217 0.09956138
## 0.10 1 750 0.1306633 0.4251580 0.09955335
## 0.10 1 800 0.1306872 0.4250753 0.09954620
## 0.10 1 850 0.1307000 0.4254282 0.09944884
## 0.10 1 900 0.1305989 0.4264349 0.09930551
## 0.10 1 950 0.1306098 0.4267148 0.09931352
## 0.10 1 1000 0.1306210 0.4268572 0.09931576
## 0.10 3 100 0.1237004 0.4876400 0.09484227
## 0.10 3 150 0.1218085 0.5006596 0.09270540
## 0.10 3 200 0.1207281 0.5084098 0.09149936
## 0.10 3 250 0.1197287 0.5161076 0.09054129
## 0.10 3 300 0.1192438 0.5198689 0.08994274
## 0.10 3 350 0.1188008 0.5233656 0.08940819
## 0.10 3 400 0.1183355 0.5270352 0.08896095
## 0.10 3 450 0.1179928 0.5299044 0.08864451
## 0.10 3 500 0.1177943 0.5316518 0.08837790
## 0.10 3 550 0.1174792 0.5342892 0.08807793
## 0.10 3 600 0.1173005 0.5358117 0.08785658
## 0.10 3 650 0.1171032 0.5374128 0.08770612
## 0.10 3 700 0.1169462 0.5388723 0.08755418
## 0.10 3 750 0.1167907 0.5401742 0.08742338
## 0.10 3 800 0.1167117 0.5409748 0.08738554
## 0.10 3 850 0.1166310 0.5417867 0.08729569
## 0.10 3 900 0.1164433 0.5433093 0.08711014
## 0.10 3 950 0.1163838 0.5439157 0.08705689
## 0.10 3 1000 0.1163277 0.5444781 0.08697017
## 0.10 5 100 0.1191208 0.5231502 0.09042409
## 0.10 5 150 0.1174064 0.5352762 0.08848273
## 0.10 5 200 0.1163261 0.5433933 0.08734742
## 0.10 5 250 0.1156447 0.5484001 0.08667833
## 0.10 5 300 0.1152657 0.5512900 0.08623685
## 0.10 5 350 0.1147782 0.5551043 0.08577897
## 0.10 5 400 0.1145082 0.5572710 0.08557336
## 0.10 5 450 0.1142815 0.5590955 0.08530422
## 0.10 5 500 0.1140707 0.5608200 0.08515894
## 0.10 5 550 0.1139322 0.5620230 0.08499480
## 0.10 5 600 0.1138320 0.5628937 0.08488888
## 0.10 5 650 0.1137258 0.5638388 0.08479956
## 0.10 5 700 0.1136177 0.5646940 0.08471355
## 0.10 5 750 0.1135797 0.5650971 0.08463624
## 0.10 5 800 0.1135291 0.5655815 0.08458170
## 0.10 5 850 0.1134852 0.5660030 0.08452965
## 0.10 5 900 0.1134644 0.5662224 0.08452265
## 0.10 5 950 0.1134490 0.5664317 0.08447877
## 0.10 5 1000 0.1134479 0.5665083 0.08445055
## 0.10 7 100 0.1162020 0.5459411 0.08745502
## 0.10 7 150 0.1148252 0.5553557 0.08609229
## 0.10 7 200 0.1141102 0.5605495 0.08531496
## 0.10 7 250 0.1134235 0.5656713 0.08458548
## 0.10 7 300 0.1130927 0.5681010 0.08422083
## 0.10 7 350 0.1129117 0.5695599 0.08399238
## 0.10 7 400 0.1127421 0.5709328 0.08376244
## 0.10 7 450 0.1125991 0.5720665 0.08360934
## 0.10 7 500 0.1125000 0.5729175 0.08350768
## 0.10 7 550 0.1124844 0.5730972 0.08346215
## 0.10 7 600 0.1123874 0.5739184 0.08341364
## 0.10 7 650 0.1123777 0.5740942 0.08337921
## 0.10 7 700 0.1123235 0.5745588 0.08333510
## 0.10 7 750 0.1123136 0.5746434 0.08329575
## 0.10 7 800 0.1122906 0.5749137 0.08325012
## 0.10 7 850 0.1122691 0.5751003 0.08324954
## 0.10 7 900 0.1122359 0.5754014 0.08320756
## 0.10 7 950 0.1122178 0.5755598 0.08319135
## 0.10 7 1000 0.1122070 0.5756670 0.08317221
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 1000, interaction.depth =
## 7, shrinkage = 0.1 and n.minobsinnode = 10.
gbm_predict <- predict(gbmTune, subset(dev_test, select = -y))
gbm <- postResample(gbm_predict, dev_test$y)
gbm
## RMSE Rsquared MAE
## 0.09710108 0.69594338 0.07302651
## Random Forest
set.seed(5565)
rforest <- randomForest(subset(train_final, select = -`y`), train_final$y)
rforest
##
## Call:
## randomForest(x = subset(train_final, select = -y), y = train_final$y)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 11
##
## Mean of squared residuals: 0.009828275
## % Var explained: 66.55
rfpredict <- predict(rforest, subset(dev_test, select = -`y`))
random_forest <- postResample(rfpredict, dev_test$y)
random_forest
## RMSE Rsquared MAE
## 0.0927263 0.7482227 0.0687377
## SVM
set.seed(5565)
svm_model <- svm(y ~ ., data = train_final)
summary(svm_model)
##
## Call:
## svm(formula = y ~ ., data = train_final)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 1
## gamma: 0.02857143
## epsilon: 0.1
##
##
## Number of Support Vectors: 1690
predictions <- predict(svm_model, subset(dev_test, select = -`y`))
postResample(predictions, dev_test$y)
## RMSE Rsquared MAE
## 0.11276683 0.59724074 0.08380731
Now we can try our model with PCA applied to the predictors:
set.seed(1765)
rforest_pca <- randomForest(subset(train_pca_final, select = -`y`), train_pca_final$y)
rfpredict_pca <- predict(rforest_pca, subset(dev_test_pca, select = -`y`))
random_forest_pca <- postResample(rfpredict_pca, dev_test_pca$y)
random_forest_pca
## RMSE Rsquared MAE
## 0.11910405 0.59744944 0.09108898
# Top Predictors for PCA
arrange(varImp(rforest_pca), desc(Overall))
## Overall
## PC2 8.4026854
## PC6 3.6705671
## PC3 3.4579474
## PC28 2.9760652
## PC9 2.4093058
## PC1 2.3025874
## PC27 1.8519788
## PC24 1.7090079
## PC21 1.6192835
## PC18 1.6109004
## PC16 1.5337504
## PC26 1.5188717
## PC15 1.5172884
## PC29 1.4178345
## PC20 1.4139694
## PC7 1.3535801
## PC13 1.3033223
## PC11 1.2760424
## PC35 1.2648512
## PC17 1.2082592
## PC19 1.1632458
## PC32 1.0933308
## PC23 1.0540734
## PC4 1.0239133
## PC22 1.0171593
## PC30 0.9730041
## PC25 0.9213585
## PC8 0.8685614
## PC31 0.8570031
## PC12 0.8053055
## PC34 0.7647792
## PC10 0.7490747
## PC14 0.6838966
## PC5 0.6728685
## PC33 0.6330814
PCA has not helped our model in its prediction. We can now try to adjust hyperparameters for slightly more accurate results:
set.seed(417)
rforest <- randomForest(subset(train_final, select = -`y`), train_final$y, ntree = 500, nodesize =1, mtry = 28)
rfpredict <- predict(rforest, subset(dev_test, select = -`y`))
random_forest <- postResample(rfpredict, dev_test$y)
random_forest
## RMSE Rsquared MAE
## 0.09070254 0.74837240 0.06629692
#Top Predictor for Tuned Random Forest
arrange(varImp(rforest), desc(Overall))
## Overall
## Mnf_Flow 11.7614246
## code_c 3.8367865
## Usage_cont 3.3899002
## Oxygen_Filler 3.2061761
## Alch_Rel 2.8471010
## Pressure_Vacuum 2.5994326
## Air_Pressurer 2.5841833
## Temperature 2.2562869
## Carb_Pressure1 1.9759164
## Carb_Rel 1.9595060
## Balling_Lvl 1.8956393
## Bowl_Setpoint 1.5382882
## Carb_Flow 1.4229279
## Balling 1.4011867
## Filler_Speed 1.3383844
## PC_Volume 1.2174458
## Hyd_Pressure3 1.1926622
## Density 1.1854274
## Filler_Level 1.1242689
## MFR 1.0284804
## Hyd_Pressure2 0.9667075
## Fill_Pressure 0.8783686
## Carb_Volume 0.8649111
## Fill_Ounces 0.8115930
## Hyd_Pressure4 0.7896093
## Hyd_Pressure1 0.7251549
## PSC 0.6772046
## PSC_Fill 0.6130646
## Carb_Pressure 0.5158117
## Carb_Temp 0.5141398
## code_b 0.3748545
## PSC_CO2 0.3727733
## code_d 0.2814658
## code_a 0.2470346
## Pressure_Setpoint 0.2445142
We can visualize an example decision tree from the forest for this model:
set.seed(2304)
tree <- train(subset(train_final, select= -`y`),train_final$y,method = "rpart",
tuneLength= 10,
trControl = trainControl(method = "cv"))
windows.options(width = 20, height = 20)
rpart.plot(tree$finalModel)
This will not be an entirely accurate representation of the model decisions as random forest is an ensemble model, having multiple model results combined.
our second most accurate model from testing was the SVM model:
svmTuned <- train(x = subset(train_final, select = -`y`),
y = train_final$y ,
method = "svmRadial",
tuneLength = 14,
trControl = trainControl(method = "cv"))
svmTuned$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 4
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.0205681371602738
##
## Number of Support Vectors : 1703
##
## Objective Function Value : -2153.002
## Training error : 0.209426
svmPred <-predict(svmTuned, newdata = subset(dev_test, select = -`y`))
svm_acc <- postResample(pred = svmPred, obs = dev_test$y)
svm_acc
## RMSE Rsquared MAE
## 0.10760932 0.62910166 0.07960835
First we need to transform the test data similar to the training data:
# Dummy Variable
test$code_a <- ifelse(test$Brand_Code == "A", 1,0)
test$code_b <- ifelse(test$Brand_Code == "B", 1,0)
test$code_c <- ifelse(test$Brand_Code == "C", 1,0)
test$code_d <- ifelse(test$Brand_Code == "D", 1,0)
test$code_a <-replace(test$code_a, is.na(test$code_a), 0)
test$code_b <-replace(test$code_b, is.na(test$code_b), 0)
test$code_c <-replace(test$code_c, is.na(test$code_c), 0)
test$code_d <-replace(test$code_d, is.na(test$code_d), 0)
test <- subset(test, select = -`Brand_Code`)
Apply Bagged imputation model to test data:
X_test_df <- as.data.frame(subset(test, select = -`PH`))
X_test_imputed <- predict(preProc, X_test_df)
X_test_imputed$PH <- test$PH
Now Feed our model the test data:
set.seed(417)
rforest_final <- randomForest(subset(X_train_imputed, select = -`y`), X_train_imputed$y, ntree = 500, nodesize =1, mtry = 28)
rfpredict_final <- predict(rforest_final, subset(X_test_imputed, select = -`PH`))
X_test_imputed$PH <- rfpredict_final
# writexl::write_xlsx(X_test_imputed, file= "project2_PH_prediction.xlsx")