##Project 2
require(impute)
require(caret)
require(corrplot)
require(ggplot2)
require(tidyr)
require(purrr)
require(randomForest)
require(gbm)
require(Cubist)
require(nnet)
require(earth)
require(kernlab)
require(rbenchmark)
data <- readxl::read_excel("StudentData.xlsx")
final <- readxl::read_excel("StudentEvaluation.xlsx")
summary(data)
## Brand Code Carb Volume Fill Ounces PC Volume
## Length:2571 Min. :5.040 Min. :23.63 Min. :0.07933
## Class :character 1st Qu.:5.293 1st Qu.:23.92 1st Qu.:0.23917
## Mode :character Median :5.347 Median :23.97 Median :0.27133
## Mean :5.370 Mean :23.97 Mean :0.27712
## 3rd Qu.:5.453 3rd Qu.:24.03 3rd Qu.:0.31200
## Max. :5.700 Max. :24.32 Max. :0.47800
## NA's :10 NA's :38 NA's :39
## Carb Pressure Carb Temp PSC PSC Fill
## Min. :57.00 Min. :128.6 Min. :0.00200 Min. :0.0000
## 1st Qu.:65.60 1st Qu.:138.4 1st Qu.:0.04800 1st Qu.:0.1000
## Median :68.20 Median :140.8 Median :0.07600 Median :0.1800
## Mean :68.19 Mean :141.1 Mean :0.08457 Mean :0.1954
## 3rd Qu.:70.60 3rd Qu.:143.8 3rd Qu.:0.11200 3rd Qu.:0.2600
## Max. :79.40 Max. :154.0 Max. :0.27000 Max. :0.6200
## NA's :27 NA's :26 NA's :33 NA's :23
## PSC CO2 Mnf Flow Carb Pressure1 Fill Pressure
## Min. :0.00000 Min. :-100.20 Min. :105.6 Min. :34.60
## 1st Qu.:0.02000 1st Qu.:-100.00 1st Qu.:119.0 1st Qu.:46.00
## Median :0.04000 Median : 65.20 Median :123.2 Median :46.40
## Mean :0.05641 Mean : 24.57 Mean :122.6 Mean :47.92
## 3rd Qu.:0.08000 3rd Qu.: 140.80 3rd Qu.:125.4 3rd Qu.:50.00
## Max. :0.24000 Max. : 229.40 Max. :140.2 Max. :60.40
## NA's :39 NA's :2 NA's :32 NA's :22
## Hyd Pressure1 Hyd Pressure2 Hyd Pressure3 Hyd Pressure4
## Min. :-0.80 Min. : 0.00 Min. :-1.20 Min. : 52.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 86.00
## Median :11.40 Median :28.60 Median :27.60 Median : 96.00
## Mean :12.44 Mean :20.96 Mean :20.46 Mean : 96.29
## 3rd Qu.:20.20 3rd Qu.:34.60 3rd Qu.:33.40 3rd Qu.:102.00
## Max. :58.00 Max. :59.40 Max. :50.00 Max. :142.00
## NA's :11 NA's :15 NA's :15 NA's :30
## Filler Level Filler Speed Temperature Usage cont
## Min. : 55.8 Min. : 998 Min. :63.60 Min. :12.08
## 1st Qu.: 98.3 1st Qu.:3888 1st Qu.:65.20 1st Qu.:18.36
## Median :118.4 Median :3982 Median :65.60 Median :21.79
## Mean :109.3 Mean :3687 Mean :65.97 Mean :20.99
## 3rd Qu.:120.0 3rd Qu.:3998 3rd Qu.:66.40 3rd Qu.:23.75
## Max. :161.2 Max. :4030 Max. :76.20 Max. :25.90
## NA's :20 NA's :57 NA's :14 NA's :5
## Carb Flow Density MFR Balling
## Min. : 26 Min. :0.240 Min. : 31.4 Min. :-0.170
## 1st Qu.:1144 1st Qu.:0.900 1st Qu.:706.3 1st Qu.: 1.496
## Median :3028 Median :0.980 Median :724.0 Median : 1.648
## Mean :2468 Mean :1.174 Mean :704.0 Mean : 2.198
## 3rd Qu.:3186 3rd Qu.:1.620 3rd Qu.:731.0 3rd Qu.: 3.292
## Max. :5104 Max. :1.920 Max. :868.6 Max. : 4.012
## NA's :2 NA's :1 NA's :212 NA's :1
## Pressure Vacuum PH Oxygen Filler Bowl Setpoint
## Min. :-6.600 Min. :7.880 Min. :0.00240 Min. : 70.0
## 1st Qu.:-5.600 1st Qu.:8.440 1st Qu.:0.02200 1st Qu.:100.0
## Median :-5.400 Median :8.540 Median :0.03340 Median :120.0
## Mean :-5.216 Mean :8.546 Mean :0.04684 Mean :109.3
## 3rd Qu.:-5.000 3rd Qu.:8.680 3rd Qu.:0.06000 3rd Qu.:120.0
## Max. :-3.600 Max. :9.360 Max. :0.40000 Max. :140.0
## NA's :4 NA's :12 NA's :2
## Pressure Setpoint Air Pressurer Alch Rel Carb Rel
## Min. :44.00 Min. :140.8 Min. :5.280 Min. :4.960
## 1st Qu.:46.00 1st Qu.:142.2 1st Qu.:6.540 1st Qu.:5.340
## Median :46.00 Median :142.6 Median :6.560 Median :5.400
## Mean :47.62 Mean :142.8 Mean :6.897 Mean :5.437
## 3rd Qu.:50.00 3rd Qu.:143.0 3rd Qu.:7.240 3rd Qu.:5.540
## Max. :52.00 Max. :148.2 Max. :8.620 Max. :6.060
## NA's :12 NA's :9 NA's :10
## Balling Lvl
## Min. :0.00
## 1st Qu.:1.38
## Median :1.48
## Mean :2.05
## 3rd Qu.:3.14
## Max. :3.66
## NA's :1
We see that every predictor has missing values, including the variable we’ll be predicting (PH). So to begin, we will attempt to impute the missing values via the impute library.
imp <- impute.knn(as.matrix(data[,-1]))
## Cluster size 2571 broken into 249 2322
## Done cluster 249
## Cluster size 2322 broken into 1781 541
## Cluster size 1781 broken into 1731 50
## Cluster size 1731 broken into 707 1024
## Done cluster 707
## Done cluster 1024
## Done cluster 1731
## Done cluster 50
## Done cluster 1781
## Done cluster 541
## Done cluster 2322
data[,-1] <- as.data.frame(imp$data)
data <- as.data.frame(data)
summary(data)
## Brand Code Carb Volume Fill Ounces PC Volume
## Length:2571 Min. :5.040 Min. :23.63 Min. :0.07933
## Class :character 1st Qu.:5.293 1st Qu.:23.92 1st Qu.:0.23933
## Mode :character Median :5.347 Median :23.97 Median :0.27133
## Mean :5.370 Mean :23.97 Mean :0.27762
## 3rd Qu.:5.453 3rd Qu.:24.03 3rd Qu.:0.31267
## Max. :5.700 Max. :24.32 Max. :0.47800
## Carb Pressure Carb Temp PSC PSC Fill
## Min. :57.0 Min. :128.6 Min. :0.00200 Min. :0.0000
## 1st Qu.:65.6 1st Qu.:138.4 1st Qu.:0.05000 1st Qu.:0.1000
## Median :68.2 Median :140.8 Median :0.07800 Median :0.1800
## Mean :68.2 Mean :141.1 Mean :0.08466 Mean :0.1953
## 3rd Qu.:70.6 3rd Qu.:143.8 3rd Qu.:0.11200 3rd Qu.:0.2600
## Max. :79.4 Max. :154.0 Max. :0.27000 Max. :0.6200
## PSC CO2 Mnf Flow Carb Pressure1 Fill Pressure
## Min. :0.00000 Min. :-100.20 Min. :105.6 Min. :34.6
## 1st Qu.:0.02000 1st Qu.:-100.00 1st Qu.:118.8 1st Qu.:46.0
## Median :0.04000 Median : 64.80 Median :123.0 Median :46.4
## Mean :0.05649 Mean : 24.53 Mean :122.5 Mean :47.9
## 3rd Qu.:0.08000 3rd Qu.: 140.80 3rd Qu.:125.4 3rd Qu.:50.0
## Max. :0.24000 Max. : 229.40 Max. :140.2 Max. :60.4
## Hyd Pressure1 Hyd Pressure2 Hyd Pressure3 Hyd Pressure4
## Min. :-0.80 Min. : 0.00 Min. :-1.20 Min. : 52.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 86.00
## Median :11.40 Median :28.60 Median :27.40 Median : 96.00
## Mean :12.39 Mean :20.86 Mean :20.36 Mean : 96.51
## 3rd Qu.:20.20 3rd Qu.:34.60 3rd Qu.:33.20 3rd Qu.:102.00
## Max. :58.00 Max. :59.40 Max. :50.00 Max. :142.00
## Filler Level Filler Speed Temperature Usage cont
## Min. : 55.8 Min. : 998 Min. :63.60 Min. :12.08
## 1st Qu.: 98.5 1st Qu.:3815 1st Qu.:65.20 1st Qu.:18.36
## Median :118.2 Median :3980 Median :65.60 Median :21.78
## Mean :109.2 Mean :3631 Mean :65.98 Mean :20.99
## 3rd Qu.:120.0 3rd Qu.:3996 3rd Qu.:66.40 3rd Qu.:23.75
## Max. :161.2 Max. :4030 Max. :76.20 Max. :25.90
## Carb Flow Density MFR Balling
## Min. : 26 Min. :0.240 Min. : 31.4 Min. :-0.170
## 1st Qu.:1133 1st Qu.:0.900 1st Qu.:704.0 1st Qu.: 1.496
## Median :3028 Median :0.980 Median :721.4 Median : 1.648
## Mean :2467 Mean :1.174 Mean :692.4 Mean : 2.198
## 3rd Qu.:3186 3rd Qu.:1.620 3rd Qu.:730.4 3rd Qu.: 3.292
## Max. :5104 Max. :1.920 Max. :868.6 Max. : 4.012
## Pressure Vacuum PH Oxygen Filler Bowl Setpoint
## Min. :-6.600 Min. :7.880 Min. :0.00240 Min. : 70.0
## 1st Qu.:-5.600 1st Qu.:8.440 1st Qu.:0.02200 1st Qu.:100.0
## Median :-5.400 Median :8.540 Median :0.03340 Median :120.0
## Mean :-5.216 Mean :8.546 Mean :0.04704 Mean :109.3
## 3rd Qu.:-5.000 3rd Qu.:8.680 3rd Qu.:0.06000 3rd Qu.:120.0
## Max. :-3.600 Max. :9.360 Max. :0.40000 Max. :140.0
## Pressure Setpoint Air Pressurer Alch Rel Carb Rel
## Min. :44.00 Min. :140.8 Min. :5.280 Min. :4.960
## 1st Qu.:46.00 1st Qu.:142.2 1st Qu.:6.540 1st Qu.:5.340
## Median :46.00 Median :142.6 Median :6.571 Median :5.400
## Mean :47.61 Mean :142.8 Mean :6.897 Mean :5.437
## 3rd Qu.:50.00 3rd Qu.:143.0 3rd Qu.:7.230 3rd Qu.:5.540
## Max. :52.00 Max. :148.2 Max. :8.620 Max. :6.060
## Balling Lvl
## Min. :0.00
## 1st Qu.:1.38
## Median :1.48
## Mean :2.05
## 3rd Qu.:3.14
## Max. :3.66
# Impute test set as well
imp2 <- impute.knn(as.matrix(final[,-c(1, 26)]))
final[, -c(1, 26)] <- as.data.frame(imp2$data)
final <- as.data.frame(final)
This does not solve our entire problem, as there are some NA's for the Brand Code predictor. This cannot be easily solved via the impute.knn function as it is a string, not a numeric. We have two options: we can run a model to predict what brand based on the rest of the data set, or we can simply introduce a new brand.
So as to not introduce too much uncertainty to our final model, we will instead introduce a new brand for all the NA values. This new brand will be E. From there, as it is our only variable that is not numeric, we will simply swap the letters with numbers. A, B, C, D, E will become 1, 2, 3, 4, 5.
data[is.na(data$`Brand Code`), 1] <- "E"
data[data$`Brand Code` == "A", 1] <- 1
data[data$`Brand Code` == "B", 1] <- 2
data[data$`Brand Code` == "C", 1] <- 3
data[data$`Brand Code` == "D", 1] <- 4
data[data$`Brand Code` == "E", 1] <- 5
data$`Brand Code` <- as.numeric(data$`Brand Code`)
# Repeat for final data
final[is.na(final$`Brand Code`), 1] <- "E"
final[final$`Brand Code` == "A", 1] <- 1
final[final$`Brand Code` == "B", 1] <- 2
final[final$`Brand Code` == "C", 1] <- 3
final[final$`Brand Code` == "D", 1] <- 4
final[final$`Brand Code` == "E", 1] <- 5
final$`Brand Code` <- as.numeric(final$`Brand Code`)
res <- cor(data)
corrplot(res)
In examining the correlation between the predictors, we find that AlchRel, CarbRel, and BailingLvl have a strong relation between themselves along with Carb Volume, Density and Bailing. There is also another strong correlation among Mnf Flow, Carb Pressure1, and Hyd Pressure1-3. Curiously, Hyd Pressure4 has a negative to neutral correlation with its siblings.
Meanwhile, the group of Mnf Flow, Carb Pressure1, and Hyd Pressure1-3 has a negative correlation with the group Bailing, PressureVacuum, OxygenFiller, Bowl Setpoint, and Filler Level.
data %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_histogram()
We can see that some predictors, like Carb Pressure, Carb Temp, Fill Ounces, and Pressure Vacuum have a mostly uniform distribution. The Hyd Pressure predictors all have a large outlier at 0, except for Hyd Pressure4.
We will fit multiple models on the data in order to determine the best one to predict PH. We will use one linear regression option, two nonlinear regression options, and two tree options.
set.seed(42)
smp <- floor(0.75 * nrow(data))
x <- sample(seq_len(nrow(data)), size = smp)
data.train <- data[x,]
data.test <- data[-x,]
ctrl <- trainControl(method = "cv", number = 10)
y.Train <- as.numeric(unlist(data.train$PH))
lmTune <- train(x = data.train[, -26], y = y.Train,
method = "lm", trControl = ctrl)
lmTune
## Linear Regression
##
## 1928 samples
## 32 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1736, 1735, 1736, 1733, 1736, 1736, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.139141 0.350397 0.107491
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
svmTune <- train(x = data.train[, -26], y = y.Train,
method = "svmRadial", preProcess = c("center", "scale"),
tuneLength = 7, trControl = trainControl(method = "cv"))
svmTune
## Support Vector Machines with Radial Basis Function Kernel
##
## 1928 samples
## 32 predictor
##
## Pre-processing: centered (32), scaled (32)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1736, 1735, 1736, 1736, 1735, 1735, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 0.1308730 0.4327814 0.09785334
## 0.50 0.1275809 0.4571032 0.09442606
## 1.00 0.1245028 0.4820125 0.09134833
## 2.00 0.1222071 0.4997631 0.08929255
## 4.00 0.1209551 0.5096069 0.08840260
## 8.00 0.1197119 0.5198609 0.08793968
## 16.00 0.1201749 0.5202414 0.08917480
##
## Tuning parameter 'sigma' was held constant at a value of 0.02187251
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.02187251 and C = 8.
knnTune <- train(x = data.train[, -26], y = y.Train,
method = "knn", preProcess = c("center", "scale"),
tuneGrid = data.frame(.k = 1:20),
trControl = ctrl)
knnTune
## k-Nearest Neighbors
##
## 1928 samples
## 32 predictor
##
## Pre-processing: centered (32), scaled (32)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1737, 1735, 1736, 1735, 1735, 1736, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 0.1610351 0.3111176 0.11094528
## 2 0.1378888 0.4034637 0.09947871
## 3 0.1315954 0.4350573 0.09643845
## 4 0.1299938 0.4415301 0.09590263
## 5 0.1270482 0.4628362 0.09454153
## 6 0.1276065 0.4580981 0.09529587
## 7 0.1281533 0.4544022 0.09639231
## 8 0.1286770 0.4498707 0.09703808
## 9 0.1291120 0.4457619 0.09756921
## 10 0.1290912 0.4463053 0.09783570
## 11 0.1296644 0.4416287 0.09850934
## 12 0.1300148 0.4387404 0.09875985
## 13 0.1301492 0.4378827 0.09905308
## 14 0.1303585 0.4359829 0.09963209
## 15 0.1308627 0.4309830 0.09982990
## 16 0.1312716 0.4276592 0.10011806
## 17 0.1319652 0.4213623 0.10084915
## 18 0.1325788 0.4156245 0.10110165
## 19 0.1329907 0.4119743 0.10162671
## 20 0.1333980 0.4084653 0.10201087
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 5.
rfTune <- train(x = data.train[, -26], y = y.Train,
method = "rf", trControl = ctrl)
rfTune
## Random Forest
##
## 1928 samples
## 32 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1734, 1735, 1735, 1736, 1735, 1735, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 0.1141801 0.6103874 0.08671527
## 17 0.1038971 0.6539130 0.07566827
## 32 0.1036121 0.6466262 0.07467261
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 32.
cubeTune <- train(x = data.train[, -26], y = y.Train,
method = "cubist", trControl = ctrl)
cubeTune
## Cubist
##
## 1928 samples
## 32 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1736, 1735, 1735, 1735, 1735, 1736, ...
## Resampling results across tuning parameters:
##
## committees neighbors RMSE Rsquared MAE
## 1 0 0.12516007 0.4954063 0.08722466
## 1 5 0.11839004 0.5496073 0.08242316
## 1 9 0.11910018 0.5404883 0.08304176
## 10 0 0.10802472 0.6065938 0.07813629
## 10 5 0.10097931 0.6541460 0.07244367
## 10 9 0.10189313 0.6476358 0.07306617
## 20 0 0.10639203 0.6207436 0.07718113
## 20 5 0.09861353 0.6702306 0.07098461
## 20 9 0.09971065 0.6633484 0.07180035
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 5.
scores1 <- data.frame(Model = character(),
RMSE = double(),
RSquared = double(),
stringsAsFactors = FALSE)
scores1[1, ] <- c("LM", lmTune$results$RMSE, lmTune$results$Rsquared)
scores1[2, ] <- c("SVM", svmTune$results$RMSE[6], svmTune$results$Rsquared[6])
scores1[3, ] <- c("KNN", knnTune$results$RMSE[5], knnTune$results$Rsquared[5])
scores1[4, ] <- c("Random Forest", rfTune$results$RMSE[3], rfTune$results$Rsquared[3])
scores1[5, ] <- c("Cubist", cubeTune$results$RMSE[8], cubeTune$results$Rsquared[8])
scores1
## Model RMSE RSquared
## 1 LM 0.139140988005172 0.350397004226692
## 2 SVM 0.119711877363228 0.519860903694499
## 3 KNN 0.127048183694861 0.462836242300806
## 4 Random Forest 0.103612113733804 0.64662622055254
## 5 Cubist 0.0986135338431062 0.67023058708877
In general, the smaller RMSE value is better, while the larger \(R^2\) value is better. This is not meant to be a hard rule, however, as these scores are merely guidelines to help assess our models. Based on our observations of the best versions of each models, we can surmise that Cubist and Random Forest will perform about the same.
lmPred <- predict(lmTune$finalModel, newdata = data.test[, -26])
svmPred <- predict(svmTune$finalModel, newdata = data.test[, -26])
knnPred <- predict(knnTune$finalModel, newdata = data.test[, -26])
rfPred <- predict(rfTune$finalModel, newdata = data.test[, -26])
cubePred <- predict(cubeTune$finalModel, newdata = data.test[, -26])
lm1 <- postResample(lmPred, obs = data.test[, 26])
svm1 <- postResample(svmPred, obs = data.test[, 26])
knn1 <- postResample(knnPred, obs = data.test[, 26])
rf1 <- postResample(rfPred, obs = data.test[, 26])
cube1 <- postResample(cubePred, obs = data.test[, 26])
scores2 <- data.frame(Model = character(),
RMSE = double(),
RSquared = double(),
stringsAsFactors = FALSE)
scores2[1, ] <- c("LM", lm1[1], lm1[2])
scores2[2, ] <- c("SVM", svm1[1], svm1[2])
scores2[3, ] <- c("KNN", knn1[1], knn1[2])
scores2[4, ] <- c("Random Forest", rf1[1], rf1[2])
scores2[5, ] <- c("Cubist", cube1[1], cube1[2])
scores2
## Model RMSE RSquared
## 1 LM 0.13679839570933 0.381966342421801
## 2 SVM 0.174478200744446 <NA>
## 3 KNN 0.169276409337091 0.139674600713041
## 4 Random Forest 0.0986596270471993 0.687590947085506
## 5 Cubist 0.0982457202171831 0.685313907728689
Upon evaluating our test data for the models, we find that our assumptions hold true. Random Forest and Cubist are virtually tied for being the best.
plot(rfTune, main = "Random Forest")
plot(cubeTune, main = "Cubist")
We can see how both models improve as they grew more computationally complex. In the case of Random Forest, as more predictors are selected, the better the model gets. There is a slight difference between an mtry of 17 vs an mtry of 32. In truth, we could probably get by just fine by using the model with mtry of 17, but as the train function bases its results solely on the lowest RMSE, it decides to go with 32.
For the Cubist algorithm, we can see how more committees contributes to a lower RMSE score. Each committee is built of separate models. The first is generated without any input from the rest of the committee, while each subsequent member works on improving the previous iteration. In the meanwhile, we see that more neighbors, or instances, in the model does not always lead to an improvement. The 5-instance model performs slightly better than the 9-instance one.
qqnorm(rfPred, main = "Random Forest")
qqline(rfPred)
qqnorm(cubePred, main = "Cubist")
qqline(cubePred)
qqnorm(data.test[, 26], main = "Observed")
qqline(data.test[, 26])
If we plot out our predicted values in a Quantile-Quantile plot, we can see how our models best fit the data. The fit for both of our top performing models are roughly the same. Cubist may have a slight edge, although we would have to worry about overfitting the data.
benchmark(train(x = data.train[, -26], y = y.Train,
method = "rf", trControl = ctrl),
replications = 1)
## test
## 1 train(x = data.train[, -26], y = y.Train, method = "rf", trControl = ctrl)
## replications elapsed relative user.self sys.self user.child sys.child
## 1 1 325.42 1 318.91 2.33 NA NA
benchmark(train(x = data.train[, -26], y = y.Train,
method = "cubist", trControl = ctrl),
replications = 1)
## test
## 1 train(x = data.train[, -26], y = y.Train, method = "cubist", trControl = ctrl)
## replications elapsed relative user.self sys.self user.child sys.child
## 1 1 62.41 1 61.53 0.11 NA NA
As the scores are equal, we’ll need a different method of choosing which to prefer going forward. If we run a benchmark on both models’ tuning, we can see that Random Forest is more intensive. It takes 5 to 6 minutes to complete, whereas the Cubist model takes only 1 minute. With all other things being equal, the Cubist model is better based on this.
cubeTune$finalModel$usage
## Conditions Model Variable
## 1 85 77 Flow
## 2 60 55 Code
## 3 23 58 Vacuum
## 4 23 79 Lvl
## 5 23 40 Setpoint
## 6 18 31 Speed
## 7 18 71 Balling
## 8 18 72 Rel
## 9 15 51 Flow
## 10 15 61 Pressurer
## 11 13 54 Filler
## 12 10 61 Temperature
## 13 10 77 Density
## 14 8 31 Level
## 15 8 52 Rel
## 16 7 61 Pressure1
## 17 7 60 Pressure3
## 18 6 39 cont
## 19 2 20 Volume
## 20 2 40 Pressure1
## 21 2 47 Pressure2
## 22 2 12 Pressure
## 23 1 8 MFR
## 24 0 27 Pressure4
## 25 0 5 Ounces
## 26 0 12 Volume
## 27 0 17 Pressure
## 28 0 24 Setpoint
## 29 0 15 Temp
## 30 0 15 Fill
## 31 0 9 CO2
## 32 0 4 PSC
Once we examine the relationship between predictors as evaluated by the model, we see that the most important variables are Mnf Flow, Brand Code, Pressure Vacuum, Balling Lvl, and Bowl Setpoint.
In order to predict PH for our final evaluation set, we’ll take the opportunity to retrain the model on our entire practice data set.
cubeTuneFinal <- train(x = data[, -26], y = data[, 26],
method = "cubist", trControl = ctrl)
cubeTuneFinal
## Cubist
##
## 2571 samples
## 32 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2314, 2314, 2313, 2314, 2313, 2315, ...
## Resampling results across tuning parameters:
##
## committees neighbors RMSE Rsquared MAE
## 1 0 0.11405394 0.5808545 0.07984392
## 1 5 0.10882863 0.6259578 0.07518089
## 1 9 0.10947724 0.6171687 0.07610701
## 10 0 0.09828038 0.6793905 0.07081404
## 10 5 0.09263168 0.7124090 0.06634390
## 10 9 0.09335915 0.7078461 0.06688831
## 20 0 0.09708059 0.6888173 0.07004094
## 20 5 0.09113704 0.7212263 0.06524261
## 20 9 0.09199001 0.7163900 0.06585124
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 5.
There’s a noticable increase in model performance by using the full data set.
ctfPred <- predict(cubeTuneFinal$finalModel, newdata = final[, -26])
final$PH <- ctfPred
final[final$`Brand Code` == 1, 1] <- "A"
final[final$`Brand Code` == 2, 1] <- "B"
final[final$`Brand Code` == 3, 1] <- "C"
final[final$`Brand Code` == 4, 1] <- "D"
final[final$`Brand Code` == 5, 1] <- "E"
xlsx::write.xlsx(final, "StudentEvaluationFinal.xlsx")