##Project 2

Data Exploration

require(impute)
require(caret)
require(corrplot)
require(ggplot2)
require(tidyr)
require(purrr)
require(randomForest)
require(gbm)
require(Cubist)
require(nnet)
require(earth)
require(kernlab)
require(rbenchmark)
data <- readxl::read_excel("StudentData.xlsx")
final <- readxl::read_excel("StudentEvaluation.xlsx")

summary(data)
##   Brand Code         Carb Volume     Fill Ounces      PC Volume      
##  Length:2571        Min.   :5.040   Min.   :23.63   Min.   :0.07933  
##  Class :character   1st Qu.:5.293   1st Qu.:23.92   1st Qu.:0.23917  
##  Mode  :character   Median :5.347   Median :23.97   Median :0.27133  
##                     Mean   :5.370   Mean   :23.97   Mean   :0.27712  
##                     3rd Qu.:5.453   3rd Qu.:24.03   3rd Qu.:0.31200  
##                     Max.   :5.700   Max.   :24.32   Max.   :0.47800  
##                     NA's   :10      NA's   :38      NA's   :39       
##  Carb Pressure     Carb Temp          PSC             PSC Fill     
##  Min.   :57.00   Min.   :128.6   Min.   :0.00200   Min.   :0.0000  
##  1st Qu.:65.60   1st Qu.:138.4   1st Qu.:0.04800   1st Qu.:0.1000  
##  Median :68.20   Median :140.8   Median :0.07600   Median :0.1800  
##  Mean   :68.19   Mean   :141.1   Mean   :0.08457   Mean   :0.1954  
##  3rd Qu.:70.60   3rd Qu.:143.8   3rd Qu.:0.11200   3rd Qu.:0.2600  
##  Max.   :79.40   Max.   :154.0   Max.   :0.27000   Max.   :0.6200  
##  NA's   :27      NA's   :26      NA's   :33        NA's   :23      
##     PSC CO2           Mnf Flow       Carb Pressure1  Fill Pressure  
##  Min.   :0.00000   Min.   :-100.20   Min.   :105.6   Min.   :34.60  
##  1st Qu.:0.02000   1st Qu.:-100.00   1st Qu.:119.0   1st Qu.:46.00  
##  Median :0.04000   Median :  65.20   Median :123.2   Median :46.40  
##  Mean   :0.05641   Mean   :  24.57   Mean   :122.6   Mean   :47.92  
##  3rd Qu.:0.08000   3rd Qu.: 140.80   3rd Qu.:125.4   3rd Qu.:50.00  
##  Max.   :0.24000   Max.   : 229.40   Max.   :140.2   Max.   :60.40  
##  NA's   :39        NA's   :2         NA's   :32      NA's   :22     
##  Hyd Pressure1   Hyd Pressure2   Hyd Pressure3   Hyd Pressure4   
##  Min.   :-0.80   Min.   : 0.00   Min.   :-1.20   Min.   : 52.00  
##  1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 86.00  
##  Median :11.40   Median :28.60   Median :27.60   Median : 96.00  
##  Mean   :12.44   Mean   :20.96   Mean   :20.46   Mean   : 96.29  
##  3rd Qu.:20.20   3rd Qu.:34.60   3rd Qu.:33.40   3rd Qu.:102.00  
##  Max.   :58.00   Max.   :59.40   Max.   :50.00   Max.   :142.00  
##  NA's   :11      NA's   :15      NA's   :15      NA's   :30      
##   Filler Level    Filler Speed   Temperature      Usage cont   
##  Min.   : 55.8   Min.   : 998   Min.   :63.60   Min.   :12.08  
##  1st Qu.: 98.3   1st Qu.:3888   1st Qu.:65.20   1st Qu.:18.36  
##  Median :118.4   Median :3982   Median :65.60   Median :21.79  
##  Mean   :109.3   Mean   :3687   Mean   :65.97   Mean   :20.99  
##  3rd Qu.:120.0   3rd Qu.:3998   3rd Qu.:66.40   3rd Qu.:23.75  
##  Max.   :161.2   Max.   :4030   Max.   :76.20   Max.   :25.90  
##  NA's   :20      NA's   :57     NA's   :14      NA's   :5      
##    Carb Flow       Density           MFR           Balling      
##  Min.   :  26   Min.   :0.240   Min.   : 31.4   Min.   :-0.170  
##  1st Qu.:1144   1st Qu.:0.900   1st Qu.:706.3   1st Qu.: 1.496  
##  Median :3028   Median :0.980   Median :724.0   Median : 1.648  
##  Mean   :2468   Mean   :1.174   Mean   :704.0   Mean   : 2.198  
##  3rd Qu.:3186   3rd Qu.:1.620   3rd Qu.:731.0   3rd Qu.: 3.292  
##  Max.   :5104   Max.   :1.920   Max.   :868.6   Max.   : 4.012  
##  NA's   :2      NA's   :1       NA's   :212     NA's   :1       
##  Pressure Vacuum        PH        Oxygen Filler     Bowl Setpoint  
##  Min.   :-6.600   Min.   :7.880   Min.   :0.00240   Min.   : 70.0  
##  1st Qu.:-5.600   1st Qu.:8.440   1st Qu.:0.02200   1st Qu.:100.0  
##  Median :-5.400   Median :8.540   Median :0.03340   Median :120.0  
##  Mean   :-5.216   Mean   :8.546   Mean   :0.04684   Mean   :109.3  
##  3rd Qu.:-5.000   3rd Qu.:8.680   3rd Qu.:0.06000   3rd Qu.:120.0  
##  Max.   :-3.600   Max.   :9.360   Max.   :0.40000   Max.   :140.0  
##                   NA's   :4       NA's   :12        NA's   :2      
##  Pressure Setpoint Air Pressurer      Alch Rel        Carb Rel    
##  Min.   :44.00     Min.   :140.8   Min.   :5.280   Min.   :4.960  
##  1st Qu.:46.00     1st Qu.:142.2   1st Qu.:6.540   1st Qu.:5.340  
##  Median :46.00     Median :142.6   Median :6.560   Median :5.400  
##  Mean   :47.62     Mean   :142.8   Mean   :6.897   Mean   :5.437  
##  3rd Qu.:50.00     3rd Qu.:143.0   3rd Qu.:7.240   3rd Qu.:5.540  
##  Max.   :52.00     Max.   :148.2   Max.   :8.620   Max.   :6.060  
##  NA's   :12                        NA's   :9       NA's   :10     
##   Balling Lvl  
##  Min.   :0.00  
##  1st Qu.:1.38  
##  Median :1.48  
##  Mean   :2.05  
##  3rd Qu.:3.14  
##  Max.   :3.66  
##  NA's   :1

We see that every predictor has missing values, including the variable we’ll be predicting (PH). So to begin, we will attempt to impute the missing values via the impute library.

imp <- impute.knn(as.matrix(data[,-1]))
## Cluster size 2571 broken into 249 2322 
## Done cluster 249 
## Cluster size 2322 broken into 1781 541 
## Cluster size 1781 broken into 1731 50 
## Cluster size 1731 broken into 707 1024 
## Done cluster 707 
## Done cluster 1024 
## Done cluster 1731 
## Done cluster 50 
## Done cluster 1781 
## Done cluster 541 
## Done cluster 2322
data[,-1] <- as.data.frame(imp$data)
data <- as.data.frame(data)

summary(data)
##   Brand Code         Carb Volume     Fill Ounces      PC Volume      
##  Length:2571        Min.   :5.040   Min.   :23.63   Min.   :0.07933  
##  Class :character   1st Qu.:5.293   1st Qu.:23.92   1st Qu.:0.23933  
##  Mode  :character   Median :5.347   Median :23.97   Median :0.27133  
##                     Mean   :5.370   Mean   :23.97   Mean   :0.27762  
##                     3rd Qu.:5.453   3rd Qu.:24.03   3rd Qu.:0.31267  
##                     Max.   :5.700   Max.   :24.32   Max.   :0.47800  
##  Carb Pressure    Carb Temp          PSC             PSC Fill     
##  Min.   :57.0   Min.   :128.6   Min.   :0.00200   Min.   :0.0000  
##  1st Qu.:65.6   1st Qu.:138.4   1st Qu.:0.05000   1st Qu.:0.1000  
##  Median :68.2   Median :140.8   Median :0.07800   Median :0.1800  
##  Mean   :68.2   Mean   :141.1   Mean   :0.08466   Mean   :0.1953  
##  3rd Qu.:70.6   3rd Qu.:143.8   3rd Qu.:0.11200   3rd Qu.:0.2600  
##  Max.   :79.4   Max.   :154.0   Max.   :0.27000   Max.   :0.6200  
##     PSC CO2           Mnf Flow       Carb Pressure1  Fill Pressure 
##  Min.   :0.00000   Min.   :-100.20   Min.   :105.6   Min.   :34.6  
##  1st Qu.:0.02000   1st Qu.:-100.00   1st Qu.:118.8   1st Qu.:46.0  
##  Median :0.04000   Median :  64.80   Median :123.0   Median :46.4  
##  Mean   :0.05649   Mean   :  24.53   Mean   :122.5   Mean   :47.9  
##  3rd Qu.:0.08000   3rd Qu.: 140.80   3rd Qu.:125.4   3rd Qu.:50.0  
##  Max.   :0.24000   Max.   : 229.40   Max.   :140.2   Max.   :60.4  
##  Hyd Pressure1   Hyd Pressure2   Hyd Pressure3   Hyd Pressure4   
##  Min.   :-0.80   Min.   : 0.00   Min.   :-1.20   Min.   : 52.00  
##  1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 86.00  
##  Median :11.40   Median :28.60   Median :27.40   Median : 96.00  
##  Mean   :12.39   Mean   :20.86   Mean   :20.36   Mean   : 96.51  
##  3rd Qu.:20.20   3rd Qu.:34.60   3rd Qu.:33.20   3rd Qu.:102.00  
##  Max.   :58.00   Max.   :59.40   Max.   :50.00   Max.   :142.00  
##   Filler Level    Filler Speed   Temperature      Usage cont   
##  Min.   : 55.8   Min.   : 998   Min.   :63.60   Min.   :12.08  
##  1st Qu.: 98.5   1st Qu.:3815   1st Qu.:65.20   1st Qu.:18.36  
##  Median :118.2   Median :3980   Median :65.60   Median :21.78  
##  Mean   :109.2   Mean   :3631   Mean   :65.98   Mean   :20.99  
##  3rd Qu.:120.0   3rd Qu.:3996   3rd Qu.:66.40   3rd Qu.:23.75  
##  Max.   :161.2   Max.   :4030   Max.   :76.20   Max.   :25.90  
##    Carb Flow       Density           MFR           Balling      
##  Min.   :  26   Min.   :0.240   Min.   : 31.4   Min.   :-0.170  
##  1st Qu.:1133   1st Qu.:0.900   1st Qu.:704.0   1st Qu.: 1.496  
##  Median :3028   Median :0.980   Median :721.4   Median : 1.648  
##  Mean   :2467   Mean   :1.174   Mean   :692.4   Mean   : 2.198  
##  3rd Qu.:3186   3rd Qu.:1.620   3rd Qu.:730.4   3rd Qu.: 3.292  
##  Max.   :5104   Max.   :1.920   Max.   :868.6   Max.   : 4.012  
##  Pressure Vacuum        PH        Oxygen Filler     Bowl Setpoint  
##  Min.   :-6.600   Min.   :7.880   Min.   :0.00240   Min.   : 70.0  
##  1st Qu.:-5.600   1st Qu.:8.440   1st Qu.:0.02200   1st Qu.:100.0  
##  Median :-5.400   Median :8.540   Median :0.03340   Median :120.0  
##  Mean   :-5.216   Mean   :8.546   Mean   :0.04704   Mean   :109.3  
##  3rd Qu.:-5.000   3rd Qu.:8.680   3rd Qu.:0.06000   3rd Qu.:120.0  
##  Max.   :-3.600   Max.   :9.360   Max.   :0.40000   Max.   :140.0  
##  Pressure Setpoint Air Pressurer      Alch Rel        Carb Rel    
##  Min.   :44.00     Min.   :140.8   Min.   :5.280   Min.   :4.960  
##  1st Qu.:46.00     1st Qu.:142.2   1st Qu.:6.540   1st Qu.:5.340  
##  Median :46.00     Median :142.6   Median :6.571   Median :5.400  
##  Mean   :47.61     Mean   :142.8   Mean   :6.897   Mean   :5.437  
##  3rd Qu.:50.00     3rd Qu.:143.0   3rd Qu.:7.230   3rd Qu.:5.540  
##  Max.   :52.00     Max.   :148.2   Max.   :8.620   Max.   :6.060  
##   Balling Lvl  
##  Min.   :0.00  
##  1st Qu.:1.38  
##  Median :1.48  
##  Mean   :2.05  
##  3rd Qu.:3.14  
##  Max.   :3.66
# Impute test set as well
imp2 <- impute.knn(as.matrix(final[,-c(1, 26)]))
final[, -c(1, 26)] <- as.data.frame(imp2$data)
final <- as.data.frame(final)

This does not solve our entire problem, as there are some NA's for the Brand Code predictor. This cannot be easily solved via the impute.knn function as it is a string, not a numeric. We have two options: we can run a model to predict what brand based on the rest of the data set, or we can simply introduce a new brand.

So as to not introduce too much uncertainty to our final model, we will instead introduce a new brand for all the NA values. This new brand will be E. From there, as it is our only variable that is not numeric, we will simply swap the letters with numbers. A, B, C, D, E will become 1, 2, 3, 4, 5.

data[is.na(data$`Brand Code`), 1] <- "E"
data[data$`Brand Code` == "A", 1] <- 1
data[data$`Brand Code` == "B", 1] <- 2
data[data$`Brand Code` == "C", 1] <- 3
data[data$`Brand Code` == "D", 1] <- 4
data[data$`Brand Code` == "E", 1] <- 5
data$`Brand Code` <- as.numeric(data$`Brand Code`)

# Repeat for final data
final[is.na(final$`Brand Code`), 1] <- "E"
final[final$`Brand Code` == "A", 1] <- 1
final[final$`Brand Code` == "B", 1] <- 2
final[final$`Brand Code` == "C", 1] <- 3
final[final$`Brand Code` == "D", 1] <- 4
final[final$`Brand Code` == "E", 1] <- 5
final$`Brand Code` <- as.numeric(final$`Brand Code`)

Data Visualization

res <- cor(data)
corrplot(res)

In examining the correlation between the predictors, we find that AlchRel, CarbRel, and BailingLvl have a strong relation between themselves along with Carb Volume, Density and Bailing. There is also another strong correlation among Mnf Flow, Carb Pressure1, and Hyd Pressure1-3. Curiously, Hyd Pressure4 has a negative to neutral correlation with its siblings.

Meanwhile, the group of Mnf Flow, Carb Pressure1, and Hyd Pressure1-3 has a negative correlation with the group Bailing, PressureVacuum, OxygenFiller, Bowl Setpoint, and Filler Level.

data %>%
  keep(is.numeric) %>%
  gather() %>%
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_histogram()

We can see that some predictors, like Carb Pressure, Carb Temp, Fill Ounces, and Pressure Vacuum have a mostly uniform distribution. The Hyd Pressure predictors all have a large outlier at 0, except for Hyd Pressure4.

Modeling

We will fit multiple models on the data in order to determine the best one to predict PH. We will use one linear regression option, two nonlinear regression options, and two tree options.

set.seed(42)

smp <- floor(0.75 * nrow(data))
x <- sample(seq_len(nrow(data)), size = smp)
data.train <- data[x,]
data.test <- data[-x,]

ctrl <- trainControl(method = "cv", number = 10)
y.Train <- as.numeric(unlist(data.train$PH))
lmTune <- train(x = data.train[, -26], y = y.Train,
               method = "lm", trControl = ctrl)

lmTune
## Linear Regression 
## 
## 1928 samples
##   32 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1736, 1735, 1736, 1733, 1736, 1736, ... 
## Resampling results:
## 
##   RMSE      Rsquared  MAE     
##   0.139141  0.350397  0.107491
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
svmTune <- train(x = data.train[, -26], y = y.Train,
                 method = "svmRadial", preProcess = c("center", "scale"),
                 tuneLength = 7, trControl = trainControl(method = "cv"))

svmTune
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 1928 samples
##   32 predictor
## 
## Pre-processing: centered (32), scaled (32) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1736, 1735, 1736, 1736, 1735, 1735, ... 
## Resampling results across tuning parameters:
## 
##   C      RMSE       Rsquared   MAE       
##    0.25  0.1308730  0.4327814  0.09785334
##    0.50  0.1275809  0.4571032  0.09442606
##    1.00  0.1245028  0.4820125  0.09134833
##    2.00  0.1222071  0.4997631  0.08929255
##    4.00  0.1209551  0.5096069  0.08840260
##    8.00  0.1197119  0.5198609  0.08793968
##   16.00  0.1201749  0.5202414  0.08917480
## 
## Tuning parameter 'sigma' was held constant at a value of 0.02187251
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.02187251 and C = 8.
knnTune <- train(x = data.train[, -26], y = y.Train,
                 method = "knn", preProcess = c("center", "scale"),
                 tuneGrid = data.frame(.k = 1:20),
                 trControl = ctrl)

knnTune
## k-Nearest Neighbors 
## 
## 1928 samples
##   32 predictor
## 
## Pre-processing: centered (32), scaled (32) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1737, 1735, 1736, 1735, 1735, 1736, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE       Rsquared   MAE       
##    1  0.1610351  0.3111176  0.11094528
##    2  0.1378888  0.4034637  0.09947871
##    3  0.1315954  0.4350573  0.09643845
##    4  0.1299938  0.4415301  0.09590263
##    5  0.1270482  0.4628362  0.09454153
##    6  0.1276065  0.4580981  0.09529587
##    7  0.1281533  0.4544022  0.09639231
##    8  0.1286770  0.4498707  0.09703808
##    9  0.1291120  0.4457619  0.09756921
##   10  0.1290912  0.4463053  0.09783570
##   11  0.1296644  0.4416287  0.09850934
##   12  0.1300148  0.4387404  0.09875985
##   13  0.1301492  0.4378827  0.09905308
##   14  0.1303585  0.4359829  0.09963209
##   15  0.1308627  0.4309830  0.09982990
##   16  0.1312716  0.4276592  0.10011806
##   17  0.1319652  0.4213623  0.10084915
##   18  0.1325788  0.4156245  0.10110165
##   19  0.1329907  0.4119743  0.10162671
##   20  0.1333980  0.4084653  0.10201087
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 5.
rfTune <- train(x = data.train[, -26], y = y.Train,
                method = "rf", trControl = ctrl)

rfTune
## Random Forest 
## 
## 1928 samples
##   32 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1734, 1735, 1735, 1736, 1735, 1735, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared   MAE       
##    2    0.1141801  0.6103874  0.08671527
##   17    0.1038971  0.6539130  0.07566827
##   32    0.1036121  0.6466262  0.07467261
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 32.
cubeTune <- train(x = data.train[, -26], y = y.Train,
                  method = "cubist", trControl = ctrl)

cubeTune
## Cubist 
## 
## 1928 samples
##   32 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1736, 1735, 1735, 1735, 1735, 1736, ... 
## Resampling results across tuning parameters:
## 
##   committees  neighbors  RMSE        Rsquared   MAE       
##    1          0          0.12516007  0.4954063  0.08722466
##    1          5          0.11839004  0.5496073  0.08242316
##    1          9          0.11910018  0.5404883  0.08304176
##   10          0          0.10802472  0.6065938  0.07813629
##   10          5          0.10097931  0.6541460  0.07244367
##   10          9          0.10189313  0.6476358  0.07306617
##   20          0          0.10639203  0.6207436  0.07718113
##   20          5          0.09861353  0.6702306  0.07098461
##   20          9          0.09971065  0.6633484  0.07180035
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 5.
scores1 <- data.frame(Model = character(),
                      RMSE = double(),
                      RSquared = double(),
                      stringsAsFactors = FALSE)

scores1[1, ] <- c("LM", lmTune$results$RMSE, lmTune$results$Rsquared)
scores1[2, ] <- c("SVM", svmTune$results$RMSE[6], svmTune$results$Rsquared[6])
scores1[3, ] <- c("KNN", knnTune$results$RMSE[5], knnTune$results$Rsquared[5])
scores1[4, ] <- c("Random Forest", rfTune$results$RMSE[3], rfTune$results$Rsquared[3])
scores1[5, ] <- c("Cubist", cubeTune$results$RMSE[8], cubeTune$results$Rsquared[8])

scores1
##           Model               RMSE          RSquared
## 1            LM  0.139140988005172 0.350397004226692
## 2           SVM  0.119711877363228 0.519860903694499
## 3           KNN  0.127048183694861 0.462836242300806
## 4 Random Forest  0.103612113733804  0.64662622055254
## 5        Cubist 0.0986135338431062  0.67023058708877

In general, the smaller RMSE value is better, while the larger \(R^2\) value is better. This is not meant to be a hard rule, however, as these scores are merely guidelines to help assess our models. Based on our observations of the best versions of each models, we can surmise that Cubist and Random Forest will perform about the same.

Model Evaluation

lmPred <- predict(lmTune$finalModel, newdata = data.test[, -26])
svmPred <- predict(svmTune$finalModel, newdata = data.test[, -26])
knnPred <- predict(knnTune$finalModel, newdata = data.test[, -26])
rfPred <- predict(rfTune$finalModel, newdata = data.test[, -26])
cubePred <- predict(cubeTune$finalModel, newdata = data.test[, -26])

lm1 <- postResample(lmPred, obs = data.test[, 26])
svm1 <- postResample(svmPred, obs = data.test[, 26])
knn1 <- postResample(knnPred, obs = data.test[, 26])
rf1 <- postResample(rfPred, obs = data.test[, 26])
cube1 <- postResample(cubePred, obs = data.test[, 26])

scores2 <- data.frame(Model = character(),
                      RMSE = double(),
                      RSquared = double(),
                      stringsAsFactors = FALSE)

scores2[1, ] <- c("LM", lm1[1], lm1[2])
scores2[2, ] <- c("SVM", svm1[1], svm1[2])
scores2[3, ] <- c("KNN", knn1[1], knn1[2])
scores2[4, ] <- c("Random Forest", rf1[1], rf1[2])
scores2[5, ] <- c("Cubist", cube1[1], cube1[2])

scores2
##           Model               RMSE          RSquared
## 1            LM   0.13679839570933 0.381966342421801
## 2           SVM  0.174478200744446              <NA>
## 3           KNN  0.169276409337091 0.139674600713041
## 4 Random Forest 0.0986596270471993 0.687590947085506
## 5        Cubist 0.0982457202171831 0.685313907728689

Upon evaluating our test data for the models, we find that our assumptions hold true. Random Forest and Cubist are virtually tied for being the best.

plot(rfTune, main = "Random Forest")

plot(cubeTune, main = "Cubist")

We can see how both models improve as they grew more computationally complex. In the case of Random Forest, as more predictors are selected, the better the model gets. There is a slight difference between an mtry of 17 vs an mtry of 32. In truth, we could probably get by just fine by using the model with mtry of 17, but as the train function bases its results solely on the lowest RMSE, it decides to go with 32.

For the Cubist algorithm, we can see how more committees contributes to a lower RMSE score. Each committee is built of separate models. The first is generated without any input from the rest of the committee, while each subsequent member works on improving the previous iteration. In the meanwhile, we see that more neighbors, or instances, in the model does not always lead to an improvement. The 5-instance model performs slightly better than the 9-instance one.

qqnorm(rfPred, main = "Random Forest")
qqline(rfPred)

qqnorm(cubePred, main = "Cubist")
qqline(cubePred)

qqnorm(data.test[, 26], main = "Observed")
qqline(data.test[, 26])

If we plot out our predicted values in a Quantile-Quantile plot, we can see how our models best fit the data. The fit for both of our top performing models are roughly the same. Cubist may have a slight edge, although we would have to worry about overfitting the data.

benchmark(train(x = data.train[, -26], y = y.Train,
                method = "rf", trControl = ctrl),
          replications = 1)
##                                                                         test
## 1 train(x = data.train[, -26], y = y.Train, method = "rf", trControl = ctrl)
##   replications elapsed relative user.self sys.self user.child sys.child
## 1            1  325.42        1    318.91     2.33         NA        NA
benchmark(train(x = data.train[, -26], y = y.Train,
                method = "cubist", trControl = ctrl),
          replications = 1)
##                                                                             test
## 1 train(x = data.train[, -26], y = y.Train, method = "cubist", trControl = ctrl)
##   replications elapsed relative user.self sys.self user.child sys.child
## 1            1   62.41        1     61.53     0.11         NA        NA

As the scores are equal, we’ll need a different method of choosing which to prefer going forward. If we run a benchmark on both models’ tuning, we can see that Random Forest is more intensive. It takes 5 to 6 minutes to complete, whereas the Cubist model takes only 1 minute. With all other things being equal, the Cubist model is better based on this.

Most Important Predictors

cubeTune$finalModel$usage
##    Conditions Model    Variable
## 1          85    77        Flow
## 2          60    55        Code
## 3          23    58      Vacuum
## 4          23    79         Lvl
## 5          23    40    Setpoint
## 6          18    31       Speed
## 7          18    71     Balling
## 8          18    72         Rel
## 9          15    51        Flow
## 10         15    61   Pressurer
## 11         13    54      Filler
## 12         10    61 Temperature
## 13         10    77     Density
## 14          8    31       Level
## 15          8    52         Rel
## 16          7    61   Pressure1
## 17          7    60   Pressure3
## 18          6    39        cont
## 19          2    20      Volume
## 20          2    40   Pressure1
## 21          2    47   Pressure2
## 22          2    12    Pressure
## 23          1     8         MFR
## 24          0    27   Pressure4
## 25          0     5      Ounces
## 26          0    12      Volume
## 27          0    17    Pressure
## 28          0    24    Setpoint
## 29          0    15        Temp
## 30          0    15        Fill
## 31          0     9         CO2
## 32          0     4         PSC

Once we examine the relationship between predictors as evaluated by the model, we see that the most important variables are Mnf Flow, Brand Code, Pressure Vacuum, Balling Lvl, and Bowl Setpoint.

Write to Excel

In order to predict PH for our final evaluation set, we’ll take the opportunity to retrain the model on our entire practice data set.

cubeTuneFinal <- train(x = data[, -26], y = data[, 26],
                       method = "cubist", trControl = ctrl)

cubeTuneFinal
## Cubist 
## 
## 2571 samples
##   32 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 2314, 2314, 2313, 2314, 2313, 2315, ... 
## Resampling results across tuning parameters:
## 
##   committees  neighbors  RMSE        Rsquared   MAE       
##    1          0          0.11405394  0.5808545  0.07984392
##    1          5          0.10882863  0.6259578  0.07518089
##    1          9          0.10947724  0.6171687  0.07610701
##   10          0          0.09828038  0.6793905  0.07081404
##   10          5          0.09263168  0.7124090  0.06634390
##   10          9          0.09335915  0.7078461  0.06688831
##   20          0          0.09708059  0.6888173  0.07004094
##   20          5          0.09113704  0.7212263  0.06524261
##   20          9          0.09199001  0.7163900  0.06585124
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 5.

There’s a noticable increase in model performance by using the full data set.

ctfPred <- predict(cubeTuneFinal$finalModel, newdata = final[, -26])
final$PH <- ctfPred

final[final$`Brand Code` == 1, 1] <- "A"
final[final$`Brand Code` == 2, 1] <- "B"
final[final$`Brand Code` == 3, 1] <- "C"
final[final$`Brand Code` == 4, 1] <- "D"
final[final$`Brand Code` == 5, 1] <- "E"

xlsx::write.xlsx(final, "StudentEvaluationFinal.xlsx")