Overview

This is role playing. I am your new boss. I am in charge of production at ABC Beverage and you are a team of data scientists reporting to me. My leadership has told me that new regulations are requiring us to understand our manufacturing process, the predictive factors and be able to report to them our predictive model of PH. Please use the historical data set I am providing. Build and report the factors in BOTH a technical and non-technical report. I like to use Word and Excel. Please provide your non-technical report in a business friendly readable document and your predictions in an Excel readable format. The technical report should show clearly the models you tested and how you selected your final approach.

Load The Data

#create a temp file
temp_file <- tempfile(fileext = ".xlsx")
temp_file2 <- tempfile(fileext = ".xlsx")
#grab a copy of the xl file from github, save to temp create above
download.file(url = "https://github.com/plb2018/DATA624/raw/master/project2/StudentData.xlsx", 
              destfile = temp_file, 
              mode = "wb", 
              quiet = TRUE)
#load xl from temp
student_train <- readxl::read_excel(temp_file,skip=0) #read in as tibble
student_train <- data.frame(student_train) # convert to dataframe
#load test data
download.file(url = "https://github.com/javernw/DATA624-Predictive-Analytics/blob/master/StudentEvaluation.xlsx?raw=true", 
              destfile = temp_file2, 
              mode = "wb", 
              quiet = TRUE)
student_test <- readxl::read_excel(temp_file2,skip=0)
student_test <- data.frame(student_test)

Going forward we will explore the data using the training set.

Structure and Summary of Training Set
str(student_train)
## 'data.frame':    2571 obs. of  33 variables:
##  $ Brand.Code       : chr  "B" "A" "B" "A" ...
##  $ Carb.Volume      : num  5.34 5.43 5.29 5.44 5.49 ...
##  $ Fill.Ounces      : num  24 24 24.1 24 24.3 ...
##  $ PC.Volume        : num  0.263 0.239 0.263 0.293 0.111 ...
##  $ Carb.Pressure    : num  68.2 68.4 70.8 63 67.2 66.6 64.2 67.6 64.2 72 ...
##  $ Carb.Temp        : num  141 140 145 133 137 ...
##  $ PSC              : num  0.104 0.124 0.09 NA 0.026 0.09 0.128 0.154 0.132 0.014 ...
##  $ PSC.Fill         : num  0.26 0.22 0.34 0.42 0.16 ...
##  $ PSC.CO2          : num  0.04 0.04 0.16 0.04 0.12 ...
##  $ Mnf.Flow         : num  -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 ...
##  $ Carb.Pressure1   : num  119 122 120 115 118 ...
##  $ Fill.Pressure    : num  46 46 46 46.4 45.8 45.6 51.8 46.8 46 45.2 ...
##  $ Hyd.Pressure1    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Hyd.Pressure2    : num  NA NA NA 0 0 0 0 0 0 0 ...
##  $ Hyd.Pressure3    : num  NA NA NA 0 0 0 0 0 0 0 ...
##  $ Hyd.Pressure4    : num  118 106 82 92 92 116 124 132 90 108 ...
##  $ Filler.Level     : num  121 119 120 118 119 ...
##  $ Filler.Speed     : num  4002 3986 4020 4012 4010 ...
##  $ Temperature      : num  66 67.6 67 65.6 65.6 66.2 65.8 65.2 65.4 66.6 ...
##  $ Usage.cont       : num  16.2 19.9 17.8 17.4 17.7 ...
##  $ Carb.Flow        : num  2932 3144 2914 3062 3054 ...
##  $ Density          : num  0.88 0.92 1.58 1.54 1.54 1.52 0.84 0.84 0.9 0.9 ...
##  $ MFR              : num  725 727 735 731 723 ...
##  $ Balling          : num  1.4 1.5 3.14 3.04 3.04 ...
##  $ Pressure.Vacuum  : num  -4 -4 -3.8 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 ...
##  $ PH               : num  8.36 8.26 8.94 8.24 8.26 8.32 8.4 8.38 8.38 8.5 ...
##  $ Oxygen.Filler    : num  0.022 0.026 0.024 0.03 0.03 0.024 0.066 0.046 0.064 0.022 ...
##  $ Bowl.Setpoint    : num  120 120 120 120 120 120 120 120 120 120 ...
##  $ Pressure.Setpoint: num  46.4 46.8 46.6 46 46 46 46 46 46 46 ...
##  $ Air.Pressurer    : num  143 143 142 146 146 ...
##  $ Alch.Rel         : num  6.58 6.56 7.66 7.14 7.14 7.16 6.54 6.52 6.52 6.54 ...
##  $ Carb.Rel         : num  5.32 5.3 5.84 5.42 5.44 5.44 5.38 5.34 5.34 5.34 ...
##  $ Balling.Lvl      : num  1.48 1.56 3.28 3.04 3.04 3.02 1.44 1.44 1.44 1.38 ...
summary(student_train)
##   Brand.Code         Carb.Volume     Fill.Ounces      PC.Volume      
##  Length:2571        Min.   :5.040   Min.   :23.63   Min.   :0.07933  
##  Class :character   1st Qu.:5.293   1st Qu.:23.92   1st Qu.:0.23917  
##  Mode  :character   Median :5.347   Median :23.97   Median :0.27133  
##                     Mean   :5.370   Mean   :23.97   Mean   :0.27712  
##                     3rd Qu.:5.453   3rd Qu.:24.03   3rd Qu.:0.31200  
##                     Max.   :5.700   Max.   :24.32   Max.   :0.47800  
##                     NA's   :10      NA's   :38      NA's   :39       
##  Carb.Pressure     Carb.Temp          PSC             PSC.Fill     
##  Min.   :57.00   Min.   :128.6   Min.   :0.00200   Min.   :0.0000  
##  1st Qu.:65.60   1st Qu.:138.4   1st Qu.:0.04800   1st Qu.:0.1000  
##  Median :68.20   Median :140.8   Median :0.07600   Median :0.1800  
##  Mean   :68.19   Mean   :141.1   Mean   :0.08457   Mean   :0.1954  
##  3rd Qu.:70.60   3rd Qu.:143.8   3rd Qu.:0.11200   3rd Qu.:0.2600  
##  Max.   :79.40   Max.   :154.0   Max.   :0.27000   Max.   :0.6200  
##  NA's   :27      NA's   :26      NA's   :33        NA's   :23      
##     PSC.CO2           Mnf.Flow       Carb.Pressure1  Fill.Pressure  
##  Min.   :0.00000   Min.   :-100.20   Min.   :105.6   Min.   :34.60  
##  1st Qu.:0.02000   1st Qu.:-100.00   1st Qu.:119.0   1st Qu.:46.00  
##  Median :0.04000   Median :  65.20   Median :123.2   Median :46.40  
##  Mean   :0.05641   Mean   :  24.57   Mean   :122.6   Mean   :47.92  
##  3rd Qu.:0.08000   3rd Qu.: 140.80   3rd Qu.:125.4   3rd Qu.:50.00  
##  Max.   :0.24000   Max.   : 229.40   Max.   :140.2   Max.   :60.40  
##  NA's   :39        NA's   :2         NA's   :32      NA's   :22     
##  Hyd.Pressure1   Hyd.Pressure2   Hyd.Pressure3   Hyd.Pressure4   
##  Min.   :-0.80   Min.   : 0.00   Min.   :-1.20   Min.   : 52.00  
##  1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 86.00  
##  Median :11.40   Median :28.60   Median :27.60   Median : 96.00  
##  Mean   :12.44   Mean   :20.96   Mean   :20.46   Mean   : 96.29  
##  3rd Qu.:20.20   3rd Qu.:34.60   3rd Qu.:33.40   3rd Qu.:102.00  
##  Max.   :58.00   Max.   :59.40   Max.   :50.00   Max.   :142.00  
##  NA's   :11      NA's   :15      NA's   :15      NA's   :30      
##   Filler.Level    Filler.Speed   Temperature      Usage.cont      Carb.Flow   
##  Min.   : 55.8   Min.   : 998   Min.   :63.60   Min.   :12.08   Min.   :  26  
##  1st Qu.: 98.3   1st Qu.:3888   1st Qu.:65.20   1st Qu.:18.36   1st Qu.:1144  
##  Median :118.4   Median :3982   Median :65.60   Median :21.79   Median :3028  
##  Mean   :109.3   Mean   :3687   Mean   :65.97   Mean   :20.99   Mean   :2468  
##  3rd Qu.:120.0   3rd Qu.:3998   3rd Qu.:66.40   3rd Qu.:23.75   3rd Qu.:3186  
##  Max.   :161.2   Max.   :4030   Max.   :76.20   Max.   :25.90   Max.   :5104  
##  NA's   :20      NA's   :57     NA's   :14      NA's   :5       NA's   :2     
##     Density           MFR           Balling       Pressure.Vacuum 
##  Min.   :0.240   Min.   : 31.4   Min.   :-0.170   Min.   :-6.600  
##  1st Qu.:0.900   1st Qu.:706.3   1st Qu.: 1.496   1st Qu.:-5.600  
##  Median :0.980   Median :724.0   Median : 1.648   Median :-5.400  
##  Mean   :1.174   Mean   :704.0   Mean   : 2.198   Mean   :-5.216  
##  3rd Qu.:1.620   3rd Qu.:731.0   3rd Qu.: 3.292   3rd Qu.:-5.000  
##  Max.   :1.920   Max.   :868.6   Max.   : 4.012   Max.   :-3.600  
##  NA's   :1       NA's   :212     NA's   :1                        
##        PH        Oxygen.Filler     Bowl.Setpoint   Pressure.Setpoint
##  Min.   :7.880   Min.   :0.00240   Min.   : 70.0   Min.   :44.00    
##  1st Qu.:8.440   1st Qu.:0.02200   1st Qu.:100.0   1st Qu.:46.00    
##  Median :8.540   Median :0.03340   Median :120.0   Median :46.00    
##  Mean   :8.546   Mean   :0.04684   Mean   :109.3   Mean   :47.62    
##  3rd Qu.:8.680   3rd Qu.:0.06000   3rd Qu.:120.0   3rd Qu.:50.00    
##  Max.   :9.360   Max.   :0.40000   Max.   :140.0   Max.   :52.00    
##  NA's   :4       NA's   :12        NA's   :2       NA's   :12       
##  Air.Pressurer      Alch.Rel        Carb.Rel      Balling.Lvl  
##  Min.   :140.8   Min.   :5.280   Min.   :4.960   Min.   :0.00  
##  1st Qu.:142.2   1st Qu.:6.540   1st Qu.:5.340   1st Qu.:1.38  
##  Median :142.6   Median :6.560   Median :5.400   Median :1.48  
##  Mean   :142.8   Mean   :6.897   Mean   :5.437   Mean   :2.05  
##  3rd Qu.:143.0   3rd Qu.:7.240   3rd Qu.:5.540   3rd Qu.:3.14  
##  Max.   :148.2   Max.   :8.620   Max.   :6.060   Max.   :3.66  
##                  NA's   :9       NA's   :10      NA's   :1

Explore The Data

Skewness in data

plot_histogram(student_train)

We see various differences among the variables: Many appear to be bi-modal or tri-modal, while some appear to be near-normal. Some are skewed whereas others are not. There are also several which appear to be categorical (ex: preassure.setpoint)

Boxplot/Outliers

ggplot(data = reshape2::melt(student_train) , aes(x=variable, y=value)) + 
  geom_boxplot(outlier.colour="blue", outlier.shape=3, outlier.size=5,aes(fill=variable)) +
  coord_flip() + theme(legend.position = "none")
## Using Brand.Code as id variables
## Warning: Removed 724 rows containing non-finite values (stat_boxplot).

We can see a dramatic difference in the level/values of the variables. Carb flow and filler speed are “off the charts” as compared to some of the other values. This may necessitate some scaling/ normalization.

Correlation

corrplot(cor(student_train[,-1], use = "na.or.complete"), type="lower", order="alphabet",
         col=brewer.pal(n=10, name="PiYG"))

Generally speaking, correlations are low, however there are a few areas of concentration. For example, there might be some multi-colinearity between Alch.Rel, Balling, Balling.Lvl and some of the Carb related variables.

Check For Missing Values

missmap(student_train, col = c("#CCCC00", "#660033"))

df.missing <- sort(colSums(is.na(student_train[!complete.cases(student_train),]))/nrow(student_train),decreasing = T)
kable(df.missing)
x
MFR 0.0824582
Brand.Code 0.0466744
Filler.Speed 0.0221704
PC.Volume 0.0151692
PSC.CO2 0.0151692
Fill.Ounces 0.0147802
PSC 0.0128355
Carb.Pressure1 0.0124465
Hyd.Pressure4 0.0116686
Carb.Pressure 0.0105018
Carb.Temp 0.0101128
PSC.Fill 0.0089459
Fill.Pressure 0.0085570
Filler.Level 0.0077791
Hyd.Pressure2 0.0058343
Hyd.Pressure3 0.0058343
Temperature 0.0054454
Oxygen.Filler 0.0046674
Pressure.Setpoint 0.0046674
Hyd.Pressure1 0.0042785
Carb.Volume 0.0038895
Carb.Rel 0.0038895
Alch.Rel 0.0035006
Usage.cont 0.0019448
PH 0.0015558
Mnf.Flow 0.0007779
Carb.Flow 0.0007779
Bowl.Setpoint 0.0007779
Density 0.0003890
Balling 0.0003890
Balling.Lvl 0.0003890
Pressure.Vacuum 0.0000000
Air.Pressurer 0.0000000
# to create safe names
colnames(student_train) <- make.names(names(student_train))
## checking missing values by colunms
colSums(is.na(student_train))
##        Brand.Code       Carb.Volume       Fill.Ounces         PC.Volume 
##               120                10                38                39 
##     Carb.Pressure         Carb.Temp               PSC          PSC.Fill 
##                27                26                33                23 
##           PSC.CO2          Mnf.Flow    Carb.Pressure1     Fill.Pressure 
##                39                 2                32                22 
##     Hyd.Pressure1     Hyd.Pressure2     Hyd.Pressure3     Hyd.Pressure4 
##                11                15                15                30 
##      Filler.Level      Filler.Speed       Temperature        Usage.cont 
##                20                57                14                 5 
##         Carb.Flow           Density               MFR           Balling 
##                 2                 1               212                 1 
##   Pressure.Vacuum                PH     Oxygen.Filler     Bowl.Setpoint 
##                 0                 4                12                 2 
## Pressure.Setpoint     Air.Pressurer          Alch.Rel          Carb.Rel 
##                12                 0                 9                10 
##       Balling.Lvl 
##                 1

About 1% of the values are missing. The absent values are unevenly distributed across variables whith some missing numerous values and others missing none.

Data Preprocessing

Impute, nearzero, high correlation, scale, center
temp_df <- data.matrix(student_train[,-1]) #convert to mumeric matix. Exlude character column (brand code)
preprocessing <- preProcess(temp_df, method = c("center", "scale", "knnImpute", "corr", "nzv")) 
#cleaned training set
student_train_preprocess <-  predict(preprocessing, temp_df) 
missmap(as.data.frame(student_train_preprocess), col = c("#CCCC00", "#660033"))

To address some of the observations above, we scale and center the data. KNN imputation is used to fill in missing values and highly correlated predictors are filtered out.

Data Splitting 80/20

ph <- data.matrix(student_train_preprocess[,24]) #ph --target
set.seed(789)
split <- ph %>%
  createDataPartition(p = 0.8, list = FALSE, times = 1)
Xtrain.data  <- student_train_preprocess[split, ] #student train
Xtrain.data <- Xtrain.data[,-24] # exlude target column
xtest.data <- student_train_preprocess[-split, ] #student validation set
xtest.data <- xtest.data[,-24] #exclude target column
Ytrain.data  <- ph[split, ] # ph 
ytest.data <- ph[-split] # ph

The data is split using 80% for training and holding the remaining 20% for validation. Also note that a random seed is set for reproducibility.

Modeling

Here, numerous models of various families are considered.

ctrl <- trainControl(method = "cv", number = 10)

Linear Models

Model 1

Ordinary Linear Regression
lmmod <- train(Xtrain.data, Ytrain.data, method = "lm", trControl = ctrl) 
lmmod
## Linear Regression 
## 
## 2058 samples
##   30 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1854, 1852, 1851, 1852, 1851, 1851, ... 
## Resampling results:
## 
##   RMSE       Rsquared   MAE      
##   0.8250735  0.3330247  0.6327658
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
lmmodPred <- predict(lmmod,newdata = xtest.data)

Model 2

Partial Least Squares
plsmod <- train(
  Xtrain.data, Ytrain.data, method = "pls",
  trControl = ctrl,
  center = T,
  tuneLength = 20,
  metrics = "Rsquared"
  )
plot(plsmod)

varImp(plsmod)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
## 
##     R2
## The following object is masked from 'package:corrplot':
## 
##     corrplot
## The following object is masked from 'package:stats':
## 
##     loadings
## pls variable importance
## 
##   only 20 most important variables shown (out of 30)
## 
##                   Overall
## Mnf.Flow           100.00
## Temperature         67.00
## Usage.cont          63.22
## Hyd.Pressure3       57.04
## Bowl.Setpoint       55.46
## Hyd.Pressure2       48.63
## Pressure.Setpoint   47.93
## Carb.Pressure1      47.53
## Filler.Level        45.57
## Fill.Pressure       39.81
## Carb.Flow           38.32
## PSC                 36.69
## Pressure.Vacuum     35.09
## Oxygen.Filler       31.01
## Fill.Ounces         30.07
## Hyd.Pressure4       28.79
## Density             28.61
## Alch.Rel            23.85
## Carb.Rel            23.73
## Balling.Lvl         23.07
plsmodPred <- predict(plsmod,newdata = xtest.data)

Model 3

RIDGE
ridgeGrid <- data.frame(.lambda = seq(0, .1, length = 15)) 
set.seed(100) 
ridgeRegFit <- train(Xtrain.data, Ytrain.data, method = "ridge", tuneGrid = ridgeGrid, trControl = ctrl)
ridgeRegFit
## Ridge Regression 
## 
## 2058 samples
##   30 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1851, 1853, 1851, 1852, 1852, 1854, ... 
## Resampling results across tuning parameters:
## 
##   lambda       RMSE       Rsquared   MAE      
##   0.000000000  0.8226105  0.3367637  0.6339799
##   0.007142857  0.8223294  0.3367078  0.6347917
##   0.014285714  0.8226664  0.3359843  0.6355218
##   0.021428571  0.8230849  0.3352097  0.6363363
##   0.028571429  0.8235373  0.3344165  0.6371187
##   0.035714286  0.8240104  0.3336101  0.6379087
##   0.042857143  0.8244964  0.3327954  0.6386428
##   0.050000000  0.8249900  0.3319773  0.6393331
##   0.057142857  0.8254872  0.3311599  0.6400148
##   0.064285714  0.8259854  0.3303464  0.6406694
##   0.071428571  0.8264825  0.3295393  0.6413239
##   0.078571429  0.8269774  0.3287404  0.6419650
##   0.085714286  0.8274692  0.3279509  0.6425803
##   0.092857143  0.8279575  0.3271716  0.6431858
##   0.100000000  0.8284419  0.3264030  0.6437925
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.007142857.
plot(ridgeRegFit)

varImp(ridgeRegFit)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 30)
## 
##                   Overall
## Mnf.Flow          100.000
## Usage.cont         74.119
## Bowl.Setpoint      54.286
## Filler.Level       47.015
## Pressure.Setpoint  45.677
## Carb.Flow          42.209
## Hyd.Pressure3      29.014
## Fill.Pressure      25.452
## Pressure.Vacuum    22.760
## Hyd.Pressure2      20.536
## Oxygen.Filler      12.106
## Carb.Rel           10.512
## Temperature         9.388
## Alch.Rel            8.642
## Hyd.Pressure4       7.704
## PSC                 4.831
## MFR                 3.781
## Balling.Lvl         3.649
## Fill.Ounces         3.609
## PC.Volume           3.041
ridgePred <- predict(ridgeRegFit,newdata = xtest.data)

Non-Linear Models

Model 4

Neural Networks
nneG <- expand.grid( .decay = c(0, 0.01, .1), .size = c(1:10), .bag= F )
set.seed(250)
nneModel <- train(Xtrain.data, Ytrain.data,
                  method = "avNNet",
                  preProc = c("center", "scale"),
                  tuneGrid = nneG,
                  trControl = trainControl(method = "cv",number = 10),
                  linout = T,
                  trace= F,
                  MaxNWts = 5 * (ncol(student_train) + 1) + 5 + 1,
                  maxit = 500)
nneModel
## Model Averaged Neural Network 
## 
## 2058 samples
##   30 predictor
## 
## Pre-processing: centered (30), scaled (30) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1852, 1852, 1851, 1852, 1852, 1852, ... 
## Resampling results across tuning parameters:
## 
##   decay  size  RMSE       Rsquared   MAE      
##   0.00    1    0.8115568  0.3551603  0.6180988
##   0.00    2    0.7719167  0.4186169  0.5834965
##   0.00    3    0.7548273  0.4433772  0.5664919
##   0.00    4    0.7503062  0.4493545  0.5627205
##   0.00    5    0.7311943  0.4774811  0.5476832
##   0.00    6          NaN        NaN        NaN
##   0.00    7          NaN        NaN        NaN
##   0.00    8          NaN        NaN        NaN
##   0.00    9          NaN        NaN        NaN
##   0.00   10          NaN        NaN        NaN
##   0.01    1    0.8150205  0.3495562  0.6219529
##   0.01    2    0.7668575  0.4250208  0.5793516
##   0.01    3    0.7362777  0.4700447  0.5532819
##   0.01    4    0.7253351  0.4849325  0.5438711
##   0.01    5    0.7163841  0.4964243  0.5349458
##   0.01    6          NaN        NaN        NaN
##   0.01    7          NaN        NaN        NaN
##   0.01    8          NaN        NaN        NaN
##   0.01    9          NaN        NaN        NaN
##   0.01   10          NaN        NaN        NaN
##   0.10    1    0.8145981  0.3496552  0.6209891
##   0.10    2    0.7596740  0.4351082  0.5722176
##   0.10    3    0.7391604  0.4654301  0.5530272
##   0.10    4    0.7132932  0.5017798  0.5317318
##   0.10    5    0.7043526  0.5138828  0.5247246
##   0.10    6          NaN        NaN        NaN
##   0.10    7          NaN        NaN        NaN
##   0.10    8          NaN        NaN        NaN
##   0.10    9          NaN        NaN        NaN
##   0.10   10          NaN        NaN        NaN
## 
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 5, decay = 0.1 and bag = FALSE.
nnePred <- predict(nneModel,newdata = xtest.data)
varImp(nneModel)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 30)
## 
##                   Overall
## Mnf.Flow          100.000
## Usage.cont         74.119
## Bowl.Setpoint      54.286
## Filler.Level       47.015
## Pressure.Setpoint  45.677
## Carb.Flow          42.209
## Hyd.Pressure3      29.014
## Fill.Pressure      25.452
## Pressure.Vacuum    22.760
## Hyd.Pressure2      20.536
## Oxygen.Filler      12.106
## Carb.Rel           10.512
## Temperature         9.388
## Alch.Rel            8.642
## Hyd.Pressure4       7.704
## PSC                 4.831
## MFR                 3.781
## Balling.Lvl         3.649
## Fill.Ounces         3.609
## PC.Volume           3.041

Model 5

Multivariate Adaptive Regression Splines
marsG <- expand.grid(.degree = 1:2 ,.nprune = 2:38)
set.seed(100)
marsModel <- train(Xtrain.data,Ytrain.data,
                   method= 'earth',
                   tuneGrid = marsG,
                   trControl = trainControl(method = "cv"))
marsModel
## Multivariate Adaptive Regression Spline 
## 
## 2058 samples
##   30 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1851, 1853, 1851, 1852, 1852, 1854, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE       Rsquared   MAE      
##   1        2      0.8922496  0.2187445  0.6972312
##   1        3      0.8790537  0.2411293  0.6844030
##   1        4      0.8698468  0.2571698  0.6772735
##   1        5      0.8594722  0.2756679  0.6657695
##   1        6      0.8404443  0.3073969  0.6499135
##   1        7      0.8338151  0.3184674  0.6379175
##   1        8      0.8215241  0.3378356  0.6252452
##   1        9      0.8108990  0.3551977  0.6168258
##   1       10      0.8140049  0.3509126  0.6170770
##   1       11      0.8077717  0.3598131  0.6108898
##   1       12      0.8054597  0.3642021  0.6062871
##   1       13      0.7989398  0.3741236  0.6043957
##   1       14      0.7964451  0.3778179  0.6004804
##   1       15      0.7972674  0.3766167  0.6008554
##   1       16      0.7938074  0.3815677  0.5971617
##   1       17      0.7943813  0.3808367  0.5976664
##   1       18      0.7939353  0.3816055  0.5974441
##   1       19      0.7939353  0.3816055  0.5974441
##   1       20      0.7939353  0.3816055  0.5974441
##   1       21      0.7939353  0.3816055  0.5974441
##   1       22      0.7939353  0.3816055  0.5974441
##   1       23      0.7939353  0.3816055  0.5974441
##   1       24      0.7939353  0.3816055  0.5974441
##   1       25      0.7939353  0.3816055  0.5974441
##   1       26      0.7939353  0.3816055  0.5974441
##   1       27      0.7939353  0.3816055  0.5974441
##   1       28      0.7939353  0.3816055  0.5974441
##   1       29      0.7939353  0.3816055  0.5974441
##   1       30      0.7939353  0.3816055  0.5974441
##   1       31      0.7939353  0.3816055  0.5974441
##   1       32      0.7939353  0.3816055  0.5974441
##   1       33      0.7939353  0.3816055  0.5974441
##   1       34      0.7939353  0.3816055  0.5974441
##   1       35      0.7939353  0.3816055  0.5974441
##   1       36      0.7939353  0.3816055  0.5974441
##   1       37      0.7939353  0.3816055  0.5974441
##   1       38      0.7939353  0.3816055  0.5974441
##   2        2      0.8901936  0.2212799  0.6956816
##   2        3      0.8770857  0.2439119  0.6847101
##   2        4      0.8511997  0.2889424  0.6601161
##   2        5      0.8374225  0.3111721  0.6462103
##   2        6      0.8314741  0.3205159  0.6422580
##   2        7      0.8237433  0.3333495  0.6318220
##   2        8      0.8154329  0.3463189  0.6242093
##   2        9      0.8054603  0.3621659  0.6149982
##   2       10      0.7950876  0.3787718  0.6053170
##   2       11      0.7906597  0.3861747  0.5997923
##   2       12      0.7861846  0.3932484  0.5956349
##   2       13      0.7765600  0.4078538  0.5892821
##   2       14      0.7708630  0.4165754  0.5828036
##   2       15      0.7681840  0.4204826  0.5804505
##   2       16      0.7663106  0.4234137  0.5797583
##   2       17      0.7656002  0.4250337  0.5796446
##   2       18      0.7659245  0.4250396  0.5783900
##   2       19      0.7661954  0.4249690  0.5783068
##   2       20      0.7628732  0.4297127  0.5758283
##   2       21      0.7598679  0.4341202  0.5726390
##   2       22      0.7572845  0.4381652  0.5706511
##   2       23      0.7546733  0.4416777  0.5684839
##   2       24      0.7557143  0.4401483  0.5681690
##   2       25      0.7565188  0.4391748  0.5676458
##   2       26      0.7597674  0.4352092  0.5682783
##   2       27      0.7608140  0.4338837  0.5692230
##   2       28      0.7605221  0.4343965  0.5686232
##   2       29      0.7594522  0.4358020  0.5670487
##   2       30      0.7594917  0.4357774  0.5669080
##   2       31      0.7587422  0.4369360  0.5664895
##   2       32      0.7585685  0.4371624  0.5664432
##   2       33      0.7585908  0.4370868  0.5664008
##   2       34      0.7585908  0.4370868  0.5664008
##   2       35      0.7585908  0.4370868  0.5664008
##   2       36      0.7585908  0.4370868  0.5664008
##   2       37      0.7585908  0.4370868  0.5664008
##   2       38      0.7585908  0.4370868  0.5664008
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 23 and degree = 2.
marsPred <- predict(marsModel,newdata = xtest.data)
varImp(marsModel)
## earth variable importance
## 
##   only 20 most important variables shown (out of 30)
## 
##                 Overall
## Mnf.Flow         100.00
## Temperature      100.00
## Carb.Pressure1    71.51
## Alch.Rel          65.06
## Bowl.Setpoint     61.76
## Air.Pressurer     56.99
## Hyd.Pressure3     55.96
## Balling           52.15
## Filler.Speed      48.85
## Pressure.Vacuum   45.55
## Usage.cont        39.94
## Oxygen.Filler     30.07
## PSC                0.00
## Carb.Pressure      0.00
## Hyd.Pressure2      0.00
## Balling.Lvl        0.00
## Hyd.Pressure4      0.00
## Carb.Rel           0.00
## Carb.Flow          0.00
## Fill.Ounces        0.00

Model 6

Support Vector Machines
svmModel <- train(Xtrain.data, Ytrain.data,
                  method = "svmRadial",
                  preProc = c("center", "scale"),
                  tuneLength = 14,
                  trControl = trainControl(method = "cv"))
svmModel
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 2058 samples
##   30 predictor
## 
## Pre-processing: centered (30), scaled (30) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1853, 1852, 1852, 1853, 1851, 1851, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE       Rsquared   MAE      
##      0.25  0.7737204  0.4227767  0.5728534
##      0.50  0.7570774  0.4445559  0.5556947
##      1.00  0.7418883  0.4645116  0.5398248
##      2.00  0.7306552  0.4792351  0.5300028
##      4.00  0.7314143  0.4792432  0.5313302
##      8.00  0.7331777  0.4802037  0.5346123
##     16.00  0.7444344  0.4724008  0.5444353
##     32.00  0.7699405  0.4529211  0.5615142
##     64.00  0.8095923  0.4240054  0.5909120
##    128.00  0.8570381  0.3942119  0.6274075
##    256.00  0.9145579  0.3664939  0.6754401
##    512.00  0.9604613  0.3454271  0.7073745
##   1024.00  0.9965897  0.3235362  0.7319442
##   2048.00  1.0002638  0.3220589  0.7347925
## 
## Tuning parameter 'sigma' was held constant at a value of 0.0223802
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.0223802 and C = 2.
svmPred <- predict(svmModel,newdata = xtest.data)
varImp(svmModel)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 30)
## 
##                   Overall
## Mnf.Flow          100.000
## Usage.cont         74.119
## Bowl.Setpoint      54.286
## Filler.Level       47.015
## Pressure.Setpoint  45.677
## Carb.Flow          42.209
## Hyd.Pressure3      29.014
## Fill.Pressure      25.452
## Pressure.Vacuum    22.760
## Hyd.Pressure2      20.536
## Oxygen.Filler      12.106
## Carb.Rel           10.512
## Temperature         9.388
## Alch.Rel            8.642
## Hyd.Pressure4       7.704
## PSC                 4.831
## MFR                 3.781
## Balling.Lvl         3.649
## Fill.Ounces         3.609
## PC.Volume           3.041

Model 7

K-Nearest Neighbors
knnModel <- train(Xtrain.data,Ytrain.data,
                  method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength = 10)
knnModel
## k-Nearest Neighbors 
## 
## 2058 samples
##   30 predictor
## 
## Pre-processing: centered (30), scaled (30) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 2058, 2058, 2058, 2058, 2058, 2058, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE       Rsquared   MAE      
##    5  0.8358733  0.3544170  0.6111521
##    7  0.8159249  0.3693323  0.6010353
##    9  0.8055430  0.3788969  0.5967540
##   11  0.8007136  0.3836559  0.5960581
##   13  0.7991137  0.3851043  0.5971429
##   15  0.7972352  0.3876560  0.5975327
##   17  0.7973038  0.3877164  0.5986463
##   19  0.7995615  0.3843455  0.6007272
##   21  0.7997813  0.3845688  0.6015806
##   23  0.8014657  0.3823067  0.6035646
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.
knnPred <- predict(knnModel,newdata = xtest.data)
varImp(knnModel)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 30)
## 
##                   Overall
## Mnf.Flow          100.000
## Usage.cont         74.119
## Bowl.Setpoint      54.286
## Filler.Level       47.015
## Pressure.Setpoint  45.677
## Carb.Flow          42.209
## Hyd.Pressure3      29.014
## Fill.Pressure      25.452
## Pressure.Vacuum    22.760
## Hyd.Pressure2      20.536
## Oxygen.Filler      12.106
## Carb.Rel           10.512
## Temperature         9.388
## Alch.Rel            8.642
## Hyd.Pressure4       7.704
## PSC                 4.831
## MFR                 3.781
## Balling.Lvl         3.649
## Fill.Ounces         3.609
## PC.Volume           3.041

Trees

Model 8

Random Forest
randomForestModel <- randomForest(Xtrain.data,Ytrain.data,
                       importance = T,
                       ntree=1000)
                       
rfImp2 <- varImp(randomForestModel,scale = F)
rfImp2
randomForestModelPred <- predict(randomForestModel,newdata = xtest.data)

Model 9

Bagged Trees
baggedTree <- bagging(Ytrain.data ~ ., data = as.data.frame(Xtrain.data))
baggedTreePred <- predict(baggedTree,newdata = xtest.data)
summary(baggedTreePred)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.48973 -0.46496 -0.04710  0.02797  0.56593  1.07404

Model 10

Boosted Trees
gbmG <- expand.grid(.interaction.depth = seq(1, 7, by = 2),
                        .n.trees = seq(100, 1000, by = 100),
                        .shrinkage = c(0.01, 0.1),
                        .n.minobsinnode = 8)
set.seed(100)
gbmModel <- train(Xtrain.data,Ytrain.data,
                  method = "gbm",
                  tuneGrid =gbmG,
                  verbose=F
                 )
varImp(gbmModel)
## gbm variable importance
## 
##   only 20 most important variables shown (out of 30)
## 
##                 Overall
## Mnf.Flow         100.00
## Usage.cont        62.74
## Alch.Rel          43.55
## Temperature       40.59
## Oxygen.Filler     37.40
## Carb.Pressure1    33.46
## Carb.Flow         32.08
## Carb.Rel          29.58
## Filler.Speed      28.38
## Pressure.Vacuum   26.57
## Balling.Lvl       26.33
## MFR               23.86
## Air.Pressurer     23.73
## Density           23.46
## PC.Volume         19.49
## Balling           18.99
## PSC               17.18
## Hyd.Pressure2     16.49
## Carb.Volume       16.19
## Fill.Ounces       15.71
gbmPred <- predict(gbmModel,newdata = xtest.data)

Model 11

Cubist
cubistmodel <- train(Xtrain.data,Ytrain.data,method ="cubist")
varImp(cubistmodel)
## cubist variable importance
## 
##   only 20 most important variables shown (out of 30)
## 
##                   Overall
## Mnf.Flow           100.00
## Balling             79.29
## Alch.Rel            70.00
## Density             65.71
## Balling.Lvl         61.43
## Pressure.Vacuum     60.71
## Temperature         55.00
## Air.Pressurer       54.29
## Carb.Rel            52.86
## Oxygen.Filler       47.86
## Bowl.Setpoint       47.86
## Carb.Flow           44.29
## Carb.Pressure1      43.57
## Hyd.Pressure2       42.14
## Hyd.Pressure3       40.71
## Filler.Speed        35.71
## Usage.cont          32.14
## Filler.Level        24.29
## Carb.Pressure       22.86
## Pressure.Setpoint   22.14
cubistPred <- predict(cubistmodel,newdata = xtest.data)

Model 12

Single Trees
student_rpart <- train(Xtrain.data, Ytrain.data, 
                          method = "rpart2", 
                          tuneLength = 10, 
                          trControl = trainControl(method = "cv"))
student_rpart
## CART 
## 
## 2058 samples
##   30 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1852, 1851, 1852, 1852, 1852, 1852, ... 
## Resampling results across tuning parameters:
## 
##   maxdepth  RMSE       Rsquared   MAE      
##    1        0.8941146  0.2153712  0.6975586
##    2        0.8709518  0.2546228  0.6845994
##    3        0.8640900  0.2672142  0.6734989
##    4        0.8584583  0.2784105  0.6685867
##    5        0.8475151  0.2975902  0.6568840
##    6        0.8402002  0.3098024  0.6503733
##    7        0.8336375  0.3204723  0.6433812
##    8        0.8218249  0.3391182  0.6296012
##    9        0.8096661  0.3591669  0.6180014
##   10        0.8008399  0.3747669  0.6084247
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was maxdepth = 10.
plot(student_rpart)

varImp(student_rpart)
## rpart2 variable importance
## 
##   only 20 most important variables shown (out of 30)
## 
##                   Overall
## Bowl.Setpoint     100.000
## Usage.cont         95.568
## Mnf.Flow           90.856
## Temperature        86.204
## Filler.Speed       78.659
## Air.Pressurer      72.210
## Balling.Lvl        70.676
## Carb.Rel           68.645
## Filler.Level       67.420
## Fill.Pressure      62.329
## Hyd.Pressure3      51.442
## Balling            48.538
## Carb.Flow          47.540
## Oxygen.Filler      44.718
## Pressure.Setpoint  39.593
## Pressure.Vacuum    34.251
## Alch.Rel           23.864
## Carb.Pressure1     20.664
## Carb.Volume         9.776
## PC.Volume           0.000
rpartPred <- predict(student_rpart,newdata = xtest.data)

Model 13

Model Trees
library(RWeka)
student_modelt <- M5P(Ytrain.data ~ ., data = as.data.frame(Xtrain.data))
summary(student_modelt)
## 
## === Summary ===
## 
## Correlation coefficient                  0.7596
## Mean absolute error                      0.4782
## Root mean squared error                  0.6572
## Relative absolute error                 59.2166 %
## Root relative squared error             65.1801 %
## Total Number of Instances             2058
m5Pred <- predict(student_modelt,newdata = as.data.frame(xtest.data))

Other Models

Model 14 GLMNET

glmnetModel <- train(
  Xtrain.data, Ytrain.data, method = "glmnet",
  trControl = trainControl("cv", number = 10),
  center = T,
  tuneLength = 20,
  metrics = "Rsquared"
  )
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
plot(glmnetModel)

varImp(glmnetModel)
## glmnet variable importance
## 
##   only 20 most important variables shown (out of 30)
## 
##                   Overall
## Mnf.Flow          100.000
## Balling            76.796
## Alch.Rel           58.885
## Balling.Lvl        55.631
## Density            52.456
## Hyd.Pressure3      51.716
## Bowl.Setpoint      43.008
## Carb.Pressure1     40.061
## Temperature        33.590
## Carb.Rel           23.302
## Usage.cont         22.099
## Pressure.Setpoint  19.587
## Oxygen.Filler      16.838
## Carb.Flow          15.955
## Carb.Temp          13.766
## MFR                12.594
## PC.Volume          11.464
## Filler.Level       10.679
## Hyd.Pressure2       9.232
## Fill.Ounces         8.810
glmnetPred <- predict(glmnetModel,newdata = xtest.data)

Model 15

Principal Components Regression
pcrModel <- train(
  Xtrain.data, Ytrain.data, method = "pcr",
  trControl = trainControl("cv", number = 10),
  center = T,
  tuneLength = 20,
  metrics = "Rsquared"
  )
plot(pcrModel)

pcrPred <- predict(pcrModel,newdata = xtest.data)

Overall Variable Importance

importance.df <- data.frame(varImp(lmmod)[1])

importance.df <- cbind(importance.df,
                       varImp(plsmod)[1],
                       varImp(ridgeRegFit)[1],
                       varImp(nneModel)[1],
                       varImp(marsModel)[1],
                       varImp(svmModel)[1],
                       varImp(knnModel)[1],
                       varImp(randomForestModel,scale = F)[1],
                       varImp(baggedTree)[1],
                       varImp(gbmModel)[1],
                       varImp(cubistmodel)[1],
                       varImp(student_rpart)[1],
                       varImp(glmnetModel)[1])



colnames(importance.df) <- c(   "lmmod",
                       "plsmod",
                       "ridgeRegFit",
                       "nneModel",
                       "marsModel",
                       "svmModel",
                       "knnModel",
                       "randomForestModel,scale = F",
                       "baggedTree",
                       "gbmModel",
                       "cubistmodel",
                       "student_rpart",
                       "glmnetModel")


importance.df <- data.frame(apply(importance.df, 2, function(x) x/max(x)))
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
imp.tidy <- importance.df
imp.tidy$names <- row.names(importance.df)

imp.tidy <- melt(imp.tidy,c("names"))

ggplot(imp.tidy,aes(x=names,y=value))+
  geom_bar(position = "dodge", stat = "summary", fun.y = "mean")+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  ggtitle("Overall Mean Variable Importance")+
  xlab("Variable") +
  xlab("Relative Importance")
## Warning: Ignoring unknown parameters: fun.y
## No summary function supplied, defaulting to `mean_se()`

imp.tree <- importance.df[,unique(imp.tidy$variable)[8:13]]
imp.tree$names <- row.names(importance.df)
imp.tree <- melt(imp.tree,c("names"))

ggplot(imp.tree,aes(x=names,y=value))+
  geom_bar(position = "dodge", stat = "summary", fun.y = "mean")+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  ggtitle("Tree Model Mean Variable Importance")+
  xlab("Variable") +
  xlab("Relative Importance")
## Warning: Ignoring unknown parameters: fun.y
## No summary function supplied, defaulting to `mean_se()`

We look at overall variable importance across all models as well as importance across just the tree-based models. We find that generally, the variable importance is the same with Mnf.Flow and Usage.cont being the key predictors in both cases.

Evaluation

## results
# with all the models
table <- data.frame(rbind(lm = postResample(pred = lmmodPred , obs = ytest.data),
                          PLS = postResample(pred = plsmodPred , obs = ytest.data),
                          Ridge = postResample(pred = ridgePred , obs = ytest.data),
                          NNE = postResample(pred = nnePred , obs = ytest.data),
                          MARS =postResample(pred = marsPred , obs = ytest.data),
                          SVM = postResample(pred = svmPred , obs = ytest.data),
                          KNN = postResample(pred = knnPred , obs = ytest.data),
                          Random.Forest = postResample(pred = randomForestModelPred , obs = ytest.data),
                          BaggedTrees = postResample(pred = baggedTreePred, obs = ytest.data),
                          GBM = postResample(pred = gbmPred , obs = ytest.data),
                          Cubist = postResample(pred = cubistPred , obs = ytest.data) ,
                          Rpart = postResample(pred = rpartPred , obs = ytest.data),
                          M5 = postResample(pred = m5Pred , obs = ytest.data), 
                          glmnet = postResample(pred = glmnetPred , obs = ytest.data),
                          PCR = postResample(pred = pcrPred , obs = ytest.data)))


round(table,4) %>% kable() %>% kable_styling(bootstrap_options = c("hover", "striped"))
RMSE Rsquared MAE
lm 0.7508 0.3924 0.5827
PLS 0.7507 0.3926 0.5832
Ridge 0.7494 0.3945 0.5831
NNE 0.6543 0.5392 0.5068
MARS 0.7066 0.4669 0.5447
SVM 0.6571 0.5395 0.4915
KNN 0.7012 0.4813 0.5455
Random.Forest 0.5567 0.6850 0.4278
BaggedTrees 0.6777 0.5123 0.5344
GBM 0.5736 0.6473 0.4360
Cubist 0.5371 0.6908 0.4055
Rpart 0.7196 0.4423 0.5602
M5 0.6796 0.5073 0.5143
glmnet 0.7495 0.3943 0.5827
PCR 0.7798 0.3444 0.6178
table$model.family <- as.factor(c(rep("linear", 3),
                  rep("non-linear", 4),
                  rep("Tree", 6),
                  rep("Other", 2)))


rmse.plot <- ggplot(table,aes(x=factor(row.names(table),levels = row.names(table)),
                 y=RMSE,
                 fill=model.family))+
  geom_bar(stat="identity") +
  coord_cartesian(ylim = c(0.5, 0.8))+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  ggtitle("RMSE (Lower is Better)")+
  xlab("Model")

  

r2.plot <- ggplot(table,aes(x=factor(row.names(table),levels = row.names(table)),
                 y=Rsquared ,
                 fill=model.family))+
  geom_bar(stat="identity") +
    coord_cartesian(ylim = c(0.3, 0.75))+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  ggtitle("R-Squared (Higher is Better)")+
  xlab("Model")
  
 

mae.plot <- ggplot(table,aes(x=factor(row.names(table),levels = row.names(table)),
                 y=MAE  ,
                 fill=model.family))+
  geom_bar(stat="identity") +
    coord_cartesian(ylim = c(0.35, 0.65))+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  ggtitle("MAE (Lower is Better)")+
  xlab("Model")
 
rmse.plot

r2.plot

mae.plot

In evaluating the models, we can see that the tree family performs best and that of the tree-based models the Cubist model is preferred.

Predict PH Using Evaluation Set

# center and scale excluding brand and PH
student_eval <- scale(student_test[c(-1,-26)], center = TRUE, scale = TRUE)
# Use Cubist model
evalPH.pred <- predict(cubistmodel,newdata = student_eval)
head(evalPH.pred)
## [1]  0.726157367 -1.003607869  0.280460060 -0.003204162 -0.101075508
## [6]  0.900708497
tail(evalPH.pred)
## [1]  0.14852044  0.18342827  0.03890022  0.05699486  0.07052865 -0.20899773
# backtransform to use original scale
scaled.PH <- scale(student_train$PH, center= TRUE, scale=TRUE)
# scale
attr(scaled.PH, 'scaled:scale')
## [1] 0.1725162
# center
attr(scaled.PH, 'scaled:center')
## [1] 8.545649
evalPH.pred_orig_units <- evalPH.pred *
  attr(scaled.PH, 'scaled:scale') +
  attr(scaled.PH, 'scaled:center')
head(evalPH.pred_orig_units)
## [1] 8.670923 8.372510 8.594033 8.545096 8.528211 8.701035
tail(evalPH.pred_orig_units)
## [1] 8.571271 8.577293 8.552360 8.555481 8.557816 8.509593
write.csv(evalPH.pred_orig_units, "StudentEvaluation_Predictions.csv")

In the final step we apply the same transformations as were used for the training data and output predictions to an excel readable format using the testing data.