Libraries

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(memisc)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
## 
## Attaching package: 'memisc'
## The following objects are masked from 'package:plotly':
## 
##     rename, style
## The following object is masked from 'package:ggplot2':
## 
##     syms
## The following objects are masked from 'package:stats':
## 
##     contr.sum, contr.treatment, contrasts
## The following object is masked from 'package:base':
## 
##     as.array
library(rmdformats)
library(partykit)
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  3.0.1     ✓ dplyr   1.0.0
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ✓ purrr   0.3.4
## ── Conflicts ─────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x purrr::%@%()     masks memisc::%@%()
## x dplyr::collect() masks memisc::collect()
## x dplyr::filter()  masks plotly::filter(), stats::filter()
## x dplyr::lag()     masks stats::lag()
## x purrr::lift()    masks caret::lift()
## x dplyr::recode()  masks memisc::recode()
## x dplyr::rename()  masks memisc::rename(), plotly::rename()
## x dplyr::select()  masks MASS::select(), plotly::select()
## x dplyr::syms()    masks memisc::syms(), ggplot2::syms()
## x tibble::view()   masks memisc::view()
library(corrplot)
## corrplot 0.84 loaded
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(mlbench)
library(dplyr)
library(readxl)
library(VIM)
## Loading required package: colorspace
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
## The following object is masked from 'package:plotly':
## 
##     slice
library(Matrix)
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(writexl)
seed <- 12345

Exploratory Data Analysis and PreProcessing

Data Loading

train <- read_excel("StudentData.xlsx")
test <- read_excel("StudentEvaluation.xlsx")

Data Statistics

dim(train)
## [1] 2571   33
dim(test)
## [1] 267  33

Observations

nrow(train[complete.cases(train),])
## [1] 2038

Predictors

summary(train)
##   Brand.Code         Carb.Volume     Fill.Ounces      PC.Volume      
##  Length:2571        Min.   :5.040   Min.   :23.63   Min.   :0.07933  
##  Class :character   1st Qu.:5.293   1st Qu.:23.92   1st Qu.:0.23917  
##  Mode  :character   Median :5.347   Median :23.97   Median :0.27133  
##                     Mean   :5.370   Mean   :23.97   Mean   :0.27712  
##                     3rd Qu.:5.453   3rd Qu.:24.03   3rd Qu.:0.31200  
##                     Max.   :5.700   Max.   :24.32   Max.   :0.47800  
##                     NA's   :10      NA's   :38      NA's   :39       
##  Carb.Pressure     Carb.Temp          PSC             PSC.Fill     
##  Min.   :57.00   Min.   :128.6   Min.   :0.00200   Min.   :0.0000  
##  1st Qu.:65.60   1st Qu.:138.4   1st Qu.:0.04800   1st Qu.:0.1000  
##  Median :68.20   Median :140.8   Median :0.07600   Median :0.1800  
##  Mean   :68.19   Mean   :141.1   Mean   :0.08457   Mean   :0.1954  
##  3rd Qu.:70.60   3rd Qu.:143.8   3rd Qu.:0.11200   3rd Qu.:0.2600  
##  Max.   :79.40   Max.   :154.0   Max.   :0.27000   Max.   :0.6200  
##  NA's   :27      NA's   :26      NA's   :33        NA's   :23      
##     PSC.CO2           Mnf.Flow       Carb.Pressure1  Fill.Pressure  
##  Min.   :0.00000   Min.   :-100.20   Min.   :105.6   Min.   :34.60  
##  1st Qu.:0.02000   1st Qu.:-100.00   1st Qu.:119.0   1st Qu.:46.00  
##  Median :0.04000   Median :  65.20   Median :123.2   Median :46.40  
##  Mean   :0.05641   Mean   :  24.57   Mean   :122.6   Mean   :47.92  
##  3rd Qu.:0.08000   3rd Qu.: 140.80   3rd Qu.:125.4   3rd Qu.:50.00  
##  Max.   :0.24000   Max.   : 229.40   Max.   :140.2   Max.   :60.40  
##  NA's   :39        NA's   :2         NA's   :32      NA's   :22     
##  Hyd.Pressure1   Hyd.Pressure2   Hyd.Pressure3   Hyd.Pressure4   
##  Min.   :-0.80   Min.   : 0.00   Min.   :-1.20   Min.   : 52.00  
##  1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 86.00  
##  Median :11.40   Median :28.60   Median :27.60   Median : 96.00  
##  Mean   :12.44   Mean   :20.96   Mean   :20.46   Mean   : 96.29  
##  3rd Qu.:20.20   3rd Qu.:34.60   3rd Qu.:33.40   3rd Qu.:102.00  
##  Max.   :58.00   Max.   :59.40   Max.   :50.00   Max.   :142.00  
##  NA's   :11      NA's   :15      NA's   :15      NA's   :30      
##   Filler.Level    Filler.Speed   Temperature      Usage.cont      Carb.Flow   
##  Min.   : 55.8   Min.   : 998   Min.   :63.60   Min.   :12.08   Min.   :  26  
##  1st Qu.: 98.3   1st Qu.:3888   1st Qu.:65.20   1st Qu.:18.36   1st Qu.:1144  
##  Median :118.4   Median :3982   Median :65.60   Median :21.79   Median :3028  
##  Mean   :109.3   Mean   :3687   Mean   :65.97   Mean   :20.99   Mean   :2468  
##  3rd Qu.:120.0   3rd Qu.:3998   3rd Qu.:66.40   3rd Qu.:23.75   3rd Qu.:3186  
##  Max.   :161.2   Max.   :4030   Max.   :76.20   Max.   :25.90   Max.   :5104  
##  NA's   :20      NA's   :57     NA's   :14      NA's   :5       NA's   :2     
##     Density           MFR           Balling       Pressure.Vacuum 
##  Min.   :0.240   Min.   : 31.4   Min.   :-0.170   Min.   :-6.600  
##  1st Qu.:0.900   1st Qu.:706.3   1st Qu.: 1.496   1st Qu.:-5.600  
##  Median :0.980   Median :724.0   Median : 1.648   Median :-5.400  
##  Mean   :1.174   Mean   :704.0   Mean   : 2.198   Mean   :-5.216  
##  3rd Qu.:1.620   3rd Qu.:731.0   3rd Qu.: 3.292   3rd Qu.:-5.000  
##  Max.   :1.920   Max.   :868.6   Max.   : 4.012   Max.   :-3.600  
##  NA's   :1       NA's   :212     NA's   :1                        
##        PH        Oxygen.Filler     Bowl.Setpoint   Pressure.Setpoint
##  Min.   :7.880   Min.   :0.00240   Min.   : 70.0   Min.   :44.00    
##  1st Qu.:8.440   1st Qu.:0.02200   1st Qu.:100.0   1st Qu.:46.00    
##  Median :8.540   Median :0.03340   Median :120.0   Median :46.00    
##  Mean   :8.546   Mean   :0.04684   Mean   :109.3   Mean   :47.62    
##  3rd Qu.:8.680   3rd Qu.:0.06000   3rd Qu.:120.0   3rd Qu.:50.00    
##  Max.   :9.360   Max.   :0.40000   Max.   :140.0   Max.   :52.00    
##  NA's   :4       NA's   :12        NA's   :2       NA's   :12       
##  Air.Pressurer      Alch.Rel        Carb.Rel      Balling.Lvl  
##  Min.   :140.8   Min.   :5.280   Min.   :4.960   Min.   :0.00  
##  1st Qu.:142.2   1st Qu.:6.540   1st Qu.:5.340   1st Qu.:1.38  
##  Median :142.6   Median :6.560   Median :5.400   Median :1.48  
##  Mean   :142.8   Mean   :6.897   Mean   :5.437   Mean   :2.05  
##  3rd Qu.:143.0   3rd Qu.:7.240   3rd Qu.:5.540   3rd Qu.:3.14  
##  Max.   :148.2   Max.   :8.620   Max.   :6.060   Max.   :3.66  
##                  NA's   :9       NA's   :10      NA's   :1
summary(test)
##   Brand Code         Carb Volume     Fill Ounces      PC Volume      
##  Length:267         Min.   :5.147   Min.   :23.75   Min.   :0.09867  
##  Class :character   1st Qu.:5.287   1st Qu.:23.92   1st Qu.:0.23333  
##  Mode  :character   Median :5.340   Median :23.97   Median :0.27533  
##                     Mean   :5.369   Mean   :23.97   Mean   :0.27769  
##                     3rd Qu.:5.465   3rd Qu.:24.01   3rd Qu.:0.32200  
##                     Max.   :5.667   Max.   :24.20   Max.   :0.46400  
##                     NA's   :1       NA's   :6       NA's   :4        
##  Carb Pressure     Carb Temp          PSC             PSC Fill     
##  Min.   :60.20   Min.   :130.0   Min.   :0.00400   Min.   :0.0200  
##  1st Qu.:65.30   1st Qu.:138.4   1st Qu.:0.04450   1st Qu.:0.1000  
##  Median :68.00   Median :140.8   Median :0.07600   Median :0.1800  
##  Mean   :68.25   Mean   :141.2   Mean   :0.08545   Mean   :0.1903  
##  3rd Qu.:70.60   3rd Qu.:143.8   3rd Qu.:0.11200   3rd Qu.:0.2600  
##  Max.   :77.60   Max.   :154.0   Max.   :0.24600   Max.   :0.6200  
##                  NA's   :1       NA's   :5         NA's   :3       
##     PSC CO2           Mnf Flow       Carb Pressure1  Fill Pressure  
##  Min.   :0.00000   Min.   :-100.20   Min.   :113.0   Min.   :37.80  
##  1st Qu.:0.02000   1st Qu.:-100.00   1st Qu.:120.2   1st Qu.:46.00  
##  Median :0.04000   Median :   0.20   Median :123.4   Median :47.80  
##  Mean   :0.05107   Mean   :  21.03   Mean   :123.0   Mean   :48.14  
##  3rd Qu.:0.06000   3rd Qu.: 141.30   3rd Qu.:125.5   3rd Qu.:50.20  
##  Max.   :0.24000   Max.   : 220.40   Max.   :136.0   Max.   :60.20  
##  NA's   :5                           NA's   :4       NA's   :2      
##  Hyd Pressure1    Hyd Pressure2    Hyd Pressure3    Hyd Pressure4   
##  Min.   :-50.00   Min.   :-50.00   Min.   :-50.00   Min.   : 68.00  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.: 90.00  
##  Median : 10.40   Median : 26.80   Median : 27.70   Median : 98.00  
##  Mean   : 12.01   Mean   : 20.11   Mean   : 19.61   Mean   : 97.84  
##  3rd Qu.: 20.40   3rd Qu.: 34.80   3rd Qu.: 33.00   3rd Qu.:104.00  
##  Max.   : 50.00   Max.   : 61.40   Max.   : 49.20   Max.   :140.00  
##                   NA's   :1        NA's   :1        NA's   :4       
##   Filler Level    Filler Speed   Temperature      Usage cont      Carb Flow   
##  Min.   : 69.2   Min.   :1006   Min.   :63.80   Min.   :12.90   Min.   :   0  
##  1st Qu.:100.6   1st Qu.:3812   1st Qu.:65.40   1st Qu.:18.12   1st Qu.:1083  
##  Median :118.6   Median :3978   Median :65.80   Median :21.44   Median :3038  
##  Mean   :110.3   Mean   :3581   Mean   :66.23   Mean   :20.90   Mean   :2409  
##  3rd Qu.:120.2   3rd Qu.:3996   3rd Qu.:66.60   3rd Qu.:23.74   3rd Qu.:3215  
##  Max.   :153.2   Max.   :4020   Max.   :75.40   Max.   :24.60   Max.   :3858  
##  NA's   :2       NA's   :10     NA's   :2       NA's   :2                     
##     Density           MFR           Balling      Pressure Vacuum 
##  Min.   :0.060   Min.   : 15.6   Min.   :0.902   Min.   :-6.400  
##  1st Qu.:0.920   1st Qu.:707.0   1st Qu.:1.498   1st Qu.:-5.600  
##  Median :0.980   Median :724.6   Median :1.648   Median :-5.200  
##  Mean   :1.177   Mean   :697.8   Mean   :2.203   Mean   :-5.174  
##  3rd Qu.:1.600   3rd Qu.:731.5   3rd Qu.:3.242   3rd Qu.:-4.800  
##  Max.   :1.840   Max.   :784.8   Max.   :3.788   Max.   :-3.600  
##  NA's   :1       NA's   :31      NA's   :1       NA's   :1       
##     PH          Oxygen Filler     Bowl Setpoint   Pressure Setpoint
##  Mode:logical   Min.   :0.00240   Min.   : 70.0   Min.   :44.00    
##  NA's:267       1st Qu.:0.01960   1st Qu.:100.0   1st Qu.:46.00    
##                 Median :0.03370   Median :120.0   Median :46.00    
##                 Mean   :0.04666   Mean   :109.6   Mean   :47.73    
##                 3rd Qu.:0.05440   3rd Qu.:120.0   3rd Qu.:50.00    
##                 Max.   :0.39800   Max.   :130.0   Max.   :52.00    
##                 NA's   :3         NA's   :1       NA's   :2        
##  Air Pressurer      Alch Rel        Carb Rel     Balling Lvl   
##  Min.   :141.2   Min.   :6.400   Min.   :5.18   Min.   :0.000  
##  1st Qu.:142.2   1st Qu.:6.540   1st Qu.:5.34   1st Qu.:1.380  
##  Median :142.6   Median :6.580   Median :5.40   Median :1.480  
##  Mean   :142.8   Mean   :6.907   Mean   :5.44   Mean   :2.051  
##  3rd Qu.:142.8   3rd Qu.:7.180   3rd Qu.:5.56   3rd Qu.:3.080  
##  Max.   :147.2   Max.   :7.820   Max.   :5.74   Max.   :3.420  
##  NA's   :1       NA's   :3       NA's   :2

Missing values for Train and Test sets

# Response
train <- train %>% drop_na(PH)

# Predictors
train <- kNN(train, imp_var=FALSE)

Correlation Analysis

# Correlations with response variable

names <- colnames(train[,-26])
pairs.panels(train[, c("PH", names[1:8])])

pairs.panels(train[, c("PH", names[9:17])])

pairs.panels(train[, c("PH", names[18:26])])

pairs.panels(train[, c("PH", names[27:32])])

Top correlated features to PH are Mnf Flow (-0.45), Bowl Setpoint (0.35), Filler Level (0.32), Usage Cont (-0.32), Pressure Setpoint (-0.31),Hyd Pressure3 (-0.24), Pressure Vacuum (0.22), Hyd Pressure2 (-0.20).

train2 <- train %>% dplyr::select(-'Brand.Code')

mydata.cor = cor(train2, method = c("spearman"))
corrplot(mydata.cor,cl.cex = 0.7,tl.cex = .7,diag = TRUE)

Correlated Predictors

corr <- cor(train[,-c(1,26)], use='complete.obs')
topcorr <- findCorrelation(corr) #top predictors to be removed to improve modeling
colnames(train[,topcorr])
## [1] "MFR"           "Hyd.Pressure2" "Carb.Flow"     "Alch.Rel"     
## [5] "Hyd.Pressure4"
corrplot(cor(train[,topcorr], use='complete.obs'))

Near Zero Variance Predictors

nzv <- nearZeroVar(train)
colnames(train[,nzv])
## NULL

The variable “Hyd Pressure1” will be removed as it is constant.

Data Distribution & Variability

boxplot(train[, c(names[2:6])])

boxplot(train[, c(names[7:11])])

boxplot(train[, c(names[12:15])])

boxplot(train[, c(names[16:20])])

boxplot(train[, c(names[21:26])])

boxplot(train[, c(names[27:32])])

# Decision Trees
source("https://raw.githubusercontent.com/IsARam/CUNY_SPS/master/DATA624/RandomForestNulls_testing.R")
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
## 
## Attaching package: 'rattle'
## The following object is masked from 'package:xgboost':
## 
##     xgboost
## The following object is masked from 'package:VIM':
## 
##     wine
## Loading required package: modeltools
## Loading required package: stats4
## 
## Attaching package: 'modeltools'
## The following objects are masked from 'package:memisc':
## 
##     Lapply, relabel
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## 
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
## 
##     boundary
## 
## Attaching package: 'party'
## The following objects are masked from 'package:partykit':
## 
##     cforest, ctree, ctree_control, edge_simple, mob, mob_control,
##     node_barplot, node_bivplot, node_boxplot, node_inner, node_surv,
##     node_terminal, varimp
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
## 
##     importance
## The following object is masked from 'package:psych':
## 
##     outlier
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
 colnames(train)<- make.names(colnames(train), unique=TRUE)
 colnames(test)<- make.names(colnames(test), unique=TRUE)
train <- as.data.frame(train)
test <- as.data.frame(test)

test <- RF_with_Nulls(train,test,"PH",.5,5,10,.01,5,1)

Data Split

set.seed(seed)
train_index <- createDataPartition(train$PH, p = .7, list = FALSE, times = 1)
training <- train[train_index,]
testing <- train[-train_index,]

# Validation for test set
Xtest <- testing[,-grep("PH", colnames(testing))]

Modeling & Evaluation

Linear Regression

set.seed(seed)
lm <- lm(PH~.,data = training)
summary(lm)
## 
## Call:
## lm(formula = PH ~ ., data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.50464 -0.08073  0.01061  0.08836  0.43181 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.034e+01  1.140e+00   9.072  < 2e-16 ***
## Brand.CodeB        5.934e-02  2.695e-02   2.202 0.027820 *  
## Brand.CodeC       -7.816e-02  2.679e-02  -2.917 0.003575 ** 
## Brand.CodeD        7.196e-02  1.788e-02   4.024 5.96e-05 ***
## Carb.Volume       -3.674e-02  7.637e-02  -0.481 0.630530    
## Fill.Ounces       -8.759e-02  3.798e-02  -2.306 0.021199 *  
## PC.Volume         -1.178e-01  6.419e-02  -1.835 0.066608 .  
## Carb.Pressure     -1.184e-03  3.009e-03  -0.393 0.694079    
## Carb.Temp          2.293e-03  2.400e-03   0.955 0.339584    
## PSC               -7.539e-02  6.729e-02  -1.120 0.262698    
## PSC.Fill          -3.331e-02  2.787e-02  -1.196 0.232047    
## PSC.CO2           -8.567e-02  7.722e-02  -1.109 0.267433    
## Mnf.Flow          -6.774e-04  5.534e-05 -12.240  < 2e-16 ***
## Carb.Pressure1     7.120e-03  8.429e-04   8.447  < 2e-16 ***
## Fill.Pressure      1.564e-03  1.461e-03   1.071 0.284469    
## Hyd.Pressure1      2.585e-04  4.486e-04   0.576 0.564572    
## Hyd.Pressure2     -1.242e-03  6.404e-04  -1.939 0.052624 .  
## Hyd.Pressure3      3.096e-03  7.159e-04   4.324 1.62e-05 ***
## Hyd.Pressure4      3.208e-04  3.917e-04   0.819 0.412928    
## Filler.Level      -1.239e-03  6.854e-04  -1.807 0.070898 .  
## Filler.Speed       8.026e-06  7.748e-06   1.036 0.300413    
## Temperature       -1.594e-02  2.750e-03  -5.794 8.12e-09 ***
## Usage.cont        -8.749e-03  1.368e-03  -6.395 2.06e-10 ***
## Carb.Flow          9.696e-06  4.538e-06   2.137 0.032772 *  
## Density           -1.158e-01  3.412e-02  -3.393 0.000706 ***
## MFR                1.329e-05  4.886e-05   0.272 0.785657    
## Balling           -1.229e-01  2.989e-02  -4.110 4.14e-05 ***
## Pressure.Vacuum   -3.337e-02  9.456e-03  -3.529 0.000427 ***
## Oxygen.Filler     -3.459e-01  8.805e-02  -3.929 8.87e-05 ***
## Bowl.Setpoint      3.480e-03  7.301e-04   4.767 2.03e-06 ***
## Pressure.Setpoint -8.166e-03  2.366e-03  -3.451 0.000571 ***
## Air.Pressurer     -9.530e-04  2.779e-03  -0.343 0.731674    
## Alch.Rel           5.443e-02  2.429e-02   2.241 0.025157 *  
## Carb.Rel           5.520e-02  5.562e-02   0.992 0.321170    
## Balling.Lvl        1.459e-01  2.715e-02   5.375 8.68e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.133 on 1764 degrees of freedom
## Multiple R-squared:  0.4223, Adjusted R-squared:  0.4112 
## F-statistic: 37.93 on 34 and 1764 DF,  p-value: < 2.2e-16

Bagged Tree

set.seed(seed)
bagControl = bagControl(fit = ctreeBag$fit, predict = ctreeBag$pred, aggregate = ctreeBag$aggregate)
bag_model <- train(PH ~., 
                    data = training, method="bag", bagControl = bagControl,
                   center = TRUE,
                   scale = TRUE,
                   trControl = trainControl("cv", number = 5),
                   tuneLength = 25)
bag_model
## Bagged Model 
## 
## 1799 samples
##   32 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1438, 1440, 1440, 1439, 1439 
## Resampling results:
## 
##   RMSE       Rsquared   MAE       
##   0.1174488  0.5433736  0.08911483
## 
## Tuning parameter 'vars' was held constant at a value of 34
bag_pred <- predict(bag_model, newdata = Xtest)
postResample(obs = testing$PH, pred=bag_pred)
##       RMSE   Rsquared        MAE 
## 0.11337276 0.56067003 0.08018872
varImp(bag_model)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 32)
## 
##                   Overall
## Oxygen.Filler      100.00
## Filler.Level        77.41
## Mnf.Flow            57.53
## Filler.Speed        57.14
## Balling             56.65
## Hyd.Pressure3       53.30
## Hyd.Pressure2       50.01
## Bowl.Setpoint       49.18
## Hyd.Pressure1       48.92
## Fill.Pressure       47.44
## Usage.cont          45.06
## Pressure.Setpoint   38.38
## Balling.Lvl         32.55
## Brand.Code          32.21
## Density             30.99
## Carb.Pressure1      30.58
## Carb.Rel            29.98
## Alch.Rel            26.45
## PSC                 24.22
## Pressure.Vacuum     22.63

XGBoost

# Converting data to matrix
training2 <- training %>% drop_na(`Brand.Code`)
testing2 <- testing %>% drop_na(`Brand.Code`)
trainingmx<-model.matrix(~.+0,data=training2[,names(training2) != c("PH")])
testingmx<-model.matrix(~.+0,data=testing2[,names(testing2) != c("PH")])
trainingdmx <- xgb.DMatrix(data = trainingmx, label=training2$PH) 
testingdmx <- xgb.DMatrix(data = testingmx, label=testing2$PH) 
# Default parameters
params <- list(booster = "gbtree", objective = "reg:linear", eta=0.3, gamma=0, max_depth=6, min_child_weight=1, subsample=1, colsample_bytree=1)
# nround parameter
xgbcv <- xgb.cv( params = params, data = trainingdmx, nrounds = 300, nfold = 5, showsd = T, stratified = T, print_every_n = 10, early_stop_rounds = 20, maximize = F) 
## [03:34:08] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [03:34:08] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [03:34:08] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [03:34:09] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [03:34:09] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [1]  train-rmse:5.636829+0.001723    test-rmse:5.636822+0.010651 
## [11] train-rmse:0.192335+0.000457    test-rmse:0.206971+0.005313 
## [21] train-rmse:0.062187+0.002178    test-rmse:0.113529+0.006683 
## [31] train-rmse:0.045677+0.000585    test-rmse:0.111656+0.006910 
## [41] train-rmse:0.034947+0.000515    test-rmse:0.111015+0.006828 
## [51] train-rmse:0.026761+0.000283    test-rmse:0.110441+0.007108 
## [61] train-rmse:0.019877+0.001281    test-rmse:0.109963+0.007370 
## [71] train-rmse:0.015932+0.001660    test-rmse:0.109905+0.007427 
## [81] train-rmse:0.012154+0.001150    test-rmse:0.109764+0.007490 
## [91] train-rmse:0.009684+0.000790    test-rmse:0.109655+0.007524 
## [101]    train-rmse:0.007590+0.000682    test-rmse:0.109519+0.007569 
## [111]    train-rmse:0.006150+0.000628    test-rmse:0.109456+0.007552 
## [121]    train-rmse:0.004868+0.000552    test-rmse:0.109478+0.007555 
## [131]    train-rmse:0.003860+0.000509    test-rmse:0.109463+0.007553 
## [141]    train-rmse:0.003021+0.000383    test-rmse:0.109398+0.007527 
## [151]    train-rmse:0.002387+0.000292    test-rmse:0.109374+0.007541 
## [161]    train-rmse:0.001976+0.000228    test-rmse:0.109342+0.007531 
## [171]    train-rmse:0.001517+0.000140    test-rmse:0.109334+0.007507 
## [181]    train-rmse:0.001229+0.000151    test-rmse:0.109332+0.007506 
## [191]    train-rmse:0.001060+0.000098    test-rmse:0.109332+0.007514 
## [201]    train-rmse:0.001001+0.000064    test-rmse:0.109338+0.007524 
## [211]    train-rmse:0.000990+0.000076    test-rmse:0.109337+0.007522 
## [221]    train-rmse:0.000990+0.000076    test-rmse:0.109337+0.007522 
## [231]    train-rmse:0.000990+0.000076    test-rmse:0.109337+0.007522 
## [241]    train-rmse:0.000990+0.000076    test-rmse:0.109337+0.007522 
## [251]    train-rmse:0.000990+0.000076    test-rmse:0.109337+0.007522 
## [261]    train-rmse:0.000990+0.000076    test-rmse:0.109337+0.007522 
## [271]    train-rmse:0.000990+0.000076    test-rmse:0.109337+0.007522 
## [281]    train-rmse:0.000990+0.000076    test-rmse:0.109337+0.007522 
## [291]    train-rmse:0.000990+0.000076    test-rmse:0.109337+0.007522 
## [300]    train-rmse:0.000990+0.000076    test-rmse:0.109337+0.007522
set.seed(seed)
xgb_model1 <- xgb.train (params = params, data = trainingdmx, nrounds = 260, watchlist = list(val=testingdmx,train=trainingdmx), print_every_n = 10, early_stop_round = 10, maximize = F)
## [03:44:21] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [03:44:21] WARNING: amalgamation/../src/learner.cc:516: 
## Parameters: { early_stop_round } might not be used.
## 
##   This may not be accurate due to some parameters are only used in language bindings but
##   passed down to XGBoost core.  Or some parameters are not used but slip through this
##   verification. Please open an issue if you find above cases.
## 
## 
## [1]  val-rmse:5.633858   train-rmse:5.636496 
## [11] val-rmse:0.201779   train-rmse:0.192384 
## [21] val-rmse:0.107885   train-rmse:0.063078 
## [31] val-rmse:0.106375   train-rmse:0.050422 
## [41] val-rmse:0.105411   train-rmse:0.040096 
## [51] val-rmse:0.105364   train-rmse:0.031308 
## [61] val-rmse:0.105399   train-rmse:0.025489 
## [71] val-rmse:0.106176   train-rmse:0.020108 
## [81] val-rmse:0.105993   train-rmse:0.016881 
## [91] val-rmse:0.105840   train-rmse:0.014073 
## [101]    val-rmse:0.105786   train-rmse:0.012095 
## [111]    val-rmse:0.105836   train-rmse:0.009654 
## [121]    val-rmse:0.105658   train-rmse:0.007734 
## [131]    val-rmse:0.105724   train-rmse:0.006184 
## [141]    val-rmse:0.105739   train-rmse:0.005214 
## [151]    val-rmse:0.105758   train-rmse:0.003922 
## [161]    val-rmse:0.105785   train-rmse:0.003175 
## [171]    val-rmse:0.105773   train-rmse:0.002706 
## [181]    val-rmse:0.105754   train-rmse:0.002244 
## [191]    val-rmse:0.105754   train-rmse:0.001716 
## [201]    val-rmse:0.105738   train-rmse:0.001361 
## [211]    val-rmse:0.105731   train-rmse:0.001107 
## [221]    val-rmse:0.105729   train-rmse:0.001093 
## [231]    val-rmse:0.105729   train-rmse:0.001093 
## [241]    val-rmse:0.105729   train-rmse:0.001093 
## [251]    val-rmse:0.105728   train-rmse:0.001093 
## [260]    val-rmse:0.105729   train-rmse:0.001093
mat <- xgb.importance (feature_names = colnames(trainingmx),model = xgb_model1)
xgb.plot.importance (importance_matrix = mat) 

SVM

ctrl = trainControl(method='cv', number = 10)
set.seed(seed)
svmRad <- train(PH ~.,
                data=training,
                method = "svmRadial",
                preProc = c("center", "scale"),
                tuneLength = 14,
                trControl = ctrl)
svmRad
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 1799 samples
##   32 predictor
## 
## Pre-processing: centered (34), scaled (34) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1619, 1618, 1619, 1618, 1619, 1620, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE       Rsquared   MAE       
##      0.25  0.1274342  0.4711883  0.09611690
##      0.50  0.1235253  0.4985630  0.09238116
##      1.00  0.1200843  0.5239995  0.08937250
##      2.00  0.1173177  0.5435048  0.08720979
##      4.00  0.1155158  0.5556946  0.08631336
##      8.00  0.1163903  0.5512329  0.08752803
##     16.00  0.1190036  0.5391637  0.08906792
##     32.00  0.1239753  0.5165933  0.09252462
##     64.00  0.1312589  0.4823998  0.09775870
##    128.00  0.1386002  0.4498103  0.10266553
##    256.00  0.1454519  0.4245452  0.10758489
##    512.00  0.1496935  0.4103727  0.11093162
##   1024.00  0.1513000  0.4038688  0.11217682
##   2048.00  0.1513000  0.4038688  0.11217682
## 
## Tuning parameter 'sigma' was held constant at a value of 0.01959052
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01959052 and C = 4.
svm_pred <- predict(svmRad, newdata = Xtest)
postResample(obs = testing$PH, pred=svm_pred)
##       RMSE   Rsquared        MAE 
## 0.11300721 0.56821917 0.08075468
varImp(svmRad)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 32)
## 
##                   Overall
## Oxygen.Filler      100.00
## Filler.Level        77.41
## Mnf.Flow            57.53
## Filler.Speed        57.14
## Balling             56.65
## Hyd.Pressure3       53.30
## Hyd.Pressure2       50.01
## Bowl.Setpoint       49.18
## Hyd.Pressure1       48.92
## Fill.Pressure       47.44
## Usage.cont          45.06
## Pressure.Setpoint   38.38
## Balling.Lvl         32.55
## Brand.Code          32.21
## Density             30.99
## Carb.Pressure1      30.58
## Carb.Rel            29.98
## Alch.Rel            26.45
## PSC                 24.22
## Pressure.Vacuum     22.63

Cubist

set.seed(seed)
cubist <- train(PH ~., 
                data = training,
                method='cubist')
cubist  # Model performance
## Cubist 
## 
## 1799 samples
##   32 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 1799, 1799, 1799, 1799, 1799, 1799, ... 
## Resampling results across tuning parameters:
## 
##   committees  neighbors  RMSE       Rsquared   MAE       
##    1          0          0.1545655  0.3699735  0.10539159
##    1          5          0.1537225  0.3914070  0.10427311
##    1          9          0.1531457  0.3894358  0.10404284
##   10          0          0.1105453  0.5930051  0.08003293
##   10          5          0.1090171  0.6085254  0.07839087
##   10          9          0.1090003  0.6070384  0.07851524
##   20          0          0.1063878  0.6218416  0.07671871
##   20          5          0.1048818  0.6341245  0.07508322
##   20          9          0.1048661  0.6333358  0.07524606
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 9.
varImp(cubist) # Display importance
## cubist variable importance
## 
##   only 20 most important variables shown (out of 34)
## 
##                 Overall
## Mnf.Flow         100.00
## Balling           78.57
## Alch.Rel          66.43
## Balling.Lvl       66.43
## Pressure.Vacuum   65.71
## Bowl.Setpoint     52.86
## Density           52.14
## Oxygen.Filler     48.57
## Air.Pressurer     47.86
## Carb.Pressure1    47.14
## Temperature       46.43
## Filler.Speed      45.00
## Hyd.Pressure3     42.14
## Usage.cont        42.14
## Carb.Rel          41.43
## Brand.CodeC       39.29
## Carb.Flow         36.43
## Hyd.Pressure2     31.43
## Filler.Level      25.71
## Hyd.Pressure1     23.57
cubist_pred <- predict(cubist, newdata=Xtest) # Generate predictions
postResample(obs=testing$PH, pred=cubist_pred) # Evaluate model
##       RMSE   Rsquared        MAE 
## 0.10112638 0.65059882 0.06951564

Random Forest

ctrl = trainControl(method='cv', number = 10, allowParallel = TRUE)
set.seed(seed)
rforest <- train(PH ~., 
                 data = training,
                 method = "ranger", 
                 importance = "permutation",
                 tuneLength = 10,
                 trControl = ctrl
                 )
rforest # Model performance
## Random Forest 
## 
## 1799 samples
##   32 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1619, 1618, 1619, 1618, 1619, 1620, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   RMSE        Rsquared   MAE       
##    2    variance    0.11317638  0.6225504  0.08711225
##    2    extratrees  0.11802169  0.5809723  0.09193537
##    5    variance    0.10492612  0.6644925  0.07924229
##    5    extratrees  0.10749416  0.6402277  0.08184601
##    9    variance    0.10203469  0.6739557  0.07617255
##    9    extratrees  0.10289887  0.6646787  0.07742827
##   12    variance    0.10074632  0.6801254  0.07497740
##   12    extratrees  0.10094060  0.6755731  0.07549004
##   16    variance    0.10008930  0.6815686  0.07417458
##   16    extratrees  0.09986389  0.6802924  0.07430426
##   19    variance    0.09959033  0.6824283  0.07356550
##   19    extratrees  0.09901741  0.6847864  0.07350240
##   23    variance    0.09927836  0.6831245  0.07312255
##   23    extratrees  0.09806920  0.6899312  0.07283373
##   26    variance    0.09925153  0.6813060  0.07306824
##   26    extratrees  0.09790106  0.6903094  0.07253947
##   30    variance    0.09946203  0.6785126  0.07277334
##   30    extratrees  0.09784219  0.6894772  0.07244412
##   34    variance    0.10014712  0.6717706  0.07290210
##   34    extratrees  0.09766238  0.6899119  0.07228101
## 
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 34, splitrule = extratrees
##  and min.node.size = 5.
varImp(rforest) # Variable importance
## ranger variable importance
## 
##   only 20 most important variables shown (out of 34)
## 
##                   Overall
## Mnf.Flow          100.000
## Brand.CodeC        36.191
## Bowl.Setpoint      31.641
## Alch.Rel           22.727
## Usage.cont         18.992
## Brand.CodeD        18.051
## Pressure.Vacuum    17.115
## Oxygen.Filler      16.518
## Filler.Level       10.821
## Air.Pressurer      10.520
## Density             9.102
## Pressure.Setpoint   9.042
## Brand.CodeB         8.861
## Carb.Rel            8.329
## Balling.Lvl         8.296
## Carb.Flow           7.818
## Balling             7.141
## Hyd.Pressure3       5.906
## Carb.Pressure1      5.823
## Temperature         5.337
rf_pred <- predict(rforest, newdata = Xtest) # Generate predictions
postResample(obs = testing$PH, pred=rf_pred) # Evaluate model
##       RMSE   Rsquared        MAE 
## 0.09874278 0.67343561 0.06843792

Training Random Forest

for (brand_code in unique(training$Brand.Code)){
  print(paste("Brand Code", brand_code))

  temp_df <- training %>%
    filter(Brand.Code == brand_code) %>%
    select(-Brand.Code)
  set.seed(seed)
  temp_rf <- train(PH ~ ., data = temp_df, method = "ranger", importance = "permutation", trControl = ctrl)
  print(temp_rf)
  print(varImp(temp_rf))
  temp_test <- testing %>%
    filter(Brand.Code == brand_code) %>%
    select(-Brand.Code)
  temp_predictions <- predict(temp_rf, temp_test)
  print(postResample(pred = temp_predictions, obs = temp_test$PH))
}
## [1] "Brand Code B"
## Random Forest 
## 
## 922 samples
##  31 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 829, 831, 830, 831, 829, 830, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   RMSE        Rsquared   MAE       
##    2    variance    0.10506711  0.6537845  0.07982225
##    2    extratrees  0.10966083  0.6242749  0.08416500
##   16    variance    0.09249258  0.7166701  0.06786011
##   16    extratrees  0.09220161  0.7206038  0.06722659
##   31    variance    0.09239940  0.7124626  0.06753006
##   31    extratrees  0.08973770  0.7323507  0.06487345
## 
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 31, splitrule = extratrees
##  and min.node.size = 5.
## ranger variable importance
## 
##   only 20 most important variables shown (out of 31)
## 
##                   Overall
## Mnf.Flow          100.000
## Bowl.Setpoint      45.438
## Filler.Level       18.724
## Air.Pressurer      15.129
## Oxygen.Filler      14.614
## Pressure.Vacuum    12.158
## Usage.cont         11.258
## Carb.Flow           9.818
## Carb.Rel            8.678
## Density             7.475
## Pressure.Setpoint   7.448
## Balling             6.160
## Temperature         5.362
## Balling.Lvl         4.557
## Hyd.Pressure3       3.887
## Alch.Rel            3.307
## Hyd.Pressure2       3.078
## Hyd.Pressure1       2.926
## Carb.Pressure1      2.794
## Fill.Pressure       2.102
##       RMSE   Rsquared        MAE 
## 0.09235979 0.71754410 0.06459907 
## [1] "Brand Code A"
## Random Forest 
## 
## 212 samples
##  31 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 190, 190, 192, 191, 192, 191, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   RMSE       Rsquared   MAE       
##    2    variance    0.1224133  0.4827949  0.09648991
##    2    extratrees  0.1268267  0.4437693  0.10134188
##   16    variance    0.1159913  0.5048723  0.08953930
##   16    extratrees  0.1153989  0.5173700  0.09111241
##   31    variance    0.1169424  0.4931588  0.09012204
##   31    extratrees  0.1139705  0.5248241  0.08972653
## 
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 31, splitrule = extratrees
##  and min.node.size = 5.
## ranger variable importance
## 
##   only 20 most important variables shown (out of 31)
## 
##                   Overall
## Mnf.Flow          100.000
## Bowl.Setpoint      48.661
## Usage.cont         44.611
## Filler.Level       41.410
## Oxygen.Filler      32.046
## Pressure.Vacuum    25.983
## Carb.Flow          15.921
## Pressure.Setpoint  14.512
## Balling.Lvl        11.299
## Carb.Pressure1     10.272
## Balling             9.807
## Hyd.Pressure2       9.430
## Density             7.011
## Air.Pressurer       6.690
## Filler.Speed        6.638
## Hyd.Pressure3       6.596
## Hyd.Pressure1       5.358
## Alch.Rel            5.189
## Hyd.Pressure4       5.109
## Fill.Pressure       4.948
##       RMSE   Rsquared        MAE 
## 0.10320186 0.66962892 0.08155831 
## [1] "Brand Code C"
## Random Forest 
## 
## 231 samples
##  31 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 208, 208, 207, 208, 207, 207, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   RMSE       Rsquared   MAE      
##    2    variance    0.1444395  0.3763802  0.1102709
##    2    extratrees  0.1485511  0.3597609  0.1153524
##   16    variance    0.1431981  0.3387047  0.1052344
##   16    extratrees  0.1399040  0.4025821  0.1048006
##   31    variance    0.1440374  0.3294470  0.1039671
##   31    extratrees  0.1399701  0.3927119  0.1034873
## 
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 16, splitrule = extratrees
##  and min.node.size = 5.
## ranger variable importance
## 
##   only 20 most important variables shown (out of 31)
## 
##                   Overall
## Oxygen.Filler     100.000
## Mnf.Flow           64.525
## Hyd.Pressure1      23.714
## Density            21.490
## Alch.Rel           19.166
## Pressure.Vacuum    17.903
## Carb.Rel           15.548
## Bowl.Setpoint      15.415
## Balling            14.915
## Pressure.Setpoint  13.606
## Balling.Lvl        12.747
## Hyd.Pressure3      12.345
## Filler.Level       11.679
## Usage.cont         11.159
## Filler.Speed       10.868
## PC.Volume          10.321
## PSC.Fill           10.197
## Hyd.Pressure2       9.477
## Carb.Flow           9.143
## MFR                 7.952
##      RMSE  Rsquared       MAE 
## 0.1517615 0.3425243 0.1021180 
## [1] "Brand Code D"
## Random Forest 
## 
## 434 samples
##  31 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 389, 391, 390, 392, 391, 391, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   RMSE        Rsquared   MAE       
##    2    variance    0.09923733  0.5935944  0.07922708
##    2    extratrees  0.10427177  0.5557178  0.08388437
##   16    variance    0.08664050  0.6467127  0.06814333
##   16    extratrees  0.08667069  0.6611741  0.06831416
##   31    variance    0.08720461  0.6268374  0.06791382
##   31    extratrees  0.08407906  0.6680614  0.06600962
## 
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 31, splitrule = extratrees
##  and min.node.size = 5.
## ranger variable importance
## 
##   only 20 most important variables shown (out of 31)
## 
##                   Overall
## Mnf.Flow          100.000
## Usage.cont         35.872
## Pressure.Vacuum    27.797
## Hyd.Pressure3      17.767
## Carb.Pressure1     15.437
## Carb.Flow          12.231
## Bowl.Setpoint      10.698
## Density             8.908
## Filler.Speed        7.997
## Hyd.Pressure2       6.567
## Temperature         6.487
## Oxygen.Filler       6.438
## Alch.Rel            5.992
## Filler.Level        5.403
## Hyd.Pressure1       5.117
## Balling.Lvl         4.946
## Balling             4.673
## Pressure.Setpoint   4.372
## Carb.Rel            3.926
## Air.Pressurer       3.866
##       RMSE   Rsquared        MAE 
## 0.07596047 0.64840118 0.05534720

Predicting New Data

pfile <- read_excel("StudentEvaluation.xlsx")
#Preparing the dataset
test <- pfile[,-grep("PH", colnames(pfile))]
test <- kNN(test, imp_var=FALSE)
colnames(test)<- make.names(colnames(test), unique=TRUE)

ctrl = trainControl(method='cv', number = 10)
set.seed(seed)
rf_model <- train(PH ~., 
                 data = train,
                 method = "ranger", 
                 importance = "permutation",
                 tuneLength = 10,
                 trControl = ctrl
                 )
## Growing trees.. Progress: 100%. Estimated remaining time: 0 seconds.
## Growing trees.. Progress: 100%. Estimated remaining time: 0 seconds.
## Growing trees.. Progress: 97%. Estimated remaining time: 0 seconds.
## Growing trees.. Progress: 86%. Estimated remaining time: 5 seconds.
final_rf_pred <- predict(rf_model, newdata=as.data.frame(test))
pfile$PH <- final_rf_pred # Applying predictions 

write_xlsx(pfile, "Predictions_file.xlsx") # Write to excel