1 Data Ingestion

student_evaluation_gh_file <- "https://github.com/hovig/Team5-Data624-Project2/raw/master/StudentEvaluation.xlsx"
student_data_gh_file <- "https://github.com/hovig/Team5-Data624-Project2/raw/master/StudentData.xlsx"
se_temp_file <- tempfile(fileext = ".xlsx")
sd_temp_file <- tempfile(fileext = ".xlsx")
se_data <- GET(student_evaluation_gh_file, authenticate(Sys.getenv("GITHUB_PAT"), ""), write_disk(path = se_temp_file))
df_data <- GET(student_data_gh_file, authenticate(Sys.getenv("GITHUB_PAT"), ""), write_disk(path = sd_temp_file))
se_data <- readxl::read_excel(se_temp_file)
sd_data <- readxl::read_excel(sd_temp_file)
hist.data.frame(se_data)

hist.data.frame(sd_data)

2 Data Preparation and EDA

bev_raw <- read_csv('https://raw.githubusercontent.com/hovig/Team5-Data624-Project2/master/StudentData.csv') 
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `Brand Code` = col_character()
## )
## See spec(...) for full column specifications.
bev_raw %>%
  ggplot(aes(PH, fill=PH > 9)) + 
  geom_histogram(bins=30) +
  theme_bw() +
  theme(legend.position='center') +
  labs(y='Count', title='PH Levels in Dataset')
## Warning: Removed 4 rows containing non-finite values (stat_bin).

bev_raw <- bev_raw %>% filter(!is.na(bev_raw$PH), bev_raw$PH < 9) 
dim(bev_raw)
## [1] 2566   33
str(bev_raw)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 2566 obs. of  33 variables:
##  $ Brand Code       : chr  "B" "A" "B" "A" ...
##  $ Carb Volume      : num  5.34 5.43 5.29 5.44 5.49 ...
##  $ Fill Ounces      : num  24 24 24.1 24 24.3 ...
##  $ PC Volume        : num  0.263 0.239 0.263 0.293 0.111 ...
##  $ Carb Pressure    : num  68.2 68.4 70.8 63 67.2 66.6 64.2 67.6 64.2 72 ...
##  $ Carb Temp        : num  141 140 145 133 137 ...
##  $ PSC              : num  0.104 0.124 0.09 NA 0.026 0.09 0.128 0.154 0.132 0.014 ...
##  $ PSC Fill         : num  0.26 0.22 0.34 0.42 0.16 0.24 0.4 0.34 0.12 0.24 ...
##  $ PSC CO2          : num  0.04 0.04 0.16 0.04 0.12 0.04 0.04 0.04 0.14 0.06 ...
##  $ Mnf Flow         : num  -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 ...
##  $ Carb Pressure1   : num  119 122 120 115 118 ...
##  $ Fill Pressure    : num  46 46 46 46.4 45.8 45.6 51.8 46.8 46 45.2 ...
##  $ Hyd Pressure1    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Hyd Pressure2    : num  NA NA NA 0 0 0 0 0 0 0 ...
##  $ Hyd Pressure3    : num  NA NA NA 0 0 0 0 0 0 0 ...
##  $ Hyd Pressure4    : num  118 106 82 92 92 116 124 132 90 108 ...
##  $ Filler Level     : num  121 119 120 118 119 ...
##  $ Filler Speed     : num  4002 3986 4020 4012 4010 ...
##  $ Temperature      : num  66 67.6 67 65.6 65.6 66.2 65.8 65.2 65.4 66.6 ...
##  $ Usage cont       : num  16.2 19.9 17.8 17.4 17.7 ...
##  $ Carb Flow        : num  2932 3144 2914 3062 3054 ...
##  $ Density          : num  0.88 0.92 1.58 1.54 1.54 1.52 0.84 0.84 0.9 0.9 ...
##  $ MFR              : num  725 727 735 731 723 ...
##  $ Balling          : num  1.4 1.5 3.14 3.04 3.04 ...
##  $ Pressure Vacuum  : num  -4 -4 -3.8 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 ...
##  $ PH               : num  8.36 8.26 8.94 8.24 8.26 8.32 8.4 8.38 8.38 8.5 ...
##  $ Oxygen Filler    : num  0.022 0.026 0.024 0.03 0.03 0.024 0.066 0.046 0.064 0.022 ...
##  $ Bowl Setpoint    : num  120 120 120 120 120 120 120 120 120 120 ...
##  $ Pressure Setpoint: num  46.4 46.8 46.6 46 46 46 46 46 46 46 ...
##  $ Air Pressurer    : num  143 143 142 146 146 ...
##  $ Alch Rel         : num  6.58 6.56 7.66 7.14 7.14 7.16 6.54 6.52 6.52 6.54 ...
##  $ Carb Rel         : num  5.32 5.3 5.84 5.42 5.44 5.44 5.38 5.34 5.34 5.34 ...
##  $ Balling Lvl      : num  1.48 1.56 3.28 3.04 3.04 3.02 1.44 1.44 1.44 1.38 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Brand Code` = col_character(),
##   ..   `Carb Volume` = col_double(),
##   ..   `Fill Ounces` = col_double(),
##   ..   `PC Volume` = col_double(),
##   ..   `Carb Pressure` = col_double(),
##   ..   `Carb Temp` = col_double(),
##   ..   PSC = col_double(),
##   ..   `PSC Fill` = col_double(),
##   ..   `PSC CO2` = col_double(),
##   ..   `Mnf Flow` = col_double(),
##   ..   `Carb Pressure1` = col_double(),
##   ..   `Fill Pressure` = col_double(),
##   ..   `Hyd Pressure1` = col_double(),
##   ..   `Hyd Pressure2` = col_double(),
##   ..   `Hyd Pressure3` = col_double(),
##   ..   `Hyd Pressure4` = col_double(),
##   ..   `Filler Level` = col_double(),
##   ..   `Filler Speed` = col_double(),
##   ..   Temperature = col_double(),
##   ..   `Usage cont` = col_double(),
##   ..   `Carb Flow` = col_double(),
##   ..   Density = col_double(),
##   ..   MFR = col_double(),
##   ..   Balling = col_double(),
##   ..   `Pressure Vacuum` = col_double(),
##   ..   PH = col_double(),
##   ..   `Oxygen Filler` = col_double(),
##   ..   `Bowl Setpoint` = col_double(),
##   ..   `Pressure Setpoint` = col_double(),
##   ..   `Air Pressurer` = col_double(),
##   ..   `Alch Rel` = col_double(),
##   ..   `Carb Rel` = col_double(),
##   ..   `Balling Lvl` = col_double()
##   .. )
hist.data.frame(bev_raw)

table(bev_raw$`Brand Code`)
## 
##    A    B    C    D 
##  293 1235  303  615
summary(bev_raw)
##   Brand Code         Carb Volume     Fill Ounces      PC Volume      
##  Length:2566        Min.   :5.040   Min.   :23.63   Min.   :0.07933  
##  Class :character   1st Qu.:5.293   1st Qu.:23.92   1st Qu.:0.23933  
##  Mode  :character   Median :5.347   Median :23.97   Median :0.27133  
##                     Mean   :5.370   Mean   :23.97   Mean   :0.27719  
##                     3rd Qu.:5.453   3rd Qu.:24.03   3rd Qu.:0.31200  
##                     Max.   :5.700   Max.   :24.32   Max.   :0.47800  
##                     NA's   :10      NA's   :38      NA's   :39       
##  Carb Pressure     Carb Temp          PSC             PSC Fill     
##  Min.   :57.00   Min.   :128.6   Min.   :0.00200   Min.   :0.0000  
##  1st Qu.:65.60   1st Qu.:138.4   1st Qu.:0.04800   1st Qu.:0.1000  
##  Median :68.20   Median :140.8   Median :0.07600   Median :0.1800  
##  Mean   :68.19   Mean   :141.1   Mean   :0.08468   Mean   :0.1953  
##  3rd Qu.:70.60   3rd Qu.:143.8   3rd Qu.:0.11200   3rd Qu.:0.2600  
##  Max.   :79.40   Max.   :154.0   Max.   :0.27000   Max.   :0.6200  
##  NA's   :27      NA's   :26      NA's   :33        NA's   :23      
##     PSC CO2           Mnf Flow       Carb Pressure1  Fill Pressure  
##  Min.   :0.00000   Min.   :-100.20   Min.   :105.6   Min.   :34.60  
##  1st Qu.:0.02000   1st Qu.:-100.00   1st Qu.:119.0   1st Qu.:46.00  
##  Median :0.04000   Median :  70.70   Median :123.2   Median :46.40  
##  Mean   :0.05645   Mean   :  24.68   Mean   :122.6   Mean   :47.92  
##  3rd Qu.:0.08000   3rd Qu.: 140.80   3rd Qu.:125.4   3rd Qu.:50.00  
##  Max.   :0.24000   Max.   : 229.40   Max.   :140.2   Max.   :60.40  
##  NA's   :39                          NA's   :32      NA's   :17     
##  Hyd Pressure1   Hyd Pressure2  Hyd Pressure3   Hyd Pressure4   
##  Min.   :-0.80   Min.   : 0.0   Min.   :-1.20   Min.   : 62.00  
##  1st Qu.: 0.00   1st Qu.: 0.0   1st Qu.: 0.00   1st Qu.: 86.00  
##  Median :11.40   Median :28.6   Median :27.60   Median : 96.00  
##  Mean   :12.46   Mean   :21.0   Mean   :20.49   Mean   : 96.31  
##  3rd Qu.:20.20   3rd Qu.:34.6   3rd Qu.:33.40   3rd Qu.:102.00  
##  Max.   :58.00   Max.   :59.4   Max.   :50.00   Max.   :142.00  
##  NA's   :11      NA's   :15     NA's   :15      NA's   :27      
##   Filler Level    Filler Speed   Temperature      Usage cont   
##  Min.   : 55.8   Min.   : 998   Min.   :63.60   Min.   :12.08  
##  1st Qu.: 98.3   1st Qu.:3888   1st Qu.:65.20   1st Qu.:18.38  
##  Median :118.4   Median :3982   Median :65.60   Median :21.80  
##  Mean   :109.3   Mean   :3688   Mean   :65.96   Mean   :21.00  
##  3rd Qu.:120.0   3rd Qu.:3998   3rd Qu.:66.40   3rd Qu.:23.76  
##  Max.   :161.2   Max.   :4030   Max.   :76.20   Max.   :25.90  
##  NA's   :15      NA's   :53     NA's   :11      NA's   :5      
##    Carb Flow       Density           MFR           Balling     
##  Min.   :  26   Min.   :0.300   Min.   : 31.4   Min.   :0.346  
##  1st Qu.:1170   1st Qu.:0.900   1st Qu.:706.3   1st Qu.:1.496  
##  Median :3030   Median :0.980   Median :724.0   Median :1.648  
##  Mean   :2473   Mean   :1.175   Mean   :704.0   Mean   :2.201  
##  3rd Qu.:3188   3rd Qu.:1.620   3rd Qu.:731.0   3rd Qu.:3.292  
##  Max.   :5104   Max.   :1.920   Max.   :868.6   Max.   :4.012  
##  NA's   :2                      NA's   :207                    
##  Pressure Vacuum        PH        Oxygen Filler     Bowl Setpoint  
##  Min.   :-6.600   Min.   :7.880   Min.   :0.00240   Min.   : 70.0  
##  1st Qu.:-5.600   1st Qu.:8.440   1st Qu.:0.02200   1st Qu.:100.0  
##  Median :-5.400   Median :8.540   Median :0.03340   Median :120.0  
##  Mean   :-5.216   Mean   :8.545   Mean   :0.04635   Mean   :109.3  
##  3rd Qu.:-5.000   3rd Qu.:8.680   3rd Qu.:0.05960   3rd Qu.:120.0  
##  Max.   :-3.600   Max.   :8.940   Max.   :0.40000   Max.   :140.0  
##                                   NA's   :11        NA's   :2      
##  Pressure Setpoint Air Pressurer      Alch Rel        Carb Rel    
##  Min.   :44.00     Min.   :140.8   Min.   :5.280   Min.   :4.960  
##  1st Qu.:46.00     1st Qu.:142.2   1st Qu.:6.540   1st Qu.:5.340  
##  Median :46.00     Median :142.6   Median :6.560   Median :5.400  
##  Mean   :47.61     Mean   :142.8   Mean   :6.898   Mean   :5.437  
##  3rd Qu.:50.00     3rd Qu.:143.0   3rd Qu.:7.240   3rd Qu.:5.550  
##  Max.   :52.00     Max.   :148.2   Max.   :8.620   Max.   :6.060  
##  NA's   :12                        NA's   :6       NA's   :7      
##   Balling Lvl   
##  Min.   :0.000  
##  1st Qu.:1.380  
##  Median :1.480  
##  Mean   :2.052  
##  3rd Qu.:3.140  
##  Max.   :3.660  
##  NA's   :1
describe(bev_raw %>% select(-`Brand Code`))
##                   vars    n    mean      sd  median trimmed    mad     min
## Carb Volume          1 2556    5.37    0.11    5.35    5.37   0.11    5.04
## Fill Ounces          2 2528   23.97    0.09   23.97   23.98   0.08   23.63
## PC Volume            3 2527    0.28    0.06    0.27    0.27   0.05    0.08
## Carb Pressure        4 2539   68.19    3.54   68.20   68.12   3.56   57.00
## Carb Temp            5 2540  141.09    4.03  140.80  140.99   3.85  128.60
## PSC                  6 2533    0.08    0.05    0.08    0.08   0.05    0.00
## PSC Fill             7 2543    0.20    0.12    0.18    0.18   0.12    0.00
## PSC CO2              8 2527    0.06    0.04    0.04    0.05   0.03    0.00
## Mnf Flow             9 2566   24.68  119.50   70.70   21.20 160.57 -100.20
## Carb Pressure1      10 2534  122.56    4.72  123.20  122.52   4.45  105.60
## Fill Pressure       11 2549   47.92    3.18   46.40   47.71   2.37   34.60
## Hyd Pressure1       12 2555   12.46   12.43   11.40   10.87  16.90   -0.80
## Hyd Pressure2       13 2551   21.00   16.38   28.60   21.10  13.34    0.00
## Hyd Pressure3       14 2551   20.49   15.97   27.60   20.54  13.64   -1.20
## Hyd Pressure4       15 2539   96.31   13.10   96.00   95.46  11.86   62.00
## Filler Level        16 2551  109.25   15.70  118.40  111.04   9.19   55.80
## Filler Speed        17 2513 3688.11  769.63 3982.00 3920.25  47.44  998.00
## Temperature         18 2555   65.96    1.38   65.60   65.80   0.89   63.60
## Usage cont          19 2561   21.00    2.98   21.80   21.25   3.17   12.08
## Carb Flow           20 2564 2473.00 1069.56 3030.00 2604.91 323.21   26.00
## Density             21 2566    1.17    0.38    0.98    1.15   0.15    0.30
## MFR                 22 2359  704.05   73.90  724.00  718.16  15.42   31.40
## Balling             23 2566    2.20    0.93    1.65    2.13   0.37    0.35
## Pressure Vacuum     24 2566   -5.22    0.57   -5.40   -5.25   0.59   -6.60
## PH                  25 2566    8.55    0.17    8.54    8.55   0.18    7.88
## Oxygen Filler       26 2555    0.05    0.04    0.03    0.04   0.02    0.00
## Bowl Setpoint       27 2564  109.34   15.29  120.00  111.36   0.00   70.00
## Pressure Setpoint   28 2554   47.61    2.04   46.00   47.60   0.00   44.00
## Air Pressurer       29 2566  142.83    1.21  142.60  142.58   0.59  140.80
## Alch Rel            30 2560    6.90    0.51    6.56    6.84   0.06    5.28
## Carb Rel            31 2559    5.44    0.13    5.40    5.43   0.12    4.96
## Balling Lvl         32 2565    2.05    0.87    1.48    1.98   0.21    0.00
##                       max   range  skew kurtosis    se
## Carb Volume          5.70    0.66  0.39    -0.47  0.00
## Fill Ounces         24.32    0.69 -0.02     0.87  0.00
## PC Volume            0.48    0.40  0.35     0.67  0.00
## Carb Pressure       79.40   22.40  0.18    -0.01  0.07
## Carb Temp          154.00   25.40  0.24     0.23  0.08
## PSC                  0.27    0.27  0.85     0.65  0.00
## PSC Fill             0.62    0.62  0.93     0.77  0.00
## PSC CO2              0.24    0.24  1.73     3.71  0.00
## Mnf Flow           229.40  329.60  0.00    -1.87  2.36
## Carb Pressure1     140.20   34.60  0.03     0.10  0.09
## Fill Pressure       60.40   25.80  0.55     1.41  0.06
## Hyd Pressure1       58.00   58.80  0.78    -0.15  0.25
## Hyd Pressure2       59.40   59.40 -0.31    -1.56  0.32
## Hyd Pressure3       50.00   51.20 -0.32    -1.57  0.32
## Hyd Pressure4      142.00   80.00  0.56     0.61  0.26
## Filler Level       161.20  105.40 -0.85     0.05  0.31
## Filler Speed      4030.00 3032.00 -2.88     6.75 15.35
## Temperature         76.20   12.60  2.39    10.25  0.03
## Usage cont          25.90   13.82 -0.54    -1.02  0.06
## Carb Flow         5104.00 5078.00 -0.99    -0.57 21.12
## Density              1.92    1.62  0.54    -1.22  0.01
## MFR                868.60  837.20 -5.09    30.46  1.52
## Balling              4.01    3.67  0.60    -1.41  0.02
## Pressure Vacuum     -3.60    3.00  0.53    -0.03  0.01
## PH                   8.94    1.06 -0.33    -0.08  0.00
## Oxygen Filler        0.40    0.40  2.41     8.89  0.00
## Bowl Setpoint      140.00   70.00 -0.97    -0.06  0.30
## Pressure Setpoint   52.00    8.00  0.20    -1.60  0.04
## Air Pressurer      148.20    7.40  2.25     4.72  0.02
## Alch Rel             8.62    3.34  0.88    -0.85  0.01
## Carb Rel             6.06    1.10  0.50    -0.30  0.00
## Balling Lvl          3.66    3.66  0.60    -1.51  0.02

2.1 Zero variance

df1 <- bev_raw %>% select(-`Brand Code`) %>% mutate_each(funs(as.numeric(.))) %>% complete.cases() %>% as.data.frame()
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
## 
## # Before:
## funs(name = f(.)
## 
## # After: 
## list(name = ~f(.))
## This warning is displayed once per session.
names(df1)[nearZeroVar(df1)]
## character(0)
nzv <- nearZeroVar(bev_raw,saveMetrics= TRUE)
nzv[nzv$nzv,]
##               freqRatio percentUnique zeroVar  nzv
## Hyd Pressure1        31      9.547935   FALSE TRUE

2.2 Box plot

df.m <- melt(bev_raw %>% select(-MFR, -`Filler Speed`, -`Carb Flow`,-`Bowl Setpoint`,`Carb Pressure1`,
            `Hyd Pressure4`, `Air Pressurer`, `Carb Temp`, `Filler Level`, `Mnf Flow`), id.var = "Brand Code")
p <-ggplot(data = df.m, aes(x=variable, y=value)) + 
  geom_boxplot(outlier.colour="red", outlier.shape=8, outlier.size=4,aes(fill=variable)) +     
  scale_y_continuous(name = "Predictors for PH", breaks = seq(0, 2, 0.5))  + 
  coord_flip()
p
## Warning: Removed 420 rows containing non-finite values (stat_boxplot).

2.3 Normality

bev_raw%>%
 select(-`Brand Code`) %>%   
  select(2:20) %>%         
  gather() %>%                            
  ggplot(aes(value)) +                     
    facet_wrap(~ key, scales = "free") +  
    geom_density()  
## Warning: Removed 428 rows containing non-finite values (stat_density).

bev_raw%>%
 select(-`Brand Code`) %>%   
  select(`Carb Flow`, Density, MFR, Balling, `Pressure Vacuum`, PH, `Oxygen Filler`, `Bowl Setpoint`, `Pressure Setpoint`, `Air Pressurer`, `Alch Rel`,`Carb Rel`,`Balling Lvl`)   %>%               
  gather() %>%                             
  ggplot(aes(value)) +                     
    facet_wrap(~ key, scales = "free") +  
    geom_density() 
## Warning: Removed 248 rows containing non-finite values (stat_density).

3 Data Preprocessing

We perform 3 data preperatoin steps - Remove the near-zero variance variables we previously mentioned - Impute missing values with a Random Forest Regression with the MICE package - Create dummy variables for the categorical variable, brand

Random Forest was chosen as the regression method in imputation because it requires very little tuning, allowing and there is no interface to tune the imputation model in MICE.

bev <- select(bev_raw, -PH)
cols <- str_replace(colnames(bev), '\\s', '')
colnames(bev) <- cols
bev <- mutate(bev, BrandCode = ifelse(is.na(BrandCode), 'Unknown', BrandCode))
y <- bev_raw$PH
zero_vars <- nearZeroVar(bev)
bev_new <- bev[, -zero_vars]
pred <- mice(data = bev_new, m = 2, method = 'rf', maxit = 3)
## Warning: Number of logged events: 1
bev_imputed <- complete(pred)
form <- dummyVars(~ ., data = bev_imputed)
bev_imputed <- predict(form, bev_imputed) %>% data.frame() %>% as.matrix()

4 Model Building

4.1 Test Train Split

samples <- createDataPartition(y, p = .75, list = F)
x_train <- bev_imputed[samples, ]
x_test<- bev_imputed[-samples, ]
y_train <- y[samples]
y_test <- y[-samples]

For state-of-the-art prediction quality, we will use a model stack. This will consist of tuning models separately and then combining the candidate models into a metamodel that will formulate predictions with a linear combination of our tuned models.

From the perspective of understanding the manufacturing process, the model stack will also provide benefits. The stack is like a panel of experts, each looking at the data through slightly different lenses to form their diagnoses. By looking at the predictors each model uses, we can gather assemble a complete picture of the factors that affect our manufacturing process.

#svm_grid <- expand.grid(.C = c(.5,1,2,4,8,16,32,64,128), .sigma = c(.005,.01,.5, .1))
xgb_grid <- expand.grid(eta = c(.025, .05), nrounds = c(1000), max_depth = c(5,6,7), gamma = c(0),
                          colsample_bytree = c(.6),
                          min_child_weight = c(.6,1),
                          subsample = c(.6))
cub_grid <- expand.grid(.committees = c(1,3,5), .neighbors = c(1,3,5,7,9))
#svm_grid <- expand.grid(.C = c(.5,1,2,4,8,16,32,64,128), .sigma = c(.005,.01,.5, .1))
mars_grid <- expand.grid(.degree = c(1,2), .nprune = seq(16,36,4))
dart_grid <- expand.grid(eta = c(.025), nrounds = c(1000), gamma = c(.001,.1), skip_drop = c(.4,.6), rate_drop = c(.4,.6), max_depth = c(6,7,8),
colsample_bytree = c(.6),
min_child_weight = c(.6),
subsample = c(.6))
tuning_list <-list(
  xgbt = caretModelSpec(method="xgbTree", tuneGrid = xgb_grid),
  xgbd = caretModelSpec(method="xgbDART", tuneGrid = dart_grid),
  cub = caretModelSpec(method="cubist", tuneGrid = cub_grid),
  #svm = caretModelSpec(method="rf", tuneGrid = svm_grid),
  rf = caretModelSpec(method="rf", tuneLength = 8, importance = TRUE),
  mars = caretModelSpec(method="bagEarth", tuneGrid = mars_grid)
)
cl <- makeCluster(detectCores())
registerDoParallel(cl)
set.seed(42)
my_control <- trainControl(method = 'cv',
                           number = 5,
                           savePredictions = 'final',
                           index = createFolds(y_train, 5),
                           allowParallel = TRUE) 
mod_list <- caretList(
  x = x_train,
  y = y_train,
  preProcess = c('center', 'scale'),
  trControl = my_control,
  tuneList = tuning_list
)
## Loading required package: earth
## Loading required package: plotmo
## Loading required package: plotrix
## 
## Attaching package: 'plotrix'
## The following object is masked from 'package:psych':
## 
##     rescale
## Loading required package: TeachingDemos
## 
## Attaching package: 'TeachingDemos'
## The following objects are masked from 'package:Hmisc':
## 
##     cnvrt.coords, subplot

4.2 MARS

# helper function
show_results <- function(model){ 
  rslts <- model$results %>%
    arrange(RMSE)
  head(rslts, 10) %>% kable () %>% kable_styling(bootstrap_options = "striped", full_width = F)
}
show_results(mod_list$mars)
degree nprune RMSE Rsquared MAE RMSESD RsquaredSD MAESD
2 24 0.1293923 0.4403201 0.0953474 0.0075449 0.0516960 0.0013165
2 32 0.1310015 0.4277821 0.0953529 0.0080684 0.0514205 0.0019951
2 20 0.1319812 0.4192975 0.0963224 0.0075199 0.0477922 0.0010510
2 36 0.1334755 0.4198462 0.0960772 0.0138693 0.0741590 0.0021868
2 16 0.1336589 0.4128170 0.0967001 0.0137420 0.0820265 0.0013145
2 28 0.1346241 0.4094923 0.0973064 0.0125917 0.0718070 0.0022925
1 20 0.1423016 0.3440586 0.1035250 0.0122296 0.0632738 0.0018322
1 16 0.1444258 0.3344042 0.1043301 0.0156439 0.0774990 0.0022212
1 32 0.1454984 0.3289925 0.1035073 0.0147017 0.0716365 0.0018291
1 28 0.1465086 0.3263716 0.1038117 0.0172102 0.0784272 0.0025939
plot(mod_list$mars, metric = "RMSE")

predict(mod_list$mars, x_test) %>%
  postResample(y_test)
##       RMSE   Rsquared        MAE 
## 0.11768340 0.54114820 0.08994945

4.3 RandomForest

show_results(mod_list$rf)
mtry RMSE Rsquared MAE RMSESD RsquaredSD MAESD
20 0.1216090 0.5102311 0.0922215 0.0024957 0.0137902 0.0009319
25 0.1218832 0.5058131 0.0923514 0.0024969 0.0142283 0.0007659
30 0.1219013 0.5046008 0.0919565 0.0020568 0.0107744 0.0007848
35 0.1219320 0.5035474 0.0917918 0.0023466 0.0163757 0.0011995
16 0.1220984 0.5082859 0.0927908 0.0027069 0.0154447 0.0006614
11 0.1225461 0.5090084 0.0935186 0.0024073 0.0154126 0.0005869
6 0.1241553 0.5035695 0.0955263 0.0028192 0.0148004 0.0006278
2 0.1296299 0.4768001 0.1010981 0.0025645 0.0162408 0.0006438
varImp(mod_list$rf)
## rf variable importance
## 
##   only 20 most important variables shown (out of 35)
## 
##                Overall
## MnfFlow         100.00
## OxygenFiller     78.22
## BrandCodeC       75.11
## PressureVacuum   68.92
## AirPressurer     59.58
## AlchRel          50.68
## CarbRel          46.67
## BallingLvl       46.36
## Usagecont        45.27
## Temperature      43.46
## HydPressure3     42.95
## FillerSpeed      42.04
## CarbFlow         40.88
## BowlSetpoint     36.01
## CarbPressure1    33.19
## Balling          31.56
## Density          31.19
## HydPressure2     25.75
## FillPressure     24.94
## CarbVolume       22.79
predict(mod_list$rf, x_test) %>%
  postResample(y_test)
##       RMSE   Rsquared        MAE 
## 0.09590286 0.70562626 0.07034761

4.4 Cubist

show_results(mod_list$cub)
committees neighbors RMSE Rsquared MAE RMSESD RsquaredSD MAESD
5 9 0.1239713 0.4858053 0.0930532 0.0024913 0.0147704 0.0013004
5 7 0.1240558 0.4864425 0.0929082 0.0027640 0.0160241 0.0015062
5 5 0.1244089 0.4871237 0.0929181 0.0031986 0.0182843 0.0018258
3 9 0.1249519 0.4797600 0.0937455 0.0025296 0.0180437 0.0017614
3 7 0.1250366 0.4804635 0.0935250 0.0027000 0.0181316 0.0019011
3 5 0.1254043 0.4811706 0.0934969 0.0029540 0.0185639 0.0020603
5 3 0.1267162 0.4788117 0.0941511 0.0037661 0.0194011 0.0024820
3 3 0.1276668 0.4734841 0.0946269 0.0031972 0.0160383 0.0024565
1 9 0.1319210 0.4369440 0.0979594 0.0069178 0.0407367 0.0027153
1 7 0.1319991 0.4385218 0.0977720 0.0067198 0.0379849 0.0025801
plot(mod_list$cub, metric = "RMSE")

predict(mod_list$cub, x_test) %>%
  postResample(y_test)
##       RMSE   Rsquared        MAE 
## 0.10060583 0.66374962 0.07142763

4.5 XGB Trees

show_results(mod_list$xgbt)
eta max_depth gamma colsample_bytree min_child_weight subsample nrounds RMSE Rsquared MAE RMSESD RsquaredSD MAESD
0.025 7 0 0.6 1.0 0.6 1000 0.1233464 0.4835457 0.0934677 0.0031863 0.0201173 0.0015521
0.025 7 0 0.6 0.6 0.6 1000 0.1233538 0.4839877 0.0938334 0.0028762 0.0171870 0.0011988
0.025 5 0 0.6 1.0 0.6 1000 0.1235127 0.4817151 0.0939106 0.0032460 0.0191319 0.0013398
0.025 6 0 0.6 1.0 0.6 1000 0.1235708 0.4813217 0.0936810 0.0027849 0.0166430 0.0012594
0.025 5 0 0.6 0.6 0.6 1000 0.1236060 0.4807873 0.0938534 0.0029860 0.0186061 0.0013791
0.025 6 0 0.6 0.6 0.6 1000 0.1237410 0.4800653 0.0939809 0.0028105 0.0154844 0.0010134
0.050 6 0 0.6 1.0 0.6 1000 0.1240914 0.4765253 0.0943297 0.0030320 0.0189040 0.0014818
0.050 6 0 0.6 0.6 0.6 1000 0.1244609 0.4739067 0.0945946 0.0021863 0.0109887 0.0006778
0.050 5 0 0.6 1.0 0.6 1000 0.1247637 0.4714508 0.0950109 0.0023851 0.0138651 0.0010304
0.050 5 0 0.6 0.6 0.6 1000 0.1248728 0.4706914 0.0946345 0.0018045 0.0089909 0.0010138
plot(mod_list$xgbt, metric = "RMSE")

predict(mod_list$xgbt, x_test) %>%
  postResample(y_test)
##       RMSE   Rsquared        MAE 
## 0.09742288 0.68739087 0.07216124

4.6 XGB Dart

show_results(mod_list$xgbd)
max_depth eta rate_drop skip_drop min_child_weight subsample colsample_bytree gamma nrounds RMSE Rsquared MAE RMSESD RsquaredSD MAESD
7 0.025 0.4 0.6 0.6 0.6 0.6 0.001 1000 0.1235007 0.4831276 0.0938503 0.0025050 0.0138700 0.0010493
7 0.025 0.6 0.6 0.6 0.6 0.6 0.001 1000 0.1235108 0.4828335 0.0937119 0.0027835 0.0149381 0.0012693
7 0.025 0.6 0.4 0.6 0.6 0.6 0.001 1000 0.1236087 0.4829257 0.0942172 0.0029463 0.0171059 0.0014546
6 0.025 0.4 0.6 0.6 0.6 0.6 0.001 1000 0.1236879 0.4809211 0.0940993 0.0027245 0.0150394 0.0009246
8 0.025 0.6 0.6 0.6 0.6 0.6 0.001 1000 0.1237657 0.4803424 0.0940324 0.0031281 0.0186124 0.0010397
6 0.025 0.6 0.6 0.6 0.6 0.6 0.001 1000 0.1238856 0.4788876 0.0941897 0.0028283 0.0152732 0.0009722
7 0.025 0.4 0.4 0.6 0.6 0.6 0.001 1000 0.1241576 0.4783071 0.0945003 0.0027020 0.0156459 0.0009545
6 0.025 0.4 0.4 0.6 0.6 0.6 0.001 1000 0.1243230 0.4763978 0.0946515 0.0028976 0.0164406 0.0009582
6 0.025 0.6 0.4 0.6 0.6 0.6 0.001 1000 0.1244095 0.4752999 0.0946699 0.0030657 0.0181469 0.0008818
8 0.025 0.4 0.6 0.6 0.6 0.6 0.001 1000 0.1245896 0.4740741 0.0945840 0.0028670 0.0192961 0.0012824
plot(mod_list$xgbd, metric = "RMSE")

predict(mod_list$xgbd, x_test) %>%
  postResample(y_test)
##       RMSE   Rsquared        MAE 
## 0.09599037 0.69987470 0.07137964

4.7 Variable Importances

p1 <- varImp(mod_list$cub) %>%
  plot(top = 10, main = 'Cubist')
p2 <- varImp(mod_list$xgbt) %>%
  plot(top = 10, main = 'XGBTrees')
p3 <- varImp(mod_list$xgbd) %>%
  plot(top = 10, main = 'XGBDart')
p4 <- varImp(mod_list$mars) %>%
  plot(top = 10, main = 'MARS')
p5 <- varImp(mod_list$rf) %>%
  plot(top = 10, main = 'RF')
grid.arrange(p1,p2,p3,p4,p5, nrow = 3, ncol = 2)

Variable Counts among the Models

top_10 <- function(model) {
  varImp(model)$importance %>%
  rownames_to_column('var') %>%
  arrange(desc(Overall)) %>%
  head(10) %>%
  select(var) 
}
rbind(top_10(mod_list$xgbt), top_10(mod_list$xgbd), top_10(mod_list$cub), top_10(mod_list$mars), top_10(mod_list$rf)) %>%
  group_by(var) %>%
  summarise(ModelCount = n()) %>%
  arrange(desc(ModelCount)) %>%
  kable () %>% kable_styling(bootstrap_options = "striped", full_width = F)
var ModelCount
AlchRel 5
MnfFlow 5
PressureVacuum 5
Temperature 5
OxygenFiller 4
Usagecont 4
AirPressurer 3
BrandCodeC 3
CarbFlow 3
CarbPressure1 3
Balling 2
BallingLvl 2
CarbRel 2
FillerSpeed 2
BowlSetpoint 1
Density 1

Best Single Tree

rp_mod <- rpart::rpart(ph ~ ., data = data.frame(bev_imputed, ph = y), method = 'anova')
rpart.plot(rp_mod, uniform=TRUE)
## Warning: Bad 'data' field in model 'call' (expected a data.frame or a matrix).
## To silence this warning:
##     Call rpart.plot with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.

4.8 Test Stack

resamples <- resamples(mod_list)
modelCor(resamples)
##             xgbt        xgbd        cub          rf        mars
## xgbt  1.00000000  0.72915620  0.1680156  0.35586510 -0.07363005
## xgbd  0.72915620  1.00000000  0.6294197  0.05638985 -0.52568873
## cub   0.16801560  0.62941974  1.0000000 -0.59194316 -0.09449550
## rf    0.35586510  0.05638985 -0.5919432  1.00000000 -0.08872421
## mars -0.07363005 -0.52568873 -0.0944955 -0.08872421  1.00000000
correlations <- cor(modelCor(resamples))
corrplot::corrplot(correlations, order = "FPC", diag = TRUE)

ensemble1 <- caretStack(
  mod_list,
  method="glmnet",
  metric="RMSE",
  trControl=trainControl(
    method="cv",
    number=5,
    savePredictions="final"
  )
)
ensemble1
## A glmnet ensemble of 2 base models: xgbTree, xgbDART, cubist, rf, bagEarth
## 
## Ensemble results:
## glmnet 
## 
## 7704 samples
##    5 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 6163, 6164, 6163, 6163, 6163 
## Resampling results across tuning parameters:
## 
##   alpha  lambda        RMSE       Rsquared   MAE       
##   0.10   0.0002434393  0.1179670  0.5256555  0.08853121
##   0.10   0.0024343929  0.1179673  0.5256535  0.08853875
##   0.10   0.0243439292  0.1182455  0.5246648  0.08897555
##   0.55   0.0002434393  0.1179732  0.5256107  0.08853850
##   0.55   0.0024343929  0.1179785  0.5256304  0.08857355
##   0.55   0.0243439292  0.1190255  0.5254893  0.09014219
##   1.00   0.0002434393  0.1179835  0.5255294  0.08854316
##   1.00   0.0024343929  0.1180083  0.5255103  0.08862443
##   1.00   0.0243439292  0.1206643  0.5250264  0.09196281
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0.1 and lambda
##  = 0.0002434393.
ensemble2 <- caretEnsemble(mod_list, 
                        metric = 'RMSE', 
                        trControl=trainControl(
                          method="cv",
                          number=5,
                          savePredictions="final"
                        ))
ensemble2
## A glm ensemble of 2 base models: xgbTree, xgbDART, cubist, rf, bagEarth
## 
## Ensemble results:
## Generalized Linear Model 
## 
## 7704 samples
##    5 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 6163, 6162, 6164, 6163, 6164 
## Resampling results:
## 
##   RMSE       Rsquared   MAE      
##   0.1180105  0.5249879  0.0885623

4.9 Predict on Train/Test Sets

y <- randomForest(x = x_train, y = y_train, importance=TRUE, ntree=1000)
predictions <- predict(y, newdata=x_test)
predictions %>% tibble::enframe(name = NULL) %>% datatable()
predictions %>% tibble::enframe(name = NULL) %>% rownames_to_column() %>% rename(PH = value, row = rowname)
## # A tibble: 640 x 2
##    row      PH
##    <chr> <dbl>
##  1 1      8.50
##  2 2      8.50
##  3 3      8.55
##  4 4      8.51
##  5 5      8.48
##  6 6      8.44
##  7 7      8.47
##  8 8      8.55
##  9 9      8.49
## 10 10     8.48
## # ... with 630 more rows
write.csv(predictions, file = "predictions.csv")