data <- read_excel("StudentData.xlsx")
eval <- read_excel("StudentEvaluation.xlsx")
eval_final <- eval
target_PH <- eval$PH
summary(data)
##   Brand Code         Carb Volume     Fill Ounces      PC Volume      
##  Length:2571        Min.   :5.040   Min.   :23.63   Min.   :0.07933  
##  Class :character   1st Qu.:5.293   1st Qu.:23.92   1st Qu.:0.23917  
##  Mode  :character   Median :5.347   Median :23.97   Median :0.27133  
##                     Mean   :5.370   Mean   :23.97   Mean   :0.27712  
##                     3rd Qu.:5.453   3rd Qu.:24.03   3rd Qu.:0.31200  
##                     Max.   :5.700   Max.   :24.32   Max.   :0.47800  
##                     NA's   :10      NA's   :38      NA's   :39       
##  Carb Pressure     Carb Temp          PSC             PSC Fill     
##  Min.   :57.00   Min.   :128.6   Min.   :0.00200   Min.   :0.0000  
##  1st Qu.:65.60   1st Qu.:138.4   1st Qu.:0.04800   1st Qu.:0.1000  
##  Median :68.20   Median :140.8   Median :0.07600   Median :0.1800  
##  Mean   :68.19   Mean   :141.1   Mean   :0.08457   Mean   :0.1954  
##  3rd Qu.:70.60   3rd Qu.:143.8   3rd Qu.:0.11200   3rd Qu.:0.2600  
##  Max.   :79.40   Max.   :154.0   Max.   :0.27000   Max.   :0.6200  
##  NA's   :27      NA's   :26      NA's   :33        NA's   :23      
##     PSC CO2           Mnf Flow       Carb Pressure1  Fill Pressure  
##  Min.   :0.00000   Min.   :-100.20   Min.   :105.6   Min.   :34.60  
##  1st Qu.:0.02000   1st Qu.:-100.00   1st Qu.:119.0   1st Qu.:46.00  
##  Median :0.04000   Median :  65.20   Median :123.2   Median :46.40  
##  Mean   :0.05641   Mean   :  24.57   Mean   :122.6   Mean   :47.92  
##  3rd Qu.:0.08000   3rd Qu.: 140.80   3rd Qu.:125.4   3rd Qu.:50.00  
##  Max.   :0.24000   Max.   : 229.40   Max.   :140.2   Max.   :60.40  
##  NA's   :39        NA's   :2         NA's   :32      NA's   :22     
##  Hyd Pressure1   Hyd Pressure2   Hyd Pressure3   Hyd Pressure4   
##  Min.   :-0.80   Min.   : 0.00   Min.   :-1.20   Min.   : 52.00  
##  1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 86.00  
##  Median :11.40   Median :28.60   Median :27.60   Median : 96.00  
##  Mean   :12.44   Mean   :20.96   Mean   :20.46   Mean   : 96.29  
##  3rd Qu.:20.20   3rd Qu.:34.60   3rd Qu.:33.40   3rd Qu.:102.00  
##  Max.   :58.00   Max.   :59.40   Max.   :50.00   Max.   :142.00  
##  NA's   :11      NA's   :15      NA's   :15      NA's   :30      
##   Filler Level    Filler Speed   Temperature      Usage cont      Carb Flow   
##  Min.   : 55.8   Min.   : 998   Min.   :63.60   Min.   :12.08   Min.   :  26  
##  1st Qu.: 98.3   1st Qu.:3888   1st Qu.:65.20   1st Qu.:18.36   1st Qu.:1144  
##  Median :118.4   Median :3982   Median :65.60   Median :21.79   Median :3028  
##  Mean   :109.3   Mean   :3687   Mean   :65.97   Mean   :20.99   Mean   :2468  
##  3rd Qu.:120.0   3rd Qu.:3998   3rd Qu.:66.40   3rd Qu.:23.75   3rd Qu.:3186  
##  Max.   :161.2   Max.   :4030   Max.   :76.20   Max.   :25.90   Max.   :5104  
##  NA's   :20      NA's   :57     NA's   :14      NA's   :5       NA's   :2     
##     Density           MFR           Balling       Pressure Vacuum 
##  Min.   :0.240   Min.   : 31.4   Min.   :-0.170   Min.   :-6.600  
##  1st Qu.:0.900   1st Qu.:706.3   1st Qu.: 1.496   1st Qu.:-5.600  
##  Median :0.980   Median :724.0   Median : 1.648   Median :-5.400  
##  Mean   :1.174   Mean   :704.0   Mean   : 2.198   Mean   :-5.216  
##  3rd Qu.:1.620   3rd Qu.:731.0   3rd Qu.: 3.292   3rd Qu.:-5.000  
##  Max.   :1.920   Max.   :868.6   Max.   : 4.012   Max.   :-3.600  
##  NA's   :1       NA's   :212     NA's   :1                        
##        PH        Oxygen Filler     Bowl Setpoint   Pressure Setpoint
##  Min.   :7.880   Min.   :0.00240   Min.   : 70.0   Min.   :44.00    
##  1st Qu.:8.440   1st Qu.:0.02200   1st Qu.:100.0   1st Qu.:46.00    
##  Median :8.540   Median :0.03340   Median :120.0   Median :46.00    
##  Mean   :8.546   Mean   :0.04684   Mean   :109.3   Mean   :47.62    
##  3rd Qu.:8.680   3rd Qu.:0.06000   3rd Qu.:120.0   3rd Qu.:50.00    
##  Max.   :9.360   Max.   :0.40000   Max.   :140.0   Max.   :52.00    
##  NA's   :4       NA's   :12        NA's   :2       NA's   :12       
##  Air Pressurer      Alch Rel        Carb Rel      Balling Lvl  
##  Min.   :140.8   Min.   :5.280   Min.   :4.960   Min.   :0.00  
##  1st Qu.:142.2   1st Qu.:6.540   1st Qu.:5.340   1st Qu.:1.38  
##  Median :142.6   Median :6.560   Median :5.400   Median :1.48  
##  Mean   :142.8   Mean   :6.897   Mean   :5.437   Mean   :2.05  
##  3rd Qu.:143.0   3rd Qu.:7.240   3rd Qu.:5.540   3rd Qu.:3.14  
##  Max.   :148.2   Max.   :8.620   Max.   :6.060   Max.   :3.66  
##                  NA's   :9       NA's   :10      NA's   :1
summary(eval)
##   Brand Code         Carb Volume     Fill Ounces      PC Volume      
##  Length:267         Min.   :5.147   Min.   :23.75   Min.   :0.09867  
##  Class :character   1st Qu.:5.287   1st Qu.:23.92   1st Qu.:0.23333  
##  Mode  :character   Median :5.340   Median :23.97   Median :0.27533  
##                     Mean   :5.369   Mean   :23.97   Mean   :0.27769  
##                     3rd Qu.:5.465   3rd Qu.:24.01   3rd Qu.:0.32200  
##                     Max.   :5.667   Max.   :24.20   Max.   :0.46400  
##                     NA's   :1       NA's   :6       NA's   :4        
##  Carb Pressure     Carb Temp          PSC             PSC Fill     
##  Min.   :60.20   Min.   :130.0   Min.   :0.00400   Min.   :0.0200  
##  1st Qu.:65.30   1st Qu.:138.4   1st Qu.:0.04450   1st Qu.:0.1000  
##  Median :68.00   Median :140.8   Median :0.07600   Median :0.1800  
##  Mean   :68.25   Mean   :141.2   Mean   :0.08545   Mean   :0.1903  
##  3rd Qu.:70.60   3rd Qu.:143.8   3rd Qu.:0.11200   3rd Qu.:0.2600  
##  Max.   :77.60   Max.   :154.0   Max.   :0.24600   Max.   :0.6200  
##                  NA's   :1       NA's   :5         NA's   :3       
##     PSC CO2           Mnf Flow       Carb Pressure1  Fill Pressure  
##  Min.   :0.00000   Min.   :-100.20   Min.   :113.0   Min.   :37.80  
##  1st Qu.:0.02000   1st Qu.:-100.00   1st Qu.:120.2   1st Qu.:46.00  
##  Median :0.04000   Median :   0.20   Median :123.4   Median :47.80  
##  Mean   :0.05107   Mean   :  21.03   Mean   :123.0   Mean   :48.14  
##  3rd Qu.:0.06000   3rd Qu.: 141.30   3rd Qu.:125.5   3rd Qu.:50.20  
##  Max.   :0.24000   Max.   : 220.40   Max.   :136.0   Max.   :60.20  
##  NA's   :5                           NA's   :4       NA's   :2      
##  Hyd Pressure1    Hyd Pressure2    Hyd Pressure3    Hyd Pressure4   
##  Min.   :-50.00   Min.   :-50.00   Min.   :-50.00   Min.   : 68.00  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.: 90.00  
##  Median : 10.40   Median : 26.80   Median : 27.70   Median : 98.00  
##  Mean   : 12.01   Mean   : 20.11   Mean   : 19.61   Mean   : 97.84  
##  3rd Qu.: 20.40   3rd Qu.: 34.80   3rd Qu.: 33.00   3rd Qu.:104.00  
##  Max.   : 50.00   Max.   : 61.40   Max.   : 49.20   Max.   :140.00  
##                   NA's   :1        NA's   :1        NA's   :4       
##   Filler Level    Filler Speed   Temperature      Usage cont      Carb Flow   
##  Min.   : 69.2   Min.   :1006   Min.   :63.80   Min.   :12.90   Min.   :   0  
##  1st Qu.:100.6   1st Qu.:3812   1st Qu.:65.40   1st Qu.:18.12   1st Qu.:1083  
##  Median :118.6   Median :3978   Median :65.80   Median :21.44   Median :3038  
##  Mean   :110.3   Mean   :3581   Mean   :66.23   Mean   :20.90   Mean   :2409  
##  3rd Qu.:120.2   3rd Qu.:3996   3rd Qu.:66.60   3rd Qu.:23.74   3rd Qu.:3215  
##  Max.   :153.2   Max.   :4020   Max.   :75.40   Max.   :24.60   Max.   :3858  
##  NA's   :2       NA's   :10     NA's   :2       NA's   :2                     
##     Density           MFR           Balling      Pressure Vacuum 
##  Min.   :0.060   Min.   : 15.6   Min.   :0.902   Min.   :-6.400  
##  1st Qu.:0.920   1st Qu.:707.0   1st Qu.:1.498   1st Qu.:-5.600  
##  Median :0.980   Median :724.6   Median :1.648   Median :-5.200  
##  Mean   :1.177   Mean   :697.8   Mean   :2.203   Mean   :-5.174  
##  3rd Qu.:1.600   3rd Qu.:731.5   3rd Qu.:3.242   3rd Qu.:-4.800  
##  Max.   :1.840   Max.   :784.8   Max.   :3.788   Max.   :-3.600  
##  NA's   :1       NA's   :31      NA's   :1       NA's   :1       
##     PH          Oxygen Filler     Bowl Setpoint   Pressure Setpoint
##  Mode:logical   Min.   :0.00240   Min.   : 70.0   Min.   :44.00    
##  NA's:267       1st Qu.:0.01960   1st Qu.:100.0   1st Qu.:46.00    
##                 Median :0.03370   Median :120.0   Median :46.00    
##                 Mean   :0.04666   Mean   :109.6   Mean   :47.73    
##                 3rd Qu.:0.05440   3rd Qu.:120.0   3rd Qu.:50.00    
##                 Max.   :0.39800   Max.   :130.0   Max.   :52.00    
##                 NA's   :3         NA's   :1       NA's   :2        
##  Air Pressurer      Alch Rel        Carb Rel     Balling Lvl   
##  Min.   :141.2   Min.   :6.400   Min.   :5.18   Min.   :0.000  
##  1st Qu.:142.2   1st Qu.:6.540   1st Qu.:5.34   1st Qu.:1.380  
##  Median :142.6   Median :6.580   Median :5.40   Median :1.480  
##  Mean   :142.8   Mean   :6.907   Mean   :5.44   Mean   :2.051  
##  3rd Qu.:142.8   3rd Qu.:7.180   3rd Qu.:5.56   3rd Qu.:3.080  
##  Max.   :147.2   Max.   :7.820   Max.   :5.74   Max.   :3.420  
##  NA's   :1       NA's   :3       NA's   :2
data_trim <- drop_na(data)

data[1,2]
data$`Carb Volume`[which(is.na(data$`Carb Volume`))] <- median(data$`Carb Volume`,na.rm=TRUE)

data$`PC Volume`[which(is.na(data$`PC Volume`))] <- median(data$`PC Volume`,na.rm=TRUE)

data$`Carb Pressure`[which(is.na(data$`Carb Pressure`))] <- median(data$`Carb Pressure`,na.rm=TRUE)

data$`Carb Temp`[which(is.na(data$`Carb Temp`))] <- median(data$`Carb Temp`,na.rm=TRUE)

data$`PSC`[which(is.na(data$`PSC`))] <- median(data$`PSC`,na.rm=TRUE)

data$`PSC Fill`[which(is.na(data$`PSC Fill`))] <- median(data$`PSC Fill`,na.rm=TRUE)

data$`PSC CO2`[which(is.na(data$`PSC CO2`))] <- median(data$`PSC CO2`,na.rm=TRUE)

data$`Mnf Flow`[which(is.na(data$`Mnf Flow`))] <- median(data$`Mnf Flow`,na.rm=TRUE)

data$`Carb Pressure1`[which(is.na(data$`Carb Pressure1`))] <- median(data$`Carb Pressure1`,na.rm=TRUE)

data$`Fill Pressure`[which(is.na(data$`Fill Pressure`))] <- median(data$`Fill Pressure`,na.rm=TRUE)

data$`Hyd Pressure1`[which(is.na(data$`Hyd Pressure1`))] <- median(data$`Hyd Pressure1`,na.rm=TRUE)

data$`Hyd Pressure2`[which(is.na(data$`Hyd Pressure2`))] <- median(data$`Hyd Pressure2`,na.rm=TRUE)
data$`Hyd Pressure3`[which(is.na(data$`Hyd Pressure3`))] <- median(data$`Hyd Pressure3`,na.rm=TRUE)
data$`Hyd Pressure4`[which(is.na(data$`Hyd Pressure4`))] <- median(data$`Hyd Pressure4`,na.rm=TRUE)
data$`Filler Level`[which(is.na(data$`Filler Level`))] <- median(data$`Filler Level`,na.rm=TRUE)
data$`Filler Speed`[which(is.na(data$`Filler Speed`))] <- median(data$`Filler Speed`,na.rm=TRUE)
data$`Temperature`[which(is.na(data$`Temperature`))] <- median(data$`Temperature`,na.rm=TRUE)
data$`Usage cont`[which(is.na(data$`Usage cont`))] <- median(data$`Usage cont`,na.rm=TRUE)
data$`Carb Flow`[which(is.na(data$`Carb Flow`))] <- median(data$`Carb Flow`,na.rm=TRUE)
data$`Density`[which(is.na(data$`Density`))] <- median(data$`Density`,na.rm=TRUE)
data$`MFR`[which(is.na(data$`MFR`))] <- median(data$`MFR`,na.rm=TRUE)
data$`Balling`[which(is.na(data$`Balling`))] <- median(data$`Balling`,na.rm=TRUE)
data$`Oxygen Filler`[which(is.na(data$`Oxygen Filler`))] <- median(data$`Oxygen Filler`,na.rm=TRUE)
data$`Bowl Setpoint`[which(is.na(data$`Bowl Setpoint`))] <- median(data$`Bowl Setpoint`,na.rm=TRUE)
data$`Pressure Setpoint`[which(is.na(data$`Pressure Setpoint`))] <- median(data$`Pressure Setpoint`,na.rm=TRUE)
data$`Alch Rel`[which(is.na(data$`Alch Rel`))] <- median(data$`Alch Rel`,na.rm=TRUE)
data$`Carb Rel`[which(is.na(data$`Carb Rel`))] <- median(data$`Carb Rel`,na.rm=TRUE)
data$`Balling Lvl`[which(is.na(data$`Balling Lvl`))] <- median(data$`Balling Lvl`,na.rm=TRUE)
data$`Fill Ounces`[which(is.na(data$`Fill Ounces`))] <- median(data$`Fill Ounces`,na.rm=TRUE)

data <- subset(data,!is.na(PH))

length(which(is.na(data$`Brand Code`)))
## [1] 120
which(is.na(eval$`Brand Code`))
## [1]  16 127 167 209 210 236 256 257
2567-2038
## [1] 529

There are a lot of NA values to manage. Removing all NAs leaves us with 2038 observations.

Fix Eval NAs

eval$`Carb Volume`[which(is.na(eval$`Carb Volume`))] <- median(eval$`Carb Volume`,na.rm=TRUE)

eval$`PC Volume`[which(is.na(eval$`PC Volume`))] <- median(eval$`PC Volume`,na.rm=TRUE)

eval$`Carb Pressure`[which(is.na(eval$`Carb Pressure`))] <- median(eval$`Carb Pressure`,na.rm=TRUE)

eval$`Carb Temp`[which(is.na(eval$`Carb Temp`))] <- median(eval$`Carb Temp`,na.rm=TRUE)

eval$`PSC`[which(is.na(eval$`PSC`))] <- median(eval$`PSC`,na.rm=TRUE)

eval$`PSC Fill`[which(is.na(eval$`PSC Fill`))] <- median(eval$`PSC Fill`,na.rm=TRUE)

eval$`PSC CO2`[which(is.na(eval$`PSC CO2`))] <- median(eval$`PSC CO2`,na.rm=TRUE)

eval$`Mnf Flow`[which(is.na(eval$`Mnf Flow`))] <- median(eval$`Mnf Flow`,na.rm=TRUE)

eval$`Carb Pressure1`[which(is.na(eval$`Carb Pressure1`))] <- median(eval$`Carb Pressure1`,na.rm=TRUE)

eval$`Fill Pressure`[which(is.na(eval$`Fill Pressure`))] <- median(eval$`Fill Pressure`,na.rm=TRUE)

eval$`Hyd Pressure1`[which(is.na(eval$`Hyd Pressure1`))] <- median(eval$`Hyd Pressure1`,na.rm=TRUE)

eval$`Hyd Pressure2`[which(is.na(eval$`Hyd Pressure2`))] <- median(eval$`Hyd Pressure2`,na.rm=TRUE)
eval$`Hyd Pressure3`[which(is.na(eval$`Hyd Pressure3`))] <- median(eval$`Hyd Pressure3`,na.rm=TRUE)
eval$`Hyd Pressure4`[which(is.na(eval$`Hyd Pressure4`))] <- median(eval$`Hyd Pressure4`,na.rm=TRUE)
eval$`Filler Level`[which(is.na(eval$`Filler Level`))] <- median(eval$`Filler Level`,na.rm=TRUE)
eval$`Filler Speed`[which(is.na(eval$`Filler Speed`))] <- median(eval$`Filler Speed`,na.rm=TRUE)
eval$`Temperature`[which(is.na(eval$`Temperature`))] <- median(eval$`Temperature`,na.rm=TRUE)
eval$`Usage cont`[which(is.na(eval$`Usage cont`))] <- median(eval$`Usage cont`,na.rm=TRUE)
eval$`Carb Flow`[which(is.na(eval$`Carb Flow`))] <- median(eval$`Carb Flow`,na.rm=TRUE)
eval$`Density`[which(is.na(eval$`Density`))] <- median(eval$`Density`,na.rm=TRUE)
eval$`MFR`[which(is.na(eval$`MFR`))] <- median(eval$`MFR`,na.rm=TRUE)
eval$`Balling`[which(is.na(eval$`Balling`))] <- median(eval$`Balling`,na.rm=TRUE)
eval$`Oxygen Filler`[which(is.na(eval$`Oxygen Filler`))] <- median(eval$`Oxygen Filler`,na.rm=TRUE)
eval$`Bowl Setpoint`[which(is.na(eval$`Bowl Setpoint`))] <- median(eval$`Bowl Setpoint`,na.rm=TRUE)
eval$`Pressure Setpoint`[which(is.na(eval$`Pressure Setpoint`))] <- median(eval$`Pressure Setpoint`,na.rm=TRUE)
eval$`Alch Rel`[which(is.na(eval$`Alch Rel`))] <- median(eval$`Alch Rel`,na.rm=TRUE)
eval$`Carb Rel`[which(is.na(eval$`Carb Rel`))] <- median(eval$`Carb Rel`,na.rm=TRUE)
eval$`Balling Lvl`[which(is.na(eval$`Balling Lvl`))] <- median(eval$`Balling Lvl`,na.rm=TRUE)
eval$`Fill Ounces`[which(is.na(eval$`Fill Ounces`))] <- median(eval$`Fill Ounces`,na.rm=TRUE)
eval$`Pressure Vacuum`[which(is.na(eval$`Pressure Vacuum`))] <- median(eval$`Pressure Vacuum`,na.rm=TRUE)
eval$`Air Pressurer`[which(is.na(eval$`Air Pressurer`))] <- median(eval$`Air Pressurer`,na.rm=TRUE)
str(data)
## tibble [2,567 × 33] (S3: tbl_df/tbl/data.frame)
##  $ Brand Code       : chr [1:2567] "B" "A" "B" "A" ...
##  $ Carb Volume      : num [1:2567] 5.34 5.43 5.29 5.44 5.49 ...
##  $ Fill Ounces      : num [1:2567] 24 24 24.1 24 24.3 ...
##  $ PC Volume        : num [1:2567] 0.263 0.239 0.263 0.293 0.111 ...
##  $ Carb Pressure    : num [1:2567] 68.2 68.4 70.8 63 67.2 66.6 64.2 67.6 64.2 72 ...
##  $ Carb Temp        : num [1:2567] 141 140 145 133 137 ...
##  $ PSC              : num [1:2567] 0.104 0.124 0.09 0.076 0.026 0.09 0.128 0.154 0.132 0.014 ...
##  $ PSC Fill         : num [1:2567] 0.26 0.22 0.34 0.42 0.16 ...
##  $ PSC CO2          : num [1:2567] 0.04 0.04 0.16 0.04 0.12 ...
##  $ Mnf Flow         : num [1:2567] -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 ...
##  $ Carb Pressure1   : num [1:2567] 119 122 120 115 118 ...
##  $ Fill Pressure    : num [1:2567] 46 46 46 46.4 45.8 45.6 51.8 46.8 46 45.2 ...
##  $ Hyd Pressure1    : num [1:2567] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Hyd Pressure2    : num [1:2567] 28.6 28.6 28.6 0 0 0 0 0 0 0 ...
##  $ Hyd Pressure3    : num [1:2567] 27.6 27.6 27.6 0 0 0 0 0 0 0 ...
##  $ Hyd Pressure4    : num [1:2567] 118 106 82 92 92 116 124 132 90 108 ...
##  $ Filler Level     : num [1:2567] 121 119 120 118 119 ...
##  $ Filler Speed     : num [1:2567] 4002 3986 4020 4012 4010 ...
##  $ Temperature      : num [1:2567] 66 67.6 67 65.6 65.6 66.2 65.8 65.2 65.4 66.6 ...
##  $ Usage cont       : num [1:2567] 16.2 19.9 17.8 17.4 17.7 ...
##  $ Carb Flow        : num [1:2567] 2932 3144 2914 3062 3054 ...
##  $ Density          : num [1:2567] 0.88 0.92 1.58 1.54 1.54 1.52 0.84 0.84 0.9 0.9 ...
##  $ MFR              : num [1:2567] 725 727 735 731 723 ...
##  $ Balling          : num [1:2567] 1.4 1.5 3.14 3.04 3.04 ...
##  $ Pressure Vacuum  : num [1:2567] -4 -4 -3.8 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 ...
##  $ PH               : num [1:2567] 8.36 8.26 8.94 8.24 8.26 8.32 8.4 8.38 8.38 8.5 ...
##  $ Oxygen Filler    : num [1:2567] 0.022 0.026 0.024 0.03 0.03 0.024 0.066 0.046 0.064 0.022 ...
##  $ Bowl Setpoint    : num [1:2567] 120 120 120 120 120 120 120 120 120 120 ...
##  $ Pressure Setpoint: num [1:2567] 46.4 46.8 46.6 46 46 46 46 46 46 46 ...
##  $ Air Pressurer    : num [1:2567] 143 143 142 146 146 ...
##  $ Alch Rel         : num [1:2567] 6.58 6.56 7.66 7.14 7.14 7.16 6.54 6.52 6.52 6.54 ...
##  $ Carb Rel         : num [1:2567] 5.32 5.3 5.84 5.42 5.44 5.44 5.38 5.34 5.34 5.34 ...
##  $ Balling Lvl      : num [1:2567] 1.48 1.56 3.28 3.04 3.04 3.02 1.44 1.44 1.44 1.38 ...
#corr <- round(cor(data[,2:33]), 1)

#corr25 <- corr[1:25,1:25]

#ggcorrplot(corr,method="circle")
#ggcorrplot(corr25,method="circle")

data$`Brand Code` <- as.factor(data$`Brand Code`)

ggplot(data,aes(x=c(1:2567),y=data$PH,color = data$`Brand Code`))+
  geom_point()

median(data$PH[which(data$`Brand Code`=="A")])
## [1] 8.52
median(data$PH[which(data$`Brand Code`=="B")])
## [1] 8.56
median(data$PH[which(data$`Brand Code`=="C")])
## [1] 8.42
median(data$PH[which(data$`Brand Code`=="D")])
## [1] 8.62
median(data$PH[which(is.na(data$`Brand Code`))])
## [1] 8.51
median(data$PH)
## [1] 8.54
ggplot(data)+
  geom_boxplot(aes(x=PH))

data$`Brand Code` <- as.character(data$`Brand Code`)
data_with_brand <- data

data_with_brand$Brand_A <- 0
data_with_brand$Brand_B <- 0
data_with_brand$Brand_C <- 0
data_with_brand$Brand_D <- 0
data_with_brand$Brand_NA <- 0

data_with_brand$Brand_A[which(data_with_brand$`Brand Code`=="A")] <- 1
data_with_brand$Brand_A[which(data_with_brand$`Brand Code`!="A")] <- 0
data_with_brand$Brand_B[which(data_with_brand$`Brand Code`=="B")] <- 1
data_with_brand$Brand_B[which(data_with_brand$`Brand Code`!="B")] <- 0
data_with_brand$Brand_C[which(data_with_brand$`Brand Code`=="C")] <- 1
data_with_brand$Brand_C[which(data_with_brand$`Brand Code`!="C")] <- 0
data_with_brand$Brand_D[which(data_with_brand$`Brand Code`=="D")] <- 1
data_with_brand$Brand_D[which(data_with_brand$`Brand Code`!="D")] <- 0
data_with_brand$Brand_NA[which(is.na(data_with_brand$`Brand Code`))] <- 1
data_with_brand$Brand_NA[which(!is.na(data_with_brand$`Brand Code`))] <- 0


quantile(data$PH)
##   0%  25%  50%  75% 100% 
## 7.88 8.44 8.54 8.68 9.36
eval_with_brand <- eval


eval_with_brand$Brand_A <- 0
eval_with_brand$Brand_B <- 0
eval_with_brand$Brand_C <- 0
eval_with_brand$Brand_D <- 0
eval_with_brand$Brand_NA <- 0


eval_with_brand$Brand_A[which(eval_with_brand$`Brand Code`=="A")] <- 1
eval_with_brand$Brand_A[which(eval_with_brand$`Brand Code`!="A")] <- 0
eval_with_brand$Brand_B[which(eval_with_brand$`Brand Code`=="B")] <- 1
eval_with_brand$Brand_B[which(eval_with_brand$`Brand Code`!="B")] <- 0
eval_with_brand$Brand_C[which(eval_with_brand$`Brand Code`=="C")] <- 1
eval_with_brand$Brand_C[which(eval_with_brand$`Brand Code`!="C")] <- 0
eval_with_brand$Brand_D[which(eval_with_brand$`Brand Code`=="D")] <- 1
eval_with_brand$Brand_D[which(eval_with_brand$`Brand Code`!="D")] <- 0
eval_with_brand$Brand_NA[which(is.na(eval_with_brand$`Brand Code`))] <- 1
eval_with_brand$Brand_NA[which(!is.na(eval_with_brand$`Brand Code`))] <- 0

Only a few variables are positively correlated with PH. Filler Level, Bowl Setpoint both have a correlation of 0.3 with PH.

PLS

set.seed(34)

data <- subset(data, !is.na(PH))
data_with_brand <- subset(data_with_brand, !is.na(PH))



train_idx <- sample(c(TRUE,FALSE), nrow(data_with_brand), 
                 replace=TRUE, prob=c(0.7,0.3))

train_set <- data_with_brand[train_idx,-c(1)]
test_set <- data_with_brand[!train_idx,-c(1)]

train_PH <- data.frame(data_with_brand[train_idx,26])
test_PH <- data.frame(data_with_brand[!train_idx,26])

colnames(train_PH) <- c('PH')
colnames(test_PH) <- c('PH')

which(is.na(train_set))
## integer(0)
summary(train_set)
##   Carb Volume     Fill Ounces      PC Volume       Carb Pressure  
##  Min.   :5.040   Min.   :23.65   Min.   :0.07933   Min.   :57.00  
##  1st Qu.:5.293   1st Qu.:23.92   1st Qu.:0.24067   1st Qu.:65.60  
##  Median :5.347   Median :23.97   Median :0.27133   Median :68.20  
##  Mean   :5.371   Mean   :23.97   Mean   :0.27841   Mean   :68.22  
##  3rd Qu.:5.460   3rd Qu.:24.03   3rd Qu.:0.31267   3rd Qu.:70.60  
##  Max.   :5.700   Max.   :24.31   Max.   :0.47800   Max.   :79.40  
##    Carb Temp          PSC             PSC Fill        PSC CO2       
##  Min.   :128.6   Min.   :0.00200   Min.   :0.000   Min.   :0.00000  
##  1st Qu.:138.4   1st Qu.:0.05000   1st Qu.:0.100   1st Qu.:0.02000  
##  Median :140.8   Median :0.07700   Median :0.180   Median :0.04000  
##  Mean   :141.1   Mean   :0.08565   Mean   :0.196   Mean   :0.05539  
##  3rd Qu.:143.8   3rd Qu.:0.11400   3rd Qu.:0.260   3rd Qu.:0.06000  
##  Max.   :154.0   Max.   :0.27000   Max.   :0.600   Max.   :0.24000  
##     Mnf Flow       Carb Pressure1  Fill Pressure  Hyd Pressure1  
##  Min.   :-100.20   Min.   :105.6   Min.   :34.6   Min.   :-0.80  
##  1st Qu.:-100.00   1st Qu.:119.0   1st Qu.:46.0   1st Qu.: 0.00  
##  Median :  86.50   Median :123.2   Median :46.4   Median :11.40  
##  Mean   :  25.68   Mean   :122.6   Mean   :47.9   Mean   :12.53  
##  3rd Qu.: 140.45   3rd Qu.:125.4   3rd Qu.:50.0   3rd Qu.:20.20  
##  Max.   : 229.40   Max.   :140.2   Max.   :60.4   Max.   :58.00  
##  Hyd Pressure2   Hyd Pressure3   Hyd Pressure4     Filler Level  
##  Min.   : 0.00   Min.   :-1.20   Min.   : 62.00   Min.   : 59.0  
##  1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 86.00   1st Qu.: 98.9  
##  Median :28.80   Median :27.60   Median : 96.00   Median :118.4  
##  Mean   :21.19   Mean   :20.63   Mean   : 96.39   Mean   :109.3  
##  3rd Qu.:34.60   3rd Qu.:33.20   3rd Qu.:102.00   3rd Qu.:120.0  
##  Max.   :59.40   Max.   :49.80   Max.   :142.00   Max.   :153.2  
##   Filler Speed   Temperature      Usage cont      Carb Flow       Density     
##  Min.   : 998   Min.   :63.60   Min.   :12.08   Min.   :  26   Min.   :0.240  
##  1st Qu.:3890   1st Qu.:65.20   1st Qu.:18.34   1st Qu.:1111   1st Qu.:0.900  
##  Median :3982   Median :65.60   Median :21.78   Median :3028   Median :0.980  
##  Mean   :3709   Mean   :65.93   Mean   :20.98   Mean   :2467   Mean   :1.183  
##  3rd Qu.:3996   3rd Qu.:66.40   3rd Qu.:23.76   3rd Qu.:3180   3rd Qu.:1.640  
##  Max.   :4028   Max.   :76.20   Max.   :25.90   Max.   :5104   Max.   :1.920  
##       MFR           Balling      Pressure Vacuum        PH       
##  Min.   : 31.4   Min.   :0.160   Min.   :-6.600   Min.   :8.000  
##  1st Qu.:708.0   1st Qu.:1.496   1st Qu.:-5.600   1st Qu.:8.440  
##  Median :724.0   Median :1.648   Median :-5.400   Median :8.540  
##  Mean   :706.6   Mean   :2.214   Mean   :-5.225   Mean   :8.545  
##  3rd Qu.:730.2   3rd Qu.:3.296   3rd Qu.:-5.000   3rd Qu.:8.680  
##  Max.   :849.4   Max.   :4.012   Max.   :-3.600   Max.   :9.360  
##  Oxygen Filler     Bowl Setpoint   Pressure Setpoint Air Pressurer  
##  Min.   :0.00240   Min.   : 70.0   Min.   :44.00     Min.   :141.0  
##  1st Qu.:0.02200   1st Qu.:100.0   1st Qu.:46.00     1st Qu.:142.2  
##  Median :0.03340   Median :120.0   Median :46.00     Median :142.6  
##  Mean   :0.04613   Mean   :109.3   Mean   :47.63     Mean   :142.8  
##  3rd Qu.:0.05810   3rd Qu.:120.0   3rd Qu.:50.00     3rd Qu.:143.0  
##  Max.   :0.40000   Max.   :140.0   Max.   :52.00     Max.   :148.2  
##     Alch Rel        Carb Rel      Balling Lvl       Brand_A      
##  Min.   :6.400   Min.   :4.960   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:6.540   1st Qu.:5.340   1st Qu.:1.380   1st Qu.:0.0000  
##  Median :6.580   Median :5.400   Median :1.480   Median :0.0000  
##  Mean   :6.903   Mean   :5.438   Mean   :2.062   Mean   :0.1161  
##  3rd Qu.:7.280   3rd Qu.:5.560   3rd Qu.:3.140   3rd Qu.:0.0000  
##  Max.   :8.600   Max.   :6.060   Max.   :3.660   Max.   :1.0000  
##     Brand_B         Brand_C          Brand_D         Brand_NA     
##  Min.   :0.000   Min.   :0.0000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000  
##  Median :0.000   Median :0.0000   Median :0.000   Median :0.0000  
##  Mean   :0.471   Mean   :0.1228   Mean   :0.245   Mean   :0.0452  
##  3rd Qu.:1.000   3rd Qu.:0.0000   3rd Qu.:0.000   3rd Qu.:0.0000  
##  Max.   :1.000   Max.   :1.0000   Max.   :1.000   Max.   :1.0000
fit <- train(PH ~.,train_set, method = "pls",tunelength = 25, trControl = trainControl(method = "cv"))

summary(data)
##   Brand Code         Carb Volume     Fill Ounces      PC Volume      
##  Length:2567        Min.   :5.040   Min.   :23.63   Min.   :0.07933  
##  Class :character   1st Qu.:5.293   1st Qu.:23.92   1st Qu.:0.23933  
##  Mode  :character   Median :5.347   Median :23.97   Median :0.27133  
##                     Mean   :5.370   Mean   :23.97   Mean   :0.27715  
##                     3rd Qu.:5.453   3rd Qu.:24.03   3rd Qu.:0.31067  
##                     Max.   :5.700   Max.   :24.32   Max.   :0.47800  
##  Carb Pressure     Carb Temp          PSC             PSC Fill     
##  Min.   :57.00   Min.   :128.6   Min.   :0.00200   Min.   :0.0000  
##  1st Qu.:65.60   1st Qu.:138.4   1st Qu.:0.05000   1st Qu.:0.1000  
##  Median :68.20   Median :140.8   Median :0.07600   Median :0.1800  
##  Mean   :68.19   Mean   :141.1   Mean   :0.08453   Mean   :0.1952  
##  3rd Qu.:70.60   3rd Qu.:143.8   3rd Qu.:0.11200   3rd Qu.:0.2600  
##  Max.   :79.40   Max.   :154.0   Max.   :0.27000   Max.   :0.6200  
##     PSC CO2           Mnf Flow       Carb Pressure1  Fill Pressure  
##  Min.   :0.00000   Min.   :-100.20   Min.   :105.6   Min.   :34.60  
##  1st Qu.:0.02000   1st Qu.:-100.00   1st Qu.:119.0   1st Qu.:46.00  
##  Median :0.04000   Median :  70.20   Median :123.2   Median :46.40  
##  Mean   :0.05619   Mean   :  24.63   Mean   :122.6   Mean   :47.91  
##  3rd Qu.:0.08000   3rd Qu.: 140.80   3rd Qu.:125.4   3rd Qu.:50.00  
##  Max.   :0.24000   Max.   : 229.40   Max.   :140.2   Max.   :60.40  
##  Hyd Pressure1   Hyd Pressure2   Hyd Pressure3   Hyd Pressure4   
##  Min.   :-0.80   Min.   : 0.00   Min.   :-1.20   Min.   : 62.00  
##  1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 86.00  
##  Median :11.40   Median :28.60   Median :27.60   Median : 96.00  
##  Mean   :12.45   Mean   :21.04   Mean   :20.52   Mean   : 96.31  
##  3rd Qu.:20.20   3rd Qu.:34.60   3rd Qu.:33.20   3rd Qu.:102.00  
##  Max.   :58.00   Max.   :59.40   Max.   :50.00   Max.   :142.00  
##   Filler Level    Filler Speed   Temperature      Usage cont      Carb Flow   
##  Min.   : 55.8   Min.   : 998   Min.   :63.60   Min.   :12.08   Min.   :  26  
##  1st Qu.: 98.7   1st Qu.:3890   1st Qu.:65.20   1st Qu.:18.38   1st Qu.:1169  
##  Median :118.4   Median :3982   Median :65.60   Median :21.79   Median :3028  
##  Mean   :109.3   Mean   :3694   Mean   :65.96   Mean   :21.00   Mean   :2472  
##  3rd Qu.:120.0   3rd Qu.:3997   3rd Qu.:66.40   3rd Qu.:23.74   3rd Qu.:3187  
##  Max.   :161.2   Max.   :4030   Max.   :76.20   Max.   :25.90   Max.   :5104  
##     Density           MFR           Balling      Pressure Vacuum 
##  Min.   :0.240   Min.   : 31.4   Min.   :0.160   Min.   :-6.600  
##  1st Qu.:0.900   1st Qu.:708.2   1st Qu.:1.496   1st Qu.:-5.600  
##  Median :0.980   Median :724.0   Median :1.648   Median :-5.400  
##  Mean   :1.174   Mean   :705.7   Mean   :2.200   Mean   :-5.216  
##  3rd Qu.:1.620   3rd Qu.:730.4   3rd Qu.:3.292   3rd Qu.:-5.000  
##  Max.   :1.920   Max.   :868.6   Max.   :4.012   Max.   :-3.600  
##        PH        Oxygen Filler     Bowl Setpoint   Pressure Setpoint
##  Min.   :7.880   Min.   :0.00240   Min.   : 70.0   Min.   :44.00    
##  1st Qu.:8.440   1st Qu.:0.02200   1st Qu.:100.0   1st Qu.:46.00    
##  Median :8.540   Median :0.03340   Median :120.0   Median :46.00    
##  Mean   :8.546   Mean   :0.04637   Mean   :109.4   Mean   :47.61    
##  3rd Qu.:8.680   3rd Qu.:0.05880   3rd Qu.:120.0   3rd Qu.:50.00    
##  Max.   :9.360   Max.   :0.40000   Max.   :140.0   Max.   :52.00    
##  Air Pressurer      Alch Rel        Carb Rel      Balling Lvl   
##  Min.   :140.8   Min.   :5.280   Min.   :4.960   Min.   :0.000  
##  1st Qu.:142.2   1st Qu.:6.540   1st Qu.:5.340   1st Qu.:1.380  
##  Median :142.6   Median :6.560   Median :5.400   Median :1.480  
##  Mean   :142.8   Mean   :6.897   Mean   :5.437   Mean   :2.051  
##  3rd Qu.:143.0   3rd Qu.:7.230   3rd Qu.:5.540   3rd Qu.:3.140  
##  Max.   :148.2   Max.   :8.620   Max.   :6.060   Max.   :3.660
plot(fit)

fit
## Partial Least Squares 
## 
## 1792 samples
##   36 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1612, 1612, 1614, 1612, 1613, 1612, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE       Rsquared    MAE      
##   1      0.1684792  0.03232073  0.1352104
##   2      0.1670441  0.04847840  0.1341937
##   3      0.1540556  0.19149438  0.1207404
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.
plsPred <- predict(fit,test_set)

postResample(plsPred,test_PH$PH)
##      RMSE  Rsquared       MAE 
## 0.1544459 0.2301354 0.1233213

SVM

set.seed(34)

data_PH <- data_with_brand[,26]

data_data <- data_with_brand[,-26]
eval_data <- eval_with_brand[,-26]

train_idx <- sample(c(TRUE,FALSE), nrow(data_data), 
                 replace=TRUE, prob=c(0.7,0.3))

train_set <- data_data[train_idx,-c(1)]
test_set <- data_data[!train_idx,-c(1)]

train_PH <- data.frame(data_PH[train_idx,])
test_PH <- data.frame(data_PH[!train_idx,])

set.seed(34)
svmRModel <- train(x = train_set,y = train_PH$PH,method = "svmRadial",tuneLength = 15,trControl = trainControl(method = "cv"),preProcess = c("center","scale"))

svmPred <- predict(svmRModel, newdata = test_set)
postResample(svmPred,test_PH$PH)
##       RMSE   Rsquared        MAE 
## 0.11603414 0.56523103 0.08581204

Earth Model

set.seed(34)
earthGrid <- expand.grid(.degree = 1:2,.nprune = 2:38)
earthModel <- train(x = train_set,y = train_PH$PH,method = "earth",tuneGrid = expand.grid(.degree = 1,.nprune = 2:25),trControl = trainControl(method = "cv"),preProcess = c("center","scale"))


earthPred <- predict(earthModel, newdata = test_set)
postResample(earthPred,test_PH$PH)
##      RMSE  Rsquared       MAE 
## 0.1336247 0.4223821 0.1037217

NNET

nnetGrid <- expand.grid(size = seq(1, 10),decay = c(0,.01,.1),bag = FALSE)

set.seed(34)
nnetModel <- train(x = train_set,y = train_PH$PH,method = "avNNet",tuneGrid = nnetGrid,preProc = c("center", "scale"),linout = TRUE,trace = FALSE,maxit = 100,MaxNWts = 10 * (ncol(train_set) + 1) + 10 + 1,trControl = trainControl(method = "cv"),preProcess = c("center","scale"))

nnetPred <- predict(nnetModel, newdata = test_set)
postResample(nnetPred,test_PH$PH)
##       RMSE   Rsquared        MAE 
## 0.11546625 0.56833764 0.08422327

KNN

knnModel <- train(x = train_set,y = train_PH$PH,method = "knn",preProc = c("center", "scale"),tuneLength = 10)

knnModel
## k-Nearest Neighbors 
## 
## 1792 samples
##   36 predictor
## 
## Pre-processing: centered (36), scaled (36) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 1792, 1792, 1792, 1792, 1792, 1792, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE       Rsquared   MAE       
##    5  0.1339611  0.4249665  0.09806591
##    7  0.1319827  0.4318388  0.09771453
##    9  0.1313512  0.4324094  0.09802806
##   11  0.1310453  0.4323258  0.09838087
##   13  0.1312755  0.4292511  0.09926105
##   15  0.1315345  0.4263844  0.09974055
##   17  0.1317595  0.4240527  0.10026194
##   19  0.1323016  0.4193815  0.10095903
##   21  0.1326620  0.4163454  0.10128024
##   23  0.1332462  0.4110770  0.10179214
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 11.
knnPred <- predict(knnModel, newdata = eval_data)
postResample(pred = knnPred, obs = test_PH$PH)
##      RMSE  Rsquared       MAE 
## 0.2143956        NA 0.1700031

Cubist

set.seed(34)

cubistTune <- cubist(train_set,train_PH$PH,committees=100)

cubistTune_predictions <- predict(cubistTune,test_set)

postResample(cubistTune_predictions,test_PH$PH)
##       RMSE   Rsquared        MAE 
## 0.10185133 0.67572056 0.07410222

GBM

set.seed(34)

gbmTune <- gbm(train_PH$PH~.,data=train_set, distribution = "gaussian")


gbmTune_predictions <- predict(gbmTune,test_set)
## Using 100 trees...
postResample(gbmTune_predictions,test_PH$PH)
##      RMSE  Rsquared       MAE 
## 0.1328323 0.4463457 0.1051459
#nrow(test_PH)

#median_prediction <- test_PH

#median_prediction$Median <- median(train_PH$PH)

#postResample(median_prediction$Median,median_prediction$PH)

XGBoost

Step 1: Load Dataframe

  • Convert Categorical Variables to Numeric

Step 2: Convert Character Vectors to Dummy Variables

  • Use model.matrix() to convert character vectors to numeric matrices
data <- subset(data, !is.na(`Brand Code`))

brand <- model.matrix(~`Brand Code` - 1, data) # -1 to remove intercept

data <- data[,-c(1)]

data_comb <- cbind(data,brand)

Step 3: Split Dataset and Construct Matrices

set.seed(34); 
sample_rows <- sample(nrow(data_comb),nrow(data_comb) *.7)

dt <- sort(sample_rows)
test_set_xgb <- data_comb[-dt,]
train_set <- data_comb[dt,]


train_x_mat <- data.matrix(train_set[,-25]) #xgboost requires a matrix as the input
train_y <- train_set[,25] #Response Variable

test_x_mat <- data.matrix(test_set_xgb[,-25]) 
test_y <- test_set_xgb[,25] #Response Variable

Step 4: Construct DMatrix and Run Model

  • To show how easy it can be, I am not setting any parameters
    • Watchlist is still necessary to clearly define
    • verbose dictates whether each individual iteration will print model metrics
xgb_train <- xgb.DMatrix(data = train_x_mat,label=train_y)
xgb_test <- xgb.DMatrix(data = test_x_mat,label=test_y)

watchlist <- list(train = xgb_train,test = xgb_test)

xgb_model <- xgb.train(data=xgb_train,max.depth=3,watchlist=watchlist,
                       nrounds=100,verbose=0)

#summary(xgb_model)

Step 5: Visualize Model Performance

  • This model is inefficient because there are too many iterations.
    • We can get away with 25 or fewer iterations
e <- data.frame(xgb_model$evaluation_log)

#which(e$test_rmse == min(e$test_rmse))

fig_2 <- ggplot(e, aes(iter,train_rmse))+
  geom_point(color = 'blue',shape = 1)+
  geom_line(data = e, aes(x=iter,y=test_rmse),color = 'red')+
  ggtitle("XGBoost Model Root Mean Squared Error Per Iteration")+
  theme(plot.title = element_text(hjust = 0.5))

imp <- xgb.importance(colnames(xgb_train),model = xgb_model)
fig_2

Step 6: Revise Model

  • For the revised model, I only changed:
    • nrounds = 25
    • eta = 0.25 (down from 0.3)
xgb_model_tuned <- xgb.train(data=xgb_train,max.depth=3,watchlist=watchlist,
                             nrounds=25,verbose=0,eta = 0.25) 
# eta of 0.01 removed all but month, clouds_all

e <- data.frame(xgb_model_tuned$evaluation_log)

#which(e$test_rmse == min(e$test_rmse))

fig_3 <- ggplot(e, aes(iter,train_rmse))+
  geom_point(color = 'blue',shape = 1)+
  geom_line(data = e, aes(x=iter,y=test_rmse),color = 'red')+
  ggtitle("XGBoost Model Root Mean Squared Error Per Iteration")+
  theme(plot.title = element_text(hjust = 0.5))

imp <- xgb.importance(colnames(xgb_train),model = xgb_model_tuned)

Step 6: Revise Model

XGBoost Regression Example

Step 6: Revise Model

XGBoost Model Metrics
XGBoost Tuned Model Score
Mean_Squared_Error 0.014
Root_Mean_Squared_Error 0.118
Test_Mean 8.539
R_Squared -0.131

There is a cyclical shape to the training data PH values when plotted by index. Peculiar.

Evaluation

eval <- subset(eval, !is.na(`Brand Code`))

brand <- model.matrix(~`Brand Code` - 1, eval) # -1 to remove intercept

eval <- eval[,-c(1)]

eval_comb <- cbind(eval,brand)

eval_x_mat <- data.matrix(eval_comb[,-25]) #xgboost requires a matrix as the input
eval_y <- eval_comb[,25]

xgb_eval <- xgb.DMatrix(data = eval_x_mat)


xgb_pred <- predict(xgb_model_tuned,xgb_eval)

predictions_xgb <- data.frame(cbind(xgb_pred))
colnames(predictions_xgb) <- c("XGBoost Predictions")

xgb_Pred <- predict(xgb_model_tuned,xgb_test)
postResample(xgb_Pred,test_y)
##       RMSE   Rsquared        MAE 
## 0.11822114 0.53937458 0.09032222

Model Performance

#pct_diff <- mean(1 - estimates_table_17_30$Ratio_ETS_vs_Census_Projection)

model_performance1 <- data.frame("PLS" = t(postResample(plsPred,test_PH$PH)))

model_performance2 <-data.frame("SVM" = t(postResample(svmPred,test_PH$PH)))

model_performance_bad <-data.frame("Earth" = t(postResample(earthPred,test_PH$PH)))

model_performance3 <-data.frame("NNET" = t(postResample(nnetPred,test_PH$PH)))

model_performance4 <-data.frame("KNN" = t(postResample(pred = knnPred, obs = test_PH$PH)))

model_performance5 <-data.frame("GBM" = t(postResample(gbmTune_predictions,test_PH$PH)))

model_performance6 <-data.frame("Cubist" = t(postResample(cubistTune_predictions,test_PH$PH)))

model_performance8 <-data.frame("XGBoost" = t(postResample(xgb_Pred,test_y)))



colnames(model_performance1) <- c("RMSE","R-Squared","MAE")
colnames(model_performance2) <- c("RMSE","R-Squared","MAE")
colnames(model_performance_bad) <- c("RMSE","R-Squared","MAE")
colnames(model_performance3) <- c("RMSE","R-Squared","MAE")
colnames(model_performance4) <- c("RMSE","R-Squared","MAE")
colnames(model_performance5) <- c("RMSE","R-Squared","MAE")
colnames(model_performance6) <- c("RMSE","R-Squared","MAE")
colnames(model_performance8) <- c("RMSE","R-Squared","MAE")

model_performance <- rbind(model_performance1,model_performance2,model_performance_bad,model_performance3,model_performance4,model_performance5,model_performance6,model_performance8)

rownames(model_performance) <- c("PLS","SVM","Earth","NNET","KNN","GBM","Cubist","XGBoost")

model_performance
kbl(model_performance, longtable = T, booktabs = T, caption = "Model Performance on Test Data") %>%
  kable_styling(latex_options = c("repeat_header"))
Model Performance on Test Data
RMSE R-Squared MAE
PLS 0.1544459 0.2301354 0.1233213
SVM 0.1160341 0.5652310 0.0858120
Earth 0.1336247 0.4223821 0.1037217
NNET 0.1154662 0.5683376 0.0842233
KNN 0.2143956 NA 0.1700031
GBM 0.1328323 0.4463457 0.1051459
Cubist 0.1018513 0.6757206 0.0741022
XGBoost 0.1182211 0.5393746 0.0903222

Final Predictions Table

eval_data <- eval_data[,-c(1)]

eval_ph_pred <- predict(cubistTune,eval_data)

eval_final$PH <- eval_ph_pred

file_path <- "eval_SHylton_Final_624.xlsx"

write.xlsx(eval_final, file_path, rowNames = FALSE)