data <- read_excel("StudentData.xlsx")
eval <- read_excel("StudentEvaluation.xlsx")
eval_final <- eval
target_PH <- eval$PHsummary(data)## Brand Code Carb Volume Fill Ounces PC Volume
## Length:2571 Min. :5.040 Min. :23.63 Min. :0.07933
## Class :character 1st Qu.:5.293 1st Qu.:23.92 1st Qu.:0.23917
## Mode :character Median :5.347 Median :23.97 Median :0.27133
## Mean :5.370 Mean :23.97 Mean :0.27712
## 3rd Qu.:5.453 3rd Qu.:24.03 3rd Qu.:0.31200
## Max. :5.700 Max. :24.32 Max. :0.47800
## NA's :10 NA's :38 NA's :39
## Carb Pressure Carb Temp PSC PSC Fill
## Min. :57.00 Min. :128.6 Min. :0.00200 Min. :0.0000
## 1st Qu.:65.60 1st Qu.:138.4 1st Qu.:0.04800 1st Qu.:0.1000
## Median :68.20 Median :140.8 Median :0.07600 Median :0.1800
## Mean :68.19 Mean :141.1 Mean :0.08457 Mean :0.1954
## 3rd Qu.:70.60 3rd Qu.:143.8 3rd Qu.:0.11200 3rd Qu.:0.2600
## Max. :79.40 Max. :154.0 Max. :0.27000 Max. :0.6200
## NA's :27 NA's :26 NA's :33 NA's :23
## PSC CO2 Mnf Flow Carb Pressure1 Fill Pressure
## Min. :0.00000 Min. :-100.20 Min. :105.6 Min. :34.60
## 1st Qu.:0.02000 1st Qu.:-100.00 1st Qu.:119.0 1st Qu.:46.00
## Median :0.04000 Median : 65.20 Median :123.2 Median :46.40
## Mean :0.05641 Mean : 24.57 Mean :122.6 Mean :47.92
## 3rd Qu.:0.08000 3rd Qu.: 140.80 3rd Qu.:125.4 3rd Qu.:50.00
## Max. :0.24000 Max. : 229.40 Max. :140.2 Max. :60.40
## NA's :39 NA's :2 NA's :32 NA's :22
## Hyd Pressure1 Hyd Pressure2 Hyd Pressure3 Hyd Pressure4
## Min. :-0.80 Min. : 0.00 Min. :-1.20 Min. : 52.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 86.00
## Median :11.40 Median :28.60 Median :27.60 Median : 96.00
## Mean :12.44 Mean :20.96 Mean :20.46 Mean : 96.29
## 3rd Qu.:20.20 3rd Qu.:34.60 3rd Qu.:33.40 3rd Qu.:102.00
## Max. :58.00 Max. :59.40 Max. :50.00 Max. :142.00
## NA's :11 NA's :15 NA's :15 NA's :30
## Filler Level Filler Speed Temperature Usage cont Carb Flow
## Min. : 55.8 Min. : 998 Min. :63.60 Min. :12.08 Min. : 26
## 1st Qu.: 98.3 1st Qu.:3888 1st Qu.:65.20 1st Qu.:18.36 1st Qu.:1144
## Median :118.4 Median :3982 Median :65.60 Median :21.79 Median :3028
## Mean :109.3 Mean :3687 Mean :65.97 Mean :20.99 Mean :2468
## 3rd Qu.:120.0 3rd Qu.:3998 3rd Qu.:66.40 3rd Qu.:23.75 3rd Qu.:3186
## Max. :161.2 Max. :4030 Max. :76.20 Max. :25.90 Max. :5104
## NA's :20 NA's :57 NA's :14 NA's :5 NA's :2
## Density MFR Balling Pressure Vacuum
## Min. :0.240 Min. : 31.4 Min. :-0.170 Min. :-6.600
## 1st Qu.:0.900 1st Qu.:706.3 1st Qu.: 1.496 1st Qu.:-5.600
## Median :0.980 Median :724.0 Median : 1.648 Median :-5.400
## Mean :1.174 Mean :704.0 Mean : 2.198 Mean :-5.216
## 3rd Qu.:1.620 3rd Qu.:731.0 3rd Qu.: 3.292 3rd Qu.:-5.000
## Max. :1.920 Max. :868.6 Max. : 4.012 Max. :-3.600
## NA's :1 NA's :212 NA's :1
## PH Oxygen Filler Bowl Setpoint Pressure Setpoint
## Min. :7.880 Min. :0.00240 Min. : 70.0 Min. :44.00
## 1st Qu.:8.440 1st Qu.:0.02200 1st Qu.:100.0 1st Qu.:46.00
## Median :8.540 Median :0.03340 Median :120.0 Median :46.00
## Mean :8.546 Mean :0.04684 Mean :109.3 Mean :47.62
## 3rd Qu.:8.680 3rd Qu.:0.06000 3rd Qu.:120.0 3rd Qu.:50.00
## Max. :9.360 Max. :0.40000 Max. :140.0 Max. :52.00
## NA's :4 NA's :12 NA's :2 NA's :12
## Air Pressurer Alch Rel Carb Rel Balling Lvl
## Min. :140.8 Min. :5.280 Min. :4.960 Min. :0.00
## 1st Qu.:142.2 1st Qu.:6.540 1st Qu.:5.340 1st Qu.:1.38
## Median :142.6 Median :6.560 Median :5.400 Median :1.48
## Mean :142.8 Mean :6.897 Mean :5.437 Mean :2.05
## 3rd Qu.:143.0 3rd Qu.:7.240 3rd Qu.:5.540 3rd Qu.:3.14
## Max. :148.2 Max. :8.620 Max. :6.060 Max. :3.66
## NA's :9 NA's :10 NA's :1
summary(eval)## Brand Code Carb Volume Fill Ounces PC Volume
## Length:267 Min. :5.147 Min. :23.75 Min. :0.09867
## Class :character 1st Qu.:5.287 1st Qu.:23.92 1st Qu.:0.23333
## Mode :character Median :5.340 Median :23.97 Median :0.27533
## Mean :5.369 Mean :23.97 Mean :0.27769
## 3rd Qu.:5.465 3rd Qu.:24.01 3rd Qu.:0.32200
## Max. :5.667 Max. :24.20 Max. :0.46400
## NA's :1 NA's :6 NA's :4
## Carb Pressure Carb Temp PSC PSC Fill
## Min. :60.20 Min. :130.0 Min. :0.00400 Min. :0.0200
## 1st Qu.:65.30 1st Qu.:138.4 1st Qu.:0.04450 1st Qu.:0.1000
## Median :68.00 Median :140.8 Median :0.07600 Median :0.1800
## Mean :68.25 Mean :141.2 Mean :0.08545 Mean :0.1903
## 3rd Qu.:70.60 3rd Qu.:143.8 3rd Qu.:0.11200 3rd Qu.:0.2600
## Max. :77.60 Max. :154.0 Max. :0.24600 Max. :0.6200
## NA's :1 NA's :5 NA's :3
## PSC CO2 Mnf Flow Carb Pressure1 Fill Pressure
## Min. :0.00000 Min. :-100.20 Min. :113.0 Min. :37.80
## 1st Qu.:0.02000 1st Qu.:-100.00 1st Qu.:120.2 1st Qu.:46.00
## Median :0.04000 Median : 0.20 Median :123.4 Median :47.80
## Mean :0.05107 Mean : 21.03 Mean :123.0 Mean :48.14
## 3rd Qu.:0.06000 3rd Qu.: 141.30 3rd Qu.:125.5 3rd Qu.:50.20
## Max. :0.24000 Max. : 220.40 Max. :136.0 Max. :60.20
## NA's :5 NA's :4 NA's :2
## Hyd Pressure1 Hyd Pressure2 Hyd Pressure3 Hyd Pressure4
## Min. :-50.00 Min. :-50.00 Min. :-50.00 Min. : 68.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 90.00
## Median : 10.40 Median : 26.80 Median : 27.70 Median : 98.00
## Mean : 12.01 Mean : 20.11 Mean : 19.61 Mean : 97.84
## 3rd Qu.: 20.40 3rd Qu.: 34.80 3rd Qu.: 33.00 3rd Qu.:104.00
## Max. : 50.00 Max. : 61.40 Max. : 49.20 Max. :140.00
## NA's :1 NA's :1 NA's :4
## Filler Level Filler Speed Temperature Usage cont Carb Flow
## Min. : 69.2 Min. :1006 Min. :63.80 Min. :12.90 Min. : 0
## 1st Qu.:100.6 1st Qu.:3812 1st Qu.:65.40 1st Qu.:18.12 1st Qu.:1083
## Median :118.6 Median :3978 Median :65.80 Median :21.44 Median :3038
## Mean :110.3 Mean :3581 Mean :66.23 Mean :20.90 Mean :2409
## 3rd Qu.:120.2 3rd Qu.:3996 3rd Qu.:66.60 3rd Qu.:23.74 3rd Qu.:3215
## Max. :153.2 Max. :4020 Max. :75.40 Max. :24.60 Max. :3858
## NA's :2 NA's :10 NA's :2 NA's :2
## Density MFR Balling Pressure Vacuum
## Min. :0.060 Min. : 15.6 Min. :0.902 Min. :-6.400
## 1st Qu.:0.920 1st Qu.:707.0 1st Qu.:1.498 1st Qu.:-5.600
## Median :0.980 Median :724.6 Median :1.648 Median :-5.200
## Mean :1.177 Mean :697.8 Mean :2.203 Mean :-5.174
## 3rd Qu.:1.600 3rd Qu.:731.5 3rd Qu.:3.242 3rd Qu.:-4.800
## Max. :1.840 Max. :784.8 Max. :3.788 Max. :-3.600
## NA's :1 NA's :31 NA's :1 NA's :1
## PH Oxygen Filler Bowl Setpoint Pressure Setpoint
## Mode:logical Min. :0.00240 Min. : 70.0 Min. :44.00
## NA's:267 1st Qu.:0.01960 1st Qu.:100.0 1st Qu.:46.00
## Median :0.03370 Median :120.0 Median :46.00
## Mean :0.04666 Mean :109.6 Mean :47.73
## 3rd Qu.:0.05440 3rd Qu.:120.0 3rd Qu.:50.00
## Max. :0.39800 Max. :130.0 Max. :52.00
## NA's :3 NA's :1 NA's :2
## Air Pressurer Alch Rel Carb Rel Balling Lvl
## Min. :141.2 Min. :6.400 Min. :5.18 Min. :0.000
## 1st Qu.:142.2 1st Qu.:6.540 1st Qu.:5.34 1st Qu.:1.380
## Median :142.6 Median :6.580 Median :5.40 Median :1.480
## Mean :142.8 Mean :6.907 Mean :5.44 Mean :2.051
## 3rd Qu.:142.8 3rd Qu.:7.180 3rd Qu.:5.56 3rd Qu.:3.080
## Max. :147.2 Max. :7.820 Max. :5.74 Max. :3.420
## NA's :1 NA's :3 NA's :2
data_trim <- drop_na(data)
data[1,2]data$`Carb Volume`[which(is.na(data$`Carb Volume`))] <- median(data$`Carb Volume`,na.rm=TRUE)
data$`PC Volume`[which(is.na(data$`PC Volume`))] <- median(data$`PC Volume`,na.rm=TRUE)
data$`Carb Pressure`[which(is.na(data$`Carb Pressure`))] <- median(data$`Carb Pressure`,na.rm=TRUE)
data$`Carb Temp`[which(is.na(data$`Carb Temp`))] <- median(data$`Carb Temp`,na.rm=TRUE)
data$`PSC`[which(is.na(data$`PSC`))] <- median(data$`PSC`,na.rm=TRUE)
data$`PSC Fill`[which(is.na(data$`PSC Fill`))] <- median(data$`PSC Fill`,na.rm=TRUE)
data$`PSC CO2`[which(is.na(data$`PSC CO2`))] <- median(data$`PSC CO2`,na.rm=TRUE)
data$`Mnf Flow`[which(is.na(data$`Mnf Flow`))] <- median(data$`Mnf Flow`,na.rm=TRUE)
data$`Carb Pressure1`[which(is.na(data$`Carb Pressure1`))] <- median(data$`Carb Pressure1`,na.rm=TRUE)
data$`Fill Pressure`[which(is.na(data$`Fill Pressure`))] <- median(data$`Fill Pressure`,na.rm=TRUE)
data$`Hyd Pressure1`[which(is.na(data$`Hyd Pressure1`))] <- median(data$`Hyd Pressure1`,na.rm=TRUE)
data$`Hyd Pressure2`[which(is.na(data$`Hyd Pressure2`))] <- median(data$`Hyd Pressure2`,na.rm=TRUE)
data$`Hyd Pressure3`[which(is.na(data$`Hyd Pressure3`))] <- median(data$`Hyd Pressure3`,na.rm=TRUE)
data$`Hyd Pressure4`[which(is.na(data$`Hyd Pressure4`))] <- median(data$`Hyd Pressure4`,na.rm=TRUE)
data$`Filler Level`[which(is.na(data$`Filler Level`))] <- median(data$`Filler Level`,na.rm=TRUE)
data$`Filler Speed`[which(is.na(data$`Filler Speed`))] <- median(data$`Filler Speed`,na.rm=TRUE)
data$`Temperature`[which(is.na(data$`Temperature`))] <- median(data$`Temperature`,na.rm=TRUE)
data$`Usage cont`[which(is.na(data$`Usage cont`))] <- median(data$`Usage cont`,na.rm=TRUE)
data$`Carb Flow`[which(is.na(data$`Carb Flow`))] <- median(data$`Carb Flow`,na.rm=TRUE)
data$`Density`[which(is.na(data$`Density`))] <- median(data$`Density`,na.rm=TRUE)
data$`MFR`[which(is.na(data$`MFR`))] <- median(data$`MFR`,na.rm=TRUE)
data$`Balling`[which(is.na(data$`Balling`))] <- median(data$`Balling`,na.rm=TRUE)
data$`Oxygen Filler`[which(is.na(data$`Oxygen Filler`))] <- median(data$`Oxygen Filler`,na.rm=TRUE)
data$`Bowl Setpoint`[which(is.na(data$`Bowl Setpoint`))] <- median(data$`Bowl Setpoint`,na.rm=TRUE)
data$`Pressure Setpoint`[which(is.na(data$`Pressure Setpoint`))] <- median(data$`Pressure Setpoint`,na.rm=TRUE)
data$`Alch Rel`[which(is.na(data$`Alch Rel`))] <- median(data$`Alch Rel`,na.rm=TRUE)
data$`Carb Rel`[which(is.na(data$`Carb Rel`))] <- median(data$`Carb Rel`,na.rm=TRUE)
data$`Balling Lvl`[which(is.na(data$`Balling Lvl`))] <- median(data$`Balling Lvl`,na.rm=TRUE)
data$`Fill Ounces`[which(is.na(data$`Fill Ounces`))] <- median(data$`Fill Ounces`,na.rm=TRUE)
data <- subset(data,!is.na(PH))
length(which(is.na(data$`Brand Code`)))## [1] 120
which(is.na(eval$`Brand Code`))## [1] 16 127 167 209 210 236 256 257
2567-2038## [1] 529
There are a lot of NA values to manage. Removing all NAs leaves us with 2038 observations.
Fix Eval NAs
eval$`Carb Volume`[which(is.na(eval$`Carb Volume`))] <- median(eval$`Carb Volume`,na.rm=TRUE)
eval$`PC Volume`[which(is.na(eval$`PC Volume`))] <- median(eval$`PC Volume`,na.rm=TRUE)
eval$`Carb Pressure`[which(is.na(eval$`Carb Pressure`))] <- median(eval$`Carb Pressure`,na.rm=TRUE)
eval$`Carb Temp`[which(is.na(eval$`Carb Temp`))] <- median(eval$`Carb Temp`,na.rm=TRUE)
eval$`PSC`[which(is.na(eval$`PSC`))] <- median(eval$`PSC`,na.rm=TRUE)
eval$`PSC Fill`[which(is.na(eval$`PSC Fill`))] <- median(eval$`PSC Fill`,na.rm=TRUE)
eval$`PSC CO2`[which(is.na(eval$`PSC CO2`))] <- median(eval$`PSC CO2`,na.rm=TRUE)
eval$`Mnf Flow`[which(is.na(eval$`Mnf Flow`))] <- median(eval$`Mnf Flow`,na.rm=TRUE)
eval$`Carb Pressure1`[which(is.na(eval$`Carb Pressure1`))] <- median(eval$`Carb Pressure1`,na.rm=TRUE)
eval$`Fill Pressure`[which(is.na(eval$`Fill Pressure`))] <- median(eval$`Fill Pressure`,na.rm=TRUE)
eval$`Hyd Pressure1`[which(is.na(eval$`Hyd Pressure1`))] <- median(eval$`Hyd Pressure1`,na.rm=TRUE)
eval$`Hyd Pressure2`[which(is.na(eval$`Hyd Pressure2`))] <- median(eval$`Hyd Pressure2`,na.rm=TRUE)
eval$`Hyd Pressure3`[which(is.na(eval$`Hyd Pressure3`))] <- median(eval$`Hyd Pressure3`,na.rm=TRUE)
eval$`Hyd Pressure4`[which(is.na(eval$`Hyd Pressure4`))] <- median(eval$`Hyd Pressure4`,na.rm=TRUE)
eval$`Filler Level`[which(is.na(eval$`Filler Level`))] <- median(eval$`Filler Level`,na.rm=TRUE)
eval$`Filler Speed`[which(is.na(eval$`Filler Speed`))] <- median(eval$`Filler Speed`,na.rm=TRUE)
eval$`Temperature`[which(is.na(eval$`Temperature`))] <- median(eval$`Temperature`,na.rm=TRUE)
eval$`Usage cont`[which(is.na(eval$`Usage cont`))] <- median(eval$`Usage cont`,na.rm=TRUE)
eval$`Carb Flow`[which(is.na(eval$`Carb Flow`))] <- median(eval$`Carb Flow`,na.rm=TRUE)
eval$`Density`[which(is.na(eval$`Density`))] <- median(eval$`Density`,na.rm=TRUE)
eval$`MFR`[which(is.na(eval$`MFR`))] <- median(eval$`MFR`,na.rm=TRUE)
eval$`Balling`[which(is.na(eval$`Balling`))] <- median(eval$`Balling`,na.rm=TRUE)
eval$`Oxygen Filler`[which(is.na(eval$`Oxygen Filler`))] <- median(eval$`Oxygen Filler`,na.rm=TRUE)
eval$`Bowl Setpoint`[which(is.na(eval$`Bowl Setpoint`))] <- median(eval$`Bowl Setpoint`,na.rm=TRUE)
eval$`Pressure Setpoint`[which(is.na(eval$`Pressure Setpoint`))] <- median(eval$`Pressure Setpoint`,na.rm=TRUE)
eval$`Alch Rel`[which(is.na(eval$`Alch Rel`))] <- median(eval$`Alch Rel`,na.rm=TRUE)
eval$`Carb Rel`[which(is.na(eval$`Carb Rel`))] <- median(eval$`Carb Rel`,na.rm=TRUE)
eval$`Balling Lvl`[which(is.na(eval$`Balling Lvl`))] <- median(eval$`Balling Lvl`,na.rm=TRUE)
eval$`Fill Ounces`[which(is.na(eval$`Fill Ounces`))] <- median(eval$`Fill Ounces`,na.rm=TRUE)
eval$`Pressure Vacuum`[which(is.na(eval$`Pressure Vacuum`))] <- median(eval$`Pressure Vacuum`,na.rm=TRUE)
eval$`Air Pressurer`[which(is.na(eval$`Air Pressurer`))] <- median(eval$`Air Pressurer`,na.rm=TRUE)str(data)## tibble [2,567 × 33] (S3: tbl_df/tbl/data.frame)
## $ Brand Code : chr [1:2567] "B" "A" "B" "A" ...
## $ Carb Volume : num [1:2567] 5.34 5.43 5.29 5.44 5.49 ...
## $ Fill Ounces : num [1:2567] 24 24 24.1 24 24.3 ...
## $ PC Volume : num [1:2567] 0.263 0.239 0.263 0.293 0.111 ...
## $ Carb Pressure : num [1:2567] 68.2 68.4 70.8 63 67.2 66.6 64.2 67.6 64.2 72 ...
## $ Carb Temp : num [1:2567] 141 140 145 133 137 ...
## $ PSC : num [1:2567] 0.104 0.124 0.09 0.076 0.026 0.09 0.128 0.154 0.132 0.014 ...
## $ PSC Fill : num [1:2567] 0.26 0.22 0.34 0.42 0.16 ...
## $ PSC CO2 : num [1:2567] 0.04 0.04 0.16 0.04 0.12 ...
## $ Mnf Flow : num [1:2567] -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 ...
## $ Carb Pressure1 : num [1:2567] 119 122 120 115 118 ...
## $ Fill Pressure : num [1:2567] 46 46 46 46.4 45.8 45.6 51.8 46.8 46 45.2 ...
## $ Hyd Pressure1 : num [1:2567] 0 0 0 0 0 0 0 0 0 0 ...
## $ Hyd Pressure2 : num [1:2567] 28.6 28.6 28.6 0 0 0 0 0 0 0 ...
## $ Hyd Pressure3 : num [1:2567] 27.6 27.6 27.6 0 0 0 0 0 0 0 ...
## $ Hyd Pressure4 : num [1:2567] 118 106 82 92 92 116 124 132 90 108 ...
## $ Filler Level : num [1:2567] 121 119 120 118 119 ...
## $ Filler Speed : num [1:2567] 4002 3986 4020 4012 4010 ...
## $ Temperature : num [1:2567] 66 67.6 67 65.6 65.6 66.2 65.8 65.2 65.4 66.6 ...
## $ Usage cont : num [1:2567] 16.2 19.9 17.8 17.4 17.7 ...
## $ Carb Flow : num [1:2567] 2932 3144 2914 3062 3054 ...
## $ Density : num [1:2567] 0.88 0.92 1.58 1.54 1.54 1.52 0.84 0.84 0.9 0.9 ...
## $ MFR : num [1:2567] 725 727 735 731 723 ...
## $ Balling : num [1:2567] 1.4 1.5 3.14 3.04 3.04 ...
## $ Pressure Vacuum : num [1:2567] -4 -4 -3.8 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 ...
## $ PH : num [1:2567] 8.36 8.26 8.94 8.24 8.26 8.32 8.4 8.38 8.38 8.5 ...
## $ Oxygen Filler : num [1:2567] 0.022 0.026 0.024 0.03 0.03 0.024 0.066 0.046 0.064 0.022 ...
## $ Bowl Setpoint : num [1:2567] 120 120 120 120 120 120 120 120 120 120 ...
## $ Pressure Setpoint: num [1:2567] 46.4 46.8 46.6 46 46 46 46 46 46 46 ...
## $ Air Pressurer : num [1:2567] 143 143 142 146 146 ...
## $ Alch Rel : num [1:2567] 6.58 6.56 7.66 7.14 7.14 7.16 6.54 6.52 6.52 6.54 ...
## $ Carb Rel : num [1:2567] 5.32 5.3 5.84 5.42 5.44 5.44 5.38 5.34 5.34 5.34 ...
## $ Balling Lvl : num [1:2567] 1.48 1.56 3.28 3.04 3.04 3.02 1.44 1.44 1.44 1.38 ...
#corr <- round(cor(data[,2:33]), 1)
#corr25 <- corr[1:25,1:25]
#ggcorrplot(corr,method="circle")
#ggcorrplot(corr25,method="circle")
data$`Brand Code` <- as.factor(data$`Brand Code`)
ggplot(data,aes(x=c(1:2567),y=data$PH,color = data$`Brand Code`))+
geom_point()median(data$PH[which(data$`Brand Code`=="A")])## [1] 8.52
median(data$PH[which(data$`Brand Code`=="B")])## [1] 8.56
median(data$PH[which(data$`Brand Code`=="C")])## [1] 8.42
median(data$PH[which(data$`Brand Code`=="D")])## [1] 8.62
median(data$PH[which(is.na(data$`Brand Code`))])## [1] 8.51
median(data$PH)## [1] 8.54
ggplot(data)+
geom_boxplot(aes(x=PH))data$`Brand Code` <- as.character(data$`Brand Code`)
data_with_brand <- data
data_with_brand$Brand_A <- 0
data_with_brand$Brand_B <- 0
data_with_brand$Brand_C <- 0
data_with_brand$Brand_D <- 0
data_with_brand$Brand_NA <- 0
data_with_brand$Brand_A[which(data_with_brand$`Brand Code`=="A")] <- 1
data_with_brand$Brand_A[which(data_with_brand$`Brand Code`!="A")] <- 0
data_with_brand$Brand_B[which(data_with_brand$`Brand Code`=="B")] <- 1
data_with_brand$Brand_B[which(data_with_brand$`Brand Code`!="B")] <- 0
data_with_brand$Brand_C[which(data_with_brand$`Brand Code`=="C")] <- 1
data_with_brand$Brand_C[which(data_with_brand$`Brand Code`!="C")] <- 0
data_with_brand$Brand_D[which(data_with_brand$`Brand Code`=="D")] <- 1
data_with_brand$Brand_D[which(data_with_brand$`Brand Code`!="D")] <- 0
data_with_brand$Brand_NA[which(is.na(data_with_brand$`Brand Code`))] <- 1
data_with_brand$Brand_NA[which(!is.na(data_with_brand$`Brand Code`))] <- 0
quantile(data$PH)## 0% 25% 50% 75% 100%
## 7.88 8.44 8.54 8.68 9.36
eval_with_brand <- eval
eval_with_brand$Brand_A <- 0
eval_with_brand$Brand_B <- 0
eval_with_brand$Brand_C <- 0
eval_with_brand$Brand_D <- 0
eval_with_brand$Brand_NA <- 0
eval_with_brand$Brand_A[which(eval_with_brand$`Brand Code`=="A")] <- 1
eval_with_brand$Brand_A[which(eval_with_brand$`Brand Code`!="A")] <- 0
eval_with_brand$Brand_B[which(eval_with_brand$`Brand Code`=="B")] <- 1
eval_with_brand$Brand_B[which(eval_with_brand$`Brand Code`!="B")] <- 0
eval_with_brand$Brand_C[which(eval_with_brand$`Brand Code`=="C")] <- 1
eval_with_brand$Brand_C[which(eval_with_brand$`Brand Code`!="C")] <- 0
eval_with_brand$Brand_D[which(eval_with_brand$`Brand Code`=="D")] <- 1
eval_with_brand$Brand_D[which(eval_with_brand$`Brand Code`!="D")] <- 0
eval_with_brand$Brand_NA[which(is.na(eval_with_brand$`Brand Code`))] <- 1
eval_with_brand$Brand_NA[which(!is.na(eval_with_brand$`Brand Code`))] <- 0Only a few variables are positively correlated with PH. Filler Level, Bowl Setpoint both have a correlation of 0.3 with PH.
PLS
set.seed(34)
data <- subset(data, !is.na(PH))
data_with_brand <- subset(data_with_brand, !is.na(PH))
train_idx <- sample(c(TRUE,FALSE), nrow(data_with_brand),
replace=TRUE, prob=c(0.7,0.3))
train_set <- data_with_brand[train_idx,-c(1)]
test_set <- data_with_brand[!train_idx,-c(1)]
train_PH <- data.frame(data_with_brand[train_idx,26])
test_PH <- data.frame(data_with_brand[!train_idx,26])
colnames(train_PH) <- c('PH')
colnames(test_PH) <- c('PH')
which(is.na(train_set))## integer(0)
summary(train_set)## Carb Volume Fill Ounces PC Volume Carb Pressure
## Min. :5.040 Min. :23.65 Min. :0.07933 Min. :57.00
## 1st Qu.:5.293 1st Qu.:23.92 1st Qu.:0.24067 1st Qu.:65.60
## Median :5.347 Median :23.97 Median :0.27133 Median :68.20
## Mean :5.371 Mean :23.97 Mean :0.27841 Mean :68.22
## 3rd Qu.:5.460 3rd Qu.:24.03 3rd Qu.:0.31267 3rd Qu.:70.60
## Max. :5.700 Max. :24.31 Max. :0.47800 Max. :79.40
## Carb Temp PSC PSC Fill PSC CO2
## Min. :128.6 Min. :0.00200 Min. :0.000 Min. :0.00000
## 1st Qu.:138.4 1st Qu.:0.05000 1st Qu.:0.100 1st Qu.:0.02000
## Median :140.8 Median :0.07700 Median :0.180 Median :0.04000
## Mean :141.1 Mean :0.08565 Mean :0.196 Mean :0.05539
## 3rd Qu.:143.8 3rd Qu.:0.11400 3rd Qu.:0.260 3rd Qu.:0.06000
## Max. :154.0 Max. :0.27000 Max. :0.600 Max. :0.24000
## Mnf Flow Carb Pressure1 Fill Pressure Hyd Pressure1
## Min. :-100.20 Min. :105.6 Min. :34.6 Min. :-0.80
## 1st Qu.:-100.00 1st Qu.:119.0 1st Qu.:46.0 1st Qu.: 0.00
## Median : 86.50 Median :123.2 Median :46.4 Median :11.40
## Mean : 25.68 Mean :122.6 Mean :47.9 Mean :12.53
## 3rd Qu.: 140.45 3rd Qu.:125.4 3rd Qu.:50.0 3rd Qu.:20.20
## Max. : 229.40 Max. :140.2 Max. :60.4 Max. :58.00
## Hyd Pressure2 Hyd Pressure3 Hyd Pressure4 Filler Level
## Min. : 0.00 Min. :-1.20 Min. : 62.00 Min. : 59.0
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 86.00 1st Qu.: 98.9
## Median :28.80 Median :27.60 Median : 96.00 Median :118.4
## Mean :21.19 Mean :20.63 Mean : 96.39 Mean :109.3
## 3rd Qu.:34.60 3rd Qu.:33.20 3rd Qu.:102.00 3rd Qu.:120.0
## Max. :59.40 Max. :49.80 Max. :142.00 Max. :153.2
## Filler Speed Temperature Usage cont Carb Flow Density
## Min. : 998 Min. :63.60 Min. :12.08 Min. : 26 Min. :0.240
## 1st Qu.:3890 1st Qu.:65.20 1st Qu.:18.34 1st Qu.:1111 1st Qu.:0.900
## Median :3982 Median :65.60 Median :21.78 Median :3028 Median :0.980
## Mean :3709 Mean :65.93 Mean :20.98 Mean :2467 Mean :1.183
## 3rd Qu.:3996 3rd Qu.:66.40 3rd Qu.:23.76 3rd Qu.:3180 3rd Qu.:1.640
## Max. :4028 Max. :76.20 Max. :25.90 Max. :5104 Max. :1.920
## MFR Balling Pressure Vacuum PH
## Min. : 31.4 Min. :0.160 Min. :-6.600 Min. :8.000
## 1st Qu.:708.0 1st Qu.:1.496 1st Qu.:-5.600 1st Qu.:8.440
## Median :724.0 Median :1.648 Median :-5.400 Median :8.540
## Mean :706.6 Mean :2.214 Mean :-5.225 Mean :8.545
## 3rd Qu.:730.2 3rd Qu.:3.296 3rd Qu.:-5.000 3rd Qu.:8.680
## Max. :849.4 Max. :4.012 Max. :-3.600 Max. :9.360
## Oxygen Filler Bowl Setpoint Pressure Setpoint Air Pressurer
## Min. :0.00240 Min. : 70.0 Min. :44.00 Min. :141.0
## 1st Qu.:0.02200 1st Qu.:100.0 1st Qu.:46.00 1st Qu.:142.2
## Median :0.03340 Median :120.0 Median :46.00 Median :142.6
## Mean :0.04613 Mean :109.3 Mean :47.63 Mean :142.8
## 3rd Qu.:0.05810 3rd Qu.:120.0 3rd Qu.:50.00 3rd Qu.:143.0
## Max. :0.40000 Max. :140.0 Max. :52.00 Max. :148.2
## Alch Rel Carb Rel Balling Lvl Brand_A
## Min. :6.400 Min. :4.960 Min. :0.000 Min. :0.0000
## 1st Qu.:6.540 1st Qu.:5.340 1st Qu.:1.380 1st Qu.:0.0000
## Median :6.580 Median :5.400 Median :1.480 Median :0.0000
## Mean :6.903 Mean :5.438 Mean :2.062 Mean :0.1161
## 3rd Qu.:7.280 3rd Qu.:5.560 3rd Qu.:3.140 3rd Qu.:0.0000
## Max. :8.600 Max. :6.060 Max. :3.660 Max. :1.0000
## Brand_B Brand_C Brand_D Brand_NA
## Min. :0.000 Min. :0.0000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000
## Median :0.000 Median :0.0000 Median :0.000 Median :0.0000
## Mean :0.471 Mean :0.1228 Mean :0.245 Mean :0.0452
## 3rd Qu.:1.000 3rd Qu.:0.0000 3rd Qu.:0.000 3rd Qu.:0.0000
## Max. :1.000 Max. :1.0000 Max. :1.000 Max. :1.0000
fit <- train(PH ~.,train_set, method = "pls",tunelength = 25, trControl = trainControl(method = "cv"))
summary(data)## Brand Code Carb Volume Fill Ounces PC Volume
## Length:2567 Min. :5.040 Min. :23.63 Min. :0.07933
## Class :character 1st Qu.:5.293 1st Qu.:23.92 1st Qu.:0.23933
## Mode :character Median :5.347 Median :23.97 Median :0.27133
## Mean :5.370 Mean :23.97 Mean :0.27715
## 3rd Qu.:5.453 3rd Qu.:24.03 3rd Qu.:0.31067
## Max. :5.700 Max. :24.32 Max. :0.47800
## Carb Pressure Carb Temp PSC PSC Fill
## Min. :57.00 Min. :128.6 Min. :0.00200 Min. :0.0000
## 1st Qu.:65.60 1st Qu.:138.4 1st Qu.:0.05000 1st Qu.:0.1000
## Median :68.20 Median :140.8 Median :0.07600 Median :0.1800
## Mean :68.19 Mean :141.1 Mean :0.08453 Mean :0.1952
## 3rd Qu.:70.60 3rd Qu.:143.8 3rd Qu.:0.11200 3rd Qu.:0.2600
## Max. :79.40 Max. :154.0 Max. :0.27000 Max. :0.6200
## PSC CO2 Mnf Flow Carb Pressure1 Fill Pressure
## Min. :0.00000 Min. :-100.20 Min. :105.6 Min. :34.60
## 1st Qu.:0.02000 1st Qu.:-100.00 1st Qu.:119.0 1st Qu.:46.00
## Median :0.04000 Median : 70.20 Median :123.2 Median :46.40
## Mean :0.05619 Mean : 24.63 Mean :122.6 Mean :47.91
## 3rd Qu.:0.08000 3rd Qu.: 140.80 3rd Qu.:125.4 3rd Qu.:50.00
## Max. :0.24000 Max. : 229.40 Max. :140.2 Max. :60.40
## Hyd Pressure1 Hyd Pressure2 Hyd Pressure3 Hyd Pressure4
## Min. :-0.80 Min. : 0.00 Min. :-1.20 Min. : 62.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 86.00
## Median :11.40 Median :28.60 Median :27.60 Median : 96.00
## Mean :12.45 Mean :21.04 Mean :20.52 Mean : 96.31
## 3rd Qu.:20.20 3rd Qu.:34.60 3rd Qu.:33.20 3rd Qu.:102.00
## Max. :58.00 Max. :59.40 Max. :50.00 Max. :142.00
## Filler Level Filler Speed Temperature Usage cont Carb Flow
## Min. : 55.8 Min. : 998 Min. :63.60 Min. :12.08 Min. : 26
## 1st Qu.: 98.7 1st Qu.:3890 1st Qu.:65.20 1st Qu.:18.38 1st Qu.:1169
## Median :118.4 Median :3982 Median :65.60 Median :21.79 Median :3028
## Mean :109.3 Mean :3694 Mean :65.96 Mean :21.00 Mean :2472
## 3rd Qu.:120.0 3rd Qu.:3997 3rd Qu.:66.40 3rd Qu.:23.74 3rd Qu.:3187
## Max. :161.2 Max. :4030 Max. :76.20 Max. :25.90 Max. :5104
## Density MFR Balling Pressure Vacuum
## Min. :0.240 Min. : 31.4 Min. :0.160 Min. :-6.600
## 1st Qu.:0.900 1st Qu.:708.2 1st Qu.:1.496 1st Qu.:-5.600
## Median :0.980 Median :724.0 Median :1.648 Median :-5.400
## Mean :1.174 Mean :705.7 Mean :2.200 Mean :-5.216
## 3rd Qu.:1.620 3rd Qu.:730.4 3rd Qu.:3.292 3rd Qu.:-5.000
## Max. :1.920 Max. :868.6 Max. :4.012 Max. :-3.600
## PH Oxygen Filler Bowl Setpoint Pressure Setpoint
## Min. :7.880 Min. :0.00240 Min. : 70.0 Min. :44.00
## 1st Qu.:8.440 1st Qu.:0.02200 1st Qu.:100.0 1st Qu.:46.00
## Median :8.540 Median :0.03340 Median :120.0 Median :46.00
## Mean :8.546 Mean :0.04637 Mean :109.4 Mean :47.61
## 3rd Qu.:8.680 3rd Qu.:0.05880 3rd Qu.:120.0 3rd Qu.:50.00
## Max. :9.360 Max. :0.40000 Max. :140.0 Max. :52.00
## Air Pressurer Alch Rel Carb Rel Balling Lvl
## Min. :140.8 Min. :5.280 Min. :4.960 Min. :0.000
## 1st Qu.:142.2 1st Qu.:6.540 1st Qu.:5.340 1st Qu.:1.380
## Median :142.6 Median :6.560 Median :5.400 Median :1.480
## Mean :142.8 Mean :6.897 Mean :5.437 Mean :2.051
## 3rd Qu.:143.0 3rd Qu.:7.230 3rd Qu.:5.540 3rd Qu.:3.140
## Max. :148.2 Max. :8.620 Max. :6.060 Max. :3.660
plot(fit)fit## Partial Least Squares
##
## 1792 samples
## 36 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1612, 1612, 1614, 1612, 1613, 1612, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 0.1684792 0.03232073 0.1352104
## 2 0.1670441 0.04847840 0.1341937
## 3 0.1540556 0.19149438 0.1207404
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.
plsPred <- predict(fit,test_set)
postResample(plsPred,test_PH$PH)## RMSE Rsquared MAE
## 0.1544459 0.2301354 0.1233213
SVM
set.seed(34)
data_PH <- data_with_brand[,26]
data_data <- data_with_brand[,-26]
eval_data <- eval_with_brand[,-26]
train_idx <- sample(c(TRUE,FALSE), nrow(data_data),
replace=TRUE, prob=c(0.7,0.3))
train_set <- data_data[train_idx,-c(1)]
test_set <- data_data[!train_idx,-c(1)]
train_PH <- data.frame(data_PH[train_idx,])
test_PH <- data.frame(data_PH[!train_idx,])
set.seed(34)
svmRModel <- train(x = train_set,y = train_PH$PH,method = "svmRadial",tuneLength = 15,trControl = trainControl(method = "cv"),preProcess = c("center","scale"))
svmPred <- predict(svmRModel, newdata = test_set)
postResample(svmPred,test_PH$PH)## RMSE Rsquared MAE
## 0.11603414 0.56523103 0.08581204
Earth Model
set.seed(34)
earthGrid <- expand.grid(.degree = 1:2,.nprune = 2:38)
earthModel <- train(x = train_set,y = train_PH$PH,method = "earth",tuneGrid = expand.grid(.degree = 1,.nprune = 2:25),trControl = trainControl(method = "cv"),preProcess = c("center","scale"))
earthPred <- predict(earthModel, newdata = test_set)
postResample(earthPred,test_PH$PH)## RMSE Rsquared MAE
## 0.1336247 0.4223821 0.1037217
NNET
nnetGrid <- expand.grid(size = seq(1, 10),decay = c(0,.01,.1),bag = FALSE)
set.seed(34)
nnetModel <- train(x = train_set,y = train_PH$PH,method = "avNNet",tuneGrid = nnetGrid,preProc = c("center", "scale"),linout = TRUE,trace = FALSE,maxit = 100,MaxNWts = 10 * (ncol(train_set) + 1) + 10 + 1,trControl = trainControl(method = "cv"),preProcess = c("center","scale"))
nnetPred <- predict(nnetModel, newdata = test_set)
postResample(nnetPred,test_PH$PH)## RMSE Rsquared MAE
## 0.11546625 0.56833764 0.08422327
KNN
knnModel <- train(x = train_set,y = train_PH$PH,method = "knn",preProc = c("center", "scale"),tuneLength = 10)
knnModel## k-Nearest Neighbors
##
## 1792 samples
## 36 predictor
##
## Pre-processing: centered (36), scaled (36)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 1792, 1792, 1792, 1792, 1792, 1792, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 0.1339611 0.4249665 0.09806591
## 7 0.1319827 0.4318388 0.09771453
## 9 0.1313512 0.4324094 0.09802806
## 11 0.1310453 0.4323258 0.09838087
## 13 0.1312755 0.4292511 0.09926105
## 15 0.1315345 0.4263844 0.09974055
## 17 0.1317595 0.4240527 0.10026194
## 19 0.1323016 0.4193815 0.10095903
## 21 0.1326620 0.4163454 0.10128024
## 23 0.1332462 0.4110770 0.10179214
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 11.
knnPred <- predict(knnModel, newdata = eval_data)
postResample(pred = knnPred, obs = test_PH$PH)## RMSE Rsquared MAE
## 0.2143956 NA 0.1700031
Cubist
set.seed(34)
cubistTune <- cubist(train_set,train_PH$PH,committees=100)
cubistTune_predictions <- predict(cubistTune,test_set)
postResample(cubistTune_predictions,test_PH$PH)## RMSE Rsquared MAE
## 0.10185133 0.67572056 0.07410222
GBM
set.seed(34)
gbmTune <- gbm(train_PH$PH~.,data=train_set, distribution = "gaussian")
gbmTune_predictions <- predict(gbmTune,test_set)## Using 100 trees...
postResample(gbmTune_predictions,test_PH$PH)## RMSE Rsquared MAE
## 0.1328323 0.4463457 0.1051459
#nrow(test_PH)
#median_prediction <- test_PH
#median_prediction$Median <- median(train_PH$PH)
#postResample(median_prediction$Median,median_prediction$PH)XGBoost
Step 1: Load Dataframe
- Convert Categorical Variables to Numeric
Step 2: Convert Character Vectors to Dummy Variables
- Use model.matrix() to convert character vectors to numeric matrices
data <- subset(data, !is.na(`Brand Code`))
brand <- model.matrix(~`Brand Code` - 1, data) # -1 to remove intercept
data <- data[,-c(1)]
data_comb <- cbind(data,brand)Step 3: Split Dataset and Construct Matrices
set.seed(34);
sample_rows <- sample(nrow(data_comb),nrow(data_comb) *.7)
dt <- sort(sample_rows)
test_set_xgb <- data_comb[-dt,]
train_set <- data_comb[dt,]
train_x_mat <- data.matrix(train_set[,-25]) #xgboost requires a matrix as the input
train_y <- train_set[,25] #Response Variable
test_x_mat <- data.matrix(test_set_xgb[,-25])
test_y <- test_set_xgb[,25] #Response VariableStep 4: Construct DMatrix and Run Model
- To show how easy it can be, I am not setting any parameters
- Watchlist is still necessary to clearly define
- verbose dictates whether each individual iteration will print model metrics
xgb_train <- xgb.DMatrix(data = train_x_mat,label=train_y)
xgb_test <- xgb.DMatrix(data = test_x_mat,label=test_y)
watchlist <- list(train = xgb_train,test = xgb_test)
xgb_model <- xgb.train(data=xgb_train,max.depth=3,watchlist=watchlist,
nrounds=100,verbose=0)
#summary(xgb_model)Step 5: Visualize Model Performance
- This model is inefficient because there are too many iterations.
- We can get away with 25 or fewer iterations
e <- data.frame(xgb_model$evaluation_log)
#which(e$test_rmse == min(e$test_rmse))
fig_2 <- ggplot(e, aes(iter,train_rmse))+
geom_point(color = 'blue',shape = 1)+
geom_line(data = e, aes(x=iter,y=test_rmse),color = 'red')+
ggtitle("XGBoost Model Root Mean Squared Error Per Iteration")+
theme(plot.title = element_text(hjust = 0.5))
imp <- xgb.importance(colnames(xgb_train),model = xgb_model)fig_2Step 6: Revise Model
- For the revised model, I only changed:
- nrounds = 25
- eta = 0.25 (down from 0.3)
xgb_model_tuned <- xgb.train(data=xgb_train,max.depth=3,watchlist=watchlist,
nrounds=25,verbose=0,eta = 0.25)
# eta of 0.01 removed all but month, clouds_all
e <- data.frame(xgb_model_tuned$evaluation_log)
#which(e$test_rmse == min(e$test_rmse))
fig_3 <- ggplot(e, aes(iter,train_rmse))+
geom_point(color = 'blue',shape = 1)+
geom_line(data = e, aes(x=iter,y=test_rmse),color = 'red')+
ggtitle("XGBoost Model Root Mean Squared Error Per Iteration")+
theme(plot.title = element_text(hjust = 0.5))
imp <- xgb.importance(colnames(xgb_train),model = xgb_model_tuned)Step 6: Revise Model
XGBoost Regression Example
Step 6: Revise Model
| XGBoost Tuned Model Score | |
|---|---|
| Mean_Squared_Error | 0.014 |
| Root_Mean_Squared_Error | 0.118 |
| Test_Mean | 8.539 |
| R_Squared | -0.131 |
There is a cyclical shape to the training data PH values when plotted by index. Peculiar.
Evaluation
eval <- subset(eval, !is.na(`Brand Code`))
brand <- model.matrix(~`Brand Code` - 1, eval) # -1 to remove intercept
eval <- eval[,-c(1)]
eval_comb <- cbind(eval,brand)
eval_x_mat <- data.matrix(eval_comb[,-25]) #xgboost requires a matrix as the input
eval_y <- eval_comb[,25]
xgb_eval <- xgb.DMatrix(data = eval_x_mat)
xgb_pred <- predict(xgb_model_tuned,xgb_eval)
predictions_xgb <- data.frame(cbind(xgb_pred))
colnames(predictions_xgb) <- c("XGBoost Predictions")
xgb_Pred <- predict(xgb_model_tuned,xgb_test)
postResample(xgb_Pred,test_y)## RMSE Rsquared MAE
## 0.11822114 0.53937458 0.09032222
Model Performance
#pct_diff <- mean(1 - estimates_table_17_30$Ratio_ETS_vs_Census_Projection)
model_performance1 <- data.frame("PLS" = t(postResample(plsPred,test_PH$PH)))
model_performance2 <-data.frame("SVM" = t(postResample(svmPred,test_PH$PH)))
model_performance_bad <-data.frame("Earth" = t(postResample(earthPred,test_PH$PH)))
model_performance3 <-data.frame("NNET" = t(postResample(nnetPred,test_PH$PH)))
model_performance4 <-data.frame("KNN" = t(postResample(pred = knnPred, obs = test_PH$PH)))
model_performance5 <-data.frame("GBM" = t(postResample(gbmTune_predictions,test_PH$PH)))
model_performance6 <-data.frame("Cubist" = t(postResample(cubistTune_predictions,test_PH$PH)))
model_performance8 <-data.frame("XGBoost" = t(postResample(xgb_Pred,test_y)))
colnames(model_performance1) <- c("RMSE","R-Squared","MAE")
colnames(model_performance2) <- c("RMSE","R-Squared","MAE")
colnames(model_performance_bad) <- c("RMSE","R-Squared","MAE")
colnames(model_performance3) <- c("RMSE","R-Squared","MAE")
colnames(model_performance4) <- c("RMSE","R-Squared","MAE")
colnames(model_performance5) <- c("RMSE","R-Squared","MAE")
colnames(model_performance6) <- c("RMSE","R-Squared","MAE")
colnames(model_performance8) <- c("RMSE","R-Squared","MAE")
model_performance <- rbind(model_performance1,model_performance2,model_performance_bad,model_performance3,model_performance4,model_performance5,model_performance6,model_performance8)
rownames(model_performance) <- c("PLS","SVM","Earth","NNET","KNN","GBM","Cubist","XGBoost")
model_performancekbl(model_performance, longtable = T, booktabs = T, caption = "Model Performance on Test Data") %>%
kable_styling(latex_options = c("repeat_header"))| RMSE | R-Squared | MAE | |
|---|---|---|---|
| PLS | 0.1544459 | 0.2301354 | 0.1233213 |
| SVM | 0.1160341 | 0.5652310 | 0.0858120 |
| Earth | 0.1336247 | 0.4223821 | 0.1037217 |
| NNET | 0.1154662 | 0.5683376 | 0.0842233 |
| KNN | 0.2143956 | NA | 0.1700031 |
| GBM | 0.1328323 | 0.4463457 | 0.1051459 |
| Cubist | 0.1018513 | 0.6757206 | 0.0741022 |
| XGBoost | 0.1182211 | 0.5393746 | 0.0903222 |
Final Predictions Table
eval_data <- eval_data[,-c(1)]
eval_ph_pred <- predict(cubistTune,eval_data)
eval_final$PH <- eval_ph_pred
file_path <- "eval_SHylton_Final_624.xlsx"
write.xlsx(eval_final, file_path, rowNames = FALSE)