DATA 624 Final Project
DATA 624 Final Project
R Libraries
Exploratory Data Analysis and PreProcessing
Data Statistics
## [1] 2571 33
## [1] 267 33
All predictors are numeric except for "Brand Code, 1st Variable)
## Brand Code Carb Volume Fill Ounces PC Volume
## Length:2571 Min. :5.040 Min. :23.63 Min. :0.07933
## Class :character 1st Qu.:5.293 1st Qu.:23.92 1st Qu.:0.23917
## Mode :character Median :5.347 Median :23.97 Median :0.27133
## Mean :5.370 Mean :23.97 Mean :0.27712
## 3rd Qu.:5.453 3rd Qu.:24.03 3rd Qu.:0.31200
## Max. :5.700 Max. :24.32 Max. :0.47800
## NA's :10 NA's :38 NA's :39
## Carb Pressure Carb Temp PSC PSC Fill
## Min. :57.00 Min. :128.6 Min. :0.00200 Min. :0.0000
## 1st Qu.:65.60 1st Qu.:138.4 1st Qu.:0.04800 1st Qu.:0.1000
## Median :68.20 Median :140.8 Median :0.07600 Median :0.1800
## Mean :68.19 Mean :141.1 Mean :0.08457 Mean :0.1954
## 3rd Qu.:70.60 3rd Qu.:143.8 3rd Qu.:0.11200 3rd Qu.:0.2600
## Max. :79.40 Max. :154.0 Max. :0.27000 Max. :0.6200
## NA's :27 NA's :26 NA's :33 NA's :23
## PSC CO2 Mnf Flow Carb Pressure1 Fill Pressure
## Min. :0.00000 Min. :-100.20 Min. :105.6 Min. :34.60
## 1st Qu.:0.02000 1st Qu.:-100.00 1st Qu.:119.0 1st Qu.:46.00
## Median :0.04000 Median : 65.20 Median :123.2 Median :46.40
## Mean :0.05641 Mean : 24.57 Mean :122.6 Mean :47.92
## 3rd Qu.:0.08000 3rd Qu.: 140.80 3rd Qu.:125.4 3rd Qu.:50.00
## Max. :0.24000 Max. : 229.40 Max. :140.2 Max. :60.40
## NA's :39 NA's :2 NA's :32 NA's :22
## Hyd Pressure1 Hyd Pressure2 Hyd Pressure3 Hyd Pressure4
## Min. :-0.80 Min. : 0.00 Min. :-1.20 Min. : 52.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 86.00
## Median :11.40 Median :28.60 Median :27.60 Median : 96.00
## Mean :12.44 Mean :20.96 Mean :20.46 Mean : 96.29
## 3rd Qu.:20.20 3rd Qu.:34.60 3rd Qu.:33.40 3rd Qu.:102.00
## Max. :58.00 Max. :59.40 Max. :50.00 Max. :142.00
## NA's :11 NA's :15 NA's :15 NA's :30
## Filler Level Filler Speed Temperature Usage cont Carb Flow
## Min. : 55.8 Min. : 998 Min. :63.60 Min. :12.08 Min. : 26
## 1st Qu.: 98.3 1st Qu.:3888 1st Qu.:65.20 1st Qu.:18.36 1st Qu.:1144
## Median :118.4 Median :3982 Median :65.60 Median :21.79 Median :3028
## Mean :109.3 Mean :3687 Mean :65.97 Mean :20.99 Mean :2468
## 3rd Qu.:120.0 3rd Qu.:3998 3rd Qu.:66.40 3rd Qu.:23.75 3rd Qu.:3186
## Max. :161.2 Max. :4030 Max. :76.20 Max. :25.90 Max. :5104
## NA's :20 NA's :57 NA's :14 NA's :5 NA's :2
## Density MFR Balling Pressure Vacuum
## Min. :0.240 Min. : 31.4 Min. :-0.170 Min. :-6.600
## 1st Qu.:0.900 1st Qu.:706.3 1st Qu.: 1.496 1st Qu.:-5.600
## Median :0.980 Median :724.0 Median : 1.648 Median :-5.400
## Mean :1.174 Mean :704.0 Mean : 2.198 Mean :-5.216
## 3rd Qu.:1.620 3rd Qu.:731.0 3rd Qu.: 3.292 3rd Qu.:-5.000
## Max. :1.920 Max. :868.6 Max. : 4.012 Max. :-3.600
## NA's :1 NA's :212 NA's :1
## PH Oxygen Filler Bowl Setpoint Pressure Setpoint
## Min. :7.880 Min. :0.00240 Min. : 70.0 Min. :44.00
## 1st Qu.:8.440 1st Qu.:0.02200 1st Qu.:100.0 1st Qu.:46.00
## Median :8.540 Median :0.03340 Median :120.0 Median :46.00
## Mean :8.546 Mean :0.04684 Mean :109.3 Mean :47.62
## 3rd Qu.:8.680 3rd Qu.:0.06000 3rd Qu.:120.0 3rd Qu.:50.00
## Max. :9.360 Max. :0.40000 Max. :140.0 Max. :52.00
## NA's :4 NA's :12 NA's :2 NA's :12
## Air Pressurer Alch Rel Carb Rel Balling Lvl
## Min. :140.8 Min. :5.280 Min. :4.960 Min. :0.00
## 1st Qu.:142.2 1st Qu.:6.540 1st Qu.:5.340 1st Qu.:1.38
## Median :142.6 Median :6.560 Median :5.400 Median :1.48
## Mean :142.8 Mean :6.897 Mean :5.437 Mean :2.05
## 3rd Qu.:143.0 3rd Qu.:7.240 3rd Qu.:5.540 3rd Qu.:3.14
## Max. :148.2 Max. :8.620 Max. :6.060 Max. :3.66
## NA's :9 NA's :10 NA's :1
## Brand Code Carb Volume Fill Ounces PC Volume
## Length:267 Min. :5.147 Min. :23.75 Min. :0.09867
## Class :character 1st Qu.:5.287 1st Qu.:23.92 1st Qu.:0.23333
## Mode :character Median :5.340 Median :23.97 Median :0.27533
## Mean :5.369 Mean :23.97 Mean :0.27769
## 3rd Qu.:5.465 3rd Qu.:24.01 3rd Qu.:0.32200
## Max. :5.667 Max. :24.20 Max. :0.46400
## NA's :1 NA's :6 NA's :4
## Carb Pressure Carb Temp PSC PSC Fill
## Min. :60.20 Min. :130.0 Min. :0.00400 Min. :0.0200
## 1st Qu.:65.30 1st Qu.:138.4 1st Qu.:0.04450 1st Qu.:0.1000
## Median :68.00 Median :140.8 Median :0.07600 Median :0.1800
## Mean :68.25 Mean :141.2 Mean :0.08545 Mean :0.1903
## 3rd Qu.:70.60 3rd Qu.:143.8 3rd Qu.:0.11200 3rd Qu.:0.2600
## Max. :77.60 Max. :154.0 Max. :0.24600 Max. :0.6200
## NA's :1 NA's :5 NA's :3
## PSC CO2 Mnf Flow Carb Pressure1 Fill Pressure
## Min. :0.00000 Min. :-100.20 Min. :113.0 Min. :37.80
## 1st Qu.:0.02000 1st Qu.:-100.00 1st Qu.:120.2 1st Qu.:46.00
## Median :0.04000 Median : 0.20 Median :123.4 Median :47.80
## Mean :0.05107 Mean : 21.03 Mean :123.0 Mean :48.14
## 3rd Qu.:0.06000 3rd Qu.: 141.30 3rd Qu.:125.5 3rd Qu.:50.20
## Max. :0.24000 Max. : 220.40 Max. :136.0 Max. :60.20
## NA's :5 NA's :4 NA's :2
## Hyd Pressure1 Hyd Pressure2 Hyd Pressure3 Hyd Pressure4
## Min. :-50.00 Min. :-50.00 Min. :-50.00 Min. : 68.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 90.00
## Median : 10.40 Median : 26.80 Median : 27.70 Median : 98.00
## Mean : 12.01 Mean : 20.11 Mean : 19.61 Mean : 97.84
## 3rd Qu.: 20.40 3rd Qu.: 34.80 3rd Qu.: 33.00 3rd Qu.:104.00
## Max. : 50.00 Max. : 61.40 Max. : 49.20 Max. :140.00
## NA's :1 NA's :1 NA's :4
## Filler Level Filler Speed Temperature Usage cont Carb Flow
## Min. : 69.2 Min. :1006 Min. :63.80 Min. :12.90 Min. : 0
## 1st Qu.:100.6 1st Qu.:3812 1st Qu.:65.40 1st Qu.:18.12 1st Qu.:1083
## Median :118.6 Median :3978 Median :65.80 Median :21.44 Median :3038
## Mean :110.3 Mean :3581 Mean :66.23 Mean :20.90 Mean :2409
## 3rd Qu.:120.2 3rd Qu.:3996 3rd Qu.:66.60 3rd Qu.:23.74 3rd Qu.:3215
## Max. :153.2 Max. :4020 Max. :75.40 Max. :24.60 Max. :3858
## NA's :2 NA's :10 NA's :2 NA's :2
## Density MFR Balling Pressure Vacuum
## Min. :0.060 Min. : 15.6 Min. :0.902 Min. :-6.400
## 1st Qu.:0.920 1st Qu.:707.0 1st Qu.:1.498 1st Qu.:-5.600
## Median :0.980 Median :724.6 Median :1.648 Median :-5.200
## Mean :1.177 Mean :697.8 Mean :2.203 Mean :-5.174
## 3rd Qu.:1.600 3rd Qu.:731.5 3rd Qu.:3.242 3rd Qu.:-4.800
## Max. :1.840 Max. :784.8 Max. :3.788 Max. :-3.600
## NA's :1 NA's :31 NA's :1 NA's :1
## PH Oxygen Filler Bowl Setpoint Pressure Setpoint
## Mode:logical Min. :0.00240 Min. : 70.0 Min. :44.00
## NA's:267 1st Qu.:0.01960 1st Qu.:100.0 1st Qu.:46.00
## Median :0.03370 Median :120.0 Median :46.00
## Mean :0.04666 Mean :109.6 Mean :47.73
## 3rd Qu.:0.05440 3rd Qu.:120.0 3rd Qu.:50.00
## Max. :0.39800 Max. :130.0 Max. :52.00
## NA's :3 NA's :1 NA's :2
## Air Pressurer Alch Rel Carb Rel Balling Lvl
## Min. :141.2 Min. :6.400 Min. :5.18 Min. :0.000
## 1st Qu.:142.2 1st Qu.:6.540 1st Qu.:5.34 1st Qu.:1.380
## Median :142.6 Median :6.580 Median :5.40 Median :1.480
## Mean :142.8 Mean :6.907 Mean :5.44 Mean :2.051
## 3rd Qu.:142.8 3rd Qu.:7.180 3rd Qu.:5.56 3rd Qu.:3.080
## Max. :147.2 Max. :7.820 Max. :5.74 Max. :3.420
## NA's :1 NA's :3 NA's :2
Handling missing values (Train and Test sets)
Correlation Analysis
# Analyze Correlations with the response variable
names <- colnames(train[,-26])
pairs.panels(train[, c("PH", names[1:8])])Top correlated features to PH: - Mnf Flow (-0.45) - Bowl Setpoint (0.35) - Filler Level (0.32) - Usage Cont (-0.32) - Pressure Setpoint (-0.31) - Hyd Pressure3 (-0.24) - Pressure Vacuum (0.22) - Hyd Pressure2 (-0.20)
train2 <- train %>% dplyr::select(-'Brand Code')
mydata.cor = cor(train2, method = c("spearman"))
corrplot(mydata.cor,cl.cex = 0.7,tl.cex = .7,diag = TRUE)Near Zero Variance Predictors
## NULL
“Hyd Pressure1” should be removed from the dataset as it is constant across observations
Data Distribution & Variability
############ Look at some Decision Trees ##################
source("https://raw.githubusercontent.com/crarnouts/Data_605_Final/master/RandomForestNulls_testing.R")## Rattle: A free graphical interface for data science with R.
## Version 5.3.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
##
## Attaching package: 'rattle'
## The following object is masked from 'package:xgboost':
##
## xgboost
## The following object is masked from 'package:VIM':
##
## wine
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
##
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
##
## boundary
## Loading required package: libcoin
##
## Attaching package: 'partykit'
## The following objects are masked from 'package:party':
##
## cforest, ctree, ctree_control, edge_simple, mob, mob_control,
## node_barplot, node_bivplot, node_boxplot, node_inner, node_surv,
## node_terminal, varimp
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
##
## importance
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
##
## Attaching package: 'memisc'
## The following objects are masked from 'package:modeltools':
##
## Lapply, relabel
## The following object is masked from 'package:Matrix':
##
## as.array
## The following objects are masked from 'package:dplyr':
##
## collect, recode, rename, syms
## The following object is masked from 'package:purrr':
##
## %@%
## The following object is masked from 'package:tibble':
##
## view
## The following object is masked from 'package:ggplot2':
##
## syms
## The following objects are masked from 'package:stats':
##
## contr.sum, contr.treatment, contrasts
## The following object is masked from 'package:base':
##
## as.array
##
## Attaching package: 'plotly'
## The following objects are masked from 'package:memisc':
##
## rename, style
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:xgboost':
##
## slice
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
colnames(train)<- make.names(colnames(train), unique=TRUE)
colnames(test)<- make.names(colnames(test), unique=TRUE)
train <- as.data.frame(train)
test <- as.data.frame(test)
test <- RF_with_Nulls(train,test,"PH",.5,5,10,.01,5,1)Modeling & Evaluation
Linear Regression
##
## Call:
## lm(formula = PH ~ ., data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43780 -0.07713 0.01232 0.08596 0.81044
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.097e+01 1.119e+00 9.798 < 2e-16 ***
## Brand.CodeB 4.413e-02 2.697e-02 1.636 0.101981
## Brand.CodeC -8.757e-02 2.701e-02 -3.242 0.001208 **
## Brand.CodeD 5.665e-02 1.843e-02 3.074 0.002143 **
## Carb.Volume -7.979e-02 7.379e-02 -1.081 0.279718
## Fill.Ounces -9.033e-02 3.767e-02 -2.398 0.016598 *
## PC.Volume -7.776e-02 6.486e-02 -1.199 0.230738
## Carb.Pressure 3.782e-04 2.924e-03 0.129 0.897092
## Carb.Temp 1.057e-03 2.337e-03 0.452 0.651270
## PSC -6.627e-02 6.986e-02 -0.949 0.342964
## PSC.Fill -2.460e-02 2.758e-02 -0.892 0.372442
## PSC.CO2 -1.191e-01 7.473e-02 -1.594 0.111098
## Mnf.Flow -7.351e-04 5.399e-05 -13.615 < 2e-16 ***
## Carb.Pressure1 7.027e-03 8.146e-04 8.626 < 2e-16 ***
## Fill.Pressure 5.140e-04 1.443e-03 0.356 0.721753
## Hyd.Pressure1 -3.977e-05 4.380e-04 -0.091 0.927661
## Hyd.Pressure2 -9.505e-04 6.220e-04 -1.528 0.126657
## Hyd.Pressure3 3.362e-03 6.882e-04 4.885 1.13e-06 ***
## Hyd.Pressure4 1.527e-04 4.073e-04 0.375 0.707680
## Filler.Level -1.302e-03 6.280e-04 -2.073 0.038295 *
## Filler.Speed 1.500e-05 8.213e-06 1.826 0.067993 .
## Temperature -1.733e-02 2.712e-03 -6.391 2.10e-10 ***
## Usage.cont -7.608e-03 1.338e-03 -5.688 1.51e-08 ***
## Carb.Flow 8.991e-06 4.499e-06 1.998 0.045843 *
## Density -1.342e-01 3.378e-02 -3.972 7.41e-05 ***
## MFR -9.279e-05 4.829e-05 -1.922 0.054804 .
## Balling -9.081e-02 2.866e-02 -3.169 0.001559 **
## Pressure.Vacuum -1.816e-02 9.041e-03 -2.009 0.044693 *
## Oxygen.Filler -2.438e-01 8.488e-02 -2.873 0.004119 **
## Bowl.Setpoint 3.196e-03 6.649e-04 4.807 1.67e-06 ***
## Pressure.Setpoint -7.781e-03 2.294e-03 -3.391 0.000711 ***
## Air.Pressurer -2.829e-03 2.815e-03 -1.005 0.315143
## Alch.Rel 4.153e-02 2.456e-02 1.691 0.090969 .
## Carb.Rel 1.259e-01 5.661e-02 2.223 0.026310 *
## Balling.Lvl 1.167e-01 2.800e-02 4.168 3.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.131 on 1764 degrees of freedom
## Multiple R-squared: 0.4281, Adjusted R-squared: 0.4171
## F-statistic: 38.84 on 34 and 1764 DF, p-value: < 2.2e-16
Bagged Tree
set.seed(seed)
bagControl = bagControl(fit = ctreeBag$fit, predict = ctreeBag$pred, aggregate = ctreeBag$aggregate)
bag_model <- train(PH ~.,
data = training, method="bag", bagControl = bagControl,
center = TRUE,
scale = TRUE,
trControl = trainControl("cv", number = 5),
tuneLength = 25)
bag_model## Bagged Model
##
## 1799 samples
## 32 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1440, 1440, 1438, 1439, 1439
## Resampling results:
##
## RMSE Rsquared MAE
## 0.1145181 0.5545603 0.08621781
##
## Tuning parameter 'vars' was held constant at a value of 34
## RMSE Rsquared MAE
## 0.11960157 0.53213860 0.08729692
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 32)
##
## Overall
## Oxygen.Filler 100.00
## Filler.Level 80.56
## Balling 67.86
## Mnf.Flow 67.62
## Filler.Speed 66.02
## Hyd.Pressure3 59.08
## Bowl.Setpoint 55.79
## Fill.Pressure 55.30
## Hyd.Pressure1 55.29
## Hyd.Pressure2 51.49
## Usage.cont 48.60
## Pressure.Setpoint 47.31
## Density 42.23
## Carb.Pressure1 39.17
## Balling.Lvl 32.79
## Brand.Code 32.31
## Temperature 29.51
## Carb.Rel 29.16
## Carb.Flow 27.12
## Carb.Volume 26.51
XGBoost (Extreme Gradient Boosting)
# Converting datasets to matrices
training2 <- training %>% drop_na(`Brand.Code`)
testing2 <- testing %>% drop_na(`Brand.Code`)
trainingmx<-model.matrix(~.+0,data=training2[,names(training2) != c("PH")])
testingmx<-model.matrix(~.+0,data=testing2[,names(testing2) != c("PH")])
trainingdmx <- xgb.DMatrix(data = trainingmx, label=training2$PH)
testingdmx <- xgb.DMatrix(data = testingmx, label=testing2$PH)
# Default parameters
params <- list(booster = "gbtree", objective = "reg:linear", eta=0.3, gamma=0, max_depth=6, min_child_weight=1, subsample=1, colsample_bytree=1)
# Determine the best nround parameter (It controls the maximum number of iterations. For classification, it is similar to the number of trees to grow.)
xgbcv <- xgb.cv( params = params, data = trainingdmx, nrounds = 300, nfold = 5, showsd = T, stratified = T, print_every_n = 10, early_stop_rounds = 20, maximize = F) ## [1] train-rmse:5.636696+0.001515 test-rmse:5.636732+0.009227
## [11] train-rmse:0.191947+0.000879 test-rmse:0.206348+0.004370
## [21] train-rmse:0.066225+0.002361 test-rmse:0.113374+0.007932
## [31] train-rmse:0.046765+0.002375 test-rmse:0.110666+0.008287
## [41] train-rmse:0.035374+0.002012 test-rmse:0.109527+0.008160
## [51] train-rmse:0.026846+0.002387 test-rmse:0.108965+0.008202
## [61] train-rmse:0.021231+0.001837 test-rmse:0.108551+0.008480
## [71] train-rmse:0.016327+0.001237 test-rmse:0.108549+0.008515
## [81] train-rmse:0.012567+0.000949 test-rmse:0.108247+0.008466
## [91] train-rmse:0.009701+0.000745 test-rmse:0.108126+0.008447
## [101] train-rmse:0.007588+0.000251 test-rmse:0.108021+0.008429
## [111] train-rmse:0.005808+0.000187 test-rmse:0.108028+0.008379
## [121] train-rmse:0.004417+0.000120 test-rmse:0.108032+0.008357
## [131] train-rmse:0.003541+0.000087 test-rmse:0.108003+0.008371
## [141] train-rmse:0.002766+0.000155 test-rmse:0.108009+0.008362
## [151] train-rmse:0.002180+0.000108 test-rmse:0.107965+0.008336
## [161] train-rmse:0.001672+0.000114 test-rmse:0.107956+0.008345
## [171] train-rmse:0.001352+0.000110 test-rmse:0.107959+0.008327
## [181] train-rmse:0.001081+0.000060 test-rmse:0.107953+0.008321
## [191] train-rmse:0.000958+0.000052 test-rmse:0.107945+0.008326
## [201] train-rmse:0.000958+0.000052 test-rmse:0.107945+0.008326
## [211] train-rmse:0.000958+0.000052 test-rmse:0.107945+0.008326
## [221] train-rmse:0.000958+0.000052 test-rmse:0.107945+0.008326
## [231] train-rmse:0.000958+0.000052 test-rmse:0.107945+0.008326
## [241] train-rmse:0.000958+0.000052 test-rmse:0.107945+0.008326
## [251] train-rmse:0.000958+0.000052 test-rmse:0.107945+0.008326
## [261] train-rmse:0.000958+0.000052 test-rmse:0.107945+0.008326
## [271] train-rmse:0.000958+0.000052 test-rmse:0.107945+0.008326
## [281] train-rmse:0.000958+0.000052 test-rmse:0.107945+0.008326
## [291] train-rmse:0.000958+0.000052 test-rmse:0.107945+0.008326
## [300] train-rmse:0.000958+0.000052 test-rmse:0.107945+0.008326
set.seed(seed)
xgb_model1 <- xgb.train (params = params, data = trainingdmx, nrounds = 260, watchlist = list(val=testingdmx,train=trainingdmx), print_every_n = 10, early_stop_round = 10, maximize = F)## [1] val-rmse:5.634269 train-rmse:5.636359
## [11] val-rmse:0.207695 train-rmse:0.191605
## [21] val-rmse:0.111853 train-rmse:0.065841
## [31] val-rmse:0.108567 train-rmse:0.049571
## [41] val-rmse:0.107455 train-rmse:0.036783
## [51] val-rmse:0.107182 train-rmse:0.029606
## [61] val-rmse:0.106665 train-rmse:0.022887
## [71] val-rmse:0.106672 train-rmse:0.018759
## [81] val-rmse:0.106710 train-rmse:0.015035
## [91] val-rmse:0.106878 train-rmse:0.011372
## [101] val-rmse:0.106759 train-rmse:0.009078
## [111] val-rmse:0.106818 train-rmse:0.007439
## [121] val-rmse:0.106748 train-rmse:0.005986
## [131] val-rmse:0.106701 train-rmse:0.005089
## [141] val-rmse:0.106724 train-rmse:0.004086
## [151] val-rmse:0.106759 train-rmse:0.003042
## [161] val-rmse:0.106750 train-rmse:0.002497
## [171] val-rmse:0.106786 train-rmse:0.002073
## [181] val-rmse:0.106786 train-rmse:0.001660
## [191] val-rmse:0.106784 train-rmse:0.001341
## [201] val-rmse:0.106782 train-rmse:0.001143
## [211] val-rmse:0.106782 train-rmse:0.001143
## [221] val-rmse:0.106782 train-rmse:0.001143
## [231] val-rmse:0.106782 train-rmse:0.001143
## [241] val-rmse:0.106782 train-rmse:0.001143
## [251] val-rmse:0.106782 train-rmse:0.001143
## [260] val-rmse:0.106782 train-rmse:0.001143
mat <- xgb.importance (feature_names = colnames(trainingmx),model = xgb_model1)
xgb.plot.importance (importance_matrix = mat) SVM
ctrl = trainControl(method='cv', number = 10)
set.seed(seed)
svmRad <- train(PH ~.,
data=training,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = ctrl)
svmRad## Support Vector Machines with Radial Basis Function Kernel
##
## 1799 samples
## 32 predictor
##
## Pre-processing: centered (34), scaled (34)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1619, 1619, 1621, 1619, 1619, 1619, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 0.1240703 0.4871182 0.09330828
## 0.50 0.1207528 0.5096012 0.09002699
## 1.00 0.1179431 0.5302617 0.08711590
## 2.00 0.1155894 0.5483718 0.08514449
## 4.00 0.1150420 0.5530793 0.08481448
## 8.00 0.1147432 0.5581778 0.08512456
## 16.00 0.1159939 0.5551358 0.08672156
## 32.00 0.1182233 0.5467311 0.08840054
## 64.00 0.1226181 0.5252908 0.09197129
## 128.00 0.1283420 0.4958995 0.09620044
## 256.00 0.1343225 0.4673940 0.10031193
## 512.00 0.1388779 0.4463327 0.10344380
## 1024.00 0.1401036 0.4415901 0.10427770
## 2048.00 0.1401036 0.4415901 0.10427770
##
## Tuning parameter 'sigma' was held constant at a value of 0.02032842
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.02032842 and C = 8.
## RMSE Rsquared MAE
## 0.11906789 0.54290102 0.08752188
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 32)
##
## Overall
## Oxygen.Filler 100.00
## Filler.Level 80.56
## Balling 67.86
## Mnf.Flow 67.62
## Filler.Speed 66.02
## Hyd.Pressure3 59.08
## Bowl.Setpoint 55.79
## Fill.Pressure 55.30
## Hyd.Pressure1 55.29
## Hyd.Pressure2 51.49
## Usage.cont 48.60
## Pressure.Setpoint 47.31
## Density 42.23
## Carb.Pressure1 39.17
## Balling.Lvl 32.79
## Brand.Code 32.31
## Temperature 29.51
## Carb.Rel 29.16
## Carb.Flow 27.12
## Carb.Volume 26.51
Cubist
set.seed(seed)
cubist <- train(PH ~.,
data = training,
method='cubist')
cubist #display model performance## Cubist
##
## 1799 samples
## 32 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 1799, 1799, 1799, 1799, 1799, 1799, ...
## Resampling results across tuning parameters:
##
## committees neighbors RMSE Rsquared MAE
## 1 0 0.1560886 0.3659507 0.10532814
## 1 5 0.1544346 0.3899526 0.10396623
## 1 9 0.1540815 0.3871304 0.10378719
## 10 0 0.1102978 0.5827615 0.08022497
## 10 5 0.1083469 0.6004825 0.07832614
## 10 9 0.1083713 0.5991803 0.07850992
## 20 0 0.1057385 0.6155050 0.07683105
## 20 5 0.1036342 0.6311569 0.07473664
## 20 9 0.1037185 0.6301280 0.07505344
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 5.
## cubist variable importance
##
## only 20 most important variables shown (out of 34)
##
## Overall
## Mnf.Flow 100.00
## Balling.Lvl 63.58
## Balling 60.49
## Alch.Rel 60.49
## Pressure.Vacuum 55.56
## Density 52.47
## Temperature 52.47
## Air.Pressurer 51.23
## Oxygen.Filler 48.77
## Bowl.Setpoint 40.12
## Hyd.Pressure3 40.12
## Brand.CodeC 39.51
## Carb.Pressure1 38.89
## Usage.cont 37.04
## Hyd.Pressure2 36.42
## Carb.Rel 33.95
## Carb.Flow 29.01
## Filler.Speed 28.40
## Hyd.Pressure1 24.07
## Filler.Level 22.84
cubist_pred <- predict(cubist, newdata=Xtest) # generate preds
postResample(obs=testing$PH, pred=cubist_pred) # evaluate model over test set## RMSE Rsquared MAE
## 0.09990521 0.67382390 0.07168616
Random Forest
ctrl = trainControl(method='cv', number = 10, allowParallel = TRUE)
set.seed(seed)
rforest <- train(PH ~.,
data = training,
method = "ranger",
importance = "permutation",
tuneLength = 10,
trControl = ctrl
)
rforest #display model performance## Random Forest
##
## 1799 samples
## 32 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1619, 1619, 1621, 1619, 1619, 1619, ...
## Resampling results across tuning parameters:
##
## mtry splitrule RMSE Rsquared MAE
## 2 variance 0.11091069 0.6278445 0.08427415
## 2 extratrees 0.11631393 0.5838127 0.08967899
## 5 variance 0.10338131 0.6654449 0.07705305
## 5 extratrees 0.10586207 0.6448788 0.07973515
## 9 variance 0.10028320 0.6787463 0.07427501
## 9 extratrees 0.10106000 0.6707174 0.07508786
## 12 variance 0.09915982 0.6832656 0.07318312
## 12 extratrees 0.09950329 0.6791071 0.07360056
## 16 variance 0.09784055 0.6896823 0.07209405
## 16 extratrees 0.09825349 0.6852784 0.07251289
## 19 variance 0.09746701 0.6900985 0.07156427
## 19 extratrees 0.09775457 0.6873250 0.07191439
## 23 variance 0.09703644 0.6912087 0.07115587
## 23 extratrees 0.09670827 0.6933017 0.07128369
## 26 variance 0.09650332 0.6936401 0.07080292
## 26 extratrees 0.09688076 0.6912843 0.07127656
## 30 variance 0.09628807 0.6932562 0.07059972
## 30 extratrees 0.09675070 0.6911588 0.07119352
## 34 variance 0.09619021 0.6929683 0.07044697
## 34 extratrees 0.09652509 0.6918887 0.07096247
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 34, splitrule = variance
## and min.node.size = 5.
## ranger variable importance
##
## only 20 most important variables shown (out of 34)
##
## Overall
## Mnf.Flow 100.000
## Alch.Rel 30.543
## Brand.CodeC 28.366
## Oxygen.Filler 14.921
## Air.Pressurer 13.583
## Balling.Lvl 13.401
## Usage.cont 13.235
## Pressure.Vacuum 11.925
## Bowl.Setpoint 10.271
## Carb.Rel 9.427
## Carb.Flow 7.645
## Balling 7.592
## Temperature 6.354
## Filler.Speed 6.278
## Carb.Pressure1 5.234
## Density 5.085
## Hyd.Pressure3 5.036
## MFR 3.959
## Filler.Level 3.906
## Carb.Volume 2.863
rf_pred <- predict(rforest, newdata = Xtest) # generate preds
postResample(obs = testing$PH, pred=rf_pred) # Evaluate model over test set## RMSE Rsquared MAE
## 0.09796899 0.69068834 0.07041744
Training Random Forest over Individual Brand Codes
for (brand_code in unique(training$Brand.Code)){
print(paste("Brand Code", brand_code))
temp_df <- training %>%
filter(Brand.Code == brand_code) %>%
select(-Brand.Code)
set.seed(seed)
temp_rf <- train(PH ~ ., data = temp_df, method = "ranger", importance = "permutation", trControl = ctrl)
print(temp_rf)
print(varImp(temp_rf))
temp_test <- testing %>%
filter(Brand.Code == brand_code) %>%
select(-Brand.Code)
temp_predictions <- predict(temp_rf, temp_test)
print(postResample(pred = temp_predictions, obs = temp_test$PH))
}## [1] "Brand Code B"
## Random Forest
##
## 945 samples
## 31 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 850, 850, 851, 850, 851, 852, ...
## Resampling results across tuning parameters:
##
## mtry splitrule RMSE Rsquared MAE
## 2 variance 0.10470334 0.6452117 0.07942344
## 2 extratrees 0.10927524 0.6177290 0.08433308
## 16 variance 0.09343016 0.7024812 0.06827668
## 16 extratrees 0.09157151 0.7163960 0.06780085
## 31 variance 0.09294968 0.7025579 0.06703315
## 31 extratrees 0.08940891 0.7275844 0.06585480
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 31, splitrule = extratrees
## and min.node.size = 5.
## ranger variable importance
##
## only 20 most important variables shown (out of 31)
##
## Overall
## Mnf.Flow 100.000
## Bowl.Setpoint 33.310
## Air.Pressurer 19.445
## Oxygen.Filler 12.418
## Filler.Level 12.253
## Pressure.Setpoint 8.454
## Pressure.Vacuum 8.370
## Carb.Flow 8.348
## Usage.cont 7.334
## Carb.Rel 7.245
## Density 6.582
## Balling 6.027
## Temperature 4.735
## Balling.Lvl 4.425
## Alch.Rel 3.993
## Hyd.Pressure3 3.929
## Hyd.Pressure2 3.478
## Carb.Pressure1 2.921
## Hyd.Pressure1 2.704
## Fill.Pressure 2.000
## RMSE Rsquared MAE
## 0.09294953 0.72636362 0.06617225
## [1] "Brand Code A"
## Random Forest
##
## 192 samples
## 31 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 173, 171, 172, 172, 174, 174, ...
## Resampling results across tuning parameters:
##
## mtry splitrule RMSE Rsquared MAE
## 2 variance 0.1169269 0.5391498 0.09014456
## 2 extratrees 0.1198074 0.5299681 0.09380621
## 16 variance 0.1123055 0.5434992 0.08681613
## 16 extratrees 0.1094826 0.5785774 0.08439492
## 31 variance 0.1136257 0.5243826 0.08751850
## 31 extratrees 0.1094616 0.5692463 0.08417489
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 31, splitrule = extratrees
## and min.node.size = 5.
## ranger variable importance
##
## only 20 most important variables shown (out of 31)
##
## Overall
## Mnf.Flow 100.000
## Filler.Level 42.237
## Usage.cont 38.028
## Bowl.Setpoint 37.465
## Pressure.Vacuum 31.191
## Oxygen.Filler 26.377
## Pressure.Setpoint 17.390
## Carb.Flow 12.625
## Carb.Pressure1 9.461
## Balling.Lvl 8.911
## Carb.Volume 7.826
## Air.Pressurer 7.292
## Hyd.Pressure3 7.252
## Fill.Ounces 6.705
## Filler.Speed 6.702
## Balling 6.017
## Hyd.Pressure1 5.453
## Carb.Temp 5.243
## Hyd.Pressure2 5.188
## Alch.Rel 5.162
## RMSE Rsquared MAE
## 0.11403307 0.55179038 0.09189937
## [1] "Brand Code C"
## Random Forest
##
## 226 samples
## 31 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 203, 203, 204, 204, 203, 205, ...
## Resampling results across tuning parameters:
##
## mtry splitrule RMSE Rsquared MAE
## 2 variance 0.1464799 0.4011552 0.10941478
## 2 extratrees 0.1492540 0.3985656 0.11347936
## 16 variance 0.1352869 0.4436095 0.10063078
## 16 extratrees 0.1366081 0.4702384 0.10189248
## 31 variance 0.1314457 0.4626753 0.09781079
## 31 extratrees 0.1343789 0.4815739 0.09988419
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 31, splitrule = variance
## and min.node.size = 5.
## ranger variable importance
##
## only 20 most important variables shown (out of 31)
##
## Overall
## Oxygen.Filler 100.000
## Carb.Rel 42.658
## Alch.Rel 16.311
## Filler.Speed 14.316
## Pressure.Vacuum 12.050
## Hyd.Pressure1 9.698
## MFR 7.925
## Density 5.515
## Fill.Pressure 4.632
## Balling 4.160
## PC.Volume 3.565
## Mnf.Flow 2.964
## Balling.Lvl 2.707
## Carb.Flow 2.475
## Usage.cont 2.273
## Pressure.Setpoint 1.863
## Hyd.Pressure3 1.707
## Hyd.Pressure4 1.544
## Hyd.Pressure2 1.411
## Carb.Pressure1 1.371
## RMSE Rsquared MAE
## 0.1616575 0.2025728 0.1115120
## [1] "Brand Code D"
## Random Forest
##
## 436 samples
## 31 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 392, 392, 393, 393, 392, 393, ...
## Resampling results across tuning parameters:
##
## mtry splitrule RMSE Rsquared MAE
## 2 variance 0.09835599 0.5757454 0.07750084
## 2 extratrees 0.10451008 0.5122990 0.08307862
## 16 variance 0.08787819 0.6221327 0.06751841
## 16 extratrees 0.08786363 0.6335255 0.06775610
## 31 variance 0.08680853 0.6206075 0.06651967
## 31 extratrees 0.08578657 0.6413579 0.06604398
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 31, splitrule = extratrees
## and min.node.size = 5.
## ranger variable importance
##
## only 20 most important variables shown (out of 31)
##
## Overall
## Mnf.Flow 100.000
## Usage.cont 29.552
## Pressure.Vacuum 24.613
## Hyd.Pressure3 15.997
## Carb.Pressure1 10.552
## Density 10.477
## Oxygen.Filler 9.951
## Bowl.Setpoint 9.338
## Carb.Flow 8.009
## Temperature 7.724
## Filler.Speed 7.227
## Alch.Rel 5.928
## Filler.Level 5.902
## Balling.Lvl 5.803
## Hyd.Pressure2 5.101
## Air.Pressurer 4.214
## Pressure.Setpoint 4.175
## Balling 3.902
## Hyd.Pressure1 3.467
## Carb.Rel 3.455
## RMSE Rsquared MAE
## 0.07858216 0.67593756 0.05911230
Predicting New Data w/Random Forest
pfile <- read_excel("StudentEvaluation.xlsx")
#Preparing the dataset we will ultimately predict PH on
test <- pfile[,-grep("PH", colnames(pfile))]
test <- kNN(test, imp_var=FALSE)
colnames(test)<- make.names(colnames(test), unique=TRUE)
ctrl = trainControl(method='cv', number = 10)
set.seed(seed)
rf_model <- train(PH ~.,
data = train,
method = "ranger",
importance = "permutation",
tuneLength = 10,
trControl = ctrl
)
final_rf_pred <- predict(rf_model, newdata=as.data.frame(test))
pfile$PH <- final_rf_pred # applying predictions to unimputed dataset
write_xlsx(pfile, "Predictions_file.xlsx") # write to excel