DATA 624 Final Project
DATA 624 Final Project
R Libraries
## Loading required package: lattice
## Loading required package: ggplot2
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------------------------ tidyverse 1.2.1 --
## v tibble 2.1.1 v purrr 0.2.5
## v tidyr 0.8.3 v dplyr 0.8.0.1
## v readr 1.1.1 v stringr 1.3.1
## v tibble 2.1.1 v forcats 0.3.0
## -- Conflicts --------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
## corrplot 0.84 loaded
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
Exploratory Data Analysis and PreProcessing
Data Statistics
## [1] 2571 33
## [1] 267 33
All predictors are numeric except for "Brand Code, 1st Variable)
## Brand Code Carb Volume Fill Ounces PC Volume
## Length:2571 Min. :5.040 Min. :23.63 Min. :0.07933
## Class :character 1st Qu.:5.293 1st Qu.:23.92 1st Qu.:0.23917
## Mode :character Median :5.347 Median :23.97 Median :0.27133
## Mean :5.370 Mean :23.97 Mean :0.27712
## 3rd Qu.:5.453 3rd Qu.:24.03 3rd Qu.:0.31200
## Max. :5.700 Max. :24.32 Max. :0.47800
## NA's :10 NA's :38 NA's :39
## Carb Pressure Carb Temp PSC PSC Fill
## Min. :57.00 Min. :128.6 Min. :0.00200 Min. :0.0000
## 1st Qu.:65.60 1st Qu.:138.4 1st Qu.:0.04800 1st Qu.:0.1000
## Median :68.20 Median :140.8 Median :0.07600 Median :0.1800
## Mean :68.19 Mean :141.1 Mean :0.08457 Mean :0.1954
## 3rd Qu.:70.60 3rd Qu.:143.8 3rd Qu.:0.11200 3rd Qu.:0.2600
## Max. :79.40 Max. :154.0 Max. :0.27000 Max. :0.6200
## NA's :27 NA's :26 NA's :33 NA's :23
## PSC CO2 Mnf Flow Carb Pressure1 Fill Pressure
## Min. :0.00000 Min. :-100.20 Min. :105.6 Min. :34.60
## 1st Qu.:0.02000 1st Qu.:-100.00 1st Qu.:119.0 1st Qu.:46.00
## Median :0.04000 Median : 65.20 Median :123.2 Median :46.40
## Mean :0.05641 Mean : 24.57 Mean :122.6 Mean :47.92
## 3rd Qu.:0.08000 3rd Qu.: 140.80 3rd Qu.:125.4 3rd Qu.:50.00
## Max. :0.24000 Max. : 229.40 Max. :140.2 Max. :60.40
## NA's :39 NA's :2 NA's :32 NA's :22
## Hyd Pressure1 Hyd Pressure2 Hyd Pressure3 Hyd Pressure4
## Min. :-0.80 Min. : 0.00 Min. :-1.20 Min. : 52.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 86.00
## Median :11.40 Median :28.60 Median :27.60 Median : 96.00
## Mean :12.44 Mean :20.96 Mean :20.46 Mean : 96.29
## 3rd Qu.:20.20 3rd Qu.:34.60 3rd Qu.:33.40 3rd Qu.:102.00
## Max. :58.00 Max. :59.40 Max. :50.00 Max. :142.00
## NA's :11 NA's :15 NA's :15 NA's :30
## Filler Level Filler Speed Temperature Usage cont
## Min. : 55.8 Min. : 998 Min. :63.60 Min. :12.08
## 1st Qu.: 98.3 1st Qu.:3888 1st Qu.:65.20 1st Qu.:18.36
## Median :118.4 Median :3982 Median :65.60 Median :21.79
## Mean :109.3 Mean :3687 Mean :65.97 Mean :20.99
## 3rd Qu.:120.0 3rd Qu.:3998 3rd Qu.:66.40 3rd Qu.:23.75
## Max. :161.2 Max. :4030 Max. :76.20 Max. :25.90
## NA's :20 NA's :57 NA's :14 NA's :5
## Carb Flow Density MFR Balling
## Min. : 26 Min. :0.240 Min. : 31.4 Min. :-0.170
## 1st Qu.:1144 1st Qu.:0.900 1st Qu.:706.3 1st Qu.: 1.496
## Median :3028 Median :0.980 Median :724.0 Median : 1.648
## Mean :2468 Mean :1.174 Mean :704.0 Mean : 2.198
## 3rd Qu.:3186 3rd Qu.:1.620 3rd Qu.:731.0 3rd Qu.: 3.292
## Max. :5104 Max. :1.920 Max. :868.6 Max. : 4.012
## NA's :2 NA's :1 NA's :212 NA's :1
## Pressure Vacuum PH Oxygen Filler Bowl Setpoint
## Min. :-6.600 Min. :7.880 Min. :0.00240 Min. : 70.0
## 1st Qu.:-5.600 1st Qu.:8.440 1st Qu.:0.02200 1st Qu.:100.0
## Median :-5.400 Median :8.540 Median :0.03340 Median :120.0
## Mean :-5.216 Mean :8.546 Mean :0.04684 Mean :109.3
## 3rd Qu.:-5.000 3rd Qu.:8.680 3rd Qu.:0.06000 3rd Qu.:120.0
## Max. :-3.600 Max. :9.360 Max. :0.40000 Max. :140.0
## NA's :4 NA's :12 NA's :2
## Pressure Setpoint Air Pressurer Alch Rel Carb Rel
## Min. :44.00 Min. :140.8 Min. :5.280 Min. :4.960
## 1st Qu.:46.00 1st Qu.:142.2 1st Qu.:6.540 1st Qu.:5.340
## Median :46.00 Median :142.6 Median :6.560 Median :5.400
## Mean :47.62 Mean :142.8 Mean :6.897 Mean :5.437
## 3rd Qu.:50.00 3rd Qu.:143.0 3rd Qu.:7.240 3rd Qu.:5.540
## Max. :52.00 Max. :148.2 Max. :8.620 Max. :6.060
## NA's :12 NA's :9 NA's :10
## Balling Lvl
## Min. :0.00
## 1st Qu.:1.38
## Median :1.48
## Mean :2.05
## 3rd Qu.:3.14
## Max. :3.66
## NA's :1
## Brand Code Carb Volume Fill Ounces PC Volume
## Length:267 Min. :5.147 Min. :23.75 Min. :0.09867
## Class :character 1st Qu.:5.287 1st Qu.:23.92 1st Qu.:0.23333
## Mode :character Median :5.340 Median :23.97 Median :0.27533
## Mean :5.369 Mean :23.97 Mean :0.27769
## 3rd Qu.:5.465 3rd Qu.:24.01 3rd Qu.:0.32200
## Max. :5.667 Max. :24.20 Max. :0.46400
## NA's :1 NA's :6 NA's :4
## Carb Pressure Carb Temp PSC PSC Fill
## Min. :60.20 Min. :130.0 Min. :0.00400 Min. :0.0200
## 1st Qu.:65.30 1st Qu.:138.4 1st Qu.:0.04450 1st Qu.:0.1000
## Median :68.00 Median :140.8 Median :0.07600 Median :0.1800
## Mean :68.25 Mean :141.2 Mean :0.08545 Mean :0.1903
## 3rd Qu.:70.60 3rd Qu.:143.8 3rd Qu.:0.11200 3rd Qu.:0.2600
## Max. :77.60 Max. :154.0 Max. :0.24600 Max. :0.6200
## NA's :1 NA's :5 NA's :3
## PSC CO2 Mnf Flow Carb Pressure1 Fill Pressure
## Min. :0.00000 Min. :-100.20 Min. :113.0 Min. :37.80
## 1st Qu.:0.02000 1st Qu.:-100.00 1st Qu.:120.2 1st Qu.:46.00
## Median :0.04000 Median : 0.20 Median :123.4 Median :47.80
## Mean :0.05107 Mean : 21.03 Mean :123.0 Mean :48.14
## 3rd Qu.:0.06000 3rd Qu.: 141.30 3rd Qu.:125.5 3rd Qu.:50.20
## Max. :0.24000 Max. : 220.40 Max. :136.0 Max. :60.20
## NA's :5 NA's :4 NA's :2
## Hyd Pressure1 Hyd Pressure2 Hyd Pressure3 Hyd Pressure4
## Min. :-50.00 Min. :-50.00 Min. :-50.00 Min. : 68.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 90.00
## Median : 10.40 Median : 26.80 Median : 27.70 Median : 98.00
## Mean : 12.01 Mean : 20.11 Mean : 19.61 Mean : 97.84
## 3rd Qu.: 20.40 3rd Qu.: 34.80 3rd Qu.: 33.00 3rd Qu.:104.00
## Max. : 50.00 Max. : 61.40 Max. : 49.20 Max. :140.00
## NA's :1 NA's :1 NA's :4
## Filler Level Filler Speed Temperature Usage cont
## Min. : 69.2 Min. :1006 Min. :63.80 Min. :12.90
## 1st Qu.:100.6 1st Qu.:3812 1st Qu.:65.40 1st Qu.:18.12
## Median :118.6 Median :3978 Median :65.80 Median :21.44
## Mean :110.3 Mean :3581 Mean :66.23 Mean :20.90
## 3rd Qu.:120.2 3rd Qu.:3996 3rd Qu.:66.60 3rd Qu.:23.74
## Max. :153.2 Max. :4020 Max. :75.40 Max. :24.60
## NA's :2 NA's :10 NA's :2 NA's :2
## Carb Flow Density MFR Balling
## Min. : 0 Min. :0.060 Min. : 15.6 Min. :0.902
## 1st Qu.:1083 1st Qu.:0.920 1st Qu.:707.0 1st Qu.:1.498
## Median :3038 Median :0.980 Median :724.6 Median :1.648
## Mean :2409 Mean :1.177 Mean :697.8 Mean :2.203
## 3rd Qu.:3215 3rd Qu.:1.600 3rd Qu.:731.5 3rd Qu.:3.242
## Max. :3858 Max. :1.840 Max. :784.8 Max. :3.788
## NA's :1 NA's :31 NA's :1
## Pressure Vacuum PH Oxygen Filler Bowl Setpoint
## Min. :-6.400 Mode:logical Min. :0.00240 Min. : 70.0
## 1st Qu.:-5.600 NA's:267 1st Qu.:0.01960 1st Qu.:100.0
## Median :-5.200 Median :0.03370 Median :120.0
## Mean :-5.174 Mean :0.04666 Mean :109.6
## 3rd Qu.:-4.800 3rd Qu.:0.05440 3rd Qu.:120.0
## Max. :-3.600 Max. :0.39800 Max. :130.0
## NA's :1 NA's :3 NA's :1
## Pressure Setpoint Air Pressurer Alch Rel Carb Rel
## Min. :44.00 Min. :141.2 Min. :6.400 Min. :5.18
## 1st Qu.:46.00 1st Qu.:142.2 1st Qu.:6.540 1st Qu.:5.34
## Median :46.00 Median :142.6 Median :6.580 Median :5.40
## Mean :47.73 Mean :142.8 Mean :6.907 Mean :5.44
## 3rd Qu.:50.00 3rd Qu.:142.8 3rd Qu.:7.180 3rd Qu.:5.56
## Max. :52.00 Max. :147.2 Max. :7.820 Max. :5.74
## NA's :2 NA's :1 NA's :3 NA's :2
## Balling Lvl
## Min. :0.000
## 1st Qu.:1.380
## Median :1.480
## Mean :2.051
## 3rd Qu.:3.080
## Max. :3.420
##
Handling missing values (Train and Test sets)
Correlation Analysis
# Analyze Correlations with the response variable
names <- colnames(train[,-26])
pairs.panels(train[, c("PH", names[1:8])])
Top correlated features to PH: - Mnf Flow (-0.45) - Bowl Setpoint (0.35) - Filler Level (0.32) - Usage Cont (-0.32) - Pressure Setpoint (-0.31) - Hyd Pressure3 (-0.24) - Pressure Vacuum (0.22) - Hyd Pressure2 (-0.20)
train2 <- train %>% dplyr::select(-'Brand Code')
mydata.cor = cor(train2, method = c("spearman"))
corrplot(mydata.cor,cl.cex = 0.7,tl.cex = .7,diag = TRUE)
Near Zero Variance Predictors
## NULL
“Hyd Pressure1” should be removed from the dataset as it is constant across observations
Data Distribution & Variability
############ Look at some Decision Trees ##################
source("https://raw.githubusercontent.com/crarnouts/Data_605_Final/master/RandomForestNulls_testing.R")
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
##
## Attaching package: 'rattle'
## The following object is masked from 'package:xgboost':
##
## xgboost
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
##
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
##
## boundary
## Loading required package: libcoin
##
## Attaching package: 'partykit'
## The following objects are masked from 'package:party':
##
## cforest, ctree, ctree_control, edge_simple, mob, mob_control,
## node_barplot, node_bivplot, node_boxplot, node_inner,
## node_surv, node_terminal, varimp
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
##
## importance
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
##
## Attaching package: 'memisc'
## The following objects are masked from 'package:modeltools':
##
## Lapply, relabel
## The following object is masked from 'package:Matrix':
##
## as.array
## The following objects are masked from 'package:dplyr':
##
## collect, recode, rename, syms
## The following object is masked from 'package:purrr':
##
## %@%
## The following object is masked from 'package:ggplot2':
##
## syms
## The following objects are masked from 'package:stats':
##
## contr.sum, contr.treatment, contrasts
## The following object is masked from 'package:base':
##
## as.array
##
## Attaching package: 'plotly'
## The following objects are masked from 'package:memisc':
##
## rename, style
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:xgboost':
##
## slice
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
colnames(train)<- make.names(colnames(train), unique=TRUE)
colnames(test)<- make.names(colnames(test), unique=TRUE)
train <- as.data.frame(train)
test <- as.data.frame(test)
test <- RF_with_Nulls(train,test,"PH",.5,5,10,.01,5,1)
Modeling & Evaluation
Linear Regression
##
## Call:
## lm(formula = PH ~ ., data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.47998 -0.07698 0.01066 0.08812 0.47485
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.097e+01 1.136e+00 9.651 < 2e-16 ***
## Brand.CodeB 8.664e-02 2.883e-02 3.005 0.002695 **
## Brand.CodeC -5.662e-02 2.864e-02 -1.977 0.048190 *
## Brand.CodeD 6.790e-02 1.868e-02 3.634 0.000287 ***
## Carb.Volume -1.064e-01 7.612e-02 -1.398 0.162291
## Fill.Ounces -9.734e-02 3.889e-02 -2.503 0.012402 *
## PC.Volume -1.661e-01 6.619e-02 -2.509 0.012184 *
## Carb.Pressure -1.279e-03 2.835e-03 -0.451 0.651871
## Carb.Temp 1.776e-03 2.310e-03 0.769 0.441978
## PSC -1.328e-01 6.855e-02 -1.937 0.052945 .
## PSC.Fill -4.870e-02 2.802e-02 -1.738 0.082363 .
## PSC.CO2 -5.049e-02 7.716e-02 -0.654 0.512942
## Mnf.Flow -6.649e-04 5.452e-05 -12.194 < 2e-16 ***
## Carb.Pressure1 7.140e-03 8.238e-04 8.666 < 2e-16 ***
## Fill.Pressure 3.073e-03 1.444e-03 2.127 0.033521 *
## Hyd.Pressure1 -5.071e-05 4.265e-04 -0.119 0.905361
## Hyd.Pressure2 -9.632e-04 6.209e-04 -1.551 0.121002
## Hyd.Pressure3 3.291e-03 6.956e-04 4.732 2.40e-06 ***
## Hyd.Pressure4 -4.748e-05 3.792e-04 -0.125 0.900367
## Filler.Level -1.449e-03 6.556e-04 -2.209 0.027276 *
## Filler.Speed -1.488e-07 7.724e-06 -0.019 0.984635
## Temperature -1.594e-02 2.636e-03 -6.047 1.79e-09 ***
## Usage.cont -6.853e-03 1.366e-03 -5.016 5.81e-07 ***
## Carb.Flow 1.275e-05 4.589e-06 2.778 0.005534 **
## Density -7.346e-02 3.462e-02 -2.122 0.033970 *
## MFR -1.353e-05 4.705e-05 -0.288 0.773744
## Balling -1.115e-01 2.945e-02 -3.786 0.000158 ***
## Pressure.Vacuum -2.130e-02 9.248e-03 -2.303 0.021394 *
## Oxygen.Filler -2.868e-01 8.242e-02 -3.480 0.000513 ***
## Bowl.Setpoint 3.943e-03 6.960e-04 5.666 1.71e-08 ***
## Pressure.Setpoint -9.011e-03 2.325e-03 -3.876 0.000110 ***
## Air.Pressurer -2.858e-05 2.853e-03 -0.010 0.992009
## Alch.Rel 2.617e-02 2.485e-02 1.053 0.292349
## Carb.Rel 6.546e-02 5.633e-02 1.162 0.245374
## Balling.Lvl 1.550e-01 2.832e-02 5.473 5.06e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1325 on 1764 degrees of freedom
## Multiple R-squared: 0.4245, Adjusted R-squared: 0.4134
## F-statistic: 38.27 on 34 and 1764 DF, p-value: < 2.2e-16
Bagged Tree
set.seed(8)
bagControl = bagControl(fit = ctreeBag$fit, predict = ctreeBag$pred, aggregate = ctreeBag$aggregate)
bag_model <- train(PH ~.,
data = training, method="bag", bagControl = bagControl,
center = TRUE,
scale = TRUE,
trControl = trainControl("cv", number = 5),
tuneLength = 25)
bag_model
## Bagged Model
##
## 1799 samples
## 32 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1439, 1439, 1439, 1440, 1439
## Resampling results:
##
## RMSE Rsquared MAE
## 0.1151136 0.5618405 0.08674401
##
## Tuning parameter 'vars' was held constant at a value of 34
XGBoost (Extreme Gradient Boosting)
# Converting datasets to matrices
training2 <- training %>% drop_na(`Brand.Code`)
testing2 <- testing %>% drop_na(`Brand.Code`)
trainingmx<-model.matrix(~.+0,data=training2[,names(training2) != c("PH")])
testingmx<-model.matrix(~.+0,data=testing2[,names(testing2) != c("PH")])
trainingdmx <- xgb.DMatrix(data = trainingmx, label=training2$PH)
testingdmx <- xgb.DMatrix(data = testingmx, label=testing2$PH)
# Default parameters
params <- list(booster = "gbtree", objective = "reg:linear", eta=0.3, gamma=0, max_depth=6, min_child_weight=1, subsample=1, colsample_bytree=1)
# Determine the best nround parameter (It controls the maximum number of iterations. For classification, it is similar to the number of trees to grow.)
xgbcv <- xgb.cv( params = params, data = trainingdmx, nrounds = 300, nfold = 5, showsd = T, stratified = T, print_every_n = 10, early_stop_rounds = 20, maximize = F)
## [1] train-rmse:5.635957+0.001916 test-rmse:5.635938+0.011816
## [11] train-rmse:0.191816+0.001336 test-rmse:0.205438+0.007184
## [21] train-rmse:0.062389+0.001301 test-rmse:0.113478+0.003886
## [31] train-rmse:0.045012+0.001369 test-rmse:0.110685+0.003705
## [41] train-rmse:0.034613+0.000964 test-rmse:0.109608+0.003563
## [51] train-rmse:0.026346+0.001259 test-rmse:0.109126+0.003638
## [61] train-rmse:0.020640+0.000670 test-rmse:0.108629+0.003721
## [71] train-rmse:0.016143+0.000452 test-rmse:0.108784+0.003506
## [81] train-rmse:0.012481+0.000642 test-rmse:0.108693+0.003590
## [91] train-rmse:0.009715+0.000433 test-rmse:0.108667+0.003422
## [101] train-rmse:0.007441+0.000207 test-rmse:0.108540+0.003498
## [111] train-rmse:0.005857+0.000244 test-rmse:0.108464+0.003481
## [121] train-rmse:0.004518+0.000098 test-rmse:0.108392+0.003433
## [131] train-rmse:0.003656+0.000106 test-rmse:0.108379+0.003446
## [141] train-rmse:0.002903+0.000084 test-rmse:0.108354+0.003471
## [151] train-rmse:0.002272+0.000091 test-rmse:0.108310+0.003429
## [161] train-rmse:0.001747+0.000120 test-rmse:0.108316+0.003425
## [171] train-rmse:0.001356+0.000079 test-rmse:0.108306+0.003449
## [181] train-rmse:0.001087+0.000103 test-rmse:0.108303+0.003447
## [191] train-rmse:0.000984+0.000057 test-rmse:0.108305+0.003448
## [201] train-rmse:0.000969+0.000053 test-rmse:0.108305+0.003448
## [211] train-rmse:0.000969+0.000053 test-rmse:0.108305+0.003448
## [221] train-rmse:0.000969+0.000053 test-rmse:0.108305+0.003448
## [231] train-rmse:0.000969+0.000053 test-rmse:0.108305+0.003448
## [241] train-rmse:0.000969+0.000053 test-rmse:0.108305+0.003448
## [251] train-rmse:0.000969+0.000053 test-rmse:0.108305+0.003448
## [261] train-rmse:0.000969+0.000053 test-rmse:0.108305+0.003448
## [271] train-rmse:0.000969+0.000053 test-rmse:0.108305+0.003448
## [281] train-rmse:0.000969+0.000053 test-rmse:0.108305+0.003448
## [291] train-rmse:0.000969+0.000053 test-rmse:0.108305+0.003448
## [300] train-rmse:0.000969+0.000053 test-rmse:0.108305+0.003448
set.seed(8)
xgb_model1 <- xgb.train (params = params, data = trainingdmx, nrounds = 260, watchlist = list(val=testingdmx,train=trainingdmx), print_every_n = 10, early_stop_round = 10, maximize = F)
## [1] val-rmse:5.637133 train-rmse:5.635623
## [11] val-rmse:0.207384 train-rmse:0.192538
## [21] val-rmse:0.113042 train-rmse:0.065535
## [31] val-rmse:0.110132 train-rmse:0.050887
## [41] val-rmse:0.109743 train-rmse:0.041158
## [51] val-rmse:0.109709 train-rmse:0.031971
## [61] val-rmse:0.109340 train-rmse:0.025109
## [71] val-rmse:0.108982 train-rmse:0.019163
## [81] val-rmse:0.108853 train-rmse:0.016024
## [91] val-rmse:0.108704 train-rmse:0.013278
## [101] val-rmse:0.108695 train-rmse:0.011163
## [111] val-rmse:0.108516 train-rmse:0.009321
## [121] val-rmse:0.108365 train-rmse:0.007779
## [131] val-rmse:0.108309 train-rmse:0.006741
## [141] val-rmse:0.108163 train-rmse:0.005150
## [151] val-rmse:0.108100 train-rmse:0.004559
## [161] val-rmse:0.108050 train-rmse:0.003697
## [171] val-rmse:0.107962 train-rmse:0.002979
## [181] val-rmse:0.107924 train-rmse:0.002386
## [191] val-rmse:0.107937 train-rmse:0.001853
## [201] val-rmse:0.107927 train-rmse:0.001433
## [211] val-rmse:0.107937 train-rmse:0.001257
## [221] val-rmse:0.107933 train-rmse:0.001048
## [231] val-rmse:0.107933 train-rmse:0.001048
## [241] val-rmse:0.107933 train-rmse:0.001048
## [251] val-rmse:0.107933 train-rmse:0.001048
## [260] val-rmse:0.107933 train-rmse:0.001048
mat <- xgb.importance (feature_names = colnames(trainingmx),model = xgb_model1)
xgb.plot.importance (importance_matrix = mat)
SVM
ctrl = trainControl(method='cv', number = 10)
set.seed(8)
svmRad <- train(PH ~.,
data=training,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = ctrl)
svmRad
## Support Vector Machines with Radial Basis Function Kernel
##
## 1799 samples
## 32 predictor
##
## Pre-processing: centered (34), scaled (34)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1618, 1619, 1620, 1619, 1620, 1621, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 0.1258726 0.4818995 0.09449519
## 0.50 0.1217355 0.5118512 0.09085609
## 1.00 0.1179886 0.5399821 0.08782349
## 2.00 0.1149755 0.5617383 0.08551357
## 4.00 0.1127782 0.5772827 0.08398958
## 8.00 0.1125225 0.5806748 0.08416775
## 16.00 0.1137899 0.5754004 0.08528021
## 32.00 0.1167588 0.5618870 0.08754950
## 64.00 0.1214338 0.5397424 0.09095877
## 128.00 0.1269152 0.5163802 0.09508690
## 256.00 0.1324107 0.4940749 0.09921596
## 512.00 0.1347824 0.4860424 0.10129155
## 1024.00 0.1348395 0.4858562 0.10133308
## 2048.00 0.1348395 0.4858562 0.10133308
##
## Tuning parameter 'sigma' was held constant at a value of 0.02093111
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.02093111 and C = 8.
## RMSE Rsquared MAE
## 0.11816659 0.53021907 0.08545353
Cubist
set.seed(8)
cubist <- train(PH ~.,
data = training,
method='cubist')
cubist #display model performance
## Cubist
##
## 1799 samples
## 32 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 1799, 1799, 1799, 1799, 1799, 1799, ...
## Resampling results across tuning parameters:
##
## committees neighbors RMSE Rsquared MAE
## 1 0 0.1563180 0.3688488 0.10686450
## 1 5 0.1544195 0.3963511 0.10484345
## 1 9 0.1539255 0.3931365 0.10471406
## 10 0 0.1096610 0.6033887 0.07951621
## 10 5 0.1076683 0.6197043 0.07785767
## 10 9 0.1076233 0.6189923 0.07797430
## 20 0 0.1053019 0.6338267 0.07631629
## 20 5 0.1030417 0.6489188 0.07430904
## 20 9 0.1030093 0.6488571 0.07451852
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 9.
## cubist variable importance
##
## only 20 most important variables shown (out of 34)
##
## Overall
## Mnf.Flow 100.00
## Balling.Lvl 65.82
## Pressure.Vacuum 61.39
## Alch.Rel 56.33
## Balling 56.33
## Density 53.16
## Temperature 44.30
## Carb.Flow 42.41
## Hyd.Pressure3 42.41
## Air.Pressurer 42.41
## Carb.Pressure1 40.51
## Oxygen.Filler 39.24
## Carb.Rel 38.61
## Usage.cont 38.61
## Bowl.Setpoint 37.97
## Brand.CodeC 32.28
## Hyd.Pressure2 30.38
## Filler.Level 25.95
## Hyd.Pressure1 19.62
## Filler.Speed 17.09
cubist_pred <- predict(cubist, newdata=Xtest) # generate preds
postResample(obs=testing$PH, pred=cubist_pred) # evaluate model over test set
## RMSE Rsquared MAE
## 0.1033775 0.6388871 0.0728324
Random Forest
ctrl = trainControl(method='cv', number = 10)
set.seed(8)
rforest <- train(PH ~.,
data = training,
method = "ranger",
importance = "permutation",
tuneLength = 10,
trControl = ctrl
)
rforest #display model performance
## Random Forest
##
## 1799 samples
## 32 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1618, 1619, 1620, 1619, 1620, 1621, ...
## Resampling results across tuning parameters:
##
## mtry splitrule RMSE Rsquared MAE
## 2 variance 0.11151739 0.6354668 0.08524535
## 2 extratrees 0.11642749 0.5981672 0.09038674
## 5 variance 0.10323307 0.6753713 0.07726420
## 5 extratrees 0.10541449 0.6579924 0.08013946
## 9 variance 0.10016483 0.6872247 0.07433043
## 9 extratrees 0.10116996 0.6797159 0.07609173
## 12 variance 0.09912001 0.6908705 0.07352341
## 12 extratrees 0.09924550 0.6892281 0.07446955
## 16 variance 0.09767099 0.6978233 0.07203220
## 16 extratrees 0.09818093 0.6943683 0.07340745
## 19 variance 0.09735473 0.6982720 0.07173595
## 19 extratrees 0.09759577 0.6962345 0.07257982
## 23 variance 0.09710838 0.6971347 0.07121258
## 23 extratrees 0.09701654 0.6985425 0.07209662
## 26 variance 0.09645654 0.7002343 0.07092141
## 26 extratrees 0.09664826 0.6997303 0.07190026
## 30 variance 0.09623082 0.6999246 0.07047931
## 30 extratrees 0.09677284 0.6978898 0.07191384
## 34 variance 0.09659049 0.6955995 0.07064884
## 34 extratrees 0.09679665 0.6969302 0.07189735
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 30, splitrule =
## variance and min.node.size = 5.
## ranger variable importance
##
## only 20 most important variables shown (out of 34)
##
## Overall
## Mnf.Flow 100.000
## Brand.CodeC 31.928
## Usage.cont 26.039
## Alch.Rel 25.674
## Oxygen.Filler 14.022
## Balling.Lvl 13.706
## Pressure.Vacuum 13.628
## Bowl.Setpoint 12.060
## Air.Pressurer 10.673
## Carb.Rel 10.192
## Temperature 9.847
## Carb.Flow 9.272
## Balling 8.978
## Carb.Pressure1 7.366
## Hyd.Pressure3 6.269
## MFR 5.687
## Filler.Level 5.333
## Filler.Speed 5.009
## Fill.Pressure 4.492
## Density 4.426
rf_pred <- predict(rforest, newdata = Xtest) # generate preds
postResample(obs = testing$PH, pred=rf_pred) # Evaluate model over test set
## RMSE Rsquared MAE
## 0.10262240 0.64474014 0.07338062
Training Random Forest over Individual Brand Codes
for (brand_code in unique(training$`Brand Code`)){
print(paste("Brand Code", brand_code))
temp_df <- training %>%
filter(`Brand Code` == brand_code) %>%
select(-`Brand Code`)
set.seed(8)
temp_rf <- train(PH ~ ., data = temp_df, method = "ranger", importance = "permutation", trControl = tc)
print(temp_rf)
print(varImp(temp_rf))
temp_test <- testing %>%
filter(`Brand Code` == brand_code) %>%
select(-`Brand Code`)
temp_predictions <- predict(temp_rf, temp_test)
print(postResample(pred = temp_predictions, obs = temp_test$PH))
}
Predicting New Data w/Random Forest
pfile <- read_excel("StudentEvaluation.xlsx")
#Preparing the dataset we will ultimately predict PH on
test <- pfile[,-grep("PH", colnames(pfile))]
test <- kNN(test, imp_var=FALSE)
colnames(test)<- make.names(colnames(test), unique=TRUE)
ctrl = trainControl(method='cv', number = 10)
set.seed(8)
rf_model <- train(PH ~.,
data = train,
method = "ranger",
importance = "permutation",
tuneLength = 10,
trControl = ctrl
)
final_rf_pred <- predict(rf_model, newdata=as.data.frame(test))
pfile$PH <- final_rf_pred # applying predictions to unimputed dataset
write_xlsx(pfile, "Predictions_file.xlsx") # write to excel