## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
##
## select
##
## Attaching package: 'memisc'
## The following objects are masked from 'package:plotly':
##
## rename, style
## The following object is masked from 'package:ggplot2':
##
## syms
## The following objects are masked from 'package:stats':
##
## contr.sum, contr.treatment, contrasts
## The following object is masked from 'package:base':
##
## as.array
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
## ── Attaching packages ──────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 3.0.1 ✓ dplyr 1.0.0
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ✓ purrr 0.3.4
## ── Conflicts ─────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x purrr::%@%() masks memisc::%@%()
## x dplyr::collect() masks memisc::collect()
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
## x dplyr::recode() masks memisc::recode()
## x dplyr::rename() masks memisc::rename(), plotly::rename()
## x dplyr::select() masks MASS::select(), plotly::select()
## x dplyr::syms() masks memisc::syms(), ggplot2::syms()
## x tibble::view() masks memisc::view()
## corrplot 0.84 loaded
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
## Loading required package: colorspace
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
## The following object is masked from 'package:plotly':
##
## slice
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## [1] 2571 33
## [1] 267 33
## Brand.Code Carb.Volume Fill.Ounces PC.Volume
## Length:2571 Min. :5.040 Min. :23.63 Min. :0.07933
## Class :character 1st Qu.:5.293 1st Qu.:23.92 1st Qu.:0.23917
## Mode :character Median :5.347 Median :23.97 Median :0.27133
## Mean :5.370 Mean :23.97 Mean :0.27712
## 3rd Qu.:5.453 3rd Qu.:24.03 3rd Qu.:0.31200
## Max. :5.700 Max. :24.32 Max. :0.47800
## NA's :10 NA's :38 NA's :39
## Carb.Pressure Carb.Temp PSC PSC.Fill
## Min. :57.00 Min. :128.6 Min. :0.00200 Min. :0.0000
## 1st Qu.:65.60 1st Qu.:138.4 1st Qu.:0.04800 1st Qu.:0.1000
## Median :68.20 Median :140.8 Median :0.07600 Median :0.1800
## Mean :68.19 Mean :141.1 Mean :0.08457 Mean :0.1954
## 3rd Qu.:70.60 3rd Qu.:143.8 3rd Qu.:0.11200 3rd Qu.:0.2600
## Max. :79.40 Max. :154.0 Max. :0.27000 Max. :0.6200
## NA's :27 NA's :26 NA's :33 NA's :23
## PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure
## Min. :0.00000 Min. :-100.20 Min. :105.6 Min. :34.60
## 1st Qu.:0.02000 1st Qu.:-100.00 1st Qu.:119.0 1st Qu.:46.00
## Median :0.04000 Median : 65.20 Median :123.2 Median :46.40
## Mean :0.05641 Mean : 24.57 Mean :122.6 Mean :47.92
## 3rd Qu.:0.08000 3rd Qu.: 140.80 3rd Qu.:125.4 3rd Qu.:50.00
## Max. :0.24000 Max. : 229.40 Max. :140.2 Max. :60.40
## NA's :39 NA's :2 NA's :32 NA's :22
## Hyd.Pressure1 Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4
## Min. :-0.80 Min. : 0.00 Min. :-1.20 Min. : 52.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 86.00
## Median :11.40 Median :28.60 Median :27.60 Median : 96.00
## Mean :12.44 Mean :20.96 Mean :20.46 Mean : 96.29
## 3rd Qu.:20.20 3rd Qu.:34.60 3rd Qu.:33.40 3rd Qu.:102.00
## Max. :58.00 Max. :59.40 Max. :50.00 Max. :142.00
## NA's :11 NA's :15 NA's :15 NA's :30
## Filler.Level Filler.Speed Temperature Usage.cont Carb.Flow
## Min. : 55.8 Min. : 998 Min. :63.60 Min. :12.08 Min. : 26
## 1st Qu.: 98.3 1st Qu.:3888 1st Qu.:65.20 1st Qu.:18.36 1st Qu.:1144
## Median :118.4 Median :3982 Median :65.60 Median :21.79 Median :3028
## Mean :109.3 Mean :3687 Mean :65.97 Mean :20.99 Mean :2468
## 3rd Qu.:120.0 3rd Qu.:3998 3rd Qu.:66.40 3rd Qu.:23.75 3rd Qu.:3186
## Max. :161.2 Max. :4030 Max. :76.20 Max. :25.90 Max. :5104
## NA's :20 NA's :57 NA's :14 NA's :5 NA's :2
## Density MFR Balling Pressure.Vacuum
## Min. :0.240 Min. : 31.4 Min. :-0.170 Min. :-6.600
## 1st Qu.:0.900 1st Qu.:706.3 1st Qu.: 1.496 1st Qu.:-5.600
## Median :0.980 Median :724.0 Median : 1.648 Median :-5.400
## Mean :1.174 Mean :704.0 Mean : 2.198 Mean :-5.216
## 3rd Qu.:1.620 3rd Qu.:731.0 3rd Qu.: 3.292 3rd Qu.:-5.000
## Max. :1.920 Max. :868.6 Max. : 4.012 Max. :-3.600
## NA's :1 NA's :212 NA's :1
## PH Oxygen.Filler Bowl.Setpoint Pressure.Setpoint
## Min. :7.880 Min. :0.00240 Min. : 70.0 Min. :44.00
## 1st Qu.:8.440 1st Qu.:0.02200 1st Qu.:100.0 1st Qu.:46.00
## Median :8.540 Median :0.03340 Median :120.0 Median :46.00
## Mean :8.546 Mean :0.04684 Mean :109.3 Mean :47.62
## 3rd Qu.:8.680 3rd Qu.:0.06000 3rd Qu.:120.0 3rd Qu.:50.00
## Max. :9.360 Max. :0.40000 Max. :140.0 Max. :52.00
## NA's :4 NA's :12 NA's :2 NA's :12
## Air.Pressurer Alch.Rel Carb.Rel Balling.Lvl
## Min. :140.8 Min. :5.280 Min. :4.960 Min. :0.00
## 1st Qu.:142.2 1st Qu.:6.540 1st Qu.:5.340 1st Qu.:1.38
## Median :142.6 Median :6.560 Median :5.400 Median :1.48
## Mean :142.8 Mean :6.897 Mean :5.437 Mean :2.05
## 3rd Qu.:143.0 3rd Qu.:7.240 3rd Qu.:5.540 3rd Qu.:3.14
## Max. :148.2 Max. :8.620 Max. :6.060 Max. :3.66
## NA's :9 NA's :10 NA's :1
## Brand Code Carb Volume Fill Ounces PC Volume
## Length:267 Min. :5.147 Min. :23.75 Min. :0.09867
## Class :character 1st Qu.:5.287 1st Qu.:23.92 1st Qu.:0.23333
## Mode :character Median :5.340 Median :23.97 Median :0.27533
## Mean :5.369 Mean :23.97 Mean :0.27769
## 3rd Qu.:5.465 3rd Qu.:24.01 3rd Qu.:0.32200
## Max. :5.667 Max. :24.20 Max. :0.46400
## NA's :1 NA's :6 NA's :4
## Carb Pressure Carb Temp PSC PSC Fill
## Min. :60.20 Min. :130.0 Min. :0.00400 Min. :0.0200
## 1st Qu.:65.30 1st Qu.:138.4 1st Qu.:0.04450 1st Qu.:0.1000
## Median :68.00 Median :140.8 Median :0.07600 Median :0.1800
## Mean :68.25 Mean :141.2 Mean :0.08545 Mean :0.1903
## 3rd Qu.:70.60 3rd Qu.:143.8 3rd Qu.:0.11200 3rd Qu.:0.2600
## Max. :77.60 Max. :154.0 Max. :0.24600 Max. :0.6200
## NA's :1 NA's :5 NA's :3
## PSC CO2 Mnf Flow Carb Pressure1 Fill Pressure
## Min. :0.00000 Min. :-100.20 Min. :113.0 Min. :37.80
## 1st Qu.:0.02000 1st Qu.:-100.00 1st Qu.:120.2 1st Qu.:46.00
## Median :0.04000 Median : 0.20 Median :123.4 Median :47.80
## Mean :0.05107 Mean : 21.03 Mean :123.0 Mean :48.14
## 3rd Qu.:0.06000 3rd Qu.: 141.30 3rd Qu.:125.5 3rd Qu.:50.20
## Max. :0.24000 Max. : 220.40 Max. :136.0 Max. :60.20
## NA's :5 NA's :4 NA's :2
## Hyd Pressure1 Hyd Pressure2 Hyd Pressure3 Hyd Pressure4
## Min. :-50.00 Min. :-50.00 Min. :-50.00 Min. : 68.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 90.00
## Median : 10.40 Median : 26.80 Median : 27.70 Median : 98.00
## Mean : 12.01 Mean : 20.11 Mean : 19.61 Mean : 97.84
## 3rd Qu.: 20.40 3rd Qu.: 34.80 3rd Qu.: 33.00 3rd Qu.:104.00
## Max. : 50.00 Max. : 61.40 Max. : 49.20 Max. :140.00
## NA's :1 NA's :1 NA's :4
## Filler Level Filler Speed Temperature Usage cont Carb Flow
## Min. : 69.2 Min. :1006 Min. :63.80 Min. :12.90 Min. : 0
## 1st Qu.:100.6 1st Qu.:3812 1st Qu.:65.40 1st Qu.:18.12 1st Qu.:1083
## Median :118.6 Median :3978 Median :65.80 Median :21.44 Median :3038
## Mean :110.3 Mean :3581 Mean :66.23 Mean :20.90 Mean :2409
## 3rd Qu.:120.2 3rd Qu.:3996 3rd Qu.:66.60 3rd Qu.:23.74 3rd Qu.:3215
## Max. :153.2 Max. :4020 Max. :75.40 Max. :24.60 Max. :3858
## NA's :2 NA's :10 NA's :2 NA's :2
## Density MFR Balling Pressure Vacuum
## Min. :0.060 Min. : 15.6 Min. :0.902 Min. :-6.400
## 1st Qu.:0.920 1st Qu.:707.0 1st Qu.:1.498 1st Qu.:-5.600
## Median :0.980 Median :724.6 Median :1.648 Median :-5.200
## Mean :1.177 Mean :697.8 Mean :2.203 Mean :-5.174
## 3rd Qu.:1.600 3rd Qu.:731.5 3rd Qu.:3.242 3rd Qu.:-4.800
## Max. :1.840 Max. :784.8 Max. :3.788 Max. :-3.600
## NA's :1 NA's :31 NA's :1 NA's :1
## PH Oxygen Filler Bowl Setpoint Pressure Setpoint
## Mode:logical Min. :0.00240 Min. : 70.0 Min. :44.00
## NA's:267 1st Qu.:0.01960 1st Qu.:100.0 1st Qu.:46.00
## Median :0.03370 Median :120.0 Median :46.00
## Mean :0.04666 Mean :109.6 Mean :47.73
## 3rd Qu.:0.05440 3rd Qu.:120.0 3rd Qu.:50.00
## Max. :0.39800 Max. :130.0 Max. :52.00
## NA's :3 NA's :1 NA's :2
## Air Pressurer Alch Rel Carb Rel Balling Lvl
## Min. :141.2 Min. :6.400 Min. :5.18 Min. :0.000
## 1st Qu.:142.2 1st Qu.:6.540 1st Qu.:5.34 1st Qu.:1.380
## Median :142.6 Median :6.580 Median :5.40 Median :1.480
## Mean :142.8 Mean :6.907 Mean :5.44 Mean :2.051
## 3rd Qu.:142.8 3rd Qu.:7.180 3rd Qu.:5.56 3rd Qu.:3.080
## Max. :147.2 Max. :7.820 Max. :5.74 Max. :3.420
## NA's :1 NA's :3 NA's :2
# Correlations with response variable
names <- colnames(train[,-26])
pairs.panels(train[, c("PH", names[1:8])])Top correlated features to PH are Mnf Flow (-0.45), Bowl Setpoint (0.35), Filler Level (0.32), Usage Cont (-0.32), Pressure Setpoint (-0.31),Hyd Pressure3 (-0.24), Pressure Vacuum (0.22), Hyd Pressure2 (-0.20).
train2 <- train %>% dplyr::select(-'Brand.Code')
mydata.cor = cor(train2, method = c("spearman"))
corrplot(mydata.cor,cl.cex = 0.7,tl.cex = .7,diag = TRUE)## NULL
The variable “Hyd Pressure1” will be removed as it is constant.
# Decision Trees
source("https://raw.githubusercontent.com/IsARam/CUNY_SPS/master/DATA624/RandomForestNulls_testing.R")## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
##
## Attaching package: 'rattle'
## The following object is masked from 'package:xgboost':
##
## xgboost
## The following object is masked from 'package:VIM':
##
## wine
## Loading required package: modeltools
## Loading required package: stats4
##
## Attaching package: 'modeltools'
## The following objects are masked from 'package:memisc':
##
## Lapply, relabel
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
##
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
##
## boundary
##
## Attaching package: 'party'
## The following objects are masked from 'package:partykit':
##
## cforest, ctree, ctree_control, edge_simple, mob, mob_control,
## node_barplot, node_bivplot, node_boxplot, node_inner, node_surv,
## node_terminal, varimp
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
##
## importance
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
colnames(train)<- make.names(colnames(train), unique=TRUE)
colnames(test)<- make.names(colnames(test), unique=TRUE)
train <- as.data.frame(train)
test <- as.data.frame(test)
test <- RF_with_Nulls(train,test,"PH",.5,5,10,.01,5,1)##
## Call:
## lm(formula = PH ~ ., data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.50464 -0.08073 0.01061 0.08836 0.43181
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.034e+01 1.140e+00 9.072 < 2e-16 ***
## Brand.CodeB 5.934e-02 2.695e-02 2.202 0.027820 *
## Brand.CodeC -7.816e-02 2.679e-02 -2.917 0.003575 **
## Brand.CodeD 7.196e-02 1.788e-02 4.024 5.96e-05 ***
## Carb.Volume -3.674e-02 7.637e-02 -0.481 0.630530
## Fill.Ounces -8.759e-02 3.798e-02 -2.306 0.021199 *
## PC.Volume -1.178e-01 6.419e-02 -1.835 0.066608 .
## Carb.Pressure -1.184e-03 3.009e-03 -0.393 0.694079
## Carb.Temp 2.293e-03 2.400e-03 0.955 0.339584
## PSC -7.539e-02 6.729e-02 -1.120 0.262698
## PSC.Fill -3.331e-02 2.787e-02 -1.196 0.232047
## PSC.CO2 -8.567e-02 7.722e-02 -1.109 0.267433
## Mnf.Flow -6.774e-04 5.534e-05 -12.240 < 2e-16 ***
## Carb.Pressure1 7.120e-03 8.429e-04 8.447 < 2e-16 ***
## Fill.Pressure 1.564e-03 1.461e-03 1.071 0.284469
## Hyd.Pressure1 2.585e-04 4.486e-04 0.576 0.564572
## Hyd.Pressure2 -1.242e-03 6.404e-04 -1.939 0.052624 .
## Hyd.Pressure3 3.096e-03 7.159e-04 4.324 1.62e-05 ***
## Hyd.Pressure4 3.208e-04 3.917e-04 0.819 0.412928
## Filler.Level -1.239e-03 6.854e-04 -1.807 0.070898 .
## Filler.Speed 8.026e-06 7.748e-06 1.036 0.300413
## Temperature -1.594e-02 2.750e-03 -5.794 8.12e-09 ***
## Usage.cont -8.749e-03 1.368e-03 -6.395 2.06e-10 ***
## Carb.Flow 9.696e-06 4.538e-06 2.137 0.032772 *
## Density -1.158e-01 3.412e-02 -3.393 0.000706 ***
## MFR 1.329e-05 4.886e-05 0.272 0.785657
## Balling -1.229e-01 2.989e-02 -4.110 4.14e-05 ***
## Pressure.Vacuum -3.337e-02 9.456e-03 -3.529 0.000427 ***
## Oxygen.Filler -3.459e-01 8.805e-02 -3.929 8.87e-05 ***
## Bowl.Setpoint 3.480e-03 7.301e-04 4.767 2.03e-06 ***
## Pressure.Setpoint -8.166e-03 2.366e-03 -3.451 0.000571 ***
## Air.Pressurer -9.530e-04 2.779e-03 -0.343 0.731674
## Alch.Rel 5.443e-02 2.429e-02 2.241 0.025157 *
## Carb.Rel 5.520e-02 5.562e-02 0.992 0.321170
## Balling.Lvl 1.459e-01 2.715e-02 5.375 8.68e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.133 on 1764 degrees of freedom
## Multiple R-squared: 0.4223, Adjusted R-squared: 0.4112
## F-statistic: 37.93 on 34 and 1764 DF, p-value: < 2.2e-16
set.seed(seed)
bagControl = bagControl(fit = ctreeBag$fit, predict = ctreeBag$pred, aggregate = ctreeBag$aggregate)
bag_model <- train(PH ~.,
data = training, method="bag", bagControl = bagControl,
center = TRUE,
scale = TRUE,
trControl = trainControl("cv", number = 5),
tuneLength = 25)
bag_model## Bagged Model
##
## 1799 samples
## 32 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1438, 1440, 1440, 1439, 1439
## Resampling results:
##
## RMSE Rsquared MAE
## 0.1174488 0.5433736 0.08911483
##
## Tuning parameter 'vars' was held constant at a value of 34
## RMSE Rsquared MAE
## 0.11337276 0.56067003 0.08018872
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 32)
##
## Overall
## Oxygen.Filler 100.00
## Filler.Level 77.41
## Mnf.Flow 57.53
## Filler.Speed 57.14
## Balling 56.65
## Hyd.Pressure3 53.30
## Hyd.Pressure2 50.01
## Bowl.Setpoint 49.18
## Hyd.Pressure1 48.92
## Fill.Pressure 47.44
## Usage.cont 45.06
## Pressure.Setpoint 38.38
## Balling.Lvl 32.55
## Brand.Code 32.21
## Density 30.99
## Carb.Pressure1 30.58
## Carb.Rel 29.98
## Alch.Rel 26.45
## PSC 24.22
## Pressure.Vacuum 22.63
# Converting data to matrix
training2 <- training %>% drop_na(`Brand.Code`)
testing2 <- testing %>% drop_na(`Brand.Code`)
trainingmx<-model.matrix(~.+0,data=training2[,names(training2) != c("PH")])
testingmx<-model.matrix(~.+0,data=testing2[,names(testing2) != c("PH")])
trainingdmx <- xgb.DMatrix(data = trainingmx, label=training2$PH)
testingdmx <- xgb.DMatrix(data = testingmx, label=testing2$PH)
# Default parameters
params <- list(booster = "gbtree", objective = "reg:linear", eta=0.3, gamma=0, max_depth=6, min_child_weight=1, subsample=1, colsample_bytree=1)
# nround parameter
xgbcv <- xgb.cv( params = params, data = trainingdmx, nrounds = 300, nfold = 5, showsd = T, stratified = T, print_every_n = 10, early_stop_rounds = 20, maximize = F) ## [03:34:08] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [03:34:08] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [03:34:08] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [03:34:09] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [03:34:09] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [1] train-rmse:5.636829+0.001723 test-rmse:5.636822+0.010651
## [11] train-rmse:0.192335+0.000457 test-rmse:0.206971+0.005313
## [21] train-rmse:0.062187+0.002178 test-rmse:0.113529+0.006683
## [31] train-rmse:0.045677+0.000585 test-rmse:0.111656+0.006910
## [41] train-rmse:0.034947+0.000515 test-rmse:0.111015+0.006828
## [51] train-rmse:0.026761+0.000283 test-rmse:0.110441+0.007108
## [61] train-rmse:0.019877+0.001281 test-rmse:0.109963+0.007370
## [71] train-rmse:0.015932+0.001660 test-rmse:0.109905+0.007427
## [81] train-rmse:0.012154+0.001150 test-rmse:0.109764+0.007490
## [91] train-rmse:0.009684+0.000790 test-rmse:0.109655+0.007524
## [101] train-rmse:0.007590+0.000682 test-rmse:0.109519+0.007569
## [111] train-rmse:0.006150+0.000628 test-rmse:0.109456+0.007552
## [121] train-rmse:0.004868+0.000552 test-rmse:0.109478+0.007555
## [131] train-rmse:0.003860+0.000509 test-rmse:0.109463+0.007553
## [141] train-rmse:0.003021+0.000383 test-rmse:0.109398+0.007527
## [151] train-rmse:0.002387+0.000292 test-rmse:0.109374+0.007541
## [161] train-rmse:0.001976+0.000228 test-rmse:0.109342+0.007531
## [171] train-rmse:0.001517+0.000140 test-rmse:0.109334+0.007507
## [181] train-rmse:0.001229+0.000151 test-rmse:0.109332+0.007506
## [191] train-rmse:0.001060+0.000098 test-rmse:0.109332+0.007514
## [201] train-rmse:0.001001+0.000064 test-rmse:0.109338+0.007524
## [211] train-rmse:0.000990+0.000076 test-rmse:0.109337+0.007522
## [221] train-rmse:0.000990+0.000076 test-rmse:0.109337+0.007522
## [231] train-rmse:0.000990+0.000076 test-rmse:0.109337+0.007522
## [241] train-rmse:0.000990+0.000076 test-rmse:0.109337+0.007522
## [251] train-rmse:0.000990+0.000076 test-rmse:0.109337+0.007522
## [261] train-rmse:0.000990+0.000076 test-rmse:0.109337+0.007522
## [271] train-rmse:0.000990+0.000076 test-rmse:0.109337+0.007522
## [281] train-rmse:0.000990+0.000076 test-rmse:0.109337+0.007522
## [291] train-rmse:0.000990+0.000076 test-rmse:0.109337+0.007522
## [300] train-rmse:0.000990+0.000076 test-rmse:0.109337+0.007522
set.seed(seed)
xgb_model1 <- xgb.train (params = params, data = trainingdmx, nrounds = 260, watchlist = list(val=testingdmx,train=trainingdmx), print_every_n = 10, early_stop_round = 10, maximize = F)## [03:44:21] WARNING: amalgamation/../src/objective/regression_obj.cu:174: reg:linear is now deprecated in favor of reg:squarederror.
## [03:44:21] WARNING: amalgamation/../src/learner.cc:516:
## Parameters: { early_stop_round } might not be used.
##
## This may not be accurate due to some parameters are only used in language bindings but
## passed down to XGBoost core. Or some parameters are not used but slip through this
## verification. Please open an issue if you find above cases.
##
##
## [1] val-rmse:5.633858 train-rmse:5.636496
## [11] val-rmse:0.201779 train-rmse:0.192384
## [21] val-rmse:0.107885 train-rmse:0.063078
## [31] val-rmse:0.106375 train-rmse:0.050422
## [41] val-rmse:0.105411 train-rmse:0.040096
## [51] val-rmse:0.105364 train-rmse:0.031308
## [61] val-rmse:0.105399 train-rmse:0.025489
## [71] val-rmse:0.106176 train-rmse:0.020108
## [81] val-rmse:0.105993 train-rmse:0.016881
## [91] val-rmse:0.105840 train-rmse:0.014073
## [101] val-rmse:0.105786 train-rmse:0.012095
## [111] val-rmse:0.105836 train-rmse:0.009654
## [121] val-rmse:0.105658 train-rmse:0.007734
## [131] val-rmse:0.105724 train-rmse:0.006184
## [141] val-rmse:0.105739 train-rmse:0.005214
## [151] val-rmse:0.105758 train-rmse:0.003922
## [161] val-rmse:0.105785 train-rmse:0.003175
## [171] val-rmse:0.105773 train-rmse:0.002706
## [181] val-rmse:0.105754 train-rmse:0.002244
## [191] val-rmse:0.105754 train-rmse:0.001716
## [201] val-rmse:0.105738 train-rmse:0.001361
## [211] val-rmse:0.105731 train-rmse:0.001107
## [221] val-rmse:0.105729 train-rmse:0.001093
## [231] val-rmse:0.105729 train-rmse:0.001093
## [241] val-rmse:0.105729 train-rmse:0.001093
## [251] val-rmse:0.105728 train-rmse:0.001093
## [260] val-rmse:0.105729 train-rmse:0.001093
mat <- xgb.importance (feature_names = colnames(trainingmx),model = xgb_model1)
xgb.plot.importance (importance_matrix = mat) ctrl = trainControl(method='cv', number = 10)
set.seed(seed)
svmRad <- train(PH ~.,
data=training,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = ctrl)
svmRad## Support Vector Machines with Radial Basis Function Kernel
##
## 1799 samples
## 32 predictor
##
## Pre-processing: centered (34), scaled (34)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1619, 1618, 1619, 1618, 1619, 1620, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 0.1274342 0.4711883 0.09611690
## 0.50 0.1235253 0.4985630 0.09238116
## 1.00 0.1200843 0.5239995 0.08937250
## 2.00 0.1173177 0.5435048 0.08720979
## 4.00 0.1155158 0.5556946 0.08631336
## 8.00 0.1163903 0.5512329 0.08752803
## 16.00 0.1190036 0.5391637 0.08906792
## 32.00 0.1239753 0.5165933 0.09252462
## 64.00 0.1312589 0.4823998 0.09775870
## 128.00 0.1386002 0.4498103 0.10266553
## 256.00 0.1454519 0.4245452 0.10758489
## 512.00 0.1496935 0.4103727 0.11093162
## 1024.00 0.1513000 0.4038688 0.11217682
## 2048.00 0.1513000 0.4038688 0.11217682
##
## Tuning parameter 'sigma' was held constant at a value of 0.01959052
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01959052 and C = 4.
## RMSE Rsquared MAE
## 0.11300721 0.56821917 0.08075468
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 32)
##
## Overall
## Oxygen.Filler 100.00
## Filler.Level 77.41
## Mnf.Flow 57.53
## Filler.Speed 57.14
## Balling 56.65
## Hyd.Pressure3 53.30
## Hyd.Pressure2 50.01
## Bowl.Setpoint 49.18
## Hyd.Pressure1 48.92
## Fill.Pressure 47.44
## Usage.cont 45.06
## Pressure.Setpoint 38.38
## Balling.Lvl 32.55
## Brand.Code 32.21
## Density 30.99
## Carb.Pressure1 30.58
## Carb.Rel 29.98
## Alch.Rel 26.45
## PSC 24.22
## Pressure.Vacuum 22.63
## Cubist
##
## 1799 samples
## 32 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 1799, 1799, 1799, 1799, 1799, 1799, ...
## Resampling results across tuning parameters:
##
## committees neighbors RMSE Rsquared MAE
## 1 0 0.1545655 0.3699735 0.10539159
## 1 5 0.1537225 0.3914070 0.10427311
## 1 9 0.1531457 0.3894358 0.10404284
## 10 0 0.1105453 0.5930051 0.08003293
## 10 5 0.1090171 0.6085254 0.07839087
## 10 9 0.1090003 0.6070384 0.07851524
## 20 0 0.1063878 0.6218416 0.07671871
## 20 5 0.1048818 0.6341245 0.07508322
## 20 9 0.1048661 0.6333358 0.07524606
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 9.
## cubist variable importance
##
## only 20 most important variables shown (out of 34)
##
## Overall
## Mnf.Flow 100.00
## Balling 78.57
## Alch.Rel 66.43
## Balling.Lvl 66.43
## Pressure.Vacuum 65.71
## Bowl.Setpoint 52.86
## Density 52.14
## Oxygen.Filler 48.57
## Air.Pressurer 47.86
## Carb.Pressure1 47.14
## Temperature 46.43
## Filler.Speed 45.00
## Hyd.Pressure3 42.14
## Usage.cont 42.14
## Carb.Rel 41.43
## Brand.CodeC 39.29
## Carb.Flow 36.43
## Hyd.Pressure2 31.43
## Filler.Level 25.71
## Hyd.Pressure1 23.57
cubist_pred <- predict(cubist, newdata=Xtest) # Generate predictions
postResample(obs=testing$PH, pred=cubist_pred) # Evaluate model## RMSE Rsquared MAE
## 0.10112638 0.65059882 0.06951564
ctrl = trainControl(method='cv', number = 10, allowParallel = TRUE)
set.seed(seed)
rforest <- train(PH ~.,
data = training,
method = "ranger",
importance = "permutation",
tuneLength = 10,
trControl = ctrl
)
rforest # Model performance## Random Forest
##
## 1799 samples
## 32 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1619, 1618, 1619, 1618, 1619, 1620, ...
## Resampling results across tuning parameters:
##
## mtry splitrule RMSE Rsquared MAE
## 2 variance 0.11317638 0.6225504 0.08711225
## 2 extratrees 0.11802169 0.5809723 0.09193537
## 5 variance 0.10492612 0.6644925 0.07924229
## 5 extratrees 0.10749416 0.6402277 0.08184601
## 9 variance 0.10203469 0.6739557 0.07617255
## 9 extratrees 0.10289887 0.6646787 0.07742827
## 12 variance 0.10074632 0.6801254 0.07497740
## 12 extratrees 0.10094060 0.6755731 0.07549004
## 16 variance 0.10008930 0.6815686 0.07417458
## 16 extratrees 0.09986389 0.6802924 0.07430426
## 19 variance 0.09959033 0.6824283 0.07356550
## 19 extratrees 0.09901741 0.6847864 0.07350240
## 23 variance 0.09927836 0.6831245 0.07312255
## 23 extratrees 0.09806920 0.6899312 0.07283373
## 26 variance 0.09925153 0.6813060 0.07306824
## 26 extratrees 0.09790106 0.6903094 0.07253947
## 30 variance 0.09946203 0.6785126 0.07277334
## 30 extratrees 0.09784219 0.6894772 0.07244412
## 34 variance 0.10014712 0.6717706 0.07290210
## 34 extratrees 0.09766238 0.6899119 0.07228101
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 34, splitrule = extratrees
## and min.node.size = 5.
## ranger variable importance
##
## only 20 most important variables shown (out of 34)
##
## Overall
## Mnf.Flow 100.000
## Brand.CodeC 36.191
## Bowl.Setpoint 31.641
## Alch.Rel 22.727
## Usage.cont 18.992
## Brand.CodeD 18.051
## Pressure.Vacuum 17.115
## Oxygen.Filler 16.518
## Filler.Level 10.821
## Air.Pressurer 10.520
## Density 9.102
## Pressure.Setpoint 9.042
## Brand.CodeB 8.861
## Carb.Rel 8.329
## Balling.Lvl 8.296
## Carb.Flow 7.818
## Balling 7.141
## Hyd.Pressure3 5.906
## Carb.Pressure1 5.823
## Temperature 5.337
rf_pred <- predict(rforest, newdata = Xtest) # Generate predictions
postResample(obs = testing$PH, pred=rf_pred) # Evaluate model## RMSE Rsquared MAE
## 0.09874278 0.67343561 0.06843792
for (brand_code in unique(training$Brand.Code)){
print(paste("Brand Code", brand_code))
temp_df <- training %>%
filter(Brand.Code == brand_code) %>%
select(-Brand.Code)
set.seed(seed)
temp_rf <- train(PH ~ ., data = temp_df, method = "ranger", importance = "permutation", trControl = ctrl)
print(temp_rf)
print(varImp(temp_rf))
temp_test <- testing %>%
filter(Brand.Code == brand_code) %>%
select(-Brand.Code)
temp_predictions <- predict(temp_rf, temp_test)
print(postResample(pred = temp_predictions, obs = temp_test$PH))
}## [1] "Brand Code B"
## Random Forest
##
## 922 samples
## 31 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 829, 831, 830, 831, 829, 830, ...
## Resampling results across tuning parameters:
##
## mtry splitrule RMSE Rsquared MAE
## 2 variance 0.10506711 0.6537845 0.07982225
## 2 extratrees 0.10966083 0.6242749 0.08416500
## 16 variance 0.09249258 0.7166701 0.06786011
## 16 extratrees 0.09220161 0.7206038 0.06722659
## 31 variance 0.09239940 0.7124626 0.06753006
## 31 extratrees 0.08973770 0.7323507 0.06487345
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 31, splitrule = extratrees
## and min.node.size = 5.
## ranger variable importance
##
## only 20 most important variables shown (out of 31)
##
## Overall
## Mnf.Flow 100.000
## Bowl.Setpoint 45.438
## Filler.Level 18.724
## Air.Pressurer 15.129
## Oxygen.Filler 14.614
## Pressure.Vacuum 12.158
## Usage.cont 11.258
## Carb.Flow 9.818
## Carb.Rel 8.678
## Density 7.475
## Pressure.Setpoint 7.448
## Balling 6.160
## Temperature 5.362
## Balling.Lvl 4.557
## Hyd.Pressure3 3.887
## Alch.Rel 3.307
## Hyd.Pressure2 3.078
## Hyd.Pressure1 2.926
## Carb.Pressure1 2.794
## Fill.Pressure 2.102
## RMSE Rsquared MAE
## 0.09235979 0.71754410 0.06459907
## [1] "Brand Code A"
## Random Forest
##
## 212 samples
## 31 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 190, 190, 192, 191, 192, 191, ...
## Resampling results across tuning parameters:
##
## mtry splitrule RMSE Rsquared MAE
## 2 variance 0.1224133 0.4827949 0.09648991
## 2 extratrees 0.1268267 0.4437693 0.10134188
## 16 variance 0.1159913 0.5048723 0.08953930
## 16 extratrees 0.1153989 0.5173700 0.09111241
## 31 variance 0.1169424 0.4931588 0.09012204
## 31 extratrees 0.1139705 0.5248241 0.08972653
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 31, splitrule = extratrees
## and min.node.size = 5.
## ranger variable importance
##
## only 20 most important variables shown (out of 31)
##
## Overall
## Mnf.Flow 100.000
## Bowl.Setpoint 48.661
## Usage.cont 44.611
## Filler.Level 41.410
## Oxygen.Filler 32.046
## Pressure.Vacuum 25.983
## Carb.Flow 15.921
## Pressure.Setpoint 14.512
## Balling.Lvl 11.299
## Carb.Pressure1 10.272
## Balling 9.807
## Hyd.Pressure2 9.430
## Density 7.011
## Air.Pressurer 6.690
## Filler.Speed 6.638
## Hyd.Pressure3 6.596
## Hyd.Pressure1 5.358
## Alch.Rel 5.189
## Hyd.Pressure4 5.109
## Fill.Pressure 4.948
## RMSE Rsquared MAE
## 0.10320186 0.66962892 0.08155831
## [1] "Brand Code C"
## Random Forest
##
## 231 samples
## 31 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 208, 208, 207, 208, 207, 207, ...
## Resampling results across tuning parameters:
##
## mtry splitrule RMSE Rsquared MAE
## 2 variance 0.1444395 0.3763802 0.1102709
## 2 extratrees 0.1485511 0.3597609 0.1153524
## 16 variance 0.1431981 0.3387047 0.1052344
## 16 extratrees 0.1399040 0.4025821 0.1048006
## 31 variance 0.1440374 0.3294470 0.1039671
## 31 extratrees 0.1399701 0.3927119 0.1034873
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 16, splitrule = extratrees
## and min.node.size = 5.
## ranger variable importance
##
## only 20 most important variables shown (out of 31)
##
## Overall
## Oxygen.Filler 100.000
## Mnf.Flow 64.525
## Hyd.Pressure1 23.714
## Density 21.490
## Alch.Rel 19.166
## Pressure.Vacuum 17.903
## Carb.Rel 15.548
## Bowl.Setpoint 15.415
## Balling 14.915
## Pressure.Setpoint 13.606
## Balling.Lvl 12.747
## Hyd.Pressure3 12.345
## Filler.Level 11.679
## Usage.cont 11.159
## Filler.Speed 10.868
## PC.Volume 10.321
## PSC.Fill 10.197
## Hyd.Pressure2 9.477
## Carb.Flow 9.143
## MFR 7.952
## RMSE Rsquared MAE
## 0.1517615 0.3425243 0.1021180
## [1] "Brand Code D"
## Random Forest
##
## 434 samples
## 31 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 389, 391, 390, 392, 391, 391, ...
## Resampling results across tuning parameters:
##
## mtry splitrule RMSE Rsquared MAE
## 2 variance 0.09923733 0.5935944 0.07922708
## 2 extratrees 0.10427177 0.5557178 0.08388437
## 16 variance 0.08664050 0.6467127 0.06814333
## 16 extratrees 0.08667069 0.6611741 0.06831416
## 31 variance 0.08720461 0.6268374 0.06791382
## 31 extratrees 0.08407906 0.6680614 0.06600962
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 31, splitrule = extratrees
## and min.node.size = 5.
## ranger variable importance
##
## only 20 most important variables shown (out of 31)
##
## Overall
## Mnf.Flow 100.000
## Usage.cont 35.872
## Pressure.Vacuum 27.797
## Hyd.Pressure3 17.767
## Carb.Pressure1 15.437
## Carb.Flow 12.231
## Bowl.Setpoint 10.698
## Density 8.908
## Filler.Speed 7.997
## Hyd.Pressure2 6.567
## Temperature 6.487
## Oxygen.Filler 6.438
## Alch.Rel 5.992
## Filler.Level 5.403
## Hyd.Pressure1 5.117
## Balling.Lvl 4.946
## Balling 4.673
## Pressure.Setpoint 4.372
## Carb.Rel 3.926
## Air.Pressurer 3.866
## RMSE Rsquared MAE
## 0.07596047 0.64840118 0.05534720
pfile <- read_excel("StudentEvaluation.xlsx")
#Preparing the dataset
test <- pfile[,-grep("PH", colnames(pfile))]
test <- kNN(test, imp_var=FALSE)
colnames(test)<- make.names(colnames(test), unique=TRUE)
ctrl = trainControl(method='cv', number = 10)
set.seed(seed)
rf_model <- train(PH ~.,
data = train,
method = "ranger",
importance = "permutation",
tuneLength = 10,
trControl = ctrl
)## Growing trees.. Progress: 100%. Estimated remaining time: 0 seconds.
## Growing trees.. Progress: 100%. Estimated remaining time: 0 seconds.
## Growing trees.. Progress: 97%. Estimated remaining time: 0 seconds.
## Growing trees.. Progress: 86%. Estimated remaining time: 5 seconds.