library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(readxl)
library(tidymodels)
## Registered S3 method overwritten by 'tune':
## method from
## required_pkgs.model_spec parsnip
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.4 ──
## ✓ broom 0.7.10 ✓ rsample 0.1.1
## ✓ dials 0.0.10 ✓ tibble 3.1.6
## ✓ dplyr 1.0.7 ✓ tidyr 1.1.4
## ✓ infer 1.0.0 ✓ tune 0.1.6
## ✓ modeldata 0.1.1 ✓ workflows 0.2.4
## ✓ parsnip 0.1.7 ✓ workflowsets 0.1.0
## ✓ purrr 0.3.4 ✓ yardstick 0.0.9
## ✓ recipes 0.1.17
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x purrr::discard() masks scales::discard()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
## x yardstick::precision() masks caret::precision()
## x yardstick::recall() masks caret::recall()
## x yardstick::sensitivity() masks caret::sensitivity()
## x yardstick::specificity() masks caret::specificity()
## x recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(VSURF)
##
## Attaching package: 'VSURF'
## The following object is masked from 'package:tune':
##
## tune
library(writexl)
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
`%!in%` <- Negate(`%in%`)
StudentEval <- readxl::read_excel("StudentEvaluation.xlsx",.name_repair = "universal")
## New names:
## * `Brand Code` -> Brand.Code
## * `Carb Volume` -> Carb.Volume
## * `Fill Ounces` -> Fill.Ounces
## * `PC Volume` -> PC.Volume
## * `Carb Pressure` -> Carb.Pressure
## * ...
StudentData <- readxl::read_excel("StudentData.xlsx",.name_repair = "universal")
## New names:
## * `Brand Code` -> Brand.Code
## * `Carb Volume` -> Carb.Volume
## * `Fill Ounces` -> Fill.Ounces
## * `PC Volume` -> PC.Volume
## * `Carb Pressure` -> Carb.Pressure
## * ...
StudentData["Brand.Code"]<-as.factor(StudentData$Brand.Code)
StudentEval["Brand.Code"]<-as.factor(StudentEval$Brand.Code)
head(StudentData)
## # A tibble: 6 × 33
## Brand.Code Carb.Volume Fill.Ounces PC.Volume Carb.Pressure Carb.Temp PSC
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 B 5.34 24.0 0.263 68.2 141. 0.104
## 2 A 5.43 24.0 0.239 68.4 140. 0.124
## 3 B 5.29 24.1 0.263 70.8 145. 0.09
## 4 A 5.44 24.0 0.293 63 133. NA
## 5 A 5.49 24.3 0.111 67.2 137. 0.026
## 6 A 5.38 23.9 0.269 66.6 138. 0.09
## # … with 26 more variables: PSC.Fill <dbl>, PSC.CO2 <dbl>, Mnf.Flow <dbl>,
## # Carb.Pressure1 <dbl>, Fill.Pressure <dbl>, Hyd.Pressure1 <dbl>,
## # Hyd.Pressure2 <dbl>, Hyd.Pressure3 <dbl>, Hyd.Pressure4 <dbl>,
## # Filler.Level <dbl>, Filler.Speed <dbl>, Temperature <dbl>,
## # Usage.cont <dbl>, Carb.Flow <dbl>, Density <dbl>, MFR <dbl>, Balling <dbl>,
## # Pressure.Vacuum <dbl>, PH <dbl>, Oxygen.Filler <dbl>, Bowl.Setpoint <dbl>,
## # Pressure.Setpoint <dbl>, Air.Pressurer <dbl>, Alch.Rel <dbl>, …
summary(StudentData)
## Brand.Code Carb.Volume Fill.Ounces PC.Volume Carb.Pressure
## A : 293 Min. :5.040 Min. :23.63 Min. :0.07933 Min. :57.00
## B :1239 1st Qu.:5.293 1st Qu.:23.92 1st Qu.:0.23917 1st Qu.:65.60
## C : 304 Median :5.347 Median :23.97 Median :0.27133 Median :68.20
## D : 615 Mean :5.370 Mean :23.97 Mean :0.27712 Mean :68.19
## NA's: 120 3rd Qu.:5.453 3rd Qu.:24.03 3rd Qu.:0.31200 3rd Qu.:70.60
## Max. :5.700 Max. :24.32 Max. :0.47800 Max. :79.40
## NA's :10 NA's :38 NA's :39 NA's :27
## Carb.Temp PSC PSC.Fill PSC.CO2
## Min. :128.6 Min. :0.00200 Min. :0.0000 Min. :0.00000
## 1st Qu.:138.4 1st Qu.:0.04800 1st Qu.:0.1000 1st Qu.:0.02000
## Median :140.8 Median :0.07600 Median :0.1800 Median :0.04000
## Mean :141.1 Mean :0.08457 Mean :0.1954 Mean :0.05641
## 3rd Qu.:143.8 3rd Qu.:0.11200 3rd Qu.:0.2600 3rd Qu.:0.08000
## Max. :154.0 Max. :0.27000 Max. :0.6200 Max. :0.24000
## NA's :26 NA's :33 NA's :23 NA's :39
## Mnf.Flow Carb.Pressure1 Fill.Pressure Hyd.Pressure1
## Min. :-100.20 Min. :105.6 Min. :34.60 Min. :-0.80
## 1st Qu.:-100.00 1st Qu.:119.0 1st Qu.:46.00 1st Qu.: 0.00
## Median : 65.20 Median :123.2 Median :46.40 Median :11.40
## Mean : 24.57 Mean :122.6 Mean :47.92 Mean :12.44
## 3rd Qu.: 140.80 3rd Qu.:125.4 3rd Qu.:50.00 3rd Qu.:20.20
## Max. : 229.40 Max. :140.2 Max. :60.40 Max. :58.00
## NA's :2 NA's :32 NA's :22 NA's :11
## Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4 Filler.Level
## Min. : 0.00 Min. :-1.20 Min. : 52.00 Min. : 55.8
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 86.00 1st Qu.: 98.3
## Median :28.60 Median :27.60 Median : 96.00 Median :118.4
## Mean :20.96 Mean :20.46 Mean : 96.29 Mean :109.3
## 3rd Qu.:34.60 3rd Qu.:33.40 3rd Qu.:102.00 3rd Qu.:120.0
## Max. :59.40 Max. :50.00 Max. :142.00 Max. :161.2
## NA's :15 NA's :15 NA's :30 NA's :20
## Filler.Speed Temperature Usage.cont Carb.Flow Density
## Min. : 998 Min. :63.60 Min. :12.08 Min. : 26 Min. :0.240
## 1st Qu.:3888 1st Qu.:65.20 1st Qu.:18.36 1st Qu.:1144 1st Qu.:0.900
## Median :3982 Median :65.60 Median :21.79 Median :3028 Median :0.980
## Mean :3687 Mean :65.97 Mean :20.99 Mean :2468 Mean :1.174
## 3rd Qu.:3998 3rd Qu.:66.40 3rd Qu.:23.75 3rd Qu.:3186 3rd Qu.:1.620
## Max. :4030 Max. :76.20 Max. :25.90 Max. :5104 Max. :1.920
## NA's :57 NA's :14 NA's :5 NA's :2 NA's :1
## MFR Balling Pressure.Vacuum PH
## Min. : 31.4 Min. :-0.170 Min. :-6.600 Min. :7.880
## 1st Qu.:706.3 1st Qu.: 1.496 1st Qu.:-5.600 1st Qu.:8.440
## Median :724.0 Median : 1.648 Median :-5.400 Median :8.540
## Mean :704.0 Mean : 2.198 Mean :-5.216 Mean :8.546
## 3rd Qu.:731.0 3rd Qu.: 3.292 3rd Qu.:-5.000 3rd Qu.:8.680
## Max. :868.6 Max. : 4.012 Max. :-3.600 Max. :9.360
## NA's :212 NA's :1 NA's :4
## Oxygen.Filler Bowl.Setpoint Pressure.Setpoint Air.Pressurer
## Min. :0.00240 Min. : 70.0 Min. :44.00 Min. :140.8
## 1st Qu.:0.02200 1st Qu.:100.0 1st Qu.:46.00 1st Qu.:142.2
## Median :0.03340 Median :120.0 Median :46.00 Median :142.6
## Mean :0.04684 Mean :109.3 Mean :47.62 Mean :142.8
## 3rd Qu.:0.06000 3rd Qu.:120.0 3rd Qu.:50.00 3rd Qu.:143.0
## Max. :0.40000 Max. :140.0 Max. :52.00 Max. :148.2
## NA's :12 NA's :2 NA's :12
## Alch.Rel Carb.Rel Balling.Lvl
## Min. :5.280 Min. :4.960 Min. :0.00
## 1st Qu.:6.540 1st Qu.:5.340 1st Qu.:1.38
## Median :6.560 Median :5.400 Median :1.48
## Mean :6.897 Mean :5.437 Mean :2.05
## 3rd Qu.:7.240 3rd Qu.:5.540 3rd Qu.:3.14
## Max. :8.620 Max. :6.060 Max. :3.66
## NA's :9 NA's :10 NA's :1
pairData<-mice::md.pairs(StudentData)
pairData$mm[,"PH"]
## Brand.Code Carb.Volume Fill.Ounces PC.Volume
## 0 0 0 0
## Carb.Pressure Carb.Temp PSC PSC.Fill
## 0 0 0 0
## PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure
## 0 2 0 4
## Hyd.Pressure1 Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4
## 0 0 0 2
## Filler.Level Filler.Speed Temperature Usage.cont
## 4 3 2 0
## Carb.Flow Density MFR Balling
## 0 1 4 1
## Pressure.Vacuum PH Oxygen.Filler Bowl.Setpoint
## 0 4 1 0
## Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel
## 0 0 2 2
## Balling.Lvl
## 0
pairData$mr[,"PH"]
## Brand.Code Carb.Volume Fill.Ounces PC.Volume
## 120 10 38 39
## Carb.Pressure Carb.Temp PSC PSC.Fill
## 27 26 33 23
## PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure
## 39 0 32 18
## Hyd.Pressure1 Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4
## 11 15 15 28
## Filler.Level Filler.Speed Temperature Usage.cont
## 16 54 12 5
## Carb.Flow Density MFR Balling
## 2 0 208 0
## Pressure.Vacuum PH Oxygen.Filler Bowl.Setpoint
## 0 0 11 2
## Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel
## 12 0 7 8
## Balling.Lvl
## 1
pairData$rm[,"PH"]
## Brand.Code Carb.Volume Fill.Ounces PC.Volume
## 4 4 4 4
## Carb.Pressure Carb.Temp PSC PSC.Fill
## 4 4 4 4
## PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure
## 4 2 4 0
## Hyd.Pressure1 Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4
## 4 4 4 2
## Filler.Level Filler.Speed Temperature Usage.cont
## 0 1 2 4
## Carb.Flow Density MFR Balling
## 4 3 0 3
## Pressure.Vacuum PH Oxygen.Filler Bowl.Setpoint
## 4 0 3 4
## Pressure.Setpoint Air.Pressurer Alch.Rel Carb.Rel
## 4 4 2 2
## Balling.Lvl
## 4
phCor<-cor(StudentData %>% select( !c(PH,"Brand.Code")),StudentData$PH)
pairs((StudentData %>% select( !c(PH,"Brand.Code")))[1:5])
Lookin at some of the pairs we see there are some obvious correlations.
dataRec <- recipe(PH~.,data = StudentData ) %>%
step_dummy(Brand.Code, one_hot=T)%>% step_impute_bag(all_predictors()) %>%
step_corr(all_predictors())%>%prep()
## Warning: There are new levels in a factor: NA
fittedData <-dataRec %>% bake(StudentData)
## Warning: There are new levels in a factor: NA
fittedDataEval <-dataRec %>% bake(StudentEval)
## Warning: There are new levels in a factor: NA
colnames(StudentData[colnames(StudentData) %!in% colnames(fittedData)])
## [1] "Brand.Code" "Hyd.Pressure3" "Filler.Level" "Filler.Speed"
## [5] "Density" "Balling" "Balling.Lvl"
head(fittedData)
## # A tibble: 6 × 30
## Carb.Volume Fill.Ounces PC.Volume Carb.Pressure Carb.Temp PSC PSC.Fill
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 5.34 24.0 0.263 68.2 141. 0.104 0.260
## 2 5.43 24.0 0.239 68.4 140. 0.124 0.220
## 3 5.29 24.1 0.263 70.8 145. 0.09 0.34
## 4 5.44 24.0 0.293 63 133. 0.106 0.420
## 5 5.49 24.3 0.111 67.2 137. 0.026 0.16
## 6 5.38 23.9 0.269 66.6 138. 0.09 0.240
## # … with 23 more variables: PSC.CO2 <dbl>, Mnf.Flow <dbl>,
## # Carb.Pressure1 <dbl>, Fill.Pressure <dbl>, Hyd.Pressure1 <dbl>,
## # Hyd.Pressure2 <dbl>, Hyd.Pressure4 <dbl>, Temperature <dbl>,
## # Usage.cont <dbl>, Carb.Flow <dbl>, MFR <dbl>, Pressure.Vacuum <dbl>,
## # Oxygen.Filler <dbl>, Bowl.Setpoint <dbl>, Pressure.Setpoint <dbl>,
## # Air.Pressurer <dbl>, Alch.Rel <dbl>, Carb.Rel <dbl>, PH <dbl>,
## # Brand.Code_A <dbl>, Brand.Code_B <dbl>, Brand.Code_C <dbl>, …
We first dummy out the Brand Code variable, the bag impute all predictors to remove the substantial missing variables, then we removed 6 highly correlated values. We also remove lines with NAs in the dependent variable as there were only 4.
colnames(StudentData[colnames(StudentData) %!in% colnames(fittedData)])[-1]
## [1] "Hyd.Pressure3" "Filler.Level" "Filler.Speed" "Density"
## [5] "Balling" "Balling.Lvl"
SurfedData<- VSURF(fittedData %>% drop_na() %>%select(-PH),(fittedData %>% drop_na())$PH)
predData<-SurfedData$varselect.pred
predData <- c(9,21,16,28,24,20,25,19,17,22,13,23)
colnames((fittedData %>% drop_na() %>%select(-PH)))[predData]
## [1] "Mnf.Flow" "Bowl.Setpoint" "Usage.cont"
## [4] "Brand.Code_C" "Alch.Rel" "Oxygen.Filler"
## [7] "Carb.Rel" "Pressure.Vacuum" "Carb.Flow"
## [10] "Pressure.Setpoint" "Hyd.Pressure2" "Air.Pressurer"
Using VSURF to select predictors we see that no Brand except C is important, and that except for Pressure and Carb with two each only one predictor from each type is selected.
First in trying to select as simple as a model as practical we attempt to fit a linear regression model.
set.seed(6354)
studentSplit <- initial_split(StudentData, prop=.75)
dataRec <- recipe(PH~.,data = training(studentSplit) ) %>%
step_dummy(Brand.Code, one_hot=T)%>% step_rm(everything(),-all_of(colnames((fittedData %>% drop_na() %>%select(-PH)))[predData]), -PH)%>% step_impute_bag(all_predictors()) %>%
step_YeoJohnson (all_predictors()) %>%
prep()
lr_mod <-
parsnip::linear_reg() %>%
set_engine("lm")
studentData_wflow <-
workflow() %>%
add_model(lr_mod) %>%
add_recipe(dataRec)
studentData_fit <-
studentData_wflow %>%
fit(data = training(studentSplit))
We use the VSURF selected predictors and apply a Yeo Johnson transformation.
studentData_fit %>%
extract_fit_parsnip() %>%
tidy()
## # A tibble: 13 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 7.81e+ 4 2.40e+ 4 3.25 1.17e- 3
## 2 Mnf.Flow -6.61e- 4 5.41e- 5 -12.2 5.01e-33
## 3 Hyd.Pressure2 6.13e- 3 1.12e- 3 5.48 4.80e- 8
## 4 Usage.cont -3.09e- 6 4.95e- 7 -6.23 5.59e-10
## 5 Carb.Flow -1.35e- 7 3.51e- 7 -0.384 7.01e- 1
## 6 Pressure.Vacuum -2.40e- 5 1.40e- 4 -0.171 8.64e- 1
## 7 Oxygen.Filler -2.89e- 1 8.75e- 2 -3.31 9.64e- 4
## 8 Bowl.Setpoint 3.72e-10 9.49e-11 3.92 9.28e- 5
## 9 Pressure.Setpoint -2.76e+ 5 8.49e+ 4 -3.25 1.17e- 3
## 10 Air.Pressurer NA NA NA NA
## 11 Alch.Rel 2.46e- 2 1.24e- 2 1.99 4.68e- 2
## 12 Carb.Rel -2.45e- 2 4.90e- 2 -0.501 6.16e- 1
## 13 Brand.Code_C -1.31e- 1 1.02e- 2 -12.8 6.05e-36
plot(studentData_fit$fit$fit$fit)
vip(studentData_fit$fit$fit)
evalResults <-
predict(studentData_fit, testing(studentSplit))%>% bind_cols(testing(studentSplit) %>%
select(PH)) %>% metrics(truth = PH, .pred)
evalResults
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.143
## 2 rsq standard 0.304
## 3 mae standard 0.114
GLM produces a pretty poor fit of an Rsquared of only 0.3.
evaledStudent<-predict(studentData_fit,StudentEval)
write_xlsx(evaledStudent,"GLM Predictions.xlsx")
set.seed(6354)
partition <- createDataPartition(as.factor(fittedData$PH), p=0.75, list=F)
## Warning in createDataPartition(as.factor(fittedData$PH), p = 0.75, list = F):
## Some classes have a single record ( 7.88, 7.9, 7.98, 9.36 ) and these will be
## selected for the sample
training <- fittedData[partition,]
testing<- fittedData[-partition,]
training<-na.omit(training)
testing<-na.omit(testing)
fd<-data.matrix(training)
tx<-training[ , !(names(training) %in% c("PH"))]
ty<-fd[,c("PH")]
fitnn <-train(x=tx, y=ty,
method="nnet",
metric = "RMSE",
trace = FALSE,
maxit = 500,
preProcess=c( "corr", "medianImpute"))
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## Warning: Setting row names on a tibble is deprecated.
fitnn
## Neural Network
##
## 1945 samples
## 29 predictor
##
## Pre-processing: median imputation (29)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 1945, 1945, 1945, 1945, 1945, 1945, ...
## Resampling results across tuning parameters:
##
## size decay RMSE Rsquared MAE
## 1 0e+00 7.545983 NaN 7.543938
## 1 1e-04 7.545983 NaN 7.543938
## 1 1e-01 7.546018 0.003046929 7.543973
## 3 0e+00 7.545983 NaN 7.543938
## 3 1e-04 7.545983 0.016317132 7.543938
## 3 1e-01 7.546002 0.002105523 7.543957
## 5 0e+00 7.545983 NaN 7.543938
## 5 1e-04 7.545983 0.015630800 7.543938
## 5 1e-01 7.545997 0.002428585 7.543951
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 1 and decay = 1e-04.
plot(fitnn)
fitnn$results
## size decay RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 0e+00 7.545983 NaN 7.543938 0.004916061 NA 0.004921555
## 2 1 1e-04 7.545983 NaN 7.543938 0.004916061 NA 0.004921555
## 3 1 1e-01 7.546018 0.003046929 7.543973 0.004916143 0.005766027 0.004921640
## 4 3 0e+00 7.545983 NaN 7.543938 0.004916061 NA 0.004921555
## 5 3 1e-04 7.545983 0.016317132 7.543938 0.004916029 0.022953698 0.004921523
## 6 3 1e-01 7.546002 0.002105523 7.543957 0.004916308 0.004694322 0.004921797
## 7 5 0e+00 7.545983 NaN 7.543938 0.004916061 NA 0.004921555
## 8 5 1e-04 7.545983 0.015630800 7.543938 0.004916039 0.023167731 0.004921532
## 9 5 1e-01 7.545997 0.002428585 7.543951 0.004916165 0.003082247 0.004921677
tex<-testing[ , !(names(testing) %in% c("PH"))]
fdt<-data.matrix(testing)
tey<-fdt[,c("PH")]
p_nn <- predict(fitnn, newdata=tex)
nn_re<-postResample(pred = p_nn, obs = tey)
nn_re
## RMSE Rsquared MAE
## 7.549642 NA 7.547846
ex<-fittedDataEval[ , !(names(fittedDataEval) %in% c("PH"))]
fde<-data.matrix(fittedDataEval)
ey<-fdt[,c("PH")]
nn_eval<-as.data.frame(predict(fitnn, newdata=ex))
write_xlsx(nn_eval,"Neural Network Predictions.xlsx")
The best neural network model did not produce an r squared with an RMSE over 7 which indicates this is not an appropriate model for this dataset. ### SVM
set.seed(6354)
fitsvm <-train(x=tx, y=ty,
method="svmRadial",
trControl = trainControl(method="cv"),
tuneLength = 15)
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
fitsvm
## Support Vector Machines with Radial Basis Function Kernel
##
## 1945 samples
## 29 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1750, 1751, 1751, 1750, 1751, 1751, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 0.1277695 0.4728566 0.09509423
## 0.50 0.1254846 0.4896870 0.09258716
## 1.00 0.1239154 0.5006250 0.09095321
## 2.00 0.1221471 0.5136656 0.08952312
## 4.00 0.1202203 0.5283408 0.08895078
## 8.00 0.1192508 0.5378317 0.08860846
## 16.00 0.1204856 0.5339572 0.08956444
## 32.00 0.1261711 0.5038953 0.09351332
## 64.00 0.1328531 0.4735981 0.09819104
## 128.00 0.1412642 0.4359073 0.10439197
## 256.00 0.1496643 0.4025497 0.11076674
## 512.00 0.1556915 0.3810760 0.11539728
## 1024.00 0.1568058 0.3766817 0.11633906
## 2048.00 0.1568058 0.3766817 0.11633906
## 4096.00 0.1568058 0.3766817 0.11633906
##
## Tuning parameter 'sigma' was held constant at a value of 0.02367136
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.02367136 and C = 8.
plot(fitsvm)
fitsvm$results
## sigma C RMSE Rsquared MAE RMSESD RsquaredSD
## 1 0.02367136 0.25 0.1277695 0.4728566 0.09509423 0.009115992 0.03793672
## 2 0.02367136 0.50 0.1254846 0.4896870 0.09258716 0.009258153 0.03779600
## 3 0.02367136 1.00 0.1239154 0.5006250 0.09095321 0.009581259 0.04105732
## 4 0.02367136 2.00 0.1221471 0.5136656 0.08952312 0.009541825 0.04290615
## 5 0.02367136 4.00 0.1202203 0.5283408 0.08895078 0.009663695 0.04684523
## 6 0.02367136 8.00 0.1192508 0.5378317 0.08860846 0.009876091 0.04679622
## 7 0.02367136 16.00 0.1204856 0.5339572 0.08956444 0.010542802 0.04882154
## 8 0.02367136 32.00 0.1261711 0.5038953 0.09351332 0.011312364 0.05390020
## 9 0.02367136 64.00 0.1328531 0.4735981 0.09819104 0.012665420 0.06415286
## 10 0.02367136 128.00 0.1412642 0.4359073 0.10439197 0.014881888 0.07273232
## 11 0.02367136 256.00 0.1496643 0.4025497 0.11076674 0.015464292 0.07383697
## 12 0.02367136 512.00 0.1556915 0.3810760 0.11539728 0.015186030 0.06982638
## 13 0.02367136 1024.00 0.1568058 0.3766817 0.11633906 0.014400043 0.06721525
## 14 0.02367136 2048.00 0.1568058 0.3766817 0.11633906 0.014400043 0.06721525
## 15 0.02367136 4096.00 0.1568058 0.3766817 0.11633906 0.014400043 0.06721525
## MAESD
## 1 0.005417798
## 2 0.005831441
## 3 0.006382767
## 4 0.006246796
## 5 0.006316028
## 6 0.006384480
## 7 0.006646433
## 8 0.007352110
## 9 0.008605265
## 10 0.009985204
## 11 0.010852249
## 12 0.010579870
## 13 0.010083748
## 14 0.010083748
## 15 0.010083748
fitsvm$bestTune
## sigma C
## 6 0.02367136 8
p_svm <- predict(fitsvm, newdata=tex)
svm_re<-postResample(pred = p_svm, obs = tey)
svm_re
## RMSE Rsquared MAE
## 0.11643501 0.51059468 0.08496464
svm_eval<-as.data.frame(predict(fitsvm, newdata=ex))
write_xlsx(nn_eval,"SVM Predictions.xlsx")
The best SVM model produced a RMSE = 0.116 and an Rsquared of over 0.51 which is a much better predictor on this dataset then neural network. ## Model Performance
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
glm_re<-subset(dcast(evalResults,.estimate~.metric), select = -c(.estimate))
## Using .estimate as value column: use value.var to override.
glm_re[1,2]<-glm_re[2,2]
glm_re[1,3]<-glm_re[3,3]
glm_re<-na.omit(glm_re)
glm_re<-glm_re[,c(2,3,1)]
rbind("GLM"=glm_re,
"SVM"=svm_re,
"Neural Network"=nn_re)
## rmse rsq mae
## GLM 0.143392 0.3037629 0.11408250
## SVM 0.116435 0.5105947 0.08496464
## Neural Network 7.549642 NA 7.54784566
When comparing all 3 models it is clear by RMSE, MAE and Rsquared that the SVM model is the best predictor thus we will be using SVM as our evaluation model.