Project 2

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(readxl)
library(tidymodels)

## Registered S3 method overwritten by 'tune':
##   method                   from   
##   required_pkgs.model_spec parsnip

## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.4 ──

## ✓ broom        0.7.10     ✓ rsample      0.1.1 
## ✓ dials        0.0.10     ✓ tibble       3.1.6 
## ✓ dplyr        1.0.7      ✓ tidyr        1.1.4 
## ✓ infer        1.0.0      ✓ tune         0.1.6 
## ✓ modeldata    0.1.1      ✓ workflows    0.2.4 
## ✓ parsnip      0.1.7      ✓ workflowsets 0.1.0 
## ✓ purrr        0.3.4      ✓ yardstick    0.0.9 
## ✓ recipes      0.1.17

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x purrr::discard()         masks scales::discard()
## x dplyr::filter()          masks stats::filter()
## x dplyr::lag()             masks stats::lag()
## x purrr::lift()            masks caret::lift()
## x yardstick::precision()   masks caret::precision()
## x yardstick::recall()      masks caret::recall()
## x yardstick::sensitivity() masks caret::sensitivity()
## x yardstick::specificity() masks caret::specificity()
## x recipes::step()          masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.

library(VSURF)

## 
## Attaching package: 'VSURF'

## The following object is masked from 'package:tune':
## 
##     tune

library(writexl)
library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

`%!in%` <- Negate(`%in%`)

Summary

Data Exploration

StudentEval <- readxl::read_excel("StudentEvaluation.xlsx",.name_repair = "universal")

## New names:
## * `Brand Code` -> Brand.Code
## * `Carb Volume` -> Carb.Volume
## * `Fill Ounces` -> Fill.Ounces
## * `PC Volume` -> PC.Volume
## * `Carb Pressure` -> Carb.Pressure
## * ...

StudentData <- readxl::read_excel("StudentData.xlsx",.name_repair = "universal")

## New names:
## * `Brand Code` -> Brand.Code
## * `Carb Volume` -> Carb.Volume
## * `Fill Ounces` -> Fill.Ounces
## * `PC Volume` -> PC.Volume
## * `Carb Pressure` -> Carb.Pressure
## * ...

StudentData["Brand.Code"]<-as.factor(StudentData$Brand.Code)
StudentEval["Brand.Code"]<-as.factor(StudentEval$Brand.Code)
head(StudentData)

## # A tibble: 6 × 33
##   Brand.Code Carb.Volume Fill.Ounces PC.Volume Carb.Pressure Carb.Temp    PSC
##   <fct>            <dbl>       <dbl>     <dbl>         <dbl>     <dbl>  <dbl>
## 1 B                 5.34        24.0     0.263          68.2      141.  0.104
## 2 A                 5.43        24.0     0.239          68.4      140.  0.124
## 3 B                 5.29        24.1     0.263          70.8      145.  0.09 
## 4 A                 5.44        24.0     0.293          63        133. NA    
## 5 A                 5.49        24.3     0.111          67.2      137.  0.026
## 6 A                 5.38        23.9     0.269          66.6      138.  0.09 
## # … with 26 more variables: PSC.Fill <dbl>, PSC.CO2 <dbl>, Mnf.Flow <dbl>,
## #   Carb.Pressure1 <dbl>, Fill.Pressure <dbl>, Hyd.Pressure1 <dbl>,
## #   Hyd.Pressure2 <dbl>, Hyd.Pressure3 <dbl>, Hyd.Pressure4 <dbl>,
## #   Filler.Level <dbl>, Filler.Speed <dbl>, Temperature <dbl>,
## #   Usage.cont <dbl>, Carb.Flow <dbl>, Density <dbl>, MFR <dbl>, Balling <dbl>,
## #   Pressure.Vacuum <dbl>, PH <dbl>, Oxygen.Filler <dbl>, Bowl.Setpoint <dbl>,
## #   Pressure.Setpoint <dbl>, Air.Pressurer <dbl>, Alch.Rel <dbl>, …

summary(StudentData)

##  Brand.Code   Carb.Volume     Fill.Ounces      PC.Volume       Carb.Pressure  
##  A   : 293   Min.   :5.040   Min.   :23.63   Min.   :0.07933   Min.   :57.00  
##  B   :1239   1st Qu.:5.293   1st Qu.:23.92   1st Qu.:0.23917   1st Qu.:65.60  
##  C   : 304   Median :5.347   Median :23.97   Median :0.27133   Median :68.20  
##  D   : 615   Mean   :5.370   Mean   :23.97   Mean   :0.27712   Mean   :68.19  
##  NA's: 120   3rd Qu.:5.453   3rd Qu.:24.03   3rd Qu.:0.31200   3rd Qu.:70.60  
##              Max.   :5.700   Max.   :24.32   Max.   :0.47800   Max.   :79.40  
##              NA's   :10      NA's   :38      NA's   :39        NA's   :27     
##    Carb.Temp          PSC             PSC.Fill         PSC.CO2       
##  Min.   :128.6   Min.   :0.00200   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:138.4   1st Qu.:0.04800   1st Qu.:0.1000   1st Qu.:0.02000  
##  Median :140.8   Median :0.07600   Median :0.1800   Median :0.04000  
##  Mean   :141.1   Mean   :0.08457   Mean   :0.1954   Mean   :0.05641  
##  3rd Qu.:143.8   3rd Qu.:0.11200   3rd Qu.:0.2600   3rd Qu.:0.08000  
##  Max.   :154.0   Max.   :0.27000   Max.   :0.6200   Max.   :0.24000  
##  NA's   :26      NA's   :33        NA's   :23       NA's   :39       
##     Mnf.Flow       Carb.Pressure1  Fill.Pressure   Hyd.Pressure1  
##  Min.   :-100.20   Min.   :105.6   Min.   :34.60   Min.   :-0.80  
##  1st Qu.:-100.00   1st Qu.:119.0   1st Qu.:46.00   1st Qu.: 0.00  
##  Median :  65.20   Median :123.2   Median :46.40   Median :11.40  
##  Mean   :  24.57   Mean   :122.6   Mean   :47.92   Mean   :12.44  
##  3rd Qu.: 140.80   3rd Qu.:125.4   3rd Qu.:50.00   3rd Qu.:20.20  
##  Max.   : 229.40   Max.   :140.2   Max.   :60.40   Max.   :58.00  
##  NA's   :2         NA's   :32      NA's   :22      NA's   :11     
##  Hyd.Pressure2   Hyd.Pressure3   Hyd.Pressure4     Filler.Level  
##  Min.   : 0.00   Min.   :-1.20   Min.   : 52.00   Min.   : 55.8  
##  1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 86.00   1st Qu.: 98.3  
##  Median :28.60   Median :27.60   Median : 96.00   Median :118.4  
##  Mean   :20.96   Mean   :20.46   Mean   : 96.29   Mean   :109.3  
##  3rd Qu.:34.60   3rd Qu.:33.40   3rd Qu.:102.00   3rd Qu.:120.0  
##  Max.   :59.40   Max.   :50.00   Max.   :142.00   Max.   :161.2  
##  NA's   :15      NA's   :15      NA's   :30       NA's   :20     
##   Filler.Speed   Temperature      Usage.cont      Carb.Flow       Density     
##  Min.   : 998   Min.   :63.60   Min.   :12.08   Min.   :  26   Min.   :0.240  
##  1st Qu.:3888   1st Qu.:65.20   1st Qu.:18.36   1st Qu.:1144   1st Qu.:0.900  
##  Median :3982   Median :65.60   Median :21.79   Median :3028   Median :0.980  
##  Mean   :3687   Mean   :65.97   Mean   :20.99   Mean   :2468   Mean   :1.174  
##  3rd Qu.:3998   3rd Qu.:66.40   3rd Qu.:23.75   3rd Qu.:3186   3rd Qu.:1.620  
##  Max.   :4030   Max.   :76.20   Max.   :25.90   Max.   :5104   Max.   :1.920  
##  NA's   :57     NA's   :14      NA's   :5       NA's   :2      NA's   :1      
##       MFR           Balling       Pressure.Vacuum        PH       
##  Min.   : 31.4   Min.   :-0.170   Min.   :-6.600   Min.   :7.880  
##  1st Qu.:706.3   1st Qu.: 1.496   1st Qu.:-5.600   1st Qu.:8.440  
##  Median :724.0   Median : 1.648   Median :-5.400   Median :8.540  
##  Mean   :704.0   Mean   : 2.198   Mean   :-5.216   Mean   :8.546  
##  3rd Qu.:731.0   3rd Qu.: 3.292   3rd Qu.:-5.000   3rd Qu.:8.680  
##  Max.   :868.6   Max.   : 4.012   Max.   :-3.600   Max.   :9.360  
##  NA's   :212     NA's   :1                         NA's   :4      
##  Oxygen.Filler     Bowl.Setpoint   Pressure.Setpoint Air.Pressurer  
##  Min.   :0.00240   Min.   : 70.0   Min.   :44.00     Min.   :140.8  
##  1st Qu.:0.02200   1st Qu.:100.0   1st Qu.:46.00     1st Qu.:142.2  
##  Median :0.03340   Median :120.0   Median :46.00     Median :142.6  
##  Mean   :0.04684   Mean   :109.3   Mean   :47.62     Mean   :142.8  
##  3rd Qu.:0.06000   3rd Qu.:120.0   3rd Qu.:50.00     3rd Qu.:143.0  
##  Max.   :0.40000   Max.   :140.0   Max.   :52.00     Max.   :148.2  
##  NA's   :12        NA's   :2       NA's   :12                       
##     Alch.Rel        Carb.Rel      Balling.Lvl  
##  Min.   :5.280   Min.   :4.960   Min.   :0.00  
##  1st Qu.:6.540   1st Qu.:5.340   1st Qu.:1.38  
##  Median :6.560   Median :5.400   Median :1.48  
##  Mean   :6.897   Mean   :5.437   Mean   :2.05  
##  3rd Qu.:7.240   3rd Qu.:5.540   3rd Qu.:3.14  
##  Max.   :8.620   Max.   :6.060   Max.   :3.66  
##  NA's   :9       NA's   :10      NA's   :1

pairData<-mice::md.pairs(StudentData)
pairData$mm[,"PH"]

##        Brand.Code       Carb.Volume       Fill.Ounces         PC.Volume 
##                 0                 0                 0                 0 
##     Carb.Pressure         Carb.Temp               PSC          PSC.Fill 
##                 0                 0                 0                 0 
##           PSC.CO2          Mnf.Flow    Carb.Pressure1     Fill.Pressure 
##                 0                 2                 0                 4 
##     Hyd.Pressure1     Hyd.Pressure2     Hyd.Pressure3     Hyd.Pressure4 
##                 0                 0                 0                 2 
##      Filler.Level      Filler.Speed       Temperature        Usage.cont 
##                 4                 3                 2                 0 
##         Carb.Flow           Density               MFR           Balling 
##                 0                 1                 4                 1 
##   Pressure.Vacuum                PH     Oxygen.Filler     Bowl.Setpoint 
##                 0                 4                 1                 0 
## Pressure.Setpoint     Air.Pressurer          Alch.Rel          Carb.Rel 
##                 0                 0                 2                 2 
##       Balling.Lvl 
##                 0

pairData$mr[,"PH"]

##        Brand.Code       Carb.Volume       Fill.Ounces         PC.Volume 
##               120                10                38                39 
##     Carb.Pressure         Carb.Temp               PSC          PSC.Fill 
##                27                26                33                23 
##           PSC.CO2          Mnf.Flow    Carb.Pressure1     Fill.Pressure 
##                39                 0                32                18 
##     Hyd.Pressure1     Hyd.Pressure2     Hyd.Pressure3     Hyd.Pressure4 
##                11                15                15                28 
##      Filler.Level      Filler.Speed       Temperature        Usage.cont 
##                16                54                12                 5 
##         Carb.Flow           Density               MFR           Balling 
##                 2                 0               208                 0 
##   Pressure.Vacuum                PH     Oxygen.Filler     Bowl.Setpoint 
##                 0                 0                11                 2 
## Pressure.Setpoint     Air.Pressurer          Alch.Rel          Carb.Rel 
##                12                 0                 7                 8 
##       Balling.Lvl 
##                 1

pairData$rm[,"PH"]

##        Brand.Code       Carb.Volume       Fill.Ounces         PC.Volume 
##                 4                 4                 4                 4 
##     Carb.Pressure         Carb.Temp               PSC          PSC.Fill 
##                 4                 4                 4                 4 
##           PSC.CO2          Mnf.Flow    Carb.Pressure1     Fill.Pressure 
##                 4                 2                 4                 0 
##     Hyd.Pressure1     Hyd.Pressure2     Hyd.Pressure3     Hyd.Pressure4 
##                 4                 4                 4                 2 
##      Filler.Level      Filler.Speed       Temperature        Usage.cont 
##                 0                 1                 2                 4 
##         Carb.Flow           Density               MFR           Balling 
##                 4                 3                 0                 3 
##   Pressure.Vacuum                PH     Oxygen.Filler     Bowl.Setpoint 
##                 4                 0                 3                 4 
## Pressure.Setpoint     Air.Pressurer          Alch.Rel          Carb.Rel 
##                 4                 4                 2                 2 
##       Balling.Lvl 
##                 4

phCor<-cor(StudentData %>% select( !c(PH,"Brand.Code")),StudentData$PH)

pairs((StudentData %>% select( !c(PH,"Brand.Code")))[1:5])

Lookin at some of the pairs we see there are some obvious correlations.

Data Cleanup

dataRec <-  recipe(PH~.,data = StudentData ) %>%
  step_dummy(Brand.Code, one_hot=T)%>% step_impute_bag(all_predictors()) %>%
  step_corr(all_predictors())%>%prep()

## Warning: There are new levels in a factor: NA

fittedData <-dataRec %>% bake(StudentData)

## Warning: There are new levels in a factor: NA

fittedDataEval <-dataRec %>% bake(StudentEval)

## Warning: There are new levels in a factor: NA

colnames(StudentData[colnames(StudentData) %!in% colnames(fittedData)])

## [1] "Brand.Code"    "Hyd.Pressure3" "Filler.Level"  "Filler.Speed" 
## [5] "Density"       "Balling"       "Balling.Lvl"

head(fittedData)

## # A tibble: 6 × 30
##   Carb.Volume Fill.Ounces PC.Volume Carb.Pressure Carb.Temp   PSC PSC.Fill
##         <dbl>       <dbl>     <dbl>         <dbl>     <dbl> <dbl>    <dbl>
## 1        5.34        24.0     0.263          68.2      141. 0.104    0.260
## 2        5.43        24.0     0.239          68.4      140. 0.124    0.220
## 3        5.29        24.1     0.263          70.8      145. 0.09     0.34 
## 4        5.44        24.0     0.293          63        133. 0.106    0.420
## 5        5.49        24.3     0.111          67.2      137. 0.026    0.16 
## 6        5.38        23.9     0.269          66.6      138. 0.09     0.240
## # … with 23 more variables: PSC.CO2 <dbl>, Mnf.Flow <dbl>,
## #   Carb.Pressure1 <dbl>, Fill.Pressure <dbl>, Hyd.Pressure1 <dbl>,
## #   Hyd.Pressure2 <dbl>, Hyd.Pressure4 <dbl>, Temperature <dbl>,
## #   Usage.cont <dbl>, Carb.Flow <dbl>, MFR <dbl>, Pressure.Vacuum <dbl>,
## #   Oxygen.Filler <dbl>, Bowl.Setpoint <dbl>, Pressure.Setpoint <dbl>,
## #   Air.Pressurer <dbl>, Alch.Rel <dbl>, Carb.Rel <dbl>, PH <dbl>,
## #   Brand.Code_A <dbl>, Brand.Code_B <dbl>, Brand.Code_C <dbl>, …

We first dummy out the Brand Code variable, the bag impute all predictors to remove the substantial missing variables, then we removed 6 highly correlated values. We also remove lines with NAs in the dependent variable as there were only 4.

colnames(StudentData[colnames(StudentData) %!in% colnames(fittedData)])[-1]

## [1] "Hyd.Pressure3" "Filler.Level"  "Filler.Speed"  "Density"      
## [5] "Balling"       "Balling.Lvl"

Suggested Predictors

SurfedData<-  VSURF(fittedData %>% drop_na() %>%select(-PH),(fittedData %>% drop_na())$PH)
predData<-SurfedData$varselect.pred

predData <-  c(9,21,16,28,24,20,25,19,17,22,13,23)
colnames((fittedData %>% drop_na() %>%select(-PH)))[predData]

##  [1] "Mnf.Flow"          "Bowl.Setpoint"     "Usage.cont"       
##  [4] "Brand.Code_C"      "Alch.Rel"          "Oxygen.Filler"    
##  [7] "Carb.Rel"          "Pressure.Vacuum"   "Carb.Flow"        
## [10] "Pressure.Setpoint" "Hyd.Pressure2"     "Air.Pressurer"

Using VSURF to select predictors we see that no Brand except C is important, and that except for Pressure and Carb with two each only one predictor from each type is selected.

Models

Linear Regression

First in trying to select as simple as a model as practical we attempt to fit a linear regression model.

set.seed(6354)
studentSplit <- initial_split(StudentData, prop=.75)
dataRec <-  recipe(PH~.,data = training(studentSplit) ) %>%
  step_dummy(Brand.Code, one_hot=T)%>%   step_rm(everything(),-all_of(colnames((fittedData %>% drop_na() %>%select(-PH)))[predData]), -PH)%>% step_impute_bag(all_predictors()) %>%
  step_YeoJohnson (all_predictors()) %>%
  prep()


lr_mod <- 
  parsnip::linear_reg() %>% 
  set_engine("lm")
studentData_wflow <- 
  workflow() %>% 
  add_model(lr_mod) %>% 
  add_recipe(dataRec)

studentData_fit <- 
  studentData_wflow %>% 
  fit(data = training(studentSplit))

We use the VSURF selected predictors and apply a Yeo Johnson transformation.

studentData_fit %>% 
  extract_fit_parsnip() %>% 
  tidy()

## # A tibble: 13 × 5
##    term               estimate std.error statistic   p.value
##    <chr>                 <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)        7.81e+ 4  2.40e+ 4     3.25   1.17e- 3
##  2 Mnf.Flow          -6.61e- 4  5.41e- 5   -12.2    5.01e-33
##  3 Hyd.Pressure2      6.13e- 3  1.12e- 3     5.48   4.80e- 8
##  4 Usage.cont        -3.09e- 6  4.95e- 7    -6.23   5.59e-10
##  5 Carb.Flow         -1.35e- 7  3.51e- 7    -0.384  7.01e- 1
##  6 Pressure.Vacuum   -2.40e- 5  1.40e- 4    -0.171  8.64e- 1
##  7 Oxygen.Filler     -2.89e- 1  8.75e- 2    -3.31   9.64e- 4
##  8 Bowl.Setpoint      3.72e-10  9.49e-11     3.92   9.28e- 5
##  9 Pressure.Setpoint -2.76e+ 5  8.49e+ 4    -3.25   1.17e- 3
## 10 Air.Pressurer     NA        NA           NA     NA       
## 11 Alch.Rel           2.46e- 2  1.24e- 2     1.99   4.68e- 2
## 12 Carb.Rel          -2.45e- 2  4.90e- 2    -0.501  6.16e- 1
## 13 Brand.Code_C      -1.31e- 1  1.02e- 2   -12.8    6.05e-36

plot(studentData_fit$fit$fit$fit)

vip(studentData_fit$fit$fit)

evalResults <- 
   predict(studentData_fit, testing(studentSplit))%>% bind_cols(testing(studentSplit) %>% 
              select(PH)) %>% metrics(truth = PH, .pred)
evalResults

## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       0.143
## 2 rsq     standard       0.304
## 3 mae     standard       0.114

GLM produces a pretty poor fit of an Rsquared of only 0.3.

evaledStudent<-predict(studentData_fit,StudentEval)
write_xlsx(evaledStudent,"GLM Predictions.xlsx")

ANN

set.seed(6354)
partition <- createDataPartition(as.factor(fittedData$PH), p=0.75, list=F)
training <- fittedData[partition,]
testing<- fittedData[-partition,]
training<-na.omit(training)
testing<-na.omit(testing)
fd<-data.matrix(training)
tx<-training[ , !(names(training) %in% c("PH"))]
ty<-fd[,c("PH")]
fitnn <-train(x=tx, y=ty,
            method="nnet", 
            metric = "RMSE",
        trace = FALSE,
        maxit = 500,
        preProcess=c( "corr",  "medianImpute"))
fitnn

## Neural Network 
## 
## 1945 samples
##   29 predictor
## 
## Pre-processing: median imputation (29) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 1945, 1945, 1945, 1945, 1945, 1945, ... 
## Resampling results across tuning parameters:
## 
##   size  decay  RMSE      Rsquared     MAE     
##   1     0e+00  7.545983          NaN  7.543938
##   1     1e-04  7.545983          NaN  7.543938
##   1     1e-01  7.546018  0.001875545  7.543973
##   3     0e+00  7.545983          NaN  7.543938
##   3     1e-04  7.545983  0.011863214  7.543938
##   3     1e-01  7.546002  0.002109082  7.543957
##   5     0e+00  7.545983          NaN  7.543938
##   5     1e-04  7.545983  0.020851360  7.543938
##   5     1e-01  7.545997  0.001909324  7.543951
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 1 and decay = 1e-04.

plot(fitnn)

fitnn$results

##   size decay     RMSE    Rsquared      MAE      RMSESD  RsquaredSD       MAESD
## 1    1 0e+00 7.545983         NaN 7.543938 0.004916061          NA 0.004921555
## 2    1 1e-04 7.545983         NaN 7.543938 0.004916061          NA 0.004921555
## 3    1 1e-01 7.546018 0.001875545 7.543973 0.004916056 0.002522860 0.004921552
## 4    3 0e+00 7.545983         NaN 7.543938 0.004916061          NA 0.004921555
## 5    3 1e-04 7.545983 0.011863214 7.543938 0.004916054 0.021869427 0.004921549
## 6    3 1e-01 7.546002 0.002109082 7.543957 0.004916262 0.003978054 0.004921754
## 7    5 0e+00 7.545983         NaN 7.543938 0.004916061          NA 0.004921555
## 8    5 1e-04 7.545983 0.020851360 7.543938 0.004916009 0.022547998 0.004921503
## 9    5 1e-01 7.545997 0.001909324 7.543951 0.004916058 0.003658934 0.004921534

tex<-testing[ , !(names(testing) %in% c("PH"))]
fdt<-data.matrix(testing)
tey<-fdt[,c("PH")]
p_nn <- predict(fitnn, newdata=tex)
nn_re<-postResample(pred = p_nn, obs = tey)
nn_re

##     RMSE Rsquared      MAE 
## 7.549642       NA 7.547846

ex<-fittedDataEval[ , !(names(fittedDataEval) %in% c("PH"))]
fde<-data.matrix(fittedDataEval)
ey<-fdt[,c("PH")]
nn_eval<-as.data.frame(predict(fitnn, newdata=ex))
write_xlsx(nn_eval,"Neural Network Predictions.xlsx")

The best neural network model did not produce an r squared with an RMSE over 7 which indicates this is not an appropriate model for this dataset. ### SVM

set.seed(6354)
fitsvm <-train(x=tx, y=ty,
            method="svmRadial", 
            trControl = trainControl(method="cv"),
            tuneLength = 15)
fitsvm

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 1945 samples
##   29 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1750, 1751, 1751, 1750, 1751, 1751, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE       Rsquared   MAE       
##      0.25  0.1277875  0.4725107  0.09508687
##      0.50  0.1254809  0.4895223  0.09259978
##      1.00  0.1239135  0.5005366  0.09095224
##      2.00  0.1221460  0.5135644  0.08952809
##      4.00  0.1201263  0.5289391  0.08883449
##      8.00  0.1190654  0.5389634  0.08852029
##     16.00  0.1203716  0.5345683  0.08961751
##     32.00  0.1260139  0.5046649  0.09350379
##     64.00  0.1326441  0.4744984  0.09811574
##    128.00  0.1410275  0.4368802  0.10428917
##    256.00  0.1494325  0.4032239  0.11071780
##    512.00  0.1553329  0.3822656  0.11521993
##   1024.00  0.1564404  0.3777610  0.11608891
##   2048.00  0.1564404  0.3777610  0.11608891
##   4096.00  0.1564404  0.3777610  0.11608891
## 
## Tuning parameter 'sigma' was held constant at a value of 0.02367588
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.02367588 and C = 8.

plot(fitsvm)

fitsvm$results

##         sigma       C      RMSE  Rsquared        MAE      RMSESD RsquaredSD
## 1  0.02367588    0.25 0.1277875 0.4725107 0.09508687 0.009024608 0.03797997
## 2  0.02367588    0.50 0.1254809 0.4895223 0.09259978 0.009188028 0.03797645
## 3  0.02367588    1.00 0.1239135 0.5005366 0.09095224 0.009584777 0.04165056
## 4  0.02367588    2.00 0.1221460 0.5135644 0.08952809 0.009508832 0.04316189
## 5  0.02367588    4.00 0.1201263 0.5289391 0.08883449 0.009614778 0.04656298
## 6  0.02367588    8.00 0.1190654 0.5389634 0.08852029 0.009794605 0.04654667
## 7  0.02367588   16.00 0.1203716 0.5345683 0.08961751 0.010470623 0.04827264
## 8  0.02367588   32.00 0.1260139 0.5046649 0.09350379 0.011288960 0.05359478
## 9  0.02367588   64.00 0.1326441 0.4744984 0.09811574 0.012690111 0.06396350
## 10 0.02367588  128.00 0.1410275 0.4368802 0.10428917 0.014925475 0.07216448
## 11 0.02367588  256.00 0.1494325 0.4032239 0.11071780 0.015606362 0.07374019
## 12 0.02367588  512.00 0.1553329 0.3822656 0.11521993 0.015396357 0.06959972
## 13 0.02367588 1024.00 0.1564404 0.3777610 0.11608891 0.014663574 0.06722616
## 14 0.02367588 2048.00 0.1564404 0.3777610 0.11608891 0.014663574 0.06722616
## 15 0.02367588 4096.00 0.1564404 0.3777610 0.11608891 0.014663574 0.06722616
##          MAESD
## 1  0.005382872
## 2  0.005825921
## 3  0.006413386
## 4  0.006227097
## 5  0.006275635
## 6  0.006366314
## 7  0.006634785
## 8  0.007317281
## 9  0.008607219
## 10 0.010063037
## 11 0.010947315
## 12 0.010663053
## 13 0.010205878
## 14 0.010205878
## 15 0.010205878

fitsvm$bestTune

##        sigma C
## 6 0.02367588 8

p_svm <- predict(fitsvm, newdata=tex)
svm_re<-postResample(pred = p_svm, obs = tey)
svm_re

##       RMSE   Rsquared        MAE 
## 0.11661343 0.50921404 0.08499967

svm_eval<-as.data.frame(predict(fitsvm, newdata=ex))
write_xlsx(nn_eval,"SVM Predictions.xlsx")

The best SVM model produced a RMSE = 0.116 and an Rsquared of over 0.51 which is a much better predictor on this dataset then neural network. ## Model Performance

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

glm_re<-subset(dcast(evalResults,.estimate~.metric), select = -c(.estimate))

## Using .estimate as value column: use value.var to override.

glm_re[1,2]<-glm_re[2,2]
glm_re[1,3]<-glm_re[3,3]
glm_re<-na.omit(glm_re)
glm_re<-glm_re[,c(2,3,1)]
rbind("OLS"=glm_re,
      "SVM"=svm_re,
      "Neural Network"=nn_re)

##                     rmse       rsq        mae
## OLS            0.1433920 0.3037629 0.11408250
## SVM            0.1166134 0.5092140 0.08499967
## Neural Network 7.5496415        NA 7.54784566

When comparing all 3 models it is clear by RMSE, MAE and Rsquared that the SVM model is the best predictor thus we will be using SVM as our evaluation model.

Data 624 Project 2

Scott Reed, Adam Gersowitz, Josef Waples

12/3/2021