## Loading required package: lattice
## Loading required package: ggplot2
## corrplot 0.84 loaded
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
## -- Attaching packages -------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v tibble 2.1.3 v dplyr 0.8.5
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## v purrr 0.3.3
## -- Conflicts ----------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::between() masks data.table::between()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks data.table::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks data.table::last()
## x purrr::lift() masks caret::lift()
## x purrr::transpose() masks data.table::transpose()
#studentData <- read_xlsx("../StudentData.xlsx")
studentData <- read_xlsx("./StudentData.xlsx")
str(studentData)## Classes 'tbl_df', 'tbl' and 'data.frame': 2571 obs. of 33 variables:
## $ Brand Code : chr "B" "A" "B" "A" ...
## $ Carb Volume : num 5.34 5.43 5.29 5.44 5.49 ...
## $ Fill Ounces : num 24 24 24.1 24 24.3 ...
## $ PC Volume : num 0.263 0.239 0.263 0.293 0.111 ...
## $ Carb Pressure : num 68.2 68.4 70.8 63 67.2 66.6 64.2 67.6 64.2 72 ...
## $ Carb Temp : num 141 140 145 133 137 ...
## $ PSC : num 0.104 0.124 0.09 NA 0.026 0.09 0.128 0.154 0.132 0.014 ...
## $ PSC Fill : num 0.26 0.22 0.34 0.42 0.16 ...
## $ PSC CO2 : num 0.04 0.04 0.16 0.04 0.12 ...
## $ Mnf Flow : num -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 ...
## $ Carb Pressure1 : num 119 122 120 115 118 ...
## $ Fill Pressure : num 46 46 46 46.4 45.8 45.6 51.8 46.8 46 45.2 ...
## $ Hyd Pressure1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Hyd Pressure2 : num NA NA NA 0 0 0 0 0 0 0 ...
## $ Hyd Pressure3 : num NA NA NA 0 0 0 0 0 0 0 ...
## $ Hyd Pressure4 : num 118 106 82 92 92 116 124 132 90 108 ...
## $ Filler Level : num 121 119 120 118 119 ...
## $ Filler Speed : num 4002 3986 4020 4012 4010 ...
## $ Temperature : num 66 67.6 67 65.6 65.6 66.2 65.8 65.2 65.4 66.6 ...
## $ Usage cont : num 16.2 19.9 17.8 17.4 17.7 ...
## $ Carb Flow : num 2932 3144 2914 3062 3054 ...
## $ Density : num 0.88 0.92 1.58 1.54 1.54 1.52 0.84 0.84 0.9 0.9 ...
## $ MFR : num 725 727 735 731 723 ...
## $ Balling : num 1.4 1.5 3.14 3.04 3.04 ...
## $ Pressure Vacuum : num -4 -4 -3.8 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 ...
## $ PH : num 8.36 8.26 8.94 8.24 8.26 8.32 8.4 8.38 8.38 8.5 ...
## $ Oxygen Filler : num 0.022 0.026 0.024 0.03 0.03 0.024 0.066 0.046 0.064 0.022 ...
## $ Bowl Setpoint : num 120 120 120 120 120 120 120 120 120 120 ...
## $ Pressure Setpoint: num 46.4 46.8 46.6 46 46 46 46 46 46 46 ...
## $ Air Pressurer : num 143 143 142 146 146 ...
## $ Alch Rel : num 6.58 6.56 7.66 7.14 7.14 7.16 6.54 6.52 6.52 6.54 ...
## $ Carb Rel : num 5.32 5.3 5.84 5.42 5.44 5.44 5.38 5.34 5.34 5.34 ...
## $ Balling Lvl : num 1.48 1.56 3.28 3.04 3.04 3.02 1.44 1.44 1.44 1.38 ...
## Remove any rows with no response values
colY <- which(colnames(studentData) == "PH")
blankResponseRows <- which(is.na(studentData[, colY]))
studentData <- studentData[-blankResponseRows, ]
(naCols <- names(which(sapply(studentData, anyNA))))## [1] "Brand Code" "Carb Volume" "Fill Ounces"
## [4] "PC Volume" "Carb Pressure" "Carb Temp"
## [7] "PSC" "PSC Fill" "PSC CO2"
## [10] "Carb Pressure1" "Fill Pressure" "Hyd Pressure1"
## [13] "Hyd Pressure2" "Hyd Pressure3" "Hyd Pressure4"
## [16] "Filler Level" "Filler Speed" "Temperature"
## [19] "Usage cont" "Carb Flow" "MFR"
## [22] "Oxygen Filler" "Bowl Setpoint" "Pressure Setpoint"
## [25] "Alch Rel" "Carb Rel" "Balling Lvl"
print(paste(
"There are",
length(naCols),
"columns with missing values out of",
ncol(studentData)
))## [1] "There are 27 columns with missing values out of 33"
print(paste(
"There are",
sum(complete.cases(studentData)),
"complete rows out of total",
nrow(studentData)
))## [1] "There are 2038 complete rows out of total 2567"
ggr_plot <- aggr(
studentData,
col = c('navyblue', 'red'),
numbers = TRUE,
sortVars = TRUE,
labels = names(studentData),
cex.axis = .7,
gap = 3,
ylab = c("Histogram of missing data", "Pattern")
)##
## Variables sorted by number of missings:
## Variable Count
## MFR 0.0810284379
## Brand Code 0.0467471757
## Filler Speed 0.0210362291
## PC Volume 0.0151928321
## PSC CO2 0.0151928321
## Fill Ounces 0.0148032723
## PSC 0.0128554733
## Carb Pressure1 0.0124659135
## Hyd Pressure4 0.0109076743
## Carb Pressure 0.0105181145
## Carb Temp 0.0101285547
## PSC Fill 0.0089598753
## Fill Pressure 0.0070120764
## Filler Level 0.0062329568
## Hyd Pressure2 0.0058433970
## Hyd Pressure3 0.0058433970
## Temperature 0.0046747176
## Pressure Setpoint 0.0046747176
## Hyd Pressure1 0.0042851578
## Oxygen Filler 0.0042851578
## Carb Volume 0.0038955980
## Carb Rel 0.0031164784
## Alch Rel 0.0027269186
## Usage cont 0.0019477990
## Carb Flow 0.0007791196
## Bowl Setpoint 0.0007791196
## Balling Lvl 0.0003895598
## Mnf Flow 0.0000000000
## Density 0.0000000000
## Balling 0.0000000000
## Pressure Vacuum 0.0000000000
## PH 0.0000000000
## Air Pressurer 0.0000000000
##
## A B C D <NA>
## 293 1235 304 615 120
## assign "U" to unknown brand codes
studentData[is.na(studentData$`Brand Code`),"Brand Code"] <- "U"
## Convert `Brand Code` to factors
studentData$`Brand Code` <- as.factor(studentData$`Brand Code`)
table(studentData$`Brand Code`,useNA = "ifany")##
## A B C D U
## 293 1235 304 615 120
## Use **MICE**: "Multivariate Imputation by Chained Equations" to come up with imputed cases
cn <- colnames(studentData)
icStudentData <- studentData
colnames(icStudentData) <- sub(" ", "_", cn)
icdf <- mice(
icStudentData,
m = 2,
maxit = 10,
meth = 'pmm',
seed = 500,
print = FALSE
)
icStudentData <- complete(icdf)
rm(icdf)
colnames(icStudentData) <- cn#### no longer necessary to omit the first column, as Brand Code has been
#### dropped through one-hot encoding
####mCor <- cor(studentData[, -1], use = "na.or.complete")
mCor <- cor(icStudentData, use = "na.or.complete")
### be sure to specify corrplot::corrplot because the namespace may be masked by pls::corrplot
corrplot::corrplot(
mCor,
method = "circle",
type = "upper",
order = "hclust",
tl.cex = 0.7
)featurePlot(
x = icStudentData[, 1:10],
y = icStudentData[, colY],
between = list(x = 1, y = 1),
type = c("g", "p", "smooth")
)featurePlot(
x = icStudentData[, 11:19],
y = icStudentData[, colY],
between = list(x = 1, y = 1),
type = c("g", "p", "smooth")
)featurePlot(
x = icStudentData[, 12:20],
y = icStudentData[, colY],
between = list(x = 1, y = 1),
type = c("g", "p", "smooth")
)featurePlot(
##### x = icStudentData[, c(21:25, 27)], "PH" has now been moved from column 26 to column 25
x = icStudentData[, c(21:24, 26:27)],
y = icStudentData[, colY],
between = list(x = 1, y = 1),
type = c("g", "p", "smooth")
)featurePlot(
x = icStudentData[, 28:32],
y = icStudentData[, colY],
between = list(x = 1, y = 1),
type = c("g", "p", "smooth")
)###PH vs. each of the brands
featurePlot(
x = icStudentData[, 33:37],
y = icStudentData[, colY],
between = list(x = 1, y = 1),
type = c("g", "p", "smooth")
)The final group of plots – based on each of the now separate Brand Codes – is not very informative, because only the column above the “1.0” is meaningful.
## Split the data into a training and a test set
set.seed(1234)
trainRows <-
createDataPartition(icStudentData$PH, p = .80, list = FALSE)
ctrl <- trainControl(method = "cv")trainX <- icStudentData[trainRows,-colY]
trainY <- icStudentData[trainRows, colY]
testX <- icStudentData[-trainRows,-colY]
testY <- icStudentData[-trainRows, colY]## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## RMSE Rsquared MAE
## 0.1261174 0.4334206 0.1005960
There is perfect multicollinearity on the five Brand Code dummies - so drop the final column Create new trainX and testX matrices which omit the final column (Brand Code U)
## [1] 2055 36
## [1] 2055 35
## [1] 512 36
## [1] 512 35
set.seed(100)
lmTune0U <- train(x = trainXU, y = trainY,
method = "lm",
preProcess = c( "center", "scale"),
trControl = ctrl)
lmTune0U ## Linear Regression
##
## 2055 samples
## 35 predictor
##
## Pre-processing: centered (35), scaled (35)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1849, 1849, 1849, 1849, 1850, 1851, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.1351187 0.3972839 0.104106
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
## RMSE Rsquared MAE
## 0.1261174 0.4334206 0.1005960
set.seed(100)
plsTune <- train(x = trainXU, y = trainY,
method = "pls",
tuneGrid = expand.grid(ncomp = 1:35),
trControl = ctrl)
plsTune## Partial Least Squares
##
## 2055 samples
## 35 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1849, 1849, 1849, 1849, 1850, 1851, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 0.1704800 0.03689837 0.1368212
## 2 0.1688215 0.05587677 0.1359948
## 3 0.1549284 0.20850850 0.1210411
## 4 0.1524105 0.23250508 0.1183762
## 5 0.1508588 0.24789647 0.1175638
## 6 0.1491563 0.26502677 0.1164563
## 7 0.1471461 0.28459419 0.1151224
## 8 0.1466613 0.28933932 0.1141979
## 9 0.1456388 0.29990938 0.1133353
## 10 0.1437830 0.31714219 0.1118687
## 11 0.1427181 0.32659359 0.1110814
## 12 0.1418692 0.33453064 0.1098993
## 13 0.1409193 0.34367671 0.1090609
## 14 0.1399150 0.35317192 0.1085857
## 15 0.1380490 0.37026530 0.1066573
## 16 0.1369673 0.37979708 0.1059585
## 17 0.1361741 0.38697672 0.1056408
## 18 0.1357763 0.39090752 0.1054258
## 19 0.1356595 0.39182047 0.1053973
## 20 0.1355282 0.39326890 0.1053321
## 21 0.1354635 0.39387183 0.1053188
## 22 0.1351353 0.39715672 0.1043958
## 23 0.1350412 0.39780110 0.1042284
## 24 0.1348922 0.39910952 0.1039375
## 25 0.1351748 0.39671476 0.1042567
## 26 0.1351719 0.39668936 0.1042212
## 27 0.1351047 0.39742445 0.1041317
## 28 0.1350783 0.39763698 0.1040690
## 29 0.1351111 0.39736192 0.1040984
## 30 0.1351355 0.39713370 0.1041019
## 31 0.1351219 0.39725452 0.1041028
## 32 0.1351163 0.39730456 0.1041020
## 33 0.1351186 0.39728442 0.1041044
## 34 0.1351188 0.39728316 0.1041060
## 35 0.1351187 0.39728394 0.1041060
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 24.
set.seed(100)
pcrTune <- train(x = trainXU, y = trainY,
method = "pcr",
tuneGrid = expand.grid(ncomp = 1:35),
trControl = ctrl)
pcrTune ## Principal Component Analysis
##
## 2055 samples
## 35 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1849, 1849, 1849, 1849, 1850, 1851, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 0.1722779 0.01654685 0.1379617
## 2 0.1698978 0.04380722 0.1369348
## 3 0.1551088 0.20671751 0.1212318
## 4 0.1551128 0.20662275 0.1212610
## 5 0.1531013 0.22570560 0.1191162
## 6 0.1530839 0.22568883 0.1189919
## 7 0.1509958 0.24684251 0.1179033
## 8 0.1501828 0.25462137 0.1171343
## 9 0.1501824 0.25460186 0.1171003
## 10 0.1502008 0.25434438 0.1170895
## 11 0.1492295 0.26423765 0.1163619
## 12 0.1477011 0.27968397 0.1150982
## 13 0.1473319 0.28354577 0.1149473
## 14 0.1470092 0.28655810 0.1146122
## 15 0.1467861 0.28840763 0.1146154
## 16 0.1454270 0.30185025 0.1130238
## 17 0.1443003 0.31213166 0.1120801
## 18 0.1441737 0.31324049 0.1120183
## 19 0.1435635 0.31881385 0.1113666
## 20 0.1406799 0.34598766 0.1090204
## 21 0.1372901 0.37667689 0.1064448
## 22 0.1358466 0.39025231 0.1057067
## 23 0.1359661 0.38923172 0.1058280
## 24 0.1359886 0.38881283 0.1057245
## 25 0.1359890 0.38887796 0.1057324
## 26 0.1348934 0.39919046 0.1042371
## 27 0.1349265 0.39875240 0.1043062
## 28 0.1349996 0.39805685 0.1043042
## 29 0.1351171 0.39700957 0.1043088
## 30 0.1350984 0.39725811 0.1044394
## 31 0.1350681 0.39733671 0.1044241
## 32 0.1352739 0.39565307 0.1046034
## 33 0.1352341 0.39622546 0.1045886
## 34 0.1350839 0.39754816 0.1040473
## 35 0.1351187 0.39728394 0.1041060
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 26.
plsResamples <- plsTune$results
plsResamples$Model <- "PLS"
pcrResamples <- pcrTune$results
pcrResamples$Model <- "PCR"
plsPlotData <- rbind(plsResamples, pcrResamples)xyplot(RMSE ~ ncomp,
data = plsPlotData,
#aspect = 1,
xlab = "# Components",
ylab = "RMSE (Cross-Validation)",
auto.key = list(columns = 2),
groups = Model,
type = c("o", "g"))##
## Attaching package: 'pls'
## The following object is masked from 'package:corrplot':
##
## corrplot
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
(A boxplot of PH by Brand Code was displayed above.)
set.seed(100)
ridgeTune <- train(x = trainXU, y = trainY,
method = "ridge",
tuneGrid = ridgeGrid,
trControl = ctrl,
preProc = c("BoxCox", "center", "scale")
#preProc = c("center", "scale")
)
ridgeTune## Ridge Regression
##
## 2055 samples
## 35 predictor
##
## Pre-processing: Box-Cox transformation (23), centered (35), scaled (35)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1849, 1849, 1849, 1849, 1850, 1851, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.0000 0.1341363 0.4060369 0.1040628
## 0.0025 0.1340463 0.4067550 0.1040562
## 0.0050 0.1340069 0.4070712 0.1040650
## 0.0075 0.1339974 0.4071448 0.1040878
## 0.0100 0.1340063 0.4070657 0.1041256
## 0.0125 0.1340271 0.4068876 0.1041636
## 0.0150 0.1340555 0.4066440 0.1042017
## 0.0175 0.1340890 0.4063567 0.1042395
## 0.0200 0.1341258 0.4060400 0.1042760
## 0.0225 0.1341648 0.4057038 0.1043113
## 0.0250 0.1342051 0.4053550 0.1043472
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.0075.
enetGrid <- expand.grid(#lambda = c(0, 0.005, 0.01, 0.05, 0.1),
lambda = c(seq(0,1,length=6)),
fraction = seq(.05, 1, length = 20))
set.seed(100)
enetTune <- train(x = trainXU, y = trainY,
method = "enet",
tuneGrid = enetGrid,
trControl = ctrl,
preProc = c("BoxCox", "center", "scale")
#preProc = c("center", "scale")
)
enetTune## Elasticnet
##
## 2055 samples
## 35 predictor
##
## Pre-processing: Box-Cox transformation (23), centered (35), scaled (35)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1849, 1849, 1849, 1849, 1850, 1851, ...
## Resampling results across tuning parameters:
##
## lambda fraction RMSE Rsquared MAE
## 0.0 0.05 0.1602745 0.2403376 0.1272700
## 0.0 0.10 0.1516848 0.2910029 0.1197740
## 0.0 0.15 0.1462028 0.3190322 0.1151215
## 0.0 0.20 0.1426767 0.3415148 0.1121029
## 0.0 0.25 0.1404830 0.3568719 0.1101965
## 0.0 0.30 0.1387711 0.3700483 0.1085791
## 0.0 0.35 0.1375154 0.3790731 0.1074189
## 0.0 0.40 0.1366780 0.3850547 0.1066084
## 0.0 0.45 0.1360737 0.3896365 0.1059854
## 0.0 0.50 0.1356743 0.3929325 0.1055406
## 0.0 0.55 0.1351829 0.3971425 0.1051380
## 0.0 0.60 0.1347618 0.4007395 0.1047835
## 0.0 0.65 0.1344662 0.4031794 0.1045361
## 0.0 0.70 0.1342588 0.4048724 0.1043594
## 0.0 0.75 0.1341098 0.4060731 0.1042100
## 0.0 0.80 0.1339882 0.4070700 0.1040726
## 0.0 0.85 0.1339298 0.4075550 0.1039946
## 0.0 0.90 0.1339531 0.4074161 0.1039699
## 0.0 0.95 0.1340238 0.4068885 0.1040094
## 0.0 1.00 0.1341363 0.4060369 0.1040628
## 0.2 0.05 0.1651808 0.2040309 0.1316864
## 0.2 0.10 0.1589971 0.2521545 0.1261805
## 0.2 0.15 0.1540566 0.2841162 0.1217976
## 0.2 0.20 0.1501105 0.3009689 0.1183970
## 0.2 0.25 0.1470453 0.3139988 0.1158416
## 0.2 0.30 0.1447392 0.3246401 0.1138649
## 0.2 0.35 0.1430268 0.3338636 0.1123578
## 0.2 0.40 0.1417104 0.3429537 0.1112788
## 0.2 0.45 0.1405953 0.3511871 0.1103118
## 0.2 0.50 0.1397109 0.3576178 0.1095289
## 0.2 0.55 0.1390577 0.3623701 0.1089056
## 0.2 0.60 0.1385572 0.3663168 0.1083726
## 0.2 0.65 0.1380873 0.3704801 0.1079057
## 0.2 0.70 0.1378059 0.3731729 0.1075794
## 0.2 0.75 0.1374928 0.3761727 0.1072723
## 0.2 0.80 0.1372059 0.3789918 0.1070239
## 0.2 0.85 0.1369789 0.3813011 0.1068293
## 0.2 0.90 0.1367859 0.3833894 0.1066638
## 0.2 0.95 0.1366106 0.3853551 0.1065275
## 0.2 1.00 0.1364984 0.3868067 0.1064539
## 0.4 0.05 0.1655532 0.2040309 0.1320209
## 0.4 0.10 0.1596278 0.2481577 0.1267064
## 0.4 0.15 0.1548734 0.2806613 0.1225217
## 0.4 0.20 0.1509483 0.2977792 0.1191070
## 0.4 0.25 0.1478906 0.3081852 0.1165268
## 0.4 0.30 0.1456325 0.3165718 0.1146634
## 0.4 0.35 0.1439232 0.3246976 0.1131877
## 0.4 0.40 0.1426413 0.3324346 0.1120715
## 0.4 0.45 0.1417446 0.3384057 0.1112756
## 0.4 0.50 0.1408766 0.3455017 0.1104953
## 0.4 0.55 0.1402094 0.3512029 0.1098789
## 0.4 0.60 0.1397575 0.3555228 0.1094061
## 0.4 0.65 0.1394646 0.3589986 0.1090983
## 0.4 0.70 0.1392961 0.3619799 0.1088671
## 0.4 0.75 0.1393030 0.3638117 0.1087751
## 0.4 0.80 0.1391733 0.3661661 0.1086251
## 0.4 0.85 0.1390212 0.3685732 0.1085045
## 0.4 0.90 0.1389015 0.3707128 0.1084294
## 0.4 0.95 0.1387991 0.3727633 0.1083673
## 0.4 1.00 0.1387453 0.3745074 0.1083519
## 0.6 0.05 0.1656086 0.2040309 0.1320703
## 0.6 0.10 0.1597747 0.2465043 0.1268196
## 0.6 0.15 0.1550837 0.2787053 0.1227001
## 0.6 0.20 0.1511880 0.2952538 0.1192978
## 0.6 0.25 0.1482301 0.3032170 0.1167971
## 0.6 0.30 0.1459938 0.3112741 0.1149818
## 0.6 0.35 0.1443430 0.3187046 0.1135957
## 0.6 0.40 0.1431365 0.3262918 0.1125033
## 0.6 0.45 0.1423352 0.3318861 0.1117477
## 0.6 0.50 0.1418128 0.3364824 0.1112140
## 0.6 0.55 0.1412393 0.3423475 0.1106780
## 0.6 0.60 0.1408262 0.3473787 0.1102570
## 0.6 0.65 0.1406240 0.3512269 0.1100248
## 0.6 0.70 0.1405733 0.3543731 0.1098978
## 0.6 0.75 0.1406635 0.3569590 0.1099098
## 0.6 0.80 0.1408551 0.3587292 0.1100465
## 0.6 0.85 0.1409903 0.3605101 0.1101538
## 0.6 0.90 0.1411295 0.3620884 0.1102838
## 0.6 0.95 0.1411884 0.3638009 0.1103493
## 0.6 1.00 0.1412492 0.3653933 0.1104303
## 0.8 0.05 0.1655386 0.2040309 0.1320079
## 0.8 0.10 0.1597414 0.2456919 0.1267847
## 0.8 0.15 0.1550404 0.2776313 0.1226534
## 0.8 0.20 0.1512119 0.2923916 0.1193066
## 0.8 0.25 0.1483043 0.2994696 0.1168474
## 0.8 0.30 0.1461352 0.3072198 0.1150975
## 0.8 0.35 0.1445895 0.3143171 0.1138315
## 0.8 0.40 0.1435385 0.3212465 0.1128478
## 0.8 0.45 0.1428112 0.3270875 0.1121060
## 0.8 0.50 0.1424134 0.3316087 0.1116507
## 0.8 0.55 0.1422054 0.3357336 0.1114108
## 0.8 0.60 0.1419757 0.3404088 0.1111420
## 0.8 0.65 0.1418991 0.3444713 0.1109905
## 0.8 0.70 0.1419860 0.3478455 0.1110019
## 0.8 0.75 0.1422006 0.3507406 0.1112218
## 0.8 0.80 0.1425827 0.3527251 0.1115779
## 0.8 0.85 0.1429667 0.3542579 0.1119006
## 0.8 0.90 0.1433186 0.3557221 0.1121769
## 0.8 0.95 0.1436968 0.3569415 0.1124976
## 0.8 1.00 0.1440480 0.3579445 0.1127978
## 1.0 0.05 0.1653575 0.2040309 0.1318454
## 1.0 0.10 0.1595435 0.2461644 0.1266128
## 1.0 0.15 0.1547983 0.2769713 0.1224344
## 1.0 0.20 0.1510057 0.2902353 0.1191081
## 1.0 0.25 0.1481733 0.2967655 0.1167339
## 1.0 0.30 0.1461126 0.3043076 0.1150975
## 1.0 0.35 0.1447398 0.3109783 0.1139730
## 1.0 0.40 0.1438355 0.3176725 0.1130841
## 1.0 0.45 0.1432325 0.3235865 0.1124099
## 1.0 0.50 0.1429962 0.3280260 0.1120694
## 1.0 0.55 0.1430745 0.3313701 0.1120656
## 1.0 0.60 0.1432010 0.3349502 0.1120690
## 1.0 0.65 0.1433064 0.3389811 0.1120928
## 1.0 0.70 0.1436102 0.3421491 0.1123580
## 1.0 0.75 0.1440626 0.3447911 0.1128114
## 1.0 0.80 0.1446861 0.3465860 0.1133464
## 1.0 0.85 0.1453400 0.3480227 0.1138641
## 1.0 0.90 0.1459419 0.3493569 0.1143475
## 1.0 0.95 0.1465338 0.3505457 0.1148450
## 1.0 1.00 0.1470863 0.3515943 0.1152918
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were fraction = 0.85 and lambda = 0.
#### Eval ElasticNet results on test holdout
## RMSE Rsquared MAE
## 0.1270674 0.4248886 0.1016689
### Neural Network
## Create training and test split on the data
trainX <- icStudentData[trainRows,-colY]
trainY <- icStudentData[trainRows, colY]
testX <- icStudentData[-trainRows,-colY]
testY <- icStudentData[-trainRows, colY]
## Create a set of models to evaluate
nnetGrid <- expand.grid(size = c(1:10),
decay = c(0, 0.01, 0.1))
nnetModel <- train(
trainX,
trainY,
method = "nnet",
tuneGrid = nnetGrid,
trControl = ctrl,
preProcess = c("BoxCox", "center", "scale", "pca"),
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(trainX) + 1) + 10 + 1,
maxit = 500
)
nnetModel## Neural Network
##
## 2055 samples
## 36 predictor
##
## Pre-processing: Box-Cox transformation (23), centered (36), scaled
## (36), principal component signal extraction (36)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1849, 1850, 1848, 1850, 1850, 1849, ...
## Resampling results across tuning parameters:
##
## size decay RMSE Rsquared MAE
## 1 0.00 0.1431134 0.3198840 0.11193714
## 1 0.01 0.1392075 0.3609829 0.10901393
## 1 0.10 0.1398917 0.3532768 0.11006820
## 2 0.00 0.2479545 0.2998425 0.12005016
## 2 0.01 0.1366916 0.3846947 0.10669069
## 2 0.10 0.1348356 0.3991393 0.10506230
## 3 0.00 0.1466705 0.3578204 0.10866279
## 3 0.01 0.1323555 0.4238115 0.10173071
## 3 0.10 0.1323119 0.4256071 0.10177294
## 4 0.00 0.1373658 0.3846831 0.10585217
## 4 0.01 0.1292647 0.4550443 0.09905576
## 4 0.10 0.1279593 0.4620413 0.09890188
## 5 0.00 0.1338247 0.4226488 0.10212347
## 5 0.01 0.1271268 0.4739085 0.09770427
## 5 0.10 0.1280582 0.4649329 0.09715941
## 6 0.00 0.1373721 0.4043096 0.10475265
## 6 0.01 0.1268750 0.4775401 0.09716372
## 6 0.10 0.1266302 0.4804003 0.09712960
## 7 0.00 0.1380890 0.4056416 0.10436005
## 7 0.01 0.1304963 0.4546324 0.09962521
## 7 0.10 0.1259417 0.4840560 0.09560961
## 8 0.00 0.1380887 0.4116317 0.10478179
## 8 0.01 0.1293825 0.4661321 0.09779170
## 8 0.10 0.1257344 0.4886613 0.09536145
## 9 0.00 0.1367075 0.4205171 0.10437443
## 9 0.01 0.1297172 0.4673196 0.09784726
## 9 0.10 0.1302668 0.4573297 0.09763129
## 10 0.00 0.1381875 0.4239263 0.10637629
## 10 0.01 0.1309902 0.4640729 0.09950640
## 10 0.10 0.1246336 0.5010111 0.09545923
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 10 and decay = 0.1.
#plot(nnetModel)
#varImp(nnetModel)
nnetPred <- predict(nnetModel, newdata = testX)
postResample(pred = nnetPred, obs = testY)## RMSE Rsquared MAE
## 0.12064541 0.50800650 0.09202389
## Create a set of models to evaluate
nnetGrid2 <- expand.grid(decay = c(0, 0.01, 0.1),
size = c(1:10),
bag = FALSE)
nnetModel2 <- train(
trainX,
trainY,
method = "avNNet",
tuneGrid = nnetGrid2,
trControl = ctrl,
preProcess = c("BoxCox", "center", "scale", "pca"),
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(trainX) + 1) + 10 + 1,
maxit = 500
)
nnetModel2## Model Averaged Neural Network
##
## 2055 samples
## 36 predictor
##
## Pre-processing: Box-Cox transformation (23), centered (36), scaled
## (36), principal component signal extraction (36)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1849, 1851, 1849, 1849, 1850, 1851, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 0.1405428 0.3591191 0.11053158
## 0.00 2 0.1369253 0.3902333 0.10736619
## 0.00 3 0.1433086 0.3855765 0.10345011
## 0.00 4 1.2253747 0.4190064 0.20662378
## 0.00 5 0.1405028 0.4463213 0.09714877
## 0.00 6 0.1212936 0.5144067 0.09253824
## 0.00 7 0.9025580 0.4364573 0.19462175
## 0.00 8 0.2811093 0.4990257 0.11041779
## 0.00 9 0.1172683 0.5459352 0.08888831
## 0.00 10 0.1180809 0.5402557 0.08944739
## 0.01 1 0.1389596 0.3606577 0.10911215
## 0.01 2 0.1315497 0.4276536 0.10212834
## 0.01 3 0.1254312 0.4795049 0.09651374
## 0.01 4 0.1228193 0.5013359 0.09336491
## 0.01 5 0.1200122 0.5233585 0.09133642
## 0.01 6 0.1178578 0.5408698 0.08918306
## 0.01 7 0.1142952 0.5679524 0.08690076
## 0.01 8 0.1175160 0.5443007 0.08804553
## 0.01 9 0.1170163 0.5494343 0.08801095
## 0.01 10 0.1159938 0.5565924 0.08700520
## 0.10 1 0.1397263 0.3537879 0.11026666
## 0.10 2 0.1314096 0.4288529 0.10282549
## 0.10 3 0.1263822 0.4713280 0.09698862
## 0.10 4 0.1230882 0.4998254 0.09342653
## 0.10 5 0.1213127 0.5133004 0.09198119
## 0.10 6 0.1202194 0.5230287 0.09149006
## 0.10 7 0.1197261 0.5264710 0.09117539
## 0.10 8 0.1198194 0.5274190 0.09091644
## 0.10 9 0.1163312 0.5545665 0.08843793
## 0.10 10 0.1194308 0.5339662 0.08967344
##
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 7, decay = 0.01 and bag = FALSE.
#plot(nnetModel2)
#varImp(nnetModel2)
nnetPred2 <- predict(nnetModel2, newdata = testX)
postResample(pred = nnetPred2, obs = testY)## RMSE Rsquared MAE
## 0.11331097 0.54482241 0.08613199
### MARS: Multivariate Adaptive Regression Splines
library(earth)
## Create training and test split on the data
trainX <- icStudentData[trainRows,-colY]
trainY <- icStudentData[trainRows, colY]
testX <- icStudentData[-trainRows,-colY]
testY <- icStudentData[-trainRows, colY]
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:30)
marsModel <- train(
trainX,
trainY,
method = "earth",
tuneGrid = marsGrid,
trControl = ctrl
)
marsModel## Multivariate Adaptive Regression Spline
##
## 2055 samples
## 36 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1850, 1848, 1849, 1850, 1849, 1850, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 0.1536855 0.2186838 0.11966687
## 1 3 0.1450564 0.3053209 0.11320681
## 1 4 0.1436850 0.3186002 0.11176409
## 1 5 0.1427015 0.3290418 0.11145592
## 1 6 0.1403504 0.3501351 0.10820909
## 1 7 0.1398978 0.3550511 0.10749100
## 1 8 0.1376532 0.3745650 0.10618216
## 1 9 0.1356637 0.3927946 0.10440556
## 1 10 0.1334482 0.4125707 0.10320300
## 1 11 0.1332496 0.4145837 0.10281181
## 1 12 0.1333377 0.4135357 0.10253510
## 1 13 0.1326997 0.4191212 0.10209514
## 1 14 0.1318543 0.4264765 0.10182482
## 1 15 0.1316103 0.4285980 0.10136944
## 1 16 0.1314962 0.4298202 0.10130143
## 1 17 0.1314540 0.4302730 0.10122605
## 1 18 0.1314231 0.4306406 0.10110827
## 1 19 0.1314884 0.4301230 0.10109542
## 1 20 0.1314884 0.4301230 0.10109542
## 1 21 0.1314884 0.4301230 0.10109542
## 1 22 0.1314884 0.4301230 0.10109542
## 1 23 0.1314884 0.4301230 0.10109542
## 1 24 0.1314884 0.4301230 0.10109542
## 1 25 0.1314884 0.4301230 0.10109542
## 1 26 0.1314884 0.4301230 0.10109542
## 1 27 0.1314884 0.4301230 0.10109542
## 1 28 0.1314884 0.4301230 0.10109542
## 1 29 0.1314884 0.4301230 0.10109542
## 1 30 0.1314884 0.4301230 0.10109542
## 2 2 0.1536855 0.2186838 0.11966687
## 2 3 0.1463234 0.2933551 0.11438365
## 2 4 0.1438170 0.3172980 0.11228989
## 2 5 0.1418996 0.3357970 0.11039062
## 2 6 0.1399931 0.3534793 0.10888086
## 2 7 0.1388464 0.3655405 0.10736007
## 2 8 0.1374990 0.3766367 0.10581595
## 2 9 0.1353717 0.3955871 0.10377425
## 2 10 0.1328284 0.4183388 0.10168978
## 2 11 0.1322849 0.4237925 0.10057750
## 2 12 0.1317020 0.4295875 0.09998406
## 2 13 0.1309455 0.4357108 0.09872763
## 2 14 0.1313115 0.4338443 0.09906983
## 2 15 0.1297974 0.4463834 0.09787733
## 2 16 0.1287933 0.4547013 0.09717566
## 2 17 0.1279619 0.4614130 0.09675009
## 2 18 0.1272367 0.4676927 0.09582992
## 2 19 0.1268629 0.4706801 0.09557628
## 2 20 0.1264203 0.4739263 0.09505489
## 2 21 0.1267973 0.4717258 0.09506021
## 2 22 0.1264241 0.4749043 0.09486406
## 2 23 0.1260751 0.4780597 0.09434735
## 2 24 0.1258472 0.4799106 0.09399175
## 2 25 0.1255086 0.4827233 0.09384152
## 2 26 0.1254218 0.4836567 0.09365109
## 2 27 0.1253820 0.4844177 0.09358541
## 2 28 0.1253622 0.4845887 0.09345948
## 2 29 0.1255458 0.4832101 0.09350864
## 2 30 0.1256708 0.4821584 0.09366350
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 28 and degree = 2.
#varImp(marsModel)
marsPred <- predict(marsModel, newdata = testX)
postResample(pred = marsPred, obs = testY)## RMSE Rsquared MAE
## 0.11684379 0.51394487 0.08950574
### SVM: Support Vector Machines
## Create training and test split on the data
trainX <- icStudentData[trainRows,-colY]
trainY <- icStudentData[trainRows, colY]
testX <- icStudentData[-trainRows,-colY]
testY <- icStudentData[-trainRows, colY]
#trainX$`Brand Code` <- as.numeric(trainX$`Brand Code`)
#testX$`Brand Code` <- as.numeric(testX$`Brand Code`)svmModel <- train(
trainX,
trainY,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = ctrl
)
svmModel## Support Vector Machines with Radial Basis Function Kernel
##
## 2055 samples
## 36 predictor
##
## Pre-processing: centered (36), scaled (36)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1849, 1850, 1849, 1851, 1850, 1848, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 0.1266244 0.4793626 0.09353797
## 0.50 0.1232076 0.5037179 0.09018774
## 1.00 0.1201454 0.5253778 0.08771376
## 2.00 0.1174010 0.5455901 0.08547101
## 4.00 0.1160249 0.5561104 0.08450325
## 8.00 0.1155326 0.5619311 0.08445268
## 16.00 0.1171583 0.5557748 0.08590326
## 32.00 0.1211153 0.5367617 0.08916189
## 64.00 0.1261794 0.5138281 0.09268589
## 128.00 0.1326212 0.4854892 0.09715716
## 256.00 0.1389351 0.4598707 0.10165385
## 512.00 0.1434567 0.4420474 0.10523635
## 1024.00 0.1448496 0.4363816 0.10647879
## 2048.00 0.1448496 0.4363816 0.10647879
##
## Tuning parameter 'sigma' was held constant at a value of 0.02009679
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.02009679 and C = 8.
#varImp(svmModel)
svmPred <- predict(svmModel, newdata = testX)
postResample(pred = svmPred, obs = testY)## RMSE Rsquared MAE
## 0.10966532 0.57985298 0.08138952
svmModel2 <- train(
trainX,
trainY,
method = "svmPoly",
preProc = c("center", "scale"),
#tuneLength = 14,
trControl = ctrl
)
svmModel2## Support Vector Machines with Polynomial Kernel
##
## 2055 samples
## 36 predictor
##
## Pre-processing: centered (36), scaled (36)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1850, 1849, 1850, 1849, 1850, 1849, ...
## Resampling results across tuning parameters:
##
## degree scale C RMSE Rsquared MAE
## 1 0.001 0.25 0.1497989 0.3080314 0.11740020
## 1 0.001 0.50 0.1449267 0.3342995 0.11281277
## 1 0.001 1.00 0.1412718 0.3555379 0.10918814
## 1 0.010 0.25 0.1381732 0.3766992 0.10588396
## 1 0.010 0.50 0.1370406 0.3850736 0.10448293
## 1 0.010 1.00 0.1368336 0.3872390 0.10382608
## 1 0.100 0.25 0.1366761 0.3891079 0.10327690
## 1 0.100 0.50 0.1366972 0.3889629 0.10317031
## 1 0.100 1.00 0.1368634 0.3880585 0.10321219
## 2 0.001 0.25 0.1447571 0.3363250 0.11265334
## 2 0.001 0.50 0.1409062 0.3593076 0.10886201
## 2 0.001 1.00 0.1380860 0.3792780 0.10579017
## 2 0.010 0.25 0.1300320 0.4475448 0.09714647
## 2 0.010 0.50 0.1281077 0.4627621 0.09454513
## 2 0.010 1.00 0.1263796 0.4765243 0.09256072
## 2 0.100 0.25 0.1268549 0.4842409 0.09214793
## 2 0.100 0.50 0.1300539 0.4695129 0.09415718
## 2 0.100 1.00 0.1338629 0.4529462 0.09623309
## 3 0.001 0.25 0.1420077 0.3536443 0.11005671
## 3 0.001 0.50 0.1386497 0.3765766 0.10642087
## 3 0.001 1.00 0.1360640 0.3957793 0.10366283
## 3 0.010 0.25 0.1266100 0.4759776 0.09308000
## 3 0.010 0.50 0.1246848 0.4912165 0.09101488
## 3 0.010 1.00 0.1229134 0.5055751 0.08932194
## 3 0.100 0.25 0.1539002 0.4097589 0.10515589
## 3 0.100 0.50 0.1758659 0.3464989 0.11759279
## 3 0.100 1.00 0.2013096 0.2898199 0.13210774
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were degree = 3, scale = 0.01 and C = 1.
#varImp(svmModel2)
svmPred2 <- predict(svmModel2, newdata = testX)
postResample(pred = svmPred2, obs = testY)## RMSE Rsquared MAE
## 0.11311496 0.54858494 0.08316103
svmModel3 <- train(
trainX,
trainY,
method = "svmLinear",
preProc = c("center", "scale"),
tuneGrid = data.frame(.C = c(.25, .5, 1)),
trControl = ctrl
)
svmModel3## Support Vector Machines with Linear Kernel
##
## 2055 samples
## 36 predictor
##
## Pre-processing: centered (36), scaled (36)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1850, 1851, 1849, 1850, 1849, 1848, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 0.1369470 0.3866762 0.1033249
## 0.50 0.1370046 0.3863261 0.1033465
## 1.00 0.1370257 0.3861863 0.1033687
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was C = 0.25.
#varImp(svmModel3)
svmPred3 <- predict(svmModel3, newdata = testX)
postResample(pred = svmPred3, obs = testY)## RMSE Rsquared MAE
## 0.12628329 0.44245728 0.09855517
### KNN: K-Nearest Neighbors
## Create training and test split on the data
trainX <- icStudentData[trainRows,-colY]
trainY <- icStudentData[trainRows, colY]
testX <- icStudentData[-trainRows,-colY]
testY <- icStudentData[-trainRows, colY]
#trainX$`Brand Code` <- as.numeric(trainX$`Brand Code`)
#testX$`Brand Code` <- as.numeric(testX$`Brand Code`)
knnModel <- train(
trainX,
trainY,
method = "knn",
preProcess = c("center", "scale"),
tuneGrid = data.frame(.k = 1:10),
trControl = ctrl
)
knnModel## k-Nearest Neighbors
##
## 2055 samples
## 36 predictor
##
## Pre-processing: centered (36), scaled (36)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1850, 1851, 1849, 1850, 1849, 1849, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 0.1545756 0.3689879 0.10446069
## 2 0.1304151 0.4739943 0.09210069
## 3 0.1246529 0.5028337 0.08991473
## 4 0.1224450 0.5150609 0.08906787
## 5 0.1208514 0.5232809 0.08846839
## 6 0.1203018 0.5257272 0.08891881
## 7 0.1203391 0.5256418 0.08953277
## 8 0.1207709 0.5217261 0.09040865
## 9 0.1210737 0.5200849 0.09058111
## 10 0.1215810 0.5164079 0.09144834
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 6.
#varImp(knnModel)
knnPred <- predict(knnModel, newdata = testX)
postResample(pred = knnPred, obs = testY)## RMSE Rsquared MAE
## 0.12260286 0.47914702 0.09095145
### Random Forest
library(randomForest)
## Create training and test split on the data
trainX <- icStudentData[trainRows,-colY]
trainY <- icStudentData[trainRows, colY]
testX <- icStudentData[-trainRows,-colY]
testY <- icStudentData[-trainRows, colY]
rf_model <- randomForest(x = trainX, y = trainY, ntree = 300)
rf_predicted <- predict(rf_model, testX)
postResample(pred = rf_predicted, obs = testY)## RMSE Rsquared MAE
## 0.09167835 0.71312650 0.06710012
library(vip)
rfImp1 <- rf_model$importance
vip(rf_model, color = 'red', fill='green') +
ggtitle('Random Forest Var Imp')gbmGrid = expand.grid(interaction.depth = seq(1,5, by=2), n.trees = seq(100, 1000, by = 100), shrinkage = 0.1, n.minobsinnode = 5)
gbm_model <- train(trainX, trainY, tuneGrid = gbmGrid, verbose = FALSE, method = 'gbm' )
gbm_predicted <- predict(gbm_model, testX)
postResample(pred = rf_predicted, obs = testY)## RMSE Rsquared MAE
## 0.09167835 0.71312650 0.06710012
rowNamesPerf <-
c("Linear",
"Linear-adjusted",
"Linear-Filtered",
"PLS",
"PCR",
"Ridge Regression",
"ElasticNet",
"KNN",
"svmRadial",
"svmPoly",
"svmLinear",
"MARS",
"nnet",
"avNNet",
"RandomForest",
"BoostedTrees")
print("Testing Performance")## [1] "Testing Performance"
testPerf <- rbind(
postResample(pred = lmtune0Pred, obs = testY),
postResample(pred = lmtune0UPred, obs = testY),
postResample(pred = lmtuneFilteredPred, obs = testY),
postResample(pred = plstunePred, obs = testY),
postResample(pred = pcrtunePred, obs = testY),
postResample(pred = ridgetunePred, obs = testY),
postResample(pred = enettunePred, obs = testY),
postResample(pred = knnPred, obs = testY),
postResample(pred = svmPred, obs = testY),
postResample(pred = svmPred2, obs = testY),
postResample(pred = svmPred3, obs = testY),
postResample(pred = marsPred, obs = testY),
postResample(pred = nnetPred, obs = testY),
postResample(pred = nnetPred2, obs = testY),
postResample(pred = rf_predicted, obs = testY),
postResample(pred = gbm_predicted, obs = testY)
)
rownames(testPerf) <- rowNamesPerf
testPerf <- as.data.frame(testPerf)
testPerf[order(testPerf$RMSE), ]## RMSE Rsquared MAE
## RandomForest 0.09167835 0.7131265 0.06710012
## BoostedTrees 0.10352400 0.6250535 0.07448248
## svmRadial 0.10966532 0.5798530 0.08138952
## svmPoly 0.11311496 0.5485849 0.08316103
## avNNet 0.11331097 0.5448224 0.08613199
## MARS 0.11684379 0.5139449 0.08950574
## nnet 0.12064541 0.5080065 0.09202389
## KNN 0.12260286 0.4791470 0.09095145
## Linear 0.12611736 0.4334206 0.10059604
## Linear-adjusted 0.12611736 0.4334206 0.10059604
## svmLinear 0.12628329 0.4424573 0.09855517
## PLS 0.12633283 0.4315677 0.10127744
## Ridge Regression 0.12689443 0.4265123 0.10150803
## PCR 0.12690331 0.4264014 0.10231110
## ElasticNet 0.12706741 0.4248886 0.10166890
## Linear-Filtered 0.13045894 0.3943139 0.10413425
## [1] "Training Performance"
rf_trainpredicted <- predict(rf_model, trainX)
gbm_trainpredicted <- predict(gbm_model, trainX)
postResample(pred = rf_predicted, obs = testY)## RMSE Rsquared MAE
## 0.09167835 0.71312650 0.06710012
trainPerf <-
rbind(
lmTune0$results[rownames(lmTune0$bestTune), c("RMSE","Rsquared","MAE")],
lmTune0U$results[rownames(lmTune0U$bestTune), c("RMSE","Rsquared","MAE")],
lmTuneFiltered$results[rownames(lmTuneFiltered$bestTune), c("RMSE","Rsquared","MAE")],
plsTune$results[rownames(plsTune$bestTune), c("RMSE","Rsquared","MAE")],
pcrTune$results[rownames(pcrTune$bestTune), c("RMSE","Rsquared","MAE")],
ridgeTune$results[rownames(ridgeTune$bestTune), c("RMSE","Rsquared","MAE")],
enetTune$results[rownames(enetTune$bestTune), c("RMSE","Rsquared","MAE")],
knnModel$results[rownames(knnModel$bestTune), c("RMSE","Rsquared","MAE")],
svmModel$results[rownames(svmModel$bestTune), c("RMSE","Rsquared","MAE")],
svmModel2$results[rownames(svmModel2$bestTune), c("RMSE","Rsquared","MAE")],
svmModel3$results[rownames(svmModel3$bestTune), c("RMSE","Rsquared","MAE")],
marsModel$results[rownames(marsModel$bestTune), c("RMSE","Rsquared","MAE")],
nnetModel$results[rownames(nnetModel$bestTune), c("RMSE","Rsquared","MAE")],
nnetModel2$results[rownames(nnetModel2$bestTune), c("RMSE","Rsquared","MAE")],
postResample(pred = rf_trainpredicted, obs = testY),
postResample(pred = gbm_trainpredicted, obs = testY)
)
rownames(trainPerf) <- rowNamesPerf
trainPerf[order(trainPerf$RMSE), ]## RMSE Rsquared MAE
## avNNet 0.1142952 0.56795243 0.08690076
## svmRadial 0.1155326 0.56193112 0.08445268
## KNN 0.1203018 0.52572719 0.08891881
## svmPoly 0.1229134 0.50557509 0.08932194
## nnet 0.1246336 0.50101113 0.09545923
## MARS 0.1253622 0.48458870 0.09345948
## ElasticNet 0.1339298 0.40755503 0.10399464
## Ridge Regression 0.1339974 0.40714480 0.10408783
## PLS 0.1348922 0.39910952 0.10393752
## PCR 0.1348934 0.39919046 0.10423705
## Linear 0.1351187 0.39728394 0.10410596
## Linear-adjusted 0.1351187 0.39728394 0.10410596
## Linear-Filtered 0.1367765 0.38248502 0.10641648
## svmLinear 0.1369470 0.38667622 0.10332491
## RandomForest NA 0.06783655 NA
## BoostedTrees NA 0.06489719 NA
studentEval <- read_xlsx("StudentEvaluation.xlsx")
#str(studentEval)
# We need to re-set colY ("PH") from 25 back to 26
colY <- which(colnames(studentData) == "PH")
(naCols <- names(which(sapply(studentEval[,-colY], anyNA))))## [1] "Brand Code" "Carb Volume" "Fill Ounces"
## [4] "PC Volume" "Carb Temp" "PSC"
## [7] "PSC Fill" "PSC CO2" "Carb Pressure1"
## [10] "Fill Pressure" "Hyd Pressure2" "Hyd Pressure3"
## [13] "Hyd Pressure4" "Filler Level" "Filler Speed"
## [16] "Temperature" "Usage cont" "Density"
## [19] "MFR" "Balling" "Pressure Vacuum"
## [22] "Oxygen Filler" "Bowl Setpoint" "Pressure Setpoint"
## [25] "Air Pressurer" "Alch Rel" "Carb Rel"
print(paste(
"There are",
length(naCols),
"columns with missing values out of",
ncol(studentEval[,-colY])
))## [1] "There are 27 columns with missing values out of 32"
print(paste(
"There are",
sum(complete.cases(studentEval[,-colY])),
"complete rows out of total",
nrow(studentEval)
))## [1] "There are 200 complete rows out of total 267"
## tabulation of "Brand Code" in the evaluation dataset
table(studentEval$`Brand Code`,useNA = "ifany")##
## A B C D <NA>
## 35 129 31 64 8
## There are eight "NA" values.
## assign "U" to unknown Brand Codes
studentEval[is.na(studentEval$`Brand Code`),"Brand Code"] <- "U"
## Convert `Brand Code` to factors
studentEval$`Brand Code` <- as.factor(studentEval$`Brand Code`)
## revised tabulation of "Brand Code" in the evaluation dataset
table(studentEval$`Brand Code`,useNA = "ifany")##
## A B C D U
## 35 129 31 64 8
cn <- colnames(studentEval)
icStudentEval <- studentEval
## temporarily adjust the column names by eliminating spaces
colnames(icStudentEval) <- sub(" ", "_", cn)
## Use **MICE**: "Multivariate Imputation by Chained Equations" to impute missing values
icdf <- mice(
icStudentEval,
m = 2,
maxit = 10,
meth = 'pmm',
seed = 500,
print = FALSE
)
icStudentEval <- complete(icdf)
rm(icdf)
## restore the original column names
colnames(icStudentEval) <- cnvarImpSorted <- function(dfVarImp) {
varImpRows <- order(abs(dfVarImp$Overall), decreasing = TRUE)
dfResult <- data.frame(dfVarImp[varImpRows, 1],
row.names = rownames(dfVarImp)[varImpRows])
colnames(dfResult) <- colnames(dfVarImp)
return(dfResult)
}
(VarImpObj <- varImp(nnetModel2))## loess r-squared variable importance
##
## only 20 most important variables shown (out of 36)
##
## Overall
## Mnf Flow 100.000
## Usage cont 70.848
## Bowl Setpoint 53.692
## Filler Level 45.044
## Pressure Setpoint 44.091
## Carb Flow 41.117
## Brand Code_C 38.403
## Hyd Pressure3 26.180
## Pressure Vacuum 21.803
## Fill Pressure 19.391
## Hyd Pressure2 17.554
## Brand Code_D 16.180
## Oxygen Filler 15.779
## Carb Rel 13.282
## Alch Rel 9.771
## Hyd Pressure4 9.265
## Temperature 6.706
## Brand Code_B 6.408
## Fill Ounces 5.343
## Brand Code_A 4.689
dfVarImp <- varImpSorted(VarImpObj$importance)
## Feature Plots
featurePlot(
x = icStudentEval[, rownames(dfVarImp)[1:2]],
y = studentEval$PH,
between = list(x = 1, y = 1),
type = c("g", "p", "smooth")
)featurePlot(
x = icStudentEval[, rownames(dfVarImp)[3:8]],
y = studentEval$PH,
between = list(x = 1, y = 1),
type = c("g", "p", "smooth")
)Given that the downside to using a non-linear model is that we lack interpretability, it is hard to suggest how the predictor variables need to be manipulated in order to get the desired results for the response variable. However as shown above the model allows to identify which variables were mostly influential (important) in determining the optimal prediction for the outcome. Perhaps this list of variables can be an important input to the subject matter experts and help them illuminate which variables to manipulate and how. It would also be helpful to domain experts if the feature plots, shown above would indicate a clear linear relationship between the predictor variables and the target variable. Unfortunately, in this case, the relationships are non-linear and complex and therefore hard to interpret.
Perhaps, as a possible workaround for this dilemma of interpretability, would be to come up with an optimal but most importantly with a more interpretable model, so that if the same top-influential variables are identified their influence can readily be identified in order to know how the outcome can best be changed to achieve desirable results.