fp <- read_excel("StudentData.xlsx", .name_repair = function(col){ gsub(" ", ".", col) })
final_eval_data <- read_excel("StudentEvaluation.xlsx", .name_repair = function(col){ gsub(" ", ".", col) })
There are 33 fields with 2571 observations. The brand code field is the only categorical field, all the others are numeric. The brand code field can be factorized. Every column except air pressure and pressure vacuum has missing data. Alch Rel, Carb Rel and Balling Lvl are all highly correlated with each other and other fields. Those three fields can be removed. According to the Shapiro-Wilks test, none of the six fields that looked normally distributed are normally distributed. Transformations such as boxcox might be needed.
summary(fp)
## Brand.Code Carb.Volume Fill.Ounces PC.Volume
## Length:2571 Min. :5.040 Min. :23.63 Min. :0.07933
## Class :character 1st Qu.:5.293 1st Qu.:23.92 1st Qu.:0.23917
## Mode :character Median :5.347 Median :23.97 Median :0.27133
## Mean :5.370 Mean :23.97 Mean :0.27712
## 3rd Qu.:5.453 3rd Qu.:24.03 3rd Qu.:0.31200
## Max. :5.700 Max. :24.32 Max. :0.47800
## NA's :10 NA's :38 NA's :39
## Carb.Pressure Carb.Temp PSC PSC.Fill
## Min. :57.00 Min. :128.6 Min. :0.00200 Min. :0.0000
## 1st Qu.:65.60 1st Qu.:138.4 1st Qu.:0.04800 1st Qu.:0.1000
## Median :68.20 Median :140.8 Median :0.07600 Median :0.1800
## Mean :68.19 Mean :141.1 Mean :0.08457 Mean :0.1954
## 3rd Qu.:70.60 3rd Qu.:143.8 3rd Qu.:0.11200 3rd Qu.:0.2600
## Max. :79.40 Max. :154.0 Max. :0.27000 Max. :0.6200
## NA's :27 NA's :26 NA's :33 NA's :23
## PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure
## Min. :0.00000 Min. :-100.20 Min. :105.6 Min. :34.60
## 1st Qu.:0.02000 1st Qu.:-100.00 1st Qu.:119.0 1st Qu.:46.00
## Median :0.04000 Median : 65.20 Median :123.2 Median :46.40
## Mean :0.05641 Mean : 24.57 Mean :122.6 Mean :47.92
## 3rd Qu.:0.08000 3rd Qu.: 140.80 3rd Qu.:125.4 3rd Qu.:50.00
## Max. :0.24000 Max. : 229.40 Max. :140.2 Max. :60.40
## NA's :39 NA's :2 NA's :32 NA's :22
## Hyd.Pressure1 Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4
## Min. :-0.80 Min. : 0.00 Min. :-1.20 Min. : 52.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 86.00
## Median :11.40 Median :28.60 Median :27.60 Median : 96.00
## Mean :12.44 Mean :20.96 Mean :20.46 Mean : 96.29
## 3rd Qu.:20.20 3rd Qu.:34.60 3rd Qu.:33.40 3rd Qu.:102.00
## Max. :58.00 Max. :59.40 Max. :50.00 Max. :142.00
## NA's :11 NA's :15 NA's :15 NA's :30
## Filler.Level Filler.Speed Temperature Usage.cont Carb.Flow
## Min. : 55.8 Min. : 998 Min. :63.60 Min. :12.08 Min. : 26
## 1st Qu.: 98.3 1st Qu.:3888 1st Qu.:65.20 1st Qu.:18.36 1st Qu.:1144
## Median :118.4 Median :3982 Median :65.60 Median :21.79 Median :3028
## Mean :109.3 Mean :3687 Mean :65.97 Mean :20.99 Mean :2468
## 3rd Qu.:120.0 3rd Qu.:3998 3rd Qu.:66.40 3rd Qu.:23.75 3rd Qu.:3186
## Max. :161.2 Max. :4030 Max. :76.20 Max. :25.90 Max. :5104
## NA's :20 NA's :57 NA's :14 NA's :5 NA's :2
## Density MFR Balling Pressure.Vacuum
## Min. :0.240 Min. : 31.4 Min. :-0.170 Min. :-6.600
## 1st Qu.:0.900 1st Qu.:706.3 1st Qu.: 1.496 1st Qu.:-5.600
## Median :0.980 Median :724.0 Median : 1.648 Median :-5.400
## Mean :1.174 Mean :704.0 Mean : 2.198 Mean :-5.216
## 3rd Qu.:1.620 3rd Qu.:731.0 3rd Qu.: 3.292 3rd Qu.:-5.000
## Max. :1.920 Max. :868.6 Max. : 4.012 Max. :-3.600
## NA's :1 NA's :212 NA's :1
## PH Oxygen.Filler Bowl.Setpoint Pressure.Setpoint
## Min. :7.880 Min. :0.00240 Min. : 70.0 Min. :44.00
## 1st Qu.:8.440 1st Qu.:0.02200 1st Qu.:100.0 1st Qu.:46.00
## Median :8.540 Median :0.03340 Median :120.0 Median :46.00
## Mean :8.546 Mean :0.04684 Mean :109.3 Mean :47.62
## 3rd Qu.:8.680 3rd Qu.:0.06000 3rd Qu.:120.0 3rd Qu.:50.00
## Max. :9.360 Max. :0.40000 Max. :140.0 Max. :52.00
## NA's :4 NA's :12 NA's :2 NA's :12
## Air.Pressurer Alch.Rel Carb.Rel Balling.Lvl
## Min. :140.8 Min. :5.280 Min. :4.960 Min. :0.00
## 1st Qu.:142.2 1st Qu.:6.540 1st Qu.:5.340 1st Qu.:1.38
## Median :142.6 Median :6.560 Median :5.400 Median :1.48
## Mean :142.8 Mean :6.897 Mean :5.437 Mean :2.05
## 3rd Qu.:143.0 3rd Qu.:7.240 3rd Qu.:5.540 3rd Qu.:3.14
## Max. :148.2 Max. :8.620 Max. :6.060 Max. :3.66
## NA's :9 NA's :10 NA's :1
summary(final_eval_data)
## Brand.Code Carb.Volume Fill.Ounces PC.Volume
## Length:267 Min. :5.147 Min. :23.75 Min. :0.09867
## Class :character 1st Qu.:5.287 1st Qu.:23.92 1st Qu.:0.23333
## Mode :character Median :5.340 Median :23.97 Median :0.27533
## Mean :5.369 Mean :23.97 Mean :0.27769
## 3rd Qu.:5.465 3rd Qu.:24.01 3rd Qu.:0.32200
## Max. :5.667 Max. :24.20 Max. :0.46400
## NA's :1 NA's :6 NA's :4
## Carb.Pressure Carb.Temp PSC PSC.Fill
## Min. :60.20 Min. :130.0 Min. :0.00400 Min. :0.0200
## 1st Qu.:65.30 1st Qu.:138.4 1st Qu.:0.04450 1st Qu.:0.1000
## Median :68.00 Median :140.8 Median :0.07600 Median :0.1800
## Mean :68.25 Mean :141.2 Mean :0.08545 Mean :0.1903
## 3rd Qu.:70.60 3rd Qu.:143.8 3rd Qu.:0.11200 3rd Qu.:0.2600
## Max. :77.60 Max. :154.0 Max. :0.24600 Max. :0.6200
## NA's :1 NA's :5 NA's :3
## PSC.CO2 Mnf.Flow Carb.Pressure1 Fill.Pressure
## Min. :0.00000 Min. :-100.20 Min. :113.0 Min. :37.80
## 1st Qu.:0.02000 1st Qu.:-100.00 1st Qu.:120.2 1st Qu.:46.00
## Median :0.04000 Median : 0.20 Median :123.4 Median :47.80
## Mean :0.05107 Mean : 21.03 Mean :123.0 Mean :48.14
## 3rd Qu.:0.06000 3rd Qu.: 141.30 3rd Qu.:125.5 3rd Qu.:50.20
## Max. :0.24000 Max. : 220.40 Max. :136.0 Max. :60.20
## NA's :5 NA's :4 NA's :2
## Hyd.Pressure1 Hyd.Pressure2 Hyd.Pressure3 Hyd.Pressure4
## Min. :-50.00 Min. :-50.00 Min. :-50.00 Min. : 68.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 90.00
## Median : 10.40 Median : 26.80 Median : 27.70 Median : 98.00
## Mean : 12.01 Mean : 20.11 Mean : 19.61 Mean : 97.84
## 3rd Qu.: 20.40 3rd Qu.: 34.80 3rd Qu.: 33.00 3rd Qu.:104.00
## Max. : 50.00 Max. : 61.40 Max. : 49.20 Max. :140.00
## NA's :1 NA's :1 NA's :4
## Filler.Level Filler.Speed Temperature Usage.cont Carb.Flow
## Min. : 69.2 Min. :1006 Min. :63.80 Min. :12.90 Min. : 0
## 1st Qu.:100.6 1st Qu.:3812 1st Qu.:65.40 1st Qu.:18.12 1st Qu.:1083
## Median :118.6 Median :3978 Median :65.80 Median :21.44 Median :3038
## Mean :110.3 Mean :3581 Mean :66.23 Mean :20.90 Mean :2409
## 3rd Qu.:120.2 3rd Qu.:3996 3rd Qu.:66.60 3rd Qu.:23.74 3rd Qu.:3215
## Max. :153.2 Max. :4020 Max. :75.40 Max. :24.60 Max. :3858
## NA's :2 NA's :10 NA's :2 NA's :2
## Density MFR Balling Pressure.Vacuum
## Min. :0.060 Min. : 15.6 Min. :0.902 Min. :-6.400
## 1st Qu.:0.920 1st Qu.:707.0 1st Qu.:1.498 1st Qu.:-5.600
## Median :0.980 Median :724.6 Median :1.648 Median :-5.200
## Mean :1.177 Mean :697.8 Mean :2.203 Mean :-5.174
## 3rd Qu.:1.600 3rd Qu.:731.5 3rd Qu.:3.242 3rd Qu.:-4.800
## Max. :1.840 Max. :784.8 Max. :3.788 Max. :-3.600
## NA's :1 NA's :31 NA's :1 NA's :1
## PH Oxygen.Filler Bowl.Setpoint Pressure.Setpoint
## Mode:logical Min. :0.00240 Min. : 70.0 Min. :44.00
## NA's:267 1st Qu.:0.01960 1st Qu.:100.0 1st Qu.:46.00
## Median :0.03370 Median :120.0 Median :46.00
## Mean :0.04666 Mean :109.6 Mean :47.73
## 3rd Qu.:0.05440 3rd Qu.:120.0 3rd Qu.:50.00
## Max. :0.39800 Max. :130.0 Max. :52.00
## NA's :3 NA's :1 NA's :2
## Air.Pressurer Alch.Rel Carb.Rel Balling.Lvl
## Min. :141.2 Min. :6.400 Min. :5.18 Min. :0.000
## 1st Qu.:142.2 1st Qu.:6.540 1st Qu.:5.34 1st Qu.:1.380
## Median :142.6 Median :6.580 Median :5.40 Median :1.480
## Mean :142.8 Mean :6.907 Mean :5.44 Mean :2.051
## 3rd Qu.:142.8 3rd Qu.:7.180 3rd Qu.:5.56 3rd Qu.:3.080
## Max. :147.2 Max. :7.820 Max. :5.74 Max. :3.420
## NA's :1 NA's :3 NA's :2
fp %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
geom_histogram(bins = 15) +
facet_wrap(~key, scales = "free") +
ggtitle("Histograms of Manufacturing Processes")
unique(fp$Brand.Code)
## [1] "B" "A" "C" "D" NA
fp %>%
summarise_all(list(~ sum(is.na(.)))) %>%
gather(variable, value) %>%
filter(value != 0) %>%
arrange(-value) %>%
kable() %>%
kable_styling() %>%
scroll_box(width = "100%", height = "300px")
| variable | value |
|---|---|
| MFR | 212 |
| Brand.Code | 120 |
| Filler.Speed | 57 |
| PC.Volume | 39 |
| PSC.CO2 | 39 |
| Fill.Ounces | 38 |
| PSC | 33 |
| Carb.Pressure1 | 32 |
| Hyd.Pressure4 | 30 |
| Carb.Pressure | 27 |
| Carb.Temp | 26 |
| PSC.Fill | 23 |
| Fill.Pressure | 22 |
| Filler.Level | 20 |
| Hyd.Pressure2 | 15 |
| Hyd.Pressure3 | 15 |
| Temperature | 14 |
| Oxygen.Filler | 12 |
| Pressure.Setpoint | 12 |
| Hyd.Pressure1 | 11 |
| Carb.Volume | 10 |
| Carb.Rel | 10 |
| Alch.Rel | 9 |
| Usage.cont | 5 |
| PH | 4 |
| Mnf.Flow | 2 |
| Carb.Flow | 2 |
| Bowl.Setpoint | 2 |
| Density | 1 |
| Balling | 1 |
| Balling.Lvl | 1 |
fp %>%
select(-c("Brand.Code",'Bowl.Setpoint','Pressure.Setpoint')) %>%
na.omit() %>%
cor() %>%
corrplot()
nearZeroVar(fp)
## [1] 13
shapiro.test(fp$PH)
##
## Shapiro-Wilk normality test
##
## data: fp$PH
## W = 0.98885, p-value = 2.823e-13
shapiro.test(fp$Carb.Pressure)
##
## Shapiro-Wilk normality test
##
## data: fp$Carb.Pressure
## W = 0.99681, p-value = 3.582e-05
shapiro.test(fp$Fill.Ounces)
##
## Shapiro-Wilk normality test
##
## data: fp$Fill.Ounces
## W = 0.99317, p-value = 1.622e-09
shapiro.test(fp$Carb.Temp)
##
## Shapiro-Wilk normality test
##
## data: fp$Carb.Temp
## W = 0.99469, p-value = 6.316e-08
shapiro.test(fp$Pressure.Vacuum)
##
## Shapiro-Wilk normality test
##
## data: fp$Pressure.Vacuum
## W = 0.96387, p-value < 2.2e-16
shapiro.test(fp$PC.Volume)
##
## Shapiro-Wilk normality test
##
## data: fp$PC.Volume
## W = 0.98309, p-value < 2.2e-16
All the na’s will have to be imputed. Predictive mean matching imputation will probably be the best method to use. Also, brand code will need to be factorized. The fields of, Alch Rel, Carb Rel and Balling Lvl all need to be removed as they highly correlated. The hyd pressure1 field will also have to be removed as its values have no variance. The data will also be centered and scaled during model pre-processing.
fp <- fp %>% select(-c('Alch.Rel','Carb.Rel','Balling.Lvl'))
final_eval_data <- final_eval_data %>% select(-c('Alch.Rel','Carb.Rel','Balling.Lvl'))
fp <- fp %>% select(.,-nearZeroVar(fp))
final_eval_data <- final_eval_data %>% select(.,-nearZeroVar(fp))
fp$Brand.Code <- as.factor(fp$Brand.Code)
final_eval_data$Brand.Code <- as.factor(final_eval_data$Brand.Code)
fp %>%
ggplot() +
geom_bar(aes(x = Brand.Code)) +
ggtitle("Frequency of Brand Codes")
set.seed(41549)
fp_imp <- mice(fp, m = 1, method = 'pmm', print = FALSE) %>% complete()
final_eval_data_imp <- mice(final_eval_data,m=1,method='pmm',print=FALSE) %>% complete()
summary(fp_imp)
## Brand.Code Carb.Volume Fill.Ounces PC.Volume Carb.Pressure
## A: 319 Min. :5.040 Min. :23.63 Min. :0.07933 Min. :57.00
## B:1288 1st Qu.:5.293 1st Qu.:23.92 1st Qu.:0.23867 1st Qu.:65.60
## C: 326 Median :5.347 Median :23.97 Median :0.27133 Median :68.20
## D: 638 Mean :5.370 Mean :23.97 Mean :0.27738 Mean :68.22
## 3rd Qu.:5.453 3rd Qu.:24.03 3rd Qu.:0.31233 3rd Qu.:70.60
## Max. :5.700 Max. :24.32 Max. :0.47800 Max. :79.40
## Carb.Temp PSC PSC.Fill PSC.CO2
## Min. :128.6 Min. :0.00200 Min. :0.000 Min. :0.00000
## 1st Qu.:138.4 1st Qu.:0.04800 1st Qu.:0.100 1st Qu.:0.02000
## Median :140.8 Median :0.07800 Median :0.180 Median :0.04000
## Mean :141.1 Mean :0.08481 Mean :0.196 Mean :0.05642
## 3rd Qu.:143.8 3rd Qu.:0.11200 3rd Qu.:0.260 3rd Qu.:0.08000
## Max. :154.0 Max. :0.27000 Max. :0.620 Max. :0.24000
## Mnf.Flow Carb.Pressure1 Fill.Pressure Hyd.Pressure2
## Min. :-100.20 Min. :105.6 Min. :34.60 Min. : 0.00
## 1st Qu.:-100.00 1st Qu.:118.8 1st Qu.:46.00 1st Qu.: 0.00
## Median : 65.20 Median :123.2 Median :46.40 Median :28.60
## Mean : 24.56 Mean :122.5 Mean :47.92 Mean :20.99
## 3rd Qu.: 140.80 3rd Qu.:125.4 3rd Qu.:50.00 3rd Qu.:34.70
## Max. : 229.40 Max. :140.2 Max. :60.40 Max. :59.40
## Hyd.Pressure3 Hyd.Pressure4 Filler.Level Filler.Speed
## Min. :-1.20 Min. : 52.00 Min. : 55.8 Min. : 998
## 1st Qu.: 0.00 1st Qu.: 86.00 1st Qu.: 97.7 1st Qu.:3815
## Median :27.60 Median : 96.00 Median :118.4 Median :3980
## Mean :20.46 Mean : 96.54 Mean :109.2 Mean :3635
## 3rd Qu.:33.30 3rd Qu.:102.00 3rd Qu.:120.0 3rd Qu.:3996
## Max. :50.00 Max. :142.00 Max. :161.2 Max. :4030
## Temperature Usage.cont Carb.Flow Density MFR
## Min. :63.60 Min. :12.08 Min. : 26 Min. :0.240 Min. : 31.4
## 1st Qu.:65.20 1st Qu.:18.36 1st Qu.:1151 1st Qu.:0.900 1st Qu.:695.0
## Median :65.60 Median :21.78 Median :3028 Median :0.980 Median :721.4
## Mean :65.97 Mean :20.99 Mean :2470 Mean :1.174 Mean :672.3
## 3rd Qu.:66.40 3rd Qu.:23.75 3rd Qu.:3188 3rd Qu.:1.620 3rd Qu.:730.4
## Max. :76.20 Max. :25.90 Max. :5104 Max. :1.920 Max. :868.6
## Balling Pressure.Vacuum PH Oxygen.Filler
## Min. :-0.170 Min. :-6.600 Min. :7.880 Min. :0.00240
## 1st Qu.: 1.496 1st Qu.:-5.600 1st Qu.:8.440 1st Qu.:0.02200
## Median : 1.648 Median :-5.400 Median :8.540 Median :0.03340
## Mean : 2.197 Mean :-5.216 Mean :8.545 Mean :0.04708
## 3rd Qu.: 3.292 3rd Qu.:-5.000 3rd Qu.:8.680 3rd Qu.:0.06000
## Max. : 4.012 Max. :-3.600 Max. :9.360 Max. :0.40000
## Bowl.Setpoint Pressure.Setpoint Air.Pressurer
## Min. : 70.0 Min. :44.00 Min. :140.8
## 1st Qu.:100.0 1st Qu.:46.00 1st Qu.:142.2
## Median :120.0 Median :46.00 Median :142.6
## Mean :109.3 Mean :47.61 Mean :142.8
## 3rd Qu.:120.0 3rd Qu.:50.00 3rd Qu.:143.0
## Max. :140.0 Max. :52.00 Max. :148.2
For each model type there will be different pre-processing techniques used. Each model will be tested without any pre-processing, with centering and scaling and also with principal component analysis and centering and scaling done.
cl<-makeCluster(detectCores())
registerDoParallel(cl)
only_ph <- fp_imp %>% select(.,c('PH'))
fp_imp <- fp_imp %>% select(.,-c('PH'))
set.seed(5482)
training <- createDataPartition(only_ph$PH, p=0.7, list=FALSE)
X_training <- fp_imp[training, ]
y_training <- only_ph$PH[training]
X_testing <- fp_imp[-training, ]
y_testing <- only_ph$PH[-training]
seeds <- vector(mode = "list", length = 11)
for(i in 1:10) seeds[[i]]<- sample.int(n=1000, 54)
#for the last model
seeds[[11]]<-sample.int(1000, 1)
myControl <- trainControl(method='cv', seeds=seeds)
glm_model_1 <- train(X_training, y_training, method = "glm", trControl = myControl)
glm_pred_1 <- predict(glm_model_1,X_testing)
postResample(glm_pred_1, y_testing)
## RMSE Rsquared MAE
## 0.1333059 0.4058569 0.1033084
glm_model_2 <- train(X_training, y_training, method = "glm",
preProc = c("center", "scale"), trControl = myControl)
glm_pred_2 <- predict(glm_model_2,X_testing)
postResample(glm_pred_2, y_testing)
## RMSE Rsquared MAE
## 0.1333059 0.4058569 0.1033084
glm_model_3 <- train(X_training, y_training, method = "glm",
preProc = c("center", "scale",'pca'), trControl = myControl)
glm_pred_3 <- predict(glm_model_3,X_testing)
postResample(glm_pred_3, y_testing)
## RMSE Rsquared MAE
## 0.1394719 0.3496153 0.1107603
svm_training = X_training %>% select(.,-c('Brand.Code'))
svm_testing = X_testing %>% select(.,-c('Brand.Code'))
svm_model_1 <- train(svm_training, y_training,
method = "svmRadial",
tuneLength = 14,
trControl = myControl)
svm_pred_1 <- predict(svm_model_1, svm_testing)
svm_pr_1 <- postResample(svm_pred_1, y_testing)
svm_pr_1
## RMSE Rsquared MAE
## 0.12934548 0.44594874 0.09337816
svm_model_2 <- train(svm_training, y_training,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = myControl)
svm_pred_2 <- predict(svm_model_2, svm_testing)
svm_pr_2 <- postResample(svm_pred_2, y_testing)
svm_pr_2
## RMSE Rsquared MAE
## 0.12934548 0.44594874 0.09337816
svm_model_3 <- train(svm_training, y_training,
method = "svmRadial",
preProc = c("center", "scale",'pca'),
tuneLength = 14,
trControl = myControl)
svm_pred_3 <- predict(svm_model_3, svm_testing)
svm_pr_3 <- postResample(svm_pred_3, y_testing)
svm_pr_3
## RMSE Rsquared MAE
## 0.1385561 0.3702451 0.1001749
hc <-findCorrelation(cor(svm_training), cutoff = .75)
nn_training <- X_training %>% select(.,-c(hc))
nn_testing <- X_testing %>% select(.,-c(hc))
nn_grid <- expand.grid(.decay = c(0, 0.01, .1),
.size = c(1:10))
nn_model_1 <- train(nn_training, y_training,
method = "nnet",
tuneGrid = nn_grid,
trControl = myControl,
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(nn_training) + 1) + 10 + 1,
maxit = 500)
nn_pred_1 <- predict(nn_model_1, nn_testing)
net_pr_1 <- postResample(nn_pred_1, y_testing)
net_pr_1
## RMSE Rsquared MAE
## 0.1301211 0.4403960 0.1005378
nn_model_2 <- train(nn_training, y_training,
method = "nnet",
tuneGrid = nn_grid,
trControl = myControl,
preProc = c("center", "scale"),
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(nn_training) + 1) + 10 + 1,
maxit = 500)
nn_pred_2 <- predict(nn_model_2, nn_testing)
net_pr_2 <- postResample(nn_pred_2, y_testing)
net_pr_2
## RMSE Rsquared MAE
## 0.1281222 0.4561019 0.1002195
nn_model_3 <- train(nn_training, y_training,
method = "nnet",
tuneGrid = nn_grid,
trControl = myControl,
preProc = c("center", "scale"),
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(nn_training) + 1) + 10 + 1,
maxit = 500)
nn_pred_3 <- predict(nn_model_3, nn_testing)
net_pr_3 <- postResample(nn_pred_3, y_testing)
net_pr_3
## RMSE Rsquared MAE
## 0.12363483 0.49738114 0.09481436
mars_grid <- expand.grid(.degree = 1:2, .nprune = 2:38)
mars_model_1 <- train(X_training, y_training,
method = "earth",
tuneGrid = mars_grid,
trControl = myControl)
## Loading required package: earth
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
mars_pred_1 <- predict(mars_model_1, X_testing)
mars_pr_1 <- postResample(mars_pred_1, y_testing)
mars_pr_1
## RMSE Rsquared MAE
## 0.1220192 0.5040312 0.0932989
mars_model_2 <- train(X_training, y_training,
method = "earth",
preProc = c("center", "scale"),
tuneGrid = mars_grid,
trControl = myControl)
mars_pred_2 <- predict(mars_model_2, X_testing)
mars_pr_2 <- postResample(mars_pred_2, y_testing)
mars_pr_2
## RMSE Rsquared MAE
## 0.12500189 0.47949814 0.09557829
mars_model_3 <- train(X_training, y_training,
method = "earth",
preProc = c("center", "scale",'pca'),
tuneGrid = mars_grid,
trControl = myControl)
mars_pred_3 <- predict(mars_model_3, X_testing)
mars_pr_3 <- postResample(mars_pred_3, y_testing)
mars_pr_3
## RMSE Rsquared MAE
## 0.1415068 0.3376479 0.1096952
rp_grid <- expand.grid(maxdepth= seq(1,10,by=1))
rp_1 <- train(x =X_training, y = y_training, method = "rpart2",metric = "Rsquared",
tuneGrid = rp_grid,
trControl = myControl)
rp_pred_1 <- predict(rp_1, newdata=X_testing)
postResample(pred=rp_pred_1, obs=y_testing)
## RMSE Rsquared MAE
## 0.12941476 0.44262146 0.09892606
rp_2 <- train(x =X_training, y = y_training, method = "rpart2",metric = "Rsquared",
tuneGrid = rp_grid,
preProc = c("center", "scale"),
trControl = myControl)
rp_pred_2 <- predict(rp_2, newdata=X_testing)
postResample(pred=rp_pred_2, obs=y_testing)
## RMSE Rsquared MAE
## 0.12941476 0.44262146 0.09892606
rp_3 <- train(x =X_training, y = y_training, method = "rpart2",metric = "Rsquared",
tuneGrid = rp_grid,
preProc = c("center", "scale",'pca'),
trControl = myControl)
rp_pred_3 <- predict(rp_3, newdata=X_testing)
postResample(pred=rp_pred_3, obs=y_testing)
## RMSE Rsquared MAE
## 0.1514582 0.2475779 0.1180886
rf_1 <- train(x =X_training, y = y_training, method = "rf", metric = "Rsquared",
importance = TRUE, trControl = myControl)
rf_1
## Random Forest
##
## 1802 samples
## 28 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1622, 1621, 1622, 1620, 1623, 1622, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 0.1162643 0.5911493 0.08820726
## 15 0.1055360 0.6369331 0.07708953
## 28 0.1049778 0.6339149 0.07605223
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 15.
rf_pred_1 <- predict(rf_1, newdata=X_testing)
rf_pr_1 <- postResample(pred=rf_pred_1, obs=y_testing)
rf_pr_1
## RMSE Rsquared MAE
## 0.10275368 0.65585741 0.07511312
rf_2 <- train(x =X_training, y = y_training, method = "rf", metric = "Rsquared",
importance = TRUE, preProc = c("center", "scale"),
trControl = myControl)
rf_2
## Random Forest
##
## 1802 samples
## 28 predictor
##
## Pre-processing: centered (27), scaled (27), ignore (1)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1623, 1622, 1623, 1621, 1622, 1621, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 0.1162981 0.5899948 0.08843142
## 15 0.1054704 0.6378505 0.07666289
## 28 0.1053502 0.6311450 0.07586421
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 15.
rf_pred_2 <- predict(rf_2, newdata=X_testing)
rf_pr_2 <- postResample(pred=rf_pred_2, obs=y_testing)
rf_pr_2
## RMSE Rsquared MAE
## 0.10261326 0.65836287 0.07484114
rf_3 <- train(x =X_training, y = y_training, method = "rf", metric = "Rsquared",
importance = TRUE, preProc = c("center", "scale",'pca'),
trControl = myControl)
rf_3
## Random Forest
##
## 1802 samples
## 28 predictor
##
## Pre-processing: centered (27), scaled (27), principal component
## signal extraction (27), ignore (1)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1621, 1621, 1621, 1622, 1621, 1622, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 0.1328086 0.4571750 0.10356189
## 15 0.1256317 0.4767968 0.09585190
## 28 0.1259945 0.4716839 0.09604997
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 15.
rf_pred_3 <- predict(rf_3, newdata=X_testing)
rf_pr_3 <- postResample(pred=rf_pred_3, obs=y_testing)
rf_pr_3
## RMSE Rsquared MAE
## 0.1285497 0.4494957 0.1001038
varImp(rf_2)
## rf variable importance
##
## only 20 most important variables shown (out of 28)
##
## Overall
## Brand.Code 100.00
## Mnf.Flow 98.15
## Usage.cont 73.91
## Air.Pressurer 72.75
## Pressure.Vacuum 72.68
## Oxygen.Filler 70.37
## Temperature 59.22
## Carb.Flow 57.73
## Filler.Speed 49.64
## Density 49.07
## Balling 47.42
## Carb.Pressure1 46.89
## Hyd.Pressure3 44.59
## Bowl.Setpoint 42.18
## Filler.Level 38.87
## Hyd.Pressure2 35.06
## Carb.Volume 34.34
## Fill.Pressure 31.91
## MFR 30.62
## Hyd.Pressure4 29.79
gbm_grid <- expand.grid(interaction.depth=seq(1,6,by=1),
n.trees=c(25,50,100,200),
shrinkage=c(0.01,0.05,0.1,0.2),
n.minobsinnode=5)
gbm_model_1 <- train(x =X_training, y = y_training, method = "gbm",
tuneGrid = gbm_grid, metric = "Rsquared",verbose = FALSE,
trControl = myControl)
gbm_pred_1 <- predict(gbm_model_1, newdata=X_testing)
postResample(pred=gbm_pred_1, obs=y_testing)
## RMSE Rsquared MAE
## 0.11247955 0.57788115 0.08458466
gbm_model_2 <- train(x =X_training, y = y_training, method = "gbm",
tuneGrid = gbm_grid, metric = "Rsquared",verbose = FALSE,
preProc = c("center", "scale"),trControl = myControl)
gbm_pred_2 <- predict(gbm_model_2, newdata=X_testing)
postResample(pred=gbm_pred_2, obs=y_testing)
## RMSE Rsquared MAE
## 0.11250502 0.57723568 0.08500179
gbm_model_3 <- train(x =X_training, y = y_training, method = "gbm",
tuneGrid = gbm_grid, metric = "Rsquared",verbose = FALSE,
preProc = c("center", "scale",'pca'),trControl = myControl)
gbm_pred_3 <- predict(gbm_model_3, newdata=X_testing)
postResample(pred=gbm_pred_3, obs=y_testing)
## RMSE Rsquared MAE
## 0.1299454 0.4359685 0.1016842
cubist_grid <- expand.grid(committees = c(1, 5, 10, 20, 50, 100),
neighbors = c(0, 1, 3, 5, 7))
cubist_model_1 <- train(x =X_training, y = y_training, method = "cubist",
tuneGrid = cubist_grid,
metric = "Rsquared",verbose = FALSE,
trControl = myControl)
cubist_model_1
## Cubist
##
## 1802 samples
## 28 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1622, 1621, 1621, 1623, 1622, 1623, ...
## Resampling results across tuning parameters:
##
## committees neighbors RMSE Rsquared MAE
## 1 0 0.1273663 0.4908706 0.08826589
## 1 1 0.1363477 0.4774167 0.09259667
## 1 3 0.1240827 0.5264088 0.08436014
## 1 5 0.1223021 0.5326267 0.08366981
## 1 7 0.1223406 0.5302389 0.08373808
## 5 0 0.1116847 0.5842280 0.08134019
## 5 1 0.1209963 0.5459323 0.08454001
## 5 3 0.1091482 0.6052547 0.07780360
## 5 5 0.1071771 0.6161355 0.07748832
## 5 7 0.1072125 0.6161048 0.07755008
## 10 0 0.1099088 0.5998913 0.08066633
## 10 1 0.1191491 0.5556846 0.08312558
## 10 3 0.1066494 0.6211240 0.07660612
## 10 5 0.1047047 0.6324162 0.07638614
## 10 7 0.1046495 0.6335070 0.07638693
## 20 0 0.1094656 0.6039771 0.08056172
## 20 1 0.1188834 0.5560592 0.08241650
## 20 3 0.1061055 0.6240364 0.07591867
## 20 5 0.1043015 0.6345679 0.07579066
## 20 7 0.1041732 0.6361819 0.07590810
## 50 0 0.1086538 0.6108751 0.07973632
## 50 1 0.1182001 0.5585016 0.08135855
## 50 3 0.1053006 0.6285589 0.07509608
## 50 5 0.1035110 0.6397797 0.07483159
## 50 7 0.1033684 0.6418257 0.07502496
## 100 0 0.1078543 0.6170464 0.07918160
## 100 1 0.1172716 0.5639263 0.08094949
## 100 3 0.1044941 0.6340846 0.07449825
## 100 5 0.1027109 0.6454470 0.07416579
## 100 7 0.1025872 0.6474842 0.07438974
##
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were committees = 100 and neighbors = 7.
cube_pred_1 <- predict(cubist_model_1, newdata=X_testing)
postResample(pred=cube_pred_1, obs=y_testing)
## RMSE Rsquared MAE
## 0.09695122 0.68719791 0.07138919
cubist_model_2 <- train(x =X_training, y = y_training, method = "cubist",
tuneGrid = cubist_grid,
metric = "Rsquared",verbose = FALSE,
preProc = c("center", "scale"),trControl = myControl)
cubist_model_2
## Cubist
##
## 1802 samples
## 28 predictor
##
## Pre-processing: centered (27), scaled (27), ignore (1)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1623, 1621, 1621, 1622, 1622, 1622, ...
## Resampling results across tuning parameters:
##
## committees neighbors RMSE Rsquared MAE
## 1 0 0.1242896 0.4984192 0.08897713
## 1 1 0.1331345 0.4889133 0.09267512
## 1 3 0.1179645 0.5560905 0.08346708
## 1 5 0.1171815 0.5566964 0.08276298
## 1 7 0.1169635 0.5556474 0.08292352
## 5 0 0.1121262 0.5802066 0.08199476
## 5 1 0.1215782 0.5466975 0.08544868
## 5 3 0.1066976 0.6215938 0.07744282
## 5 5 0.1063062 0.6223022 0.07713355
## 5 7 0.1063036 0.6219187 0.07736846
## 10 0 0.1083741 0.6126583 0.08014536
## 10 1 0.1186651 0.5633830 0.08327584
## 10 3 0.1031213 0.6452639 0.07488344
## 10 5 0.1026406 0.6477432 0.07483450
## 10 7 0.1026714 0.6478303 0.07509403
## 20 0 0.1087473 0.6110551 0.07988068
## 20 1 0.1185372 0.5623978 0.08292433
## 20 3 0.1034084 0.6428908 0.07467458
## 20 5 0.1027935 0.6466155 0.07454143
## 20 7 0.1028064 0.6470532 0.07483428
## 50 0 0.1081944 0.6169975 0.07914356
## 50 1 0.1175492 0.5662031 0.08196167
## 50 3 0.1027286 0.6471555 0.07373253
## 50 5 0.1021640 0.6510516 0.07356067
## 50 7 0.1021566 0.6521042 0.07386078
## 100 0 0.1078864 0.6203771 0.07871757
## 100 1 0.1173689 0.5665918 0.08178573
## 100 3 0.1024915 0.6485229 0.07359627
## 100 5 0.1018568 0.6531600 0.07330673
## 100 7 0.1018269 0.6545421 0.07356565
##
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were committees = 100 and neighbors = 7.
cube_pred_2 <- predict(cubist_model_2, newdata=X_testing)
postResample(pred=cube_pred_2, obs=y_testing)
## RMSE Rsquared MAE
## 0.09787625 0.68127930 0.07222664
cubist_model_3 <- train(x =X_training, y = y_training, method = "cubist",
tuneGrid = cubist_grid,
metric = "Rsquared",verbose = FALSE,
preProc = c("center", "scale",'pca'),trControl = myControl)
cubist_model_3
## Cubist
##
## 1802 samples
## 28 predictor
##
## Pre-processing: centered (27), scaled (27), principal component
## signal extraction (27), ignore (1)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1623, 1621, 1621, 1622, 1622, 1622, ...
## Resampling results across tuning parameters:
##
## committees neighbors RMSE Rsquared MAE
## 1 0 0.1373751 0.3744694 0.10403132
## 1 1 0.1549994 0.3514342 0.11085591
## 1 3 0.1352904 0.4230820 0.10025820
## 1 5 0.1303613 0.4497338 0.09775357
## 1 7 0.1290616 0.4559081 0.09693412
## 5 0 0.1336379 0.4050726 0.10193762
## 5 1 0.1484059 0.3796090 0.10635896
## 5 3 0.1307794 0.4488124 0.09730608
## 5 5 0.1262449 0.4759476 0.09537076
## 5 7 0.1253144 0.4806406 0.09495184
## 10 0 0.1333622 0.4079848 0.10212662
## 10 1 0.1477283 0.3797964 0.10568987
## 10 3 0.1302686 0.4499682 0.09674427
## 10 5 0.1256787 0.4785757 0.09492369
## 10 7 0.1246728 0.4842751 0.09441571
## 20 0 0.1327538 0.4125045 0.10125513
## 20 1 0.1477100 0.3789349 0.10551892
## 20 3 0.1302892 0.4493285 0.09670153
## 20 5 0.1256525 0.4783580 0.09496917
## 20 7 0.1245583 0.4847937 0.09438866
## 50 0 0.1320641 0.4199292 0.10061747
## 50 1 0.1473236 0.3809904 0.10495943
## 50 3 0.1295675 0.4540696 0.09598592
## 50 5 0.1248361 0.4841960 0.09414075
## 50 7 0.1237011 0.4911406 0.09360104
## 100 0 0.1318770 0.4218837 0.10047085
## 100 1 0.1470721 0.3823609 0.10475341
## 100 3 0.1292150 0.4563985 0.09573629
## 100 5 0.1244939 0.4867957 0.09384289
## 100 7 0.1234062 0.4933567 0.09341076
##
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were committees = 100 and neighbors = 7.
cube_pred_3 <- predict(cubist_model_3, newdata=X_testing)
postResample(pred=cube_pred_3, obs=y_testing)
## RMSE Rsquared MAE
## 0.1248926 0.4797407 0.0949015
stopCluster(cl)
varImp(cubist_model_2)
## cubist variable importance
##
## only 20 most important variables shown (out of 28)
##
## Overall
## Mnf.Flow 100.00
## Balling 81.34
## Density 67.91
## Air.Pressurer 55.22
## Pressure.Vacuum 55.22
## Temperature 52.99
## Hyd.Pressure3 47.01
## Oxygen.Filler 45.52
## Usage.cont 42.54
## Carb.Pressure1 42.54
## Hyd.Pressure2 39.55
## Carb.Flow 38.81
## Bowl.Setpoint 34.33
## Filler.Level 34.33
## Carb.Volume 34.33
## Filler.Speed 33.58
## Brand.Code 31.34
## Carb.Pressure 27.61
## Carb.Temp 26.12
## Pressure.Setpoint 22.39
For the most part all the models performed rather poorly. There were only two that had R-square scores higher than .6 and there were only three that had higher than .55. Of those the cubist model with no preprocessing performed the best on the testing data with a .68 r-squared score. Which was a 5% increase over its training r-square score.
The random forest model with centering and scaling and also without pre-processing had an r-square score of ~.63 on the training data, which was slightly lower than the cubist model’s r-square score of .65 on the training data. On the testing data the random forest model only improved to .6583 for r-squared.
Since the cubist model performed virtually the same as the random forest on the training data, but had a much larger performance gain on the testing data, the cubist model will be the final model.
final_eval_data_imp <- final_eval_data %>% select(-PH)
final_prediction <- predict(cubist_model_1, final_eval_data_imp)
final_prediction
## [1] 8.663651 8.399857 8.548253 8.588914 8.468299 8.546074 8.464340 8.544967
## [9] 8.573155 8.551621 8.450832 8.457115 8.448719 8.661861 8.247412 8.633602
## [17] 8.555671 8.545568 8.487899 8.696609 8.691767 8.634740 8.570895 8.569745
## [25] 8.675952 8.410020 8.410580 8.624118 8.690289 8.691509 8.605254 8.731082
## [33] 8.660496 8.657748 8.731308 8.561165 8.522634 8.541997 8.645558 8.823889
## [41] 8.816323 8.669717 8.228788 8.245699 8.722853 8.753989 8.759417 8.636842
## [49] 8.759039 8.789128 8.673086 8.682339 8.689214 8.796692 8.791013 8.738265
## [57] 8.367038 8.539260 8.658771 8.803403 8.727114 8.753772 8.772833 8.820044
## [65] 8.755528 8.806726 8.580817 8.538203 8.550093 8.480567 8.441368 8.324424
## [73] 8.587729 8.511843 8.494021 8.457296 8.595482 8.717562 8.689876 8.690001
## [81] 8.785590 8.784991 8.794956 8.741295 8.834356 8.760874 8.742410 8.649231
## [89] 8.517745 8.716171 8.643930 8.555835 8.496687 8.408738 8.518122 8.663733
## [97] 8.706146 8.667054 8.708802 8.700249 8.653530 8.675000 8.711729 8.676590
## [105] 8.813308 8.835199 8.550488 8.489166 8.511027 8.600323 8.668372 8.666607
## [113] 8.699860 8.718661 8.610188 8.759213 8.749774 8.731980 8.751470 8.752671
## [121] 8.727777 8.744652 8.702955 8.647503 8.586506 8.455987 8.488049 8.278573
## [129] 8.460218 8.440671 8.565418 8.557616 8.422834 8.485707 8.448220 8.475329
## [137] 8.506094 8.445242 8.425882 8.485176 8.453637 8.505466 8.438625 8.491728
## [145] 8.410774 8.550174 8.503317 8.589645 8.532893 8.590325 8.590317 8.590549
## [153] 8.486816 8.684231 8.452705 8.476883 8.464428 8.536860 8.539461 8.581436
## [161] 8.610895 8.637177 8.604809 8.553144 8.553486 8.484779 8.415937 8.789887
## [169] 8.722962 8.781662 8.345147 8.496157 8.302920 8.511931 8.516747 8.558492
## [177] 8.429007 8.466746 8.509178 8.443894 8.431644 8.506138 8.505123 8.496730
## [185] 8.504001 8.306504 8.308646 8.482499 8.430278 8.396166 8.341457 8.481395
## [193] 8.425842 8.516177 8.414750 8.341685 8.302737 8.208286 8.254679 8.475475
## [201] 8.407339 8.426421 8.468726 8.486006 8.314820 8.487294 8.517469 8.476567
## [209] 8.459014 8.297743 8.386098 8.354876 8.454288 8.500471 8.478093 8.523907
## [217] 8.373122 8.380192 8.503963 8.488920 8.484028 8.527408 8.449262 8.436918
## [225] 8.558197 8.488687 8.402719 8.400624 8.502196 8.455499 8.521846 8.512491
## [233] 8.505167 8.570243 8.563753 8.537703 8.639412 8.661979 8.436349 8.474943
## [241] 8.444898 8.550615 8.466645 8.553781 8.445525 8.439879 8.434119 8.495939
## [249] 8.540469 8.487692 8.398073 8.462302 8.476020 8.519601 8.529755 8.376858
## [257] 8.542078 8.669650 8.511080 8.516471 8.680401 8.637736 8.493894 8.607294
## [265] 8.281431 8.359053 8.207158
#library(xlsx)
#write.xlsx(list('PH_Predictions' = final_prediction), file = 'final_predictions.xlsx')