library(kableExtra)
library(tidyverse)
library(ggplot2)
library(dplyr)
library(psych)
library(caret)
library(mice)
library(randomForest)
library(caTools)
library(corrplot)
library(class)
library(rpart)
library(AppliedPredictiveModeling)
library(naniar)
library(xgboost)
library(DiagrammeR)
library(readxl)
library(writexl)
library(DataExplorer)
library(elasticnet)
library(glmnet)
library(rattle)This is role playing. I am your new boss. I am in charge of production at ABC Beverage and you are a team of data scientists reporting to me. My leadership has told me that new regulations are requiring us to understand our manufacturing process, the predictive factors and be able to report to them our predictive model of PH.
Please use the historical data set I am providing. Build and report the factors in BOTH a technical and non-technical report. I like to use Word and Excel. Please provide your non-technical report in a business friendly readable document and your predictions in an Excel readable format. The technical report should show clearly the models you tested and how you selected your final approach.
Please submit both Rpubs links and .rmd files or other readable formats for technical and non-technical reports. Also submit the excel file showing the prediction of your models for pH.
To accomplish the goal of the assignment, these are the steps we will take.
df <- read_xlsx('StudentData.xlsx')
head(df)%>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
scroll_box(width="100%",height="300px")| Brand Code | Carb Volume | Fill Ounces | PC Volume | Carb Pressure | Carb Temp | PSC | PSC Fill | PSC CO2 | Mnf Flow | Carb Pressure1 | Fill Pressure | Hyd Pressure1 | Hyd Pressure2 | Hyd Pressure3 | Hyd Pressure4 | Filler Level | Filler Speed | Temperature | Usage cont | Carb Flow | Density | MFR | Balling | Pressure Vacuum | PH | Oxygen Filler | Bowl Setpoint | Pressure Setpoint | Air Pressurer | Alch Rel | Carb Rel | Balling Lvl |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| B | 5.340000 | 23.96667 | 0.2633333 | 68.2 | 141.2 | 0.104 | 0.26 | 0.04 | -100 | 118.8 | 46.0 | 0 | NA | NA | 118 | 121.2 | 4002 | 66.0 | 16.18 | 2932 | 0.88 | 725.0 | 1.398 | -4.0 | 8.36 | 0.022 | 120 | 46.4 | 142.6 | 6.58 | 5.32 | 1.48 |
| A | 5.426667 | 24.00667 | 0.2386667 | 68.4 | 139.6 | 0.124 | 0.22 | 0.04 | -100 | 121.6 | 46.0 | 0 | NA | NA | 106 | 118.6 | 3986 | 67.6 | 19.90 | 3144 | 0.92 | 726.8 | 1.498 | -4.0 | 8.26 | 0.026 | 120 | 46.8 | 143.0 | 6.56 | 5.30 | 1.56 |
| B | 5.286667 | 24.06000 | 0.2633333 | 70.8 | 144.8 | 0.090 | 0.34 | 0.16 | -100 | 120.2 | 46.0 | 0 | NA | NA | 82 | 120.0 | 4020 | 67.0 | 17.76 | 2914 | 1.58 | 735.0 | 3.142 | -3.8 | 8.94 | 0.024 | 120 | 46.6 | 142.0 | 7.66 | 5.84 | 3.28 |
| A | 5.440000 | 24.00667 | 0.2933333 | 63.0 | 132.6 | NA | 0.42 | 0.04 | -100 | 115.2 | 46.4 | 0 | 0 | 0 | 92 | 117.8 | 4012 | 65.6 | 17.42 | 3062 | 1.54 | 730.6 | 3.042 | -4.4 | 8.24 | 0.030 | 120 | 46.0 | 146.2 | 7.14 | 5.42 | 3.04 |
| A | 5.486667 | 24.31333 | 0.1113333 | 67.2 | 136.8 | 0.026 | 0.16 | 0.12 | -100 | 118.4 | 45.8 | 0 | 0 | 0 | 92 | 118.6 | 4010 | 65.6 | 17.68 | 3054 | 1.54 | 722.8 | 3.042 | -4.4 | 8.26 | 0.030 | 120 | 46.0 | 146.2 | 7.14 | 5.44 | 3.04 |
| A | 5.380000 | 23.92667 | 0.2693333 | 66.6 | 138.4 | 0.090 | 0.24 | 0.04 | -100 | 119.6 | 45.6 | 0 | 0 | 0 | 116 | 120.2 | 4014 | 66.2 | 23.82 | 2948 | 1.52 | 738.8 | 2.992 | -4.4 | 8.32 | 0.024 | 120 | 46.0 | 146.6 | 7.16 | 5.44 | 3.02 |
summary(df)%>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="400px")| Brand Code | Carb Volume | Fill Ounces | PC Volume | Carb Pressure | Carb Temp | PSC | PSC Fill | PSC CO2 | Mnf Flow | Carb Pressure1 | Fill Pressure | Hyd Pressure1 | Hyd Pressure2 | Hyd Pressure3 | Hyd Pressure4 | Filler Level | Filler Speed | Temperature | Usage cont | Carb Flow | Density | MFR | Balling | Pressure Vacuum | PH | Oxygen Filler | Bowl Setpoint | Pressure Setpoint | Air Pressurer | Alch Rel | Carb Rel | Balling Lvl | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Length:2571 | Min. :5.040 | Min. :23.63 | Min. :0.07933 | Min. :57.00 | Min. :128.6 | Min. :0.00200 | Min. :0.0000 | Min. :0.00000 | Min. :-100.20 | Min. :105.6 | Min. :34.60 | Min. :-0.80 | Min. : 0.00 | Min. :-1.20 | Min. : 52.00 | Min. : 55.8 | Min. : 998 | Min. :63.60 | Min. :12.08 | Min. : 26 | Min. :0.240 | Min. : 31.4 | Min. :-0.170 | Min. :-6.600 | Min. :7.880 | Min. :0.00240 | Min. : 70.0 | Min. :44.00 | Min. :140.8 | Min. :5.280 | Min. :4.960 | Min. :0.00 | |
| Class :character | 1st Qu.:5.293 | 1st Qu.:23.92 | 1st Qu.:0.23917 | 1st Qu.:65.60 | 1st Qu.:138.4 | 1st Qu.:0.04800 | 1st Qu.:0.1000 | 1st Qu.:0.02000 | 1st Qu.:-100.00 | 1st Qu.:119.0 | 1st Qu.:46.00 | 1st Qu.: 0.00 | 1st Qu.: 0.00 | 1st Qu.: 0.00 | 1st Qu.: 86.00 | 1st Qu.: 98.3 | 1st Qu.:3888 | 1st Qu.:65.20 | 1st Qu.:18.36 | 1st Qu.:1144 | 1st Qu.:0.900 | 1st Qu.:706.3 | 1st Qu.: 1.496 | 1st Qu.:-5.600 | 1st Qu.:8.440 | 1st Qu.:0.02200 | 1st Qu.:100.0 | 1st Qu.:46.00 | 1st Qu.:142.2 | 1st Qu.:6.540 | 1st Qu.:5.340 | 1st Qu.:1.38 | |
| Mode :character | Median :5.347 | Median :23.97 | Median :0.27133 | Median :68.20 | Median :140.8 | Median :0.07600 | Median :0.1800 | Median :0.04000 | Median : 65.20 | Median :123.2 | Median :46.40 | Median :11.40 | Median :28.60 | Median :27.60 | Median : 96.00 | Median :118.4 | Median :3982 | Median :65.60 | Median :21.79 | Median :3028 | Median :0.980 | Median :724.0 | Median : 1.648 | Median :-5.400 | Median :8.540 | Median :0.03340 | Median :120.0 | Median :46.00 | Median :142.6 | Median :6.560 | Median :5.400 | Median :1.48 | |
| NA | Mean :5.370 | Mean :23.97 | Mean :0.27712 | Mean :68.19 | Mean :141.1 | Mean :0.08457 | Mean :0.1954 | Mean :0.05641 | Mean : 24.57 | Mean :122.6 | Mean :47.92 | Mean :12.44 | Mean :20.96 | Mean :20.46 | Mean : 96.29 | Mean :109.3 | Mean :3687 | Mean :65.97 | Mean :20.99 | Mean :2468 | Mean :1.174 | Mean :704.0 | Mean : 2.198 | Mean :-5.216 | Mean :8.546 | Mean :0.04684 | Mean :109.3 | Mean :47.62 | Mean :142.8 | Mean :6.897 | Mean :5.437 | Mean :2.05 | |
| NA | 3rd Qu.:5.453 | 3rd Qu.:24.03 | 3rd Qu.:0.31200 | 3rd Qu.:70.60 | 3rd Qu.:143.8 | 3rd Qu.:0.11200 | 3rd Qu.:0.2600 | 3rd Qu.:0.08000 | 3rd Qu.: 140.80 | 3rd Qu.:125.4 | 3rd Qu.:50.00 | 3rd Qu.:20.20 | 3rd Qu.:34.60 | 3rd Qu.:33.40 | 3rd Qu.:102.00 | 3rd Qu.:120.0 | 3rd Qu.:3998 | 3rd Qu.:66.40 | 3rd Qu.:23.75 | 3rd Qu.:3186 | 3rd Qu.:1.620 | 3rd Qu.:731.0 | 3rd Qu.: 3.292 | 3rd Qu.:-5.000 | 3rd Qu.:8.680 | 3rd Qu.:0.06000 | 3rd Qu.:120.0 | 3rd Qu.:50.00 | 3rd Qu.:143.0 | 3rd Qu.:7.240 | 3rd Qu.:5.540 | 3rd Qu.:3.14 | |
| NA | Max. :5.700 | Max. :24.32 | Max. :0.47800 | Max. :79.40 | Max. :154.0 | Max. :0.27000 | Max. :0.6200 | Max. :0.24000 | Max. : 229.40 | Max. :140.2 | Max. :60.40 | Max. :58.00 | Max. :59.40 | Max. :50.00 | Max. :142.00 | Max. :161.2 | Max. :4030 | Max. :76.20 | Max. :25.90 | Max. :5104 | Max. :1.920 | Max. :868.6 | Max. : 4.012 | Max. :-3.600 | Max. :9.360 | Max. :0.40000 | Max. :140.0 | Max. :52.00 | Max. :148.2 | Max. :8.620 | Max. :6.060 | Max. :3.66 | |
| NA | NA’s :10 | NA’s :38 | NA’s :39 | NA’s :27 | NA’s :26 | NA’s :33 | NA’s :23 | NA’s :39 | NA’s :2 | NA’s :32 | NA’s :22 | NA’s :11 | NA’s :15 | NA’s :15 | NA’s :30 | NA’s :20 | NA’s :57 | NA’s :14 | NA’s :5 | NA’s :2 | NA’s :1 | NA’s :212 | NA’s :1 | NA | NA’s :4 | NA’s :12 | NA’s :2 | NA’s :12 | NA | NA’s :9 | NA’s :10 | NA’s :1 |
Before performing the imputation, we need to change the categorical variable into a factor and clean up white spaces from column names.
# transforming the Brand Code variable into a factor for imputation
df <- df %>%
mutate(`Brand Code` = as.factor(`Brand Code`))
# cleaning up the column names for the imputation function
colNamesNoSpace <- colnames(df) %>%
str_remove_all(' ')
# transforming the column names
colnames(df) <- colNamesNoSpaceWe also checked for presence of any de-generate variables. Here, we see that there is one degenerate variable, HydPressure1.
# near zero variance
# capturing the degenerate variables
dim(df)## [1] 2571 33
nzv <- nearZeroVar(df)
# identifying them
(remove<-colnames(df)[nzv])## [1] "HydPressure1"
# removing from the dataset
df <- df[,!(colnames(df) == remove)]
dim(df)## [1] 2571 32
Based on the above descriptive data summary, there are quite a few variables with missing values. So we conducted an analysis of all missing values in various attributes to identify proper imputation technique.
## Counts of missing data per feature
dataset_missing_counts <- data.frame(apply(df, 2, function(x) length(which(is.na(x)))))
dataset_missing_pct <- data.frame(apply(df, 2,function(x) {sum(is.na(x)) / length(x) * 100}))
dataset_missing_counts <- cbind(Feature = rownames(dataset_missing_counts), dataset_missing_counts, dataset_missing_pct)
colnames(dataset_missing_counts) <- c('Feature','NA_Count','NA_Percentage')
rownames(dataset_missing_counts) <- NULL
dataset_missing_counts <- dataset_missing_counts %>% filter(`NA_Count` != 0) %>% arrange(desc(`NA_Count`))
dataset_missing_counts %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")| Feature | NA_Count | NA_Percentage |
|---|---|---|
| MFR | 212 | 8.2458187 |
| BrandCode | 120 | 4.6674446 |
| FillerSpeed | 57 | 2.2170362 |
| PCVolume | 39 | 1.5169195 |
| PSCCO2 | 39 | 1.5169195 |
| FillOunces | 38 | 1.4780241 |
| PSC | 33 | 1.2835473 |
| CarbPressure1 | 32 | 1.2446519 |
| HydPressure4 | 30 | 1.1668611 |
| CarbPressure | 27 | 1.0501750 |
| CarbTemp | 26 | 1.0112797 |
| PSCFill | 23 | 0.8945935 |
| FillPressure | 22 | 0.8556982 |
| FillerLevel | 20 | 0.7779074 |
| HydPressure2 | 15 | 0.5834306 |
| HydPressure3 | 15 | 0.5834306 |
| Temperature | 14 | 0.5445352 |
| OxygenFiller | 12 | 0.4667445 |
| PressureSetpoint | 12 | 0.4667445 |
| CarbVolume | 10 | 0.3889537 |
| CarbRel | 10 | 0.3889537 |
| AlchRel | 9 | 0.3500583 |
| Usagecont | 5 | 0.1944769 |
| PH | 4 | 0.1555815 |
| MnfFlow | 2 | 0.0777907 |
| CarbFlow | 2 | 0.0777907 |
| BowlSetpoint | 2 | 0.0777907 |
| Density | 1 | 0.0388954 |
| Balling | 1 | 0.0388954 |
| BallingLvl | 1 | 0.0388954 |
ggplot(dataset_missing_counts, aes(x = NA_Count, y = reorder(Feature, NA_Count))) +
geom_bar(stat = 'identity', fill = 'steelblue') +
geom_label(aes(label = NA_Count)) +
labs(title = 'Missing Counts') +
theme(plot.title = element_text(hjust = 0.5), axis.title.y = element_blank(), axis.title.x = element_blank())To avoid risk of imputing the incorrect Brand Code, we remove the NA rows, a total 120 rows.
#remove missing brand => df3
df <- df[!is.na(df$BrandCode), ]
dim(df)## [1] 2451 32
Once done, we can perform the imputation using the random forest method of the mice package
#imputation by using the random forest method ('rf')
init <- mice(df, maxit = 0)
predM <- init$predictorMatrix
set.seed(123)
imputed <- mice(df, method = 'rf', predictorMatrix = predM, m=1, silent = TRUE)##
## iter imp variable
## 1 1 CarbVolume FillOunces PCVolume CarbPressure CarbTemp PSC PSCFill PSCCO2 MnfFlow CarbPressure1 FillPressure HydPressure2 HydPressure3 HydPressure4 FillerLevel FillerSpeed Temperature Usagecont CarbFlow Density MFR Balling PH OxygenFiller BowlSetpoint PressureSetpoint AlchRel CarbRel
## 2 1 CarbVolume FillOunces PCVolume CarbPressure CarbTemp PSC PSCFill PSCCO2 MnfFlow CarbPressure1 FillPressure HydPressure2 HydPressure3 HydPressure4 FillerLevel FillerSpeed Temperature Usagecont CarbFlow Density MFR Balling PH OxygenFiller BowlSetpoint PressureSetpoint AlchRel CarbRel
## 3 1 CarbVolume FillOunces PCVolume CarbPressure CarbTemp PSC PSCFill PSCCO2 MnfFlow CarbPressure1 FillPressure HydPressure2 HydPressure3 HydPressure4 FillerLevel FillerSpeed Temperature Usagecont CarbFlow Density MFR Balling PH OxygenFiller BowlSetpoint PressureSetpoint AlchRel CarbRel
## 4 1 CarbVolume FillOunces PCVolume CarbPressure CarbTemp PSC PSCFill PSCCO2 MnfFlow CarbPressure1 FillPressure HydPressure2 HydPressure3 HydPressure4 FillerLevel FillerSpeed Temperature Usagecont CarbFlow Density MFR Balling PH OxygenFiller BowlSetpoint PressureSetpoint AlchRel CarbRel
## 5 1 CarbVolume FillOunces PCVolume CarbPressure CarbTemp PSC PSCFill PSCCO2 MnfFlow CarbPressure1 FillPressure HydPressure2 HydPressure3 HydPressure4 FillerLevel FillerSpeed Temperature Usagecont CarbFlow Density MFR Balling PH OxygenFiller BowlSetpoint PressureSetpoint AlchRel CarbRel
df <- complete(imputed, silent = TRUE)
summary(df)## BrandCode CarbVolume FillOunces PCVolume CarbPressure
## A: 293 Min. :5.040 Min. :23.63 Min. :0.08667 Min. :57.00
## B:1239 1st Qu.:5.293 1st Qu.:23.92 1st Qu.:0.23933 1st Qu.:65.80
## C: 304 Median :5.347 Median :23.97 Median :0.27133 Median :68.20
## D: 615 Mean :5.374 Mean :23.97 Mean :0.27700 Mean :68.31
## 3rd Qu.:5.460 3rd Qu.:24.03 3rd Qu.:0.31133 3rd Qu.:70.60
## Max. :5.700 Max. :24.32 Max. :0.47067 Max. :79.40
## CarbTemp PSC PSCFill PSCCO2
## Min. :128.6 Min. :0.00200 Min. :0.0000 Min. :0.00000
## 1st Qu.:138.4 1st Qu.:0.04800 1st Qu.:0.1000 1st Qu.:0.02000
## Median :140.8 Median :0.07800 Median :0.1800 Median :0.04000
## Mean :141.2 Mean :0.08462 Mean :0.1958 Mean :0.05672
## 3rd Qu.:143.8 3rd Qu.:0.11200 3rd Qu.:0.2600 3rd Qu.:0.08000
## Max. :154.0 Max. :0.27000 Max. :0.6200 Max. :0.24000
## MnfFlow CarbPressure1 FillPressure HydPressure2
## Min. :-100.20 Min. :105.6 Min. :34.60 Min. : 0.00
## 1st Qu.:-100.00 1st Qu.:118.8 1st Qu.:46.00 1st Qu.: 0.00
## Median : 65.20 Median :123.2 Median :46.40 Median :28.60
## Mean : 24.42 Mean :122.5 Mean :47.86 Mean :21.03
## 3rd Qu.: 141.20 3rd Qu.:125.4 3rd Qu.:50.00 3rd Qu.:34.80
## Max. : 216.20 Max. :140.2 Max. :60.40 Max. :59.40
## HydPressure3 HydPressure4 FillerLevel FillerSpeed
## Min. :-1.20 Min. : 52.00 Min. : 55.8 Min. : 998
## 1st Qu.: 0.00 1st Qu.: 88.00 1st Qu.: 97.3 1st Qu.:3867
## Median :27.40 Median : 96.00 Median :118.2 Median :3982
## Mean :20.41 Mean : 96.55 Mean :109.1 Mean :3645
## 3rd Qu.:33.20 3rd Qu.:102.00 3rd Qu.:120.0 3rd Qu.:3998
## Max. :50.00 Max. :142.00 Max. :161.2 Max. :4030
## Temperature Usagecont CarbFlow Density MFR
## Min. :63.60 Min. :12.08 Min. : 26 Min. :0.240 Min. : 31.4
## 1st Qu.:65.20 1st Qu.:18.39 1st Qu.:1125 1st Qu.:0.900 1st Qu.:697.2
## Median :65.60 Median :21.80 Median :3028 Median :0.980 Median :722.2
## Mean :65.93 Mean :21.00 Mean :2464 Mean :1.184 Mean :675.2
## 3rd Qu.:66.40 3rd Qu.:23.76 3rd Qu.:3184 3rd Qu.:1.640 3rd Qu.:730.6
## Max. :76.20 Max. :25.90 Max. :5104 Max. :1.920 Max. :868.6
## Balling PressureVacuum PH OxygenFiller
## Min. :-0.170 Min. :-6.60 Min. :7.880 Min. :0.00240
## 1st Qu.: 1.496 1st Qu.:-5.60 1st Qu.:8.440 1st Qu.:0.02200
## Median : 1.648 Median :-5.40 Median :8.540 Median :0.03340
## Mean : 2.224 Mean :-5.21 Mean :8.548 Mean :0.04701
## 3rd Qu.: 3.340 3rd Qu.:-5.00 3rd Qu.:8.680 3rd Qu.:0.06000
## Max. : 4.012 Max. :-3.60 Max. :9.360 Max. :0.40000
## BowlSetpoint PressureSetpoint AirPressurer AlchRel
## Min. : 70.0 Min. :44.0 Min. :140.8 Min. :6.240
## 1st Qu.:100.0 1st Qu.:46.0 1st Qu.:142.2 1st Qu.:6.540
## Median :120.0 Median :46.0 Median :142.6 Median :6.580
## Mean :109.2 Mean :47.6 Mean :142.8 Mean :6.909
## 3rd Qu.:120.0 3rd Qu.:50.0 3rd Qu.:143.0 3rd Qu.:7.620
## Max. :140.0 Max. :52.0 Max. :148.2 Max. :8.560
## CarbRel BallingLvl
## Min. :4.960 Min. :0.000
## 1st Qu.:5.340 1st Qu.:1.380
## Median :5.400 Median :1.480
## Mean :5.441 Mean :2.078
## 3rd Qu.:5.560 3rd Qu.:3.160
## Max. :6.060 Max. :3.660
# Categorical
boxplot(df$PH ~df$BrandCode, main="PH by BrandCode", ylab="
ph", cex.axis=.5, xlab="",las=2) # numerical
# make dataset long to place data in boxplots
vars <- df %>%
select(-BrandCode) %>%
gather(key = 'variables', value = 'value')
# boxplot
vars %>%
ggplot(aes(x = variables, y = value)) +
geom_boxplot() +
labs(title = 'Boxplot of Numerical Variables') +
theme(plot.title = element_text(hjust = 0.5)) +
coord_flip()# histogram of variables
DataExplorer::plot_histogram(df)From the histograms above, we can classify the variables into the distribution categories:
Unfortunately, we were not provided a data dictionary or access to an advisor. We noted that HydPressure2 and HydPressure3 have a large proportion of 0 values. Is this feasible or could these be missing values? Could the multimodal distributed values represent discrete machine settings and maybe better represented as a categorical or ordinal variable? Without a data dictionary or expert advice, we will assume the variables are continuous variables.
corrMatrix <- round(cor(df[,-1]),4)
corrMatrix %>% corrplot(., method = "color", outline = T, addgrid.col = "darkgray", order="hclust", addrect = 4, rect.col = "black", rect.lwd = 5,cl.pos = "b", tl.col = "indianred4", tl.cex = 1.0, cl.cex = 1.0, addCoef.col = "white", number.digits = 2, number.cex = 0.5, col = colorRampPalette(c("darkred","white","dodgerblue4"))(100))We will first split the training and test data into a 75/25 split. We will then create a dummy variable for the factor variable, Brand Code, in order to utilize all models. Next, the independent and dependent will be identified as X and y, respectively.
Once this is done we can, we will perform 2 models of each regression family type: Linear, Non Linear, Tree and Rules.
We will take a look at the variable importance of each model and capture the performance on the hold out dataset.
Lastly, we will take the model with the highest Rsquared as our best model and perform a prediction on the evaluation dataset.
set.seed(123)
sample <- sample.split(df$PH, SplitRatio = 0.75)
train <- subset(df, sample == TRUE)
train <- model.matrix( ~ .-1, train)
test <- subset(df, sample == FALSE)
test <- model.matrix( ~ .-1, test)
y_train <- train[,'PH']
y_test <- test[,'PH']
X_train <- train[, !(colnames(train) == 'PH')]
X_test <- test[, !(colnames(test) == 'PH')]lmFit <- train(X_train, y_train,
method = 'lm',
preProc = c('center','scale'),
trControl = trainControl(method = 'cv')
)
lmFit## Linear Regression
##
## 1838 samples
## 34 predictor
##
## Pre-processing: centered (34), scaled (34)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1654, 1653, 1655, 1654, 1653, 1655, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.1357171 0.3845636 0.1054333
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
plot(lmFit$finalModel)The diagnostic plots reveal some concern over the Residuals vs Fitted. This plot should exhibit no patterns. The QQ plot should follow a straight line and there is a departure on the left side. The scale-location plot line is not straight and drifts downward. The Leverage plot does exhibit concerning points as well.
varimp <- varImp(lmFit)
varimp$importance %>%
arrange(desc(Overall)) %>%
kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="400px")| Overall | |
|---|---|
| MnfFlow | 100.000000 |
| CarbPressure1 | 63.187685 |
| HydPressure3 | 37.111454 |
| Temperature | 33.655251 |
| Usagecont | 33.368963 |
| BowlSetpoint | 32.223744 |
| Density | 26.203536 |
| PressureSetpoint | 24.150945 |
| OxygenFiller | 23.749691 |
| BrandCodeC | 22.693949 |
| BallingLvl | 22.323489 |
| AlchRel | 18.562396 |
| FillOunces | 17.860508 |
| PCVolume | 16.130986 |
| Balling | 15.321388 |
| PSCFill | 15.231630 |
| CarbFlow | 13.889807 |
| MFR | 13.778058 |
| BrandCodeA | 13.040450 |
| PSCCO2 | 12.482412 |
| HydPressure2 | 10.766061 |
| FillerSpeed | 10.705281 |
| FillerLevel | 8.989604 |
| AirPressurer | 8.336756 |
| PressureVacuum | 7.811428 |
| FillPressure | 7.257734 |
| CarbRel | 4.738633 |
| BrandCodeB | 4.702300 |
| CarbVolume | 4.296034 |
| PSC | 3.768463 |
| HydPressure4 | 1.009254 |
| CarbTemp | 0.771186 |
| CarbPressure | 0.000000 |
lm_Pred <- predict(lmFit, newdata = X_test)
lmPredPerf <- postResample(pred = lm_Pred, obs = y_test)
lmPredPerf['Family'] <- 'Linear'
# saving model prediction performance
modelPredPerf <- data.frame(rbind(lm = lmPredPerf))enetGrid <- expand.grid(.lambda = c(0,0.01,0.1),
.fraction = seq(0.05,1,length = 20))
set.seed(213)
enetTune <- train(X_train, y_train,
method = 'enet',
preProc = c('center','scale'),
tuneGrid = enetGrid,
trControl = trainControl(method = 'cv')
)
enetTune## Elasticnet
##
## 1838 samples
## 34 predictor
##
## Pre-processing: centered (34), scaled (34)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1655, 1653, 1655, 1653, 1653, 1655, ...
## Resampling results across tuning parameters:
##
## lambda fraction RMSE Rsquared MAE
## 0.00 0.05 0.1594181 0.2421211 0.1274611
## 0.00 0.10 0.1508963 0.2870210 0.1201409
## 0.00 0.15 0.1461062 0.3085183 0.1160208
## 0.00 0.20 0.1428295 0.3304116 0.1131066
## 0.00 0.25 0.1403685 0.3481322 0.1109448
## 0.00 0.30 0.1385503 0.3620985 0.1092429
## 0.00 0.35 0.1374238 0.3701068 0.1080466
## 0.00 0.40 0.1366838 0.3751849 0.1071298
## 0.00 0.45 0.1362466 0.3783356 0.1065466
## 0.00 0.50 0.1358365 0.3816027 0.1060263
## 0.00 0.55 0.1355623 0.3838794 0.1056378
## 0.00 0.60 0.1353530 0.3858426 0.1053413
## 0.00 0.65 0.1352604 0.3867479 0.1051351
## 0.00 0.70 0.1352622 0.3868130 0.1050523
## 0.00 0.75 0.1352964 0.3866307 0.1050440
## 0.00 0.80 0.1352858 0.3869250 0.1050263
## 0.00 0.85 0.1352691 0.3872454 0.1049957
## 0.00 0.90 0.1352811 0.3873364 0.1049717
## 0.00 0.95 0.1353159 0.3872630 0.1049626
## 0.00 1.00 0.1353448 0.3872084 0.1049774
## 0.01 0.05 0.1617693 0.2206851 0.1295299
## 0.01 0.10 0.1540843 0.2740077 0.1228096
## 0.01 0.15 0.1488478 0.2921870 0.1183467
## 0.01 0.20 0.1453349 0.3139255 0.1153309
## 0.01 0.25 0.1427644 0.3325691 0.1131085
## 0.01 0.30 0.1408343 0.3459918 0.1114360
## 0.01 0.35 0.1392048 0.3588129 0.1099309
## 0.01 0.40 0.1380570 0.3666159 0.1087011
## 0.01 0.45 0.1372182 0.3723264 0.1077154
## 0.01 0.50 0.1366867 0.3762296 0.1070444
## 0.01 0.55 0.1362247 0.3798518 0.1064836
## 0.01 0.60 0.1358642 0.3826399 0.1060187
## 0.01 0.65 0.1356359 0.3843241 0.1057066
## 0.01 0.70 0.1354491 0.3858379 0.1054316
## 0.01 0.75 0.1353647 0.3864451 0.1052511
## 0.01 0.80 0.1353286 0.3867642 0.1051733
## 0.01 0.85 0.1352806 0.3872687 0.1050921
## 0.01 0.90 0.1352243 0.3878375 0.1050043
## 0.01 0.95 0.1351704 0.3883726 0.1049474
## 0.01 1.00 0.1351440 0.3886737 0.1049184
## 0.10 0.05 0.1642013 0.1890429 0.1316803
## 0.10 0.10 0.1579334 0.2540368 0.1262179
## 0.10 0.15 0.1528831 0.2804125 0.1217122
## 0.10 0.20 0.1491187 0.2926903 0.1185522
## 0.10 0.25 0.1464422 0.3062337 0.1162621
## 0.10 0.30 0.1442695 0.3191597 0.1143883
## 0.10 0.35 0.1425440 0.3316946 0.1129034
## 0.10 0.40 0.1411423 0.3412816 0.1116739
## 0.10 0.45 0.1400476 0.3486849 0.1106672
## 0.10 0.50 0.1390805 0.3561940 0.1096901
## 0.10 0.55 0.1383856 0.3612655 0.1089433
## 0.10 0.60 0.1378768 0.3649947 0.1083449
## 0.10 0.65 0.1375146 0.3678612 0.1078842
## 0.10 0.70 0.1371749 0.3706499 0.1074605
## 0.10 0.75 0.1368877 0.3730726 0.1070972
## 0.10 0.80 0.1366540 0.3751016 0.1067685
## 0.10 0.85 0.1364312 0.3770946 0.1064839
## 0.10 0.90 0.1362413 0.3788108 0.1062394
## 0.10 0.95 0.1361149 0.3800147 0.1060677
## 0.10 1.00 0.1360143 0.3810034 0.1059430
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were fraction = 1 and lambda = 0.01.
varimp <- varImp(enetTune)
varimp$importance %>%
arrange(desc(Overall)) %>%
kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="400px")| Overall | |
|---|---|
| MnfFlow | 100.0000000 |
| Usagecont | 69.2766428 |
| BowlSetpoint | 57.2437485 |
| FillerLevel | 50.8583228 |
| PressureSetpoint | 44.1772076 |
| BrandCodeC | 43.7880379 |
| CarbFlow | 38.2422547 |
| HydPressure3 | 24.7297152 |
| PressureVacuum | 23.3682750 |
| FillPressure | 21.8618080 |
| HydPressure2 | 16.2828184 |
| BrandCodeD | 14.1425615 |
| OxygenFiller | 10.5003903 |
| CarbRel | 10.2468468 |
| Temperature | 9.0620329 |
| HydPressure4 | 8.8632613 |
| AlchRel | 8.5201559 |
| BrandCodeB | 7.4073347 |
| BrandCodeA | 5.1468240 |
| FillOunces | 4.9357578 |
| PSC | 4.1257338 |
| PSCCO2 | 3.3972054 |
| PCVolume | 3.2846205 |
| BallingLvl | 3.0449649 |
| Density | 1.9110063 |
| PSCFill | 1.8130365 |
| MFR | 1.7614789 |
| CarbVolume | 1.4719802 |
| CarbPressure1 | 1.1273341 |
| Balling | 1.0176082 |
| CarbPressure | 0.8775290 |
| FillerSpeed | 0.1823979 |
| CarbTemp | 0.0117359 |
| AirPressurer | 0.0000000 |
enetPred <- predict(enetTune, newdata = X_test)
enetPredPerf <- postResample(pred = enetPred, obs = y_test)
enetPredPerf['Family'] <- 'Linear'
# saving model prediction performance
modelPredPerf <- rbind(modelPredPerf,elastic_net = enetPredPerf)svmRTuned <- train(X_train, y_train,
method = 'svmRadial',
preProc = c('center','scale'),
tuneLength = 14,
trControl = trainControl(method = 'cv'))
svmRTuned## Support Vector Machines with Radial Basis Function Kernel
##
## 1838 samples
## 34 predictor
##
## Pre-processing: centered (34), scaled (34)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1655, 1656, 1654, 1653, 1655, 1654, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 0.1266445 0.4721916 0.09401414
## 0.50 0.1235081 0.4946179 0.09092216
## 1.00 0.1209978 0.5132580 0.08870062
## 2.00 0.1193126 0.5265046 0.08750538
## 4.00 0.1175024 0.5404473 0.08635614
## 8.00 0.1169993 0.5460900 0.08652649
## 16.00 0.1178836 0.5447947 0.08737787
## 32.00 0.1227274 0.5212937 0.09032960
## 64.00 0.1289240 0.4939767 0.09483409
## 128.00 0.1373354 0.4579998 0.10110100
## 256.00 0.1446848 0.4306996 0.10672403
## 512.00 0.1513160 0.4079947 0.11171768
## 1024.00 0.1543055 0.3957866 0.11399901
## 2048.00 0.1543055 0.3957866 0.11399901
##
## Tuning parameter 'sigma' was held constant at a value of 0.0206858
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.0206858 and C = 8.
varimp <- varImp(svmRTuned)
varimp$importance %>%
arrange(desc(Overall)) %>%
kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="400px")| Overall | |
|---|---|
| MnfFlow | 100.0000000 |
| Usagecont | 69.2766428 |
| BowlSetpoint | 57.2437485 |
| FillerLevel | 50.8583228 |
| PressureSetpoint | 44.1772076 |
| BrandCodeC | 43.7880379 |
| CarbFlow | 38.2422547 |
| HydPressure3 | 24.7297152 |
| PressureVacuum | 23.3682750 |
| FillPressure | 21.8618080 |
| HydPressure2 | 16.2828184 |
| BrandCodeD | 14.1425615 |
| OxygenFiller | 10.5003903 |
| CarbRel | 10.2468468 |
| Temperature | 9.0620329 |
| HydPressure4 | 8.8632613 |
| AlchRel | 8.5201559 |
| BrandCodeB | 7.4073347 |
| BrandCodeA | 5.1468240 |
| FillOunces | 4.9357578 |
| PSC | 4.1257338 |
| PSCCO2 | 3.3972054 |
| PCVolume | 3.2846205 |
| BallingLvl | 3.0449649 |
| Density | 1.9110063 |
| PSCFill | 1.8130365 |
| MFR | 1.7614789 |
| CarbVolume | 1.4719802 |
| CarbPressure1 | 1.1273341 |
| Balling | 1.0176082 |
| CarbPressure | 0.8775290 |
| FillerSpeed | 0.1823979 |
| CarbTemp | 0.0117359 |
| AirPressurer | 0.0000000 |
svmRPred <- predict(svmRTuned, newdata = X_test)
svmRPredPerf <- postResample(pred = svmRPred, obs = y_test)
svmRPredPerf['Family'] <- 'NonLinear'
# saving model prediction performance
modelPredPerf <- rbind(modelPredPerf,svm_radial = svmRPredPerf)# neural network
nnetGrid <- expand.grid(.decay = c(0,0.01,.1),
.size = c(1:5),
.bag = FALSE)
nnetFit <- train(X_train, y_train,
method = 'avNNet',
preProc = c('center','scale'),
tuneGrid = nnetGrid,
linout = TRUE,
trace = FALSE,
MaxNWts = 5 * (ncol(X_train) + 1 + 5 + 1),
maxit = 100
)
nnetFit## Model Averaged Neural Network
##
## 1838 samples
## 34 predictor
##
## Pre-processing: centered (34), scaled (34)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 1838, 1838, 1838, 1838, 1838, 1838, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 0.1933439 0.1864359 0.12151840
## 0.00 2 0.2671241 0.1522010 0.12375482
## 0.00 3 0.2131091 0.2824768 0.10771429
## 0.00 4 0.1293532 0.4502129 0.09580797
## 0.00 5 0.1259340 0.4777182 0.09317359
## 0.01 1 0.1460670 0.3232886 0.10856975
## 0.01 2 0.1777732 0.2308782 0.11101188
## 0.01 3 0.1472257 0.3812926 0.09961886
## 0.01 4 0.1249013 0.4837255 0.09376834
## 0.01 5 0.1256862 0.4831068 0.09328069
## 0.10 1 0.1372620 0.3807262 0.10511896
## 0.10 2 0.1468081 0.3107366 0.10982136
## 0.10 3 0.1548665 0.3495874 0.10316415
## 0.10 4 0.1260535 0.4745737 0.09553511
## 0.10 5 0.1248377 0.4848205 0.09441037
##
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 5, decay = 0.1 and bag = FALSE.
varimp <- varImp(nnetFit)
varimp$importance %>%
arrange(desc(Overall)) %>%
kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="400px")| Overall | |
|---|---|
| MnfFlow | 100.0000000 |
| Usagecont | 69.2766428 |
| BowlSetpoint | 57.2437485 |
| FillerLevel | 50.8583228 |
| PressureSetpoint | 44.1772076 |
| BrandCodeC | 43.7880379 |
| CarbFlow | 38.2422547 |
| HydPressure3 | 24.7297152 |
| PressureVacuum | 23.3682750 |
| FillPressure | 21.8618080 |
| HydPressure2 | 16.2828184 |
| BrandCodeD | 14.1425615 |
| OxygenFiller | 10.5003903 |
| CarbRel | 10.2468468 |
| Temperature | 9.0620329 |
| HydPressure4 | 8.8632613 |
| AlchRel | 8.5201559 |
| BrandCodeB | 7.4073347 |
| BrandCodeA | 5.1468240 |
| FillOunces | 4.9357578 |
| PSC | 4.1257338 |
| PSCCO2 | 3.3972054 |
| PCVolume | 3.2846205 |
| BallingLvl | 3.0449649 |
| Density | 1.9110063 |
| PSCFill | 1.8130365 |
| MFR | 1.7614789 |
| CarbVolume | 1.4719802 |
| CarbPressure1 | 1.1273341 |
| Balling | 1.0176082 |
| CarbPressure | 0.8775290 |
| FillerSpeed | 0.1823979 |
| CarbTemp | 0.0117359 |
| AirPressurer | 0.0000000 |
nnetPred <- predict(nnetFit, newdata = X_test)
nnetPredPerf <- postResample(pred = nnetPred, obs = y_test)
nnetPredPerf['Family'] <- 'NonLinear'
# saving model prediction performance
modelPredPerf <- rbind(modelPredPerf,neural_net = nnetPredPerf)rfmodel <- train(X_train, y_train,
method = 'rf',
preProc = c('center','scale'),
trControl = trainControl(method = 'cv'))
rfmodel## Random Forest
##
## 1838 samples
## 34 predictor
##
## Pre-processing: centered (34), scaled (34)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1654, 1654, 1653, 1653, 1656, 1654, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 0.1143616 0.6080012 0.08631204
## 18 0.1006756 0.6729194 0.07251130
## 34 0.1000516 0.6701545 0.07167922
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 34.
varimp <- varImp(rfmodel)
varimp$importance %>%
arrange(desc(Overall)) %>%
kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="400px")| Overall | |
|---|---|
| MnfFlow | 100.0000000 |
| BrandCodeC | 32.9791581 |
| OxygenFiller | 25.1211507 |
| AlchRel | 23.0500172 |
| AirPressurer | 21.4981865 |
| PressureVacuum | 18.4859680 |
| BallingLvl | 17.4312097 |
| CarbRel | 15.7352773 |
| CarbPressure1 | 13.8219323 |
| Temperature | 12.2853810 |
| Usagecont | 12.0620345 |
| CarbFlow | 11.8643196 |
| HydPressure3 | 10.4302866 |
| FillerSpeed | 10.1567336 |
| Density | 8.6051131 |
| Balling | 8.2544745 |
| FillerLevel | 7.3460338 |
| PCVolume | 7.2139284 |
| FillOunces | 6.7377716 |
| CarbVolume | 6.6267703 |
| FillPressure | 6.6175996 |
| MFR | 6.5439064 |
| HydPressure2 | 5.7663144 |
| BowlSetpoint | 5.2854966 |
| PSC | 5.2668915 |
| PSCFill | 4.2538893 |
| HydPressure4 | 4.0522002 |
| CarbPressure | 3.6348795 |
| CarbTemp | 3.1968389 |
| PSCCO2 | 2.9732825 |
| PressureSetpoint | 1.5325994 |
| BrandCodeA | 1.5117571 |
| BrandCodeB | 0.4281065 |
| BrandCodeD | 0.0000000 |
rfPred <- predict(rfmodel, newdata = X_test)
rfPredPerf <- postResample(pred = rfPred, obs = y_test)
rfPredPerf['Family'] <- 'TreesAndRules'
# saving model prediction performance
modelPredPerf <- rbind(modelPredPerf, rf = rfPredPerf)set.seed(123)
cubist_Model <- train(x = X_train, y = y_train,
method = "cubist",
preProc = c('center','scale'),
trControl = trainControl(method = 'cv'))
cubist_Model## Cubist
##
## 1838 samples
## 34 predictor
##
## Pre-processing: centered (34), scaled (34)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1653, 1654, 1654, 1655, 1654, 1656, ...
## Resampling results across tuning parameters:
##
## committees neighbors RMSE Rsquared MAE
## 1 0 0.1213716 0.5316270 0.08440854
## 1 5 0.1191605 0.5595143 0.08131312
## 1 9 0.1184966 0.5587677 0.08104720
## 10 0 0.1050136 0.6347628 0.07697248
## 10 5 0.1023614 0.6521372 0.07278536
## 10 9 0.1015647 0.6558187 0.07273128
## 20 0 0.1043489 0.6413283 0.07604857
## 20 5 0.1013335 0.6583605 0.07176922
## 20 9 0.1006968 0.6613563 0.07180303
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 9.
cubistPred <- predict(cubist_Model, newdata = X_test)
cubistPredPerf <- postResample(pred = cubistPred, obs = y_test)
cubistPredPerf['Family'] <- 'TreesAndRules'
# saving model prediction performance
modelPredPerf <- rbind(modelPredPerf,cubist = cubistPredPerf)varimp <- varImp(cubist_Model)
varimp$importance %>%
arrange(desc(Overall)) %>%
kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="400px")| Overall | |
|---|---|
| MnfFlow | 100.000000 |
| AlchRel | 74.825175 |
| BallingLvl | 72.727273 |
| Density | 62.237762 |
| PressureVacuum | 55.944056 |
| Balling | 55.244755 |
| AirPressurer | 53.146853 |
| FillerSpeed | 46.153846 |
| Temperature | 44.755245 |
| CarbPressure1 | 44.755245 |
| OxygenFiller | 43.356643 |
| BowlSetpoint | 42.657343 |
| Usagecont | 40.559441 |
| HydPressure3 | 40.559441 |
| CarbFlow | 38.461539 |
| BrandCodeC | 37.762238 |
| CarbRel | 36.363636 |
| HydPressure2 | 29.370629 |
| MFR | 27.272727 |
| FillerLevel | 21.678322 |
| CarbPressure | 20.279720 |
| HydPressure4 | 17.482518 |
| PCVolume | 15.384615 |
| CarbVolume | 14.685315 |
| CarbTemp | 13.986014 |
| PressureSetpoint | 11.188811 |
| FillPressure | 10.489510 |
| PSCFill | 8.391608 |
| BrandCodeB | 6.993007 |
| FillOunces | 5.594406 |
| BrandCodeD | 3.496504 |
| PSC | 2.797203 |
| PSCCO2 | 2.797203 |
| BrandCodeA | 0.000000 |
Comparing the Rsquared, RSME, and MAE metric of each model’s prediction on the hold out data. We will use the the model with the highest Rsquared value as this will explains the most about the variance of the predicted value.
Here, we see that the Trees and Rules models outperforms the Non Linear and Linear regression models.
modelPredPerf %>%
mutate(RMSE = round(as.numeric(RMSE),3),
Rsquared = round(as.numeric(Rsquared),3),
MAE = round(as.numeric(MAE),3),) %>%
arrange(desc(Rsquared)) %>%
kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="250px")| RMSE | Rsquared | MAE | Family | |
|---|---|---|---|---|
| rf | 0.089 | 0.738 | 0.067 | TreesAndRules |
| cubist | 0.089 | 0.728 | 0.065 | TreesAndRules |
| svm_radial | 0.109 | 0.593 | 0.081 | NonLinear |
| neural_net | 0.114 | 0.552 | 0.088 | NonLinear |
| lm | 0.128 | 0.435 | 0.100 | Linear |
| elastic_net | 0.128 | 0.433 | 0.100 | Linear |
Each model generated indicates that the most important variable used is the MnfFlow in predicting the PH value. In the majority of models, this variable was followed by UsageCont, BowlSetpoint, and FillerLevel. However, this is not the case in the Random Forest model, which is our best performing model.
We recommend the top performing model, the random forest with top 5 predictor variables: MnfFlow, BrandCodeC, OxygenFiller, AlchRel, AirPressurer.
Lastly, we will use our best performing model, Random Forest, on the evaluation dataset. In order to use the model, we need to perform the transformations that we used on our dataset.
# reading the evaluation dataset
eval_df <- read_xlsx('StudentEvaluation.xlsx')
# change Brand Code into a factor
eval_df <- eval_df %>%
mutate(`Brand Code` = as.factor(`Brand Code`))
# remove white spaces in column names
colNamesNoSpace <- colnames(eval_df) %>%
str_remove_all(' ')
colnames(eval_df) <- colNamesNoSpace
# remove degenerate variable
eval_df <- eval_df[,!(colnames(eval_df) == remove)]
# looking at NA values
(countNA <- colSums(is.na(eval_df)))## BrandCode CarbVolume FillOunces PCVolume
## 8 1 6 4
## CarbPressure CarbTemp PSC PSCFill
## 0 1 5 3
## PSCCO2 MnfFlow CarbPressure1 FillPressure
## 5 0 4 2
## HydPressure2 HydPressure3 HydPressure4 FillerLevel
## 1 1 4 2
## FillerSpeed Temperature Usagecont CarbFlow
## 10 2 2 0
## Density MFR Balling PressureVacuum
## 1 31 1 1
## PH OxygenFiller BowlSetpoint PressureSetpoint
## 267 3 1 2
## AirPressurer AlchRel CarbRel BallingLvl
## 1 3 2 0
We can see that the dataset has missing values. The generated Random Forest model needs values in each entry to make a prediction on the entry. Thus, imputation is needed prior to making predictions. We will use the same technique as before.
#imputation by using the random forest method ('rf')
init <- mice(eval_df, maxit = 0)
predM <- init$predictorMatrix
set.seed(123)
imputed <- mice(eval_df, method = 'rf', predictorMatrix = predM, m=1, silent = TRUE)##
## iter imp variable
## 1 1 BrandCode CarbVolume FillOunces PCVolume CarbTemp PSC PSCFill PSCCO2 CarbPressure1 FillPressure HydPressure2 HydPressure3 HydPressure4 FillerLevel FillerSpeed Temperature Usagecont Density MFR Balling PressureVacuum OxygenFiller BowlSetpoint PressureSetpoint AirPressurer AlchRel CarbRel
## 2 1 BrandCode CarbVolume FillOunces PCVolume CarbTemp PSC PSCFill PSCCO2 CarbPressure1 FillPressure HydPressure2 HydPressure3 HydPressure4 FillerLevel FillerSpeed Temperature Usagecont Density MFR Balling PressureVacuum OxygenFiller BowlSetpoint PressureSetpoint AirPressurer AlchRel CarbRel
## 3 1 BrandCode CarbVolume FillOunces PCVolume CarbTemp PSC PSCFill PSCCO2 CarbPressure1 FillPressure HydPressure2 HydPressure3 HydPressure4 FillerLevel FillerSpeed Temperature Usagecont Density MFR Balling PressureVacuum OxygenFiller BowlSetpoint PressureSetpoint AirPressurer AlchRel CarbRel
## 4 1 BrandCode CarbVolume FillOunces PCVolume CarbTemp PSC PSCFill PSCCO2 CarbPressure1 FillPressure HydPressure2 HydPressure3 HydPressure4 FillerLevel FillerSpeed Temperature Usagecont Density MFR Balling PressureVacuum OxygenFiller BowlSetpoint PressureSetpoint AirPressurer AlchRel CarbRel
## 5 1 BrandCode CarbVolume FillOunces PCVolume CarbTemp PSC PSCFill PSCCO2 CarbPressure1 FillPressure HydPressure2 HydPressure3 HydPressure4 FillerLevel FillerSpeed Temperature Usagecont Density MFR Balling PressureVacuum OxygenFiller BowlSetpoint PressureSetpoint AirPressurer AlchRel CarbRel
eval_df2 <- complete(imputed, silent = TRUE)Next, we need to drop the PH in the evaluation dataset to transform it into a matrix model that includes dummy variables for the factor data type.
# remove PH variable
eval_df2 <- eval_df2[,!(colnames(eval_df2) == 'PH')]
# create model matrix
eval_modelmatrix <- model.matrix( ~ .-1, eval_df2)Now the Random Forest model is ready to be used on the evaluation dataset.
# Predicting on evaluation dataset
rfPred <- predict(rfmodel, newdata = eval_modelmatrix)
# Inputting the prediction values into dataset
eval_df$PH <- rfPredFinally, we can write the dataset set with our predictions into an excel spreadsheet.
eval_df %>%
head %>%
kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="250px")| BrandCode | CarbVolume | FillOunces | PCVolume | CarbPressure | CarbTemp | PSC | PSCFill | PSCCO2 | MnfFlow | CarbPressure1 | FillPressure | HydPressure2 | HydPressure3 | HydPressure4 | FillerLevel | FillerSpeed | Temperature | Usagecont | CarbFlow | Density | MFR | Balling | PressureVacuum | PH | OxygenFiller | BowlSetpoint | PressureSetpoint | AirPressurer | AlchRel | CarbRel | BallingLvl |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| D | 5.480000 | 24.03333 | 0.2700000 | 65.4 | 134.6 | 0.236 | 0.40 | 0.04 | -100 | 116.6 | 46.0 | NA | NA | 96 | 129.4 | 3986 | 66.0 | 21.66 | 2950 | 0.88 | 727.6 | 1.398 | -3.8 | 8.538019 | 0.022 | 130 | 45.2 | 142.6 | 6.56 | 5.34 | 1.48 |
| A | 5.393333 | 23.95333 | 0.2266667 | 63.2 | 135.0 | 0.042 | 0.22 | 0.08 | -100 | 118.8 | 46.2 | 0 | 0 | 112 | 120.0 | 4012 | 65.6 | 17.60 | 2916 | 1.50 | 735.8 | 2.942 | -4.4 | 8.437569 | 0.030 | 120 | 46.0 | 147.2 | 7.14 | 5.58 | 3.04 |
| B | 5.293333 | 23.92000 | 0.3033333 | 66.4 | 140.4 | 0.068 | 0.10 | 0.02 | -100 | 120.2 | 45.8 | 0 | 0 | 98 | 119.4 | 4010 | 65.6 | 24.18 | 3056 | 0.90 | 734.8 | 1.448 | -4.2 | 8.516417 | 0.046 | 120 | 46.0 | 146.6 | 6.52 | 5.34 | 1.46 |
| B | 5.266667 | 23.94000 | 0.1860000 | 64.8 | 139.0 | 0.004 | 0.20 | 0.02 | -100 | 124.8 | 40.0 | 0 | 0 | 132 | 120.2 | NA | 74.4 | 18.12 | 28 | 0.74 | NA | 1.056 | -4.0 | 8.536727 | NA | 120 | 46.0 | 146.4 | 6.48 | 5.50 | 1.48 |
| B | 5.406667 | 24.20000 | 0.1600000 | 69.4 | 142.2 | 0.040 | 0.30 | 0.06 | -100 | 115.0 | 51.4 | 0 | 0 | 94 | 116.0 | 4018 | 66.4 | 21.32 | 3214 | 0.88 | 752.0 | 1.398 | -4.0 | 8.472045 | 0.082 | 120 | 50.0 | 145.8 | 6.50 | 5.38 | 1.46 |
| B | 5.286667 | 24.10667 | 0.2120000 | 73.4 | 147.2 | 0.078 | 0.22 | NA | -100 | 118.6 | 46.4 | 0 | 0 | 94 | 120.4 | 4010 | 66.6 | 18.00 | 3064 | 0.84 | 732.0 | 1.298 | -3.8 | 8.550307 | 0.064 | 120 | 46.0 | 146.0 | 6.50 | 5.42 | 1.44 |
write_xlsx(eval_df, 'StudentEvaluation2.xlsx')