This is role playing. I am your new boss. I am in charge of production at ABC Beverage and you are a team of data scientists reporting to me. My leadership has told me that new regulations are requiring us to understand our manufacturing process, the predictive factors and be able to report to them our predictive model of PH.
Please use the historical data set I am providing. Build and report the factors in BOTH a technical and non-technical report. I like to use Word and Excel. Please provide your non-technical report in a business friendly readable document and your predictions in an Excel readable format. The technical report should show clearly the models you tested and how you selected your final approach.
Please submit both Rpubs links and .rmd files or other readable formats for technical and non-technical reports. Also submit the excel file showing the prediction of your models for pH.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(imputeTS)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(readxl)
library(caret)
## Loading required package: lattice
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(e1071)
library(readxl)
library(corrplot)
## corrplot 0.92 loaded
library(writexl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ randomForest::combine() masks dplyr::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
training_data <- read_excel("/Users/briansingh/Desktop/CUNY/Spring 2024/Data 624/Final Project/StudentData.xlsx")
testing_data <- read_excel("/Users/briansingh/Desktop/CUNY/Spring 2024/Data 624/Final Project/StudentEvaluation.xlsx")
#summary of data
summary(training_data)
## Brand Code Carb Volume Fill Ounces PC Volume
## Length:2571 Min. :5.040 Min. :23.63 Min. :0.07933
## Class :character 1st Qu.:5.293 1st Qu.:23.92 1st Qu.:0.23917
## Mode :character Median :5.347 Median :23.97 Median :0.27133
## Mean :5.370 Mean :23.97 Mean :0.27712
## 3rd Qu.:5.453 3rd Qu.:24.03 3rd Qu.:0.31200
## Max. :5.700 Max. :24.32 Max. :0.47800
## NA's :10 NA's :38 NA's :39
## Carb Pressure Carb Temp PSC PSC Fill
## Min. :57.00 Min. :128.6 Min. :0.00200 Min. :0.0000
## 1st Qu.:65.60 1st Qu.:138.4 1st Qu.:0.04800 1st Qu.:0.1000
## Median :68.20 Median :140.8 Median :0.07600 Median :0.1800
## Mean :68.19 Mean :141.1 Mean :0.08457 Mean :0.1954
## 3rd Qu.:70.60 3rd Qu.:143.8 3rd Qu.:0.11200 3rd Qu.:0.2600
## Max. :79.40 Max. :154.0 Max. :0.27000 Max. :0.6200
## NA's :27 NA's :26 NA's :33 NA's :23
## PSC CO2 Mnf Flow Carb Pressure1 Fill Pressure
## Min. :0.00000 Min. :-100.20 Min. :105.6 Min. :34.60
## 1st Qu.:0.02000 1st Qu.:-100.00 1st Qu.:119.0 1st Qu.:46.00
## Median :0.04000 Median : 65.20 Median :123.2 Median :46.40
## Mean :0.05641 Mean : 24.57 Mean :122.6 Mean :47.92
## 3rd Qu.:0.08000 3rd Qu.: 140.80 3rd Qu.:125.4 3rd Qu.:50.00
## Max. :0.24000 Max. : 229.40 Max. :140.2 Max. :60.40
## NA's :39 NA's :2 NA's :32 NA's :22
## Hyd Pressure1 Hyd Pressure2 Hyd Pressure3 Hyd Pressure4
## Min. :-0.80 Min. : 0.00 Min. :-1.20 Min. : 52.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 86.00
## Median :11.40 Median :28.60 Median :27.60 Median : 96.00
## Mean :12.44 Mean :20.96 Mean :20.46 Mean : 96.29
## 3rd Qu.:20.20 3rd Qu.:34.60 3rd Qu.:33.40 3rd Qu.:102.00
## Max. :58.00 Max. :59.40 Max. :50.00 Max. :142.00
## NA's :11 NA's :15 NA's :15 NA's :30
## Filler Level Filler Speed Temperature Usage cont Carb Flow
## Min. : 55.8 Min. : 998 Min. :63.60 Min. :12.08 Min. : 26
## 1st Qu.: 98.3 1st Qu.:3888 1st Qu.:65.20 1st Qu.:18.36 1st Qu.:1144
## Median :118.4 Median :3982 Median :65.60 Median :21.79 Median :3028
## Mean :109.3 Mean :3687 Mean :65.97 Mean :20.99 Mean :2468
## 3rd Qu.:120.0 3rd Qu.:3998 3rd Qu.:66.40 3rd Qu.:23.75 3rd Qu.:3186
## Max. :161.2 Max. :4030 Max. :76.20 Max. :25.90 Max. :5104
## NA's :20 NA's :57 NA's :14 NA's :5 NA's :2
## Density MFR Balling Pressure Vacuum
## Min. :0.240 Min. : 31.4 Min. :-0.170 Min. :-6.600
## 1st Qu.:0.900 1st Qu.:706.3 1st Qu.: 1.496 1st Qu.:-5.600
## Median :0.980 Median :724.0 Median : 1.648 Median :-5.400
## Mean :1.174 Mean :704.0 Mean : 2.198 Mean :-5.216
## 3rd Qu.:1.620 3rd Qu.:731.0 3rd Qu.: 3.292 3rd Qu.:-5.000
## Max. :1.920 Max. :868.6 Max. : 4.012 Max. :-3.600
## NA's :1 NA's :212 NA's :1
## PH Oxygen Filler Bowl Setpoint Pressure Setpoint
## Min. :7.880 Min. :0.00240 Min. : 70.0 Min. :44.00
## 1st Qu.:8.440 1st Qu.:0.02200 1st Qu.:100.0 1st Qu.:46.00
## Median :8.540 Median :0.03340 Median :120.0 Median :46.00
## Mean :8.546 Mean :0.04684 Mean :109.3 Mean :47.62
## 3rd Qu.:8.680 3rd Qu.:0.06000 3rd Qu.:120.0 3rd Qu.:50.00
## Max. :9.360 Max. :0.40000 Max. :140.0 Max. :52.00
## NA's :4 NA's :12 NA's :2 NA's :12
## Air Pressurer Alch Rel Carb Rel Balling Lvl
## Min. :140.8 Min. :5.280 Min. :4.960 Min. :0.00
## 1st Qu.:142.2 1st Qu.:6.540 1st Qu.:5.340 1st Qu.:1.38
## Median :142.6 Median :6.560 Median :5.400 Median :1.48
## Mean :142.8 Mean :6.897 Mean :5.437 Mean :2.05
## 3rd Qu.:143.0 3rd Qu.:7.240 3rd Qu.:5.540 3rd Qu.:3.14
## Max. :148.2 Max. :8.620 Max. :6.060 Max. :3.66
## NA's :9 NA's :10 NA's :1
#column names appear as `Brand Code`, instead of Brand_Code, fix the names
colnames(training_data) <- c("Brand_Code", "Carb_Volume", "Fill_Ounces", "PC_Volume", "Carb_Pressure", "Carb_Temp", "PSC", "PSC_Fill", "PSC_C02", "Mnf_Flow", "Carb_Pressure1", "Fill_Pressure", "Hyd_Pressure1", "Hyd_Pressure2", "Hyd_Pressure3", "Hyd_Pressure4", "Filler_Level", "Filler_Speed", "Temperature", "Usage_cont", "Carb_Flow","Density", "MFR", "Balling", "Pressure_Vacuum", "PH","Oxygen_Filler", "Bowl_Setpoint", "Pressure_Setpoint","Air_Pressure", "Alch_Rel", "Carb_Rel", "Balling_Lvl")
#check for missing values
missing_values <- colSums(is.na(training_data))
missing_values_df <- data.frame(Variable = names(missing_values), Missing_Values = missing_values)
missing_values_df
## Variable Missing_Values
## Brand_Code Brand_Code 120
## Carb_Volume Carb_Volume 10
## Fill_Ounces Fill_Ounces 38
## PC_Volume PC_Volume 39
## Carb_Pressure Carb_Pressure 27
## Carb_Temp Carb_Temp 26
## PSC PSC 33
## PSC_Fill PSC_Fill 23
## PSC_C02 PSC_C02 39
## Mnf_Flow Mnf_Flow 2
## Carb_Pressure1 Carb_Pressure1 32
## Fill_Pressure Fill_Pressure 22
## Hyd_Pressure1 Hyd_Pressure1 11
## Hyd_Pressure2 Hyd_Pressure2 15
## Hyd_Pressure3 Hyd_Pressure3 15
## Hyd_Pressure4 Hyd_Pressure4 30
## Filler_Level Filler_Level 20
## Filler_Speed Filler_Speed 57
## Temperature Temperature 14
## Usage_cont Usage_cont 5
## Carb_Flow Carb_Flow 2
## Density Density 1
## MFR MFR 212
## Balling Balling 1
## Pressure_Vacuum Pressure_Vacuum 0
## PH PH 4
## Oxygen_Filler Oxygen_Filler 12
## Bowl_Setpoint Bowl_Setpoint 2
## Pressure_Setpoint Pressure_Setpoint 12
## Air_Pressure Air_Pressure 0
## Alch_Rel Alch_Rel 9
## Carb_Rel Carb_Rel 10
## Balling_Lvl Balling_Lvl 1
#Brand_Code is categorical, we can't impute with averages, we'll use the mode and fill in the 120 values
table(training_data$Brand_Code)
##
## A B C D
## 293 1239 304 615
mode_brand_code <- training_data %>%
group_by(Brand_Code) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
slice(1) %>%
pull(Brand_Code)
training_data_imputed <- training_data %>%
mutate(Brand_Code= ifelse(is.na(training_data$Brand_Code), mode_brand_code, training_data$Brand_Code))
sum(is.na(training_data_imputed$Brand_Code))
## [1] 0
summary(training_data_imputed)
## Brand_Code Carb_Volume Fill_Ounces PC_Volume
## Length:2571 Min. :5.040 Min. :23.63 Min. :0.07933
## Class :character 1st Qu.:5.293 1st Qu.:23.92 1st Qu.:0.23917
## Mode :character Median :5.347 Median :23.97 Median :0.27133
## Mean :5.370 Mean :23.97 Mean :0.27712
## 3rd Qu.:5.453 3rd Qu.:24.03 3rd Qu.:0.31200
## Max. :5.700 Max. :24.32 Max. :0.47800
## NA's :10 NA's :38 NA's :39
## Carb_Pressure Carb_Temp PSC PSC_Fill
## Min. :57.00 Min. :128.6 Min. :0.00200 Min. :0.0000
## 1st Qu.:65.60 1st Qu.:138.4 1st Qu.:0.04800 1st Qu.:0.1000
## Median :68.20 Median :140.8 Median :0.07600 Median :0.1800
## Mean :68.19 Mean :141.1 Mean :0.08457 Mean :0.1954
## 3rd Qu.:70.60 3rd Qu.:143.8 3rd Qu.:0.11200 3rd Qu.:0.2600
## Max. :79.40 Max. :154.0 Max. :0.27000 Max. :0.6200
## NA's :27 NA's :26 NA's :33 NA's :23
## PSC_C02 Mnf_Flow Carb_Pressure1 Fill_Pressure
## Min. :0.00000 Min. :-100.20 Min. :105.6 Min. :34.60
## 1st Qu.:0.02000 1st Qu.:-100.00 1st Qu.:119.0 1st Qu.:46.00
## Median :0.04000 Median : 65.20 Median :123.2 Median :46.40
## Mean :0.05641 Mean : 24.57 Mean :122.6 Mean :47.92
## 3rd Qu.:0.08000 3rd Qu.: 140.80 3rd Qu.:125.4 3rd Qu.:50.00
## Max. :0.24000 Max. : 229.40 Max. :140.2 Max. :60.40
## NA's :39 NA's :2 NA's :32 NA's :22
## Hyd_Pressure1 Hyd_Pressure2 Hyd_Pressure3 Hyd_Pressure4
## Min. :-0.80 Min. : 0.00 Min. :-1.20 Min. : 52.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 86.00
## Median :11.40 Median :28.60 Median :27.60 Median : 96.00
## Mean :12.44 Mean :20.96 Mean :20.46 Mean : 96.29
## 3rd Qu.:20.20 3rd Qu.:34.60 3rd Qu.:33.40 3rd Qu.:102.00
## Max. :58.00 Max. :59.40 Max. :50.00 Max. :142.00
## NA's :11 NA's :15 NA's :15 NA's :30
## Filler_Level Filler_Speed Temperature Usage_cont Carb_Flow
## Min. : 55.8 Min. : 998 Min. :63.60 Min. :12.08 Min. : 26
## 1st Qu.: 98.3 1st Qu.:3888 1st Qu.:65.20 1st Qu.:18.36 1st Qu.:1144
## Median :118.4 Median :3982 Median :65.60 Median :21.79 Median :3028
## Mean :109.3 Mean :3687 Mean :65.97 Mean :20.99 Mean :2468
## 3rd Qu.:120.0 3rd Qu.:3998 3rd Qu.:66.40 3rd Qu.:23.75 3rd Qu.:3186
## Max. :161.2 Max. :4030 Max. :76.20 Max. :25.90 Max. :5104
## NA's :20 NA's :57 NA's :14 NA's :5 NA's :2
## Density MFR Balling Pressure_Vacuum
## Min. :0.240 Min. : 31.4 Min. :-0.170 Min. :-6.600
## 1st Qu.:0.900 1st Qu.:706.3 1st Qu.: 1.496 1st Qu.:-5.600
## Median :0.980 Median :724.0 Median : 1.648 Median :-5.400
## Mean :1.174 Mean :704.0 Mean : 2.198 Mean :-5.216
## 3rd Qu.:1.620 3rd Qu.:731.0 3rd Qu.: 3.292 3rd Qu.:-5.000
## Max. :1.920 Max. :868.6 Max. : 4.012 Max. :-3.600
## NA's :1 NA's :212 NA's :1
## PH Oxygen_Filler Bowl_Setpoint Pressure_Setpoint
## Min. :7.880 Min. :0.00240 Min. : 70.0 Min. :44.00
## 1st Qu.:8.440 1st Qu.:0.02200 1st Qu.:100.0 1st Qu.:46.00
## Median :8.540 Median :0.03340 Median :120.0 Median :46.00
## Mean :8.546 Mean :0.04684 Mean :109.3 Mean :47.62
## 3rd Qu.:8.680 3rd Qu.:0.06000 3rd Qu.:120.0 3rd Qu.:50.00
## Max. :9.360 Max. :0.40000 Max. :140.0 Max. :52.00
## NA's :4 NA's :12 NA's :2 NA's :12
## Air_Pressure Alch_Rel Carb_Rel Balling_Lvl
## Min. :140.8 Min. :5.280 Min. :4.960 Min. :0.00
## 1st Qu.:142.2 1st Qu.:6.540 1st Qu.:5.340 1st Qu.:1.38
## Median :142.6 Median :6.560 Median :5.400 Median :1.48
## Mean :142.8 Mean :6.897 Mean :5.437 Mean :2.05
## 3rd Qu.:143.0 3rd Qu.:7.240 3rd Qu.:5.540 3rd Qu.:3.14
## Max. :148.2 Max. :8.620 Max. :6.060 Max. :3.66
## NA's :9 NA's :10 NA's :1
#impute numerical values with mean of the feature
for (i in colnames(training_data_imputed)[sapply(training_data_imputed, is.numeric)]) {
training_data_imputed[[i]] <- na_mean(training_data_imputed[[i]])
}
#check for any remaning missing values
any(is.na(training_data_imputed))
## [1] FALSE
#distribution of PH
ggplot(training_data_imputed, aes(x = PH)) +
geom_histogram() +
labs(title = "pH distribution", x = "pH", y = "Freq")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#distribution of other numeric features
training_data_imputed %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
geom_histogram(bins = 10) +
facet_wrap(~key, scales = "free")
#correlations
correlation_matrix <- cor(training_data_imputed[, -1])
corrplot(correlation_matrix, method = "color", type = "upper", tl.col = "black", tl.srt = 45, tl.cex = 0.6)
#training control to use in model building
train_control <- trainControl(method = "cv", number = 10)
#linear regression, random forest and support vector machine models
lm_mod <- train(PH ~ ., data = training_data_imputed, method = "lm", trControl = train_control)
rf_mod <- train(PH ~ ., data = training_data_imputed, method = "rf", trControl = train_control)
svm_mod <- train(PH ~ ., data = training_data_imputed, method = "svmRadial", trControl = train_control)
#evaluations
lm_results <- summary(lm_mod)
rf_results <- rf_mod$results[which.max(rf_mod$results$Rsquared), ]
svm_results <- svm_mod$results[which.max(svm_mod$results$Rsquared), ]
lm_results
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.51684 -0.07726 0.01138 0.08826 0.72663
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.066e+01 9.346e-01 11.408 < 2e-16 ***
## Brand_CodeB 6.352e-02 2.216e-02 2.866 0.004186 **
## Brand_CodeC -7.033e-02 2.222e-02 -3.165 0.001572 **
## Brand_CodeD 5.453e-02 1.520e-02 3.587 0.000341 ***
## Carb_Volume -5.979e-02 5.846e-02 -1.023 0.306497
## Fill_Ounces -8.393e-02 3.232e-02 -2.597 0.009464 **
## PC_Volume -1.012e-01 5.354e-02 -1.890 0.058922 .
## Carb_Pressure -7.018e-04 2.085e-03 -0.337 0.736490
## Carb_Temp 1.723e-03 1.670e-03 1.032 0.302177
## PSC -9.946e-02 5.714e-02 -1.741 0.081879 .
## PSC_Fill -2.948e-02 2.343e-02 -1.258 0.208523
## PSC_C02 -1.218e-01 6.333e-02 -1.924 0.054526 .
## Mnf_Flow -7.210e-04 4.593e-05 -15.696 < 2e-16 ***
## Carb_Pressure1 6.359e-03 6.988e-04 9.100 < 2e-16 ***
## Fill_Pressure 1.992e-03 1.216e-03 1.638 0.101561
## Hyd_Pressure1 -2.310e-06 3.662e-04 -0.006 0.994968
## Hyd_Pressure2 -7.774e-04 5.321e-04 -1.461 0.144101
## Hyd_Pressure3 3.025e-03 5.807e-04 5.208 2.06e-07 ***
## Hyd_Pressure4 2.611e-05 2.984e-04 0.087 0.930283
## Filler_Level -1.061e-03 5.180e-04 -2.049 0.040593 *
## Filler_Speed -1.422e-06 5.498e-06 -0.259 0.795864
## Temperature -1.702e-02 2.230e-03 -7.632 3.25e-14 ***
## Usage_cont -6.671e-03 1.141e-03 -5.847 5.66e-09 ***
## Carb_Flow 1.095e-05 3.618e-06 3.027 0.002495 **
## Density -1.304e-01 2.799e-02 -4.660 3.33e-06 ***
## MFR 1.304e-05 4.634e-05 0.281 0.778356
## Balling -7.214e-02 2.265e-02 -3.185 0.001465 **
## Pressure_Vacuum -1.671e-02 7.415e-03 -2.253 0.024328 *
## Oxygen_Filler -2.759e-01 6.818e-02 -4.046 5.36e-05 ***
## Bowl_Setpoint 3.071e-03 5.419e-04 5.666 1.62e-08 ***
## Pressure_Setpoint -8.224e-03 1.953e-03 -4.212 2.62e-05 ***
## Air_Pressure -1.676e-03 2.370e-03 -0.707 0.479476
## Alch_Rel 6.350e-02 2.083e-02 3.049 0.002322 **
## Carb_Rel 6.564e-02 4.631e-02 1.417 0.156540
## Balling_Lvl 1.037e-01 2.081e-02 4.985 6.61e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.133 on 2536 degrees of freedom
## Multiple R-squared: 0.4122, Adjusted R-squared: 0.4043
## F-statistic: 52.3 on 34 and 2536 DF, p-value: < 2.2e-16
rf_results
## mtry RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 2 18 0.09334958 0.7219972 0.06759072 0.007549756 0.03511768 0.004079897
svm_results
## sigma C RMSE Rsquared MAE RMSESD RsquaredSD
## 3 0.02045062 1 0.1183709 0.5330265 0.08671873 0.007912032 0.04676892
## MAESD
## 3 0.006055283
#look at most important variables in the random forest model for predicting PH
rf_imp <- varImp(rf_mod, scale = FALSE)
plot(rf_imp)
#fix the evaluation data column names as we did with the training data
colnames(testing_data) <- c("Brand_Code", "Carb_Volume", "Fill_Ounces", "PC_Volume", "Carb_Pressure", "Carb_Temp", "PSC", "PSC_Fill", "PSC_C02", "Mnf_Flow", "Carb_Pressure1", "Fill_Pressure", "Hyd_Pressure1", "Hyd_Pressure2", "Hyd_Pressure3", "Hyd_Pressure4", "Filler_Level", "Filler_Speed", "Temperature", "Usage_cont", "Carb_Flow","Density", "MFR", "Balling", "Pressure_Vacuum", "PH","Oxygen_Filler", "Bowl_Setpoint", "Pressure_Setpoint","Air_Pressure", "Alch_Rel", "Carb_Rel", "Balling_Lvl")
#check for NAs
summary(testing_data)
## Brand_Code Carb_Volume Fill_Ounces PC_Volume
## Length:267 Min. :5.147 Min. :23.75 Min. :0.09867
## Class :character 1st Qu.:5.287 1st Qu.:23.92 1st Qu.:0.23333
## Mode :character Median :5.340 Median :23.97 Median :0.27533
## Mean :5.369 Mean :23.97 Mean :0.27769
## 3rd Qu.:5.465 3rd Qu.:24.01 3rd Qu.:0.32200
## Max. :5.667 Max. :24.20 Max. :0.46400
## NA's :1 NA's :6 NA's :4
## Carb_Pressure Carb_Temp PSC PSC_Fill
## Min. :60.20 Min. :130.0 Min. :0.00400 Min. :0.0200
## 1st Qu.:65.30 1st Qu.:138.4 1st Qu.:0.04450 1st Qu.:0.1000
## Median :68.00 Median :140.8 Median :0.07600 Median :0.1800
## Mean :68.25 Mean :141.2 Mean :0.08545 Mean :0.1903
## 3rd Qu.:70.60 3rd Qu.:143.8 3rd Qu.:0.11200 3rd Qu.:0.2600
## Max. :77.60 Max. :154.0 Max. :0.24600 Max. :0.6200
## NA's :1 NA's :5 NA's :3
## PSC_C02 Mnf_Flow Carb_Pressure1 Fill_Pressure
## Min. :0.00000 Min. :-100.20 Min. :113.0 Min. :37.80
## 1st Qu.:0.02000 1st Qu.:-100.00 1st Qu.:120.2 1st Qu.:46.00
## Median :0.04000 Median : 0.20 Median :123.4 Median :47.80
## Mean :0.05107 Mean : 21.03 Mean :123.0 Mean :48.14
## 3rd Qu.:0.06000 3rd Qu.: 141.30 3rd Qu.:125.5 3rd Qu.:50.20
## Max. :0.24000 Max. : 220.40 Max. :136.0 Max. :60.20
## NA's :5 NA's :4 NA's :2
## Hyd_Pressure1 Hyd_Pressure2 Hyd_Pressure3 Hyd_Pressure4
## Min. :-50.00 Min. :-50.00 Min. :-50.00 Min. : 68.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 90.00
## Median : 10.40 Median : 26.80 Median : 27.70 Median : 98.00
## Mean : 12.01 Mean : 20.11 Mean : 19.61 Mean : 97.84
## 3rd Qu.: 20.40 3rd Qu.: 34.80 3rd Qu.: 33.00 3rd Qu.:104.00
## Max. : 50.00 Max. : 61.40 Max. : 49.20 Max. :140.00
## NA's :1 NA's :1 NA's :4
## Filler_Level Filler_Speed Temperature Usage_cont Carb_Flow
## Min. : 69.2 Min. :1006 Min. :63.80 Min. :12.90 Min. : 0
## 1st Qu.:100.6 1st Qu.:3812 1st Qu.:65.40 1st Qu.:18.12 1st Qu.:1083
## Median :118.6 Median :3978 Median :65.80 Median :21.44 Median :3038
## Mean :110.3 Mean :3581 Mean :66.23 Mean :20.90 Mean :2409
## 3rd Qu.:120.2 3rd Qu.:3996 3rd Qu.:66.60 3rd Qu.:23.74 3rd Qu.:3215
## Max. :153.2 Max. :4020 Max. :75.40 Max. :24.60 Max. :3858
## NA's :2 NA's :10 NA's :2 NA's :2
## Density MFR Balling Pressure_Vacuum
## Min. :0.060 Min. : 15.6 Min. :0.902 Min. :-6.400
## 1st Qu.:0.920 1st Qu.:707.0 1st Qu.:1.498 1st Qu.:-5.600
## Median :0.980 Median :724.6 Median :1.648 Median :-5.200
## Mean :1.177 Mean :697.8 Mean :2.203 Mean :-5.174
## 3rd Qu.:1.600 3rd Qu.:731.5 3rd Qu.:3.242 3rd Qu.:-4.800
## Max. :1.840 Max. :784.8 Max. :3.788 Max. :-3.600
## NA's :1 NA's :31 NA's :1 NA's :1
## PH Oxygen_Filler Bowl_Setpoint Pressure_Setpoint
## Mode:logical Min. :0.00240 Min. : 70.0 Min. :44.00
## NA's:267 1st Qu.:0.01960 1st Qu.:100.0 1st Qu.:46.00
## Median :0.03370 Median :120.0 Median :46.00
## Mean :0.04666 Mean :109.6 Mean :47.73
## 3rd Qu.:0.05440 3rd Qu.:120.0 3rd Qu.:50.00
## Max. :0.39800 Max. :130.0 Max. :52.00
## NA's :3 NA's :1 NA's :2
## Air_Pressure Alch_Rel Carb_Rel Balling_Lvl
## Min. :141.2 Min. :6.400 Min. :5.18 Min. :0.000
## 1st Qu.:142.2 1st Qu.:6.540 1st Qu.:5.34 1st Qu.:1.380
## Median :142.6 Median :6.580 Median :5.40 Median :1.480
## Mean :142.8 Mean :6.907 Mean :5.44 Mean :2.051
## 3rd Qu.:142.8 3rd Qu.:7.180 3rd Qu.:5.56 3rd Qu.:3.080
## Max. :147.2 Max. :7.820 Max. :5.74 Max. :3.420
## NA's :1 NA's :3 NA's :2
#impute with mean as we did for training data
testing_data_imputed <- testing_data
for (i in colnames(testing_data_imputed)[sapply(testing_data_imputed, is.numeric)]) {
testing_data_imputed[[i]] <- na_mean(testing_data_imputed[[i]])
}
#predict PH using the randomforest model
predictions <- predict(rf_mod, newdata = testing_data_imputed)
hist(predictions, xlab = "Predicted pH")
mean_training_data_ph <- mean(training_data_imputed$PH) #8.5456
mean_predictions_ph <- mean(predictions) #8.5498
#use rf model to on the training_data_imputed data to get it to 259 rows to plot for comparison
training_predictions <- predict(rf_mod, newdata = training_data_imputed)
#residuals and plot
residuals <- training_data_imputed$PH - training_predictions
plot(training_predictions, residuals, xlab = "rf predicted pH", ylab = "residuals") + abline(h = 0, col = "red")
## integer(0)
#create a data frame with actual PH values and predicted PH values to plot against each other as another way to visualize model performance and plot against each other
predictions_df <- data.frame(Actual = training_data_imputed$PH, Predicted = training_predictions)
plot(predictions_df$Actual, predictions_df$Predicted, xlab = "Actual PH", ylab = "Predicted pH")+abline(0, 1, col = "red")
## integer(0)
#export predictions
#write_xlsx(data.frame(Predicted_PH = pH_predictions), "BrianSingh_PH_Predictions.xlsx")