Data 624 Final Project

This is role playing. I am your new boss. I am in charge of production at ABC Beverage and you are a team of data scientists reporting to me. My leadership has told me that new regulations are requiring us to understand our manufacturing process, the predictive factors and be able to report to them our predictive model of PH.

Please use the historical data set I am providing. Build and report the factors in BOTH a technical and non-technical report. I like to use Word and Excel. Please provide your non-technical report in a business friendly readable document and your predictions in an Excel readable format. The technical report should show clearly the models you tested and how you selected your final approach.

Please submit both Rpubs links and .rmd files or other readable formats for technical and non-technical reports. Also submit the excel file showing the prediction of your models for pH.

Load libraries and data

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(imputeTS)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(readxl)
library(caret)
## Loading required package: lattice
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(e1071)
library(readxl)
library(corrplot)
## corrplot 0.92 loaded
library(writexl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ✔ readr     2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ randomForest::combine() masks dplyr::combine()
## ✖ dplyr::filter()         masks stats::filter()
## ✖ dplyr::lag()            masks stats::lag()
## ✖ purrr::lift()           masks caret::lift()
## ✖ randomForest::margin()  masks ggplot2::margin()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
training_data <- read_excel("/Users/briansingh/Desktop/CUNY/Spring 2024/Data 624/Final Project/StudentData.xlsx")
testing_data <- read_excel("/Users/briansingh/Desktop/CUNY/Spring 2024/Data 624/Final Project/StudentEvaluation.xlsx")

Tidying

#summary of data
summary(training_data)
##   Brand Code         Carb Volume     Fill Ounces      PC Volume      
##  Length:2571        Min.   :5.040   Min.   :23.63   Min.   :0.07933  
##  Class :character   1st Qu.:5.293   1st Qu.:23.92   1st Qu.:0.23917  
##  Mode  :character   Median :5.347   Median :23.97   Median :0.27133  
##                     Mean   :5.370   Mean   :23.97   Mean   :0.27712  
##                     3rd Qu.:5.453   3rd Qu.:24.03   3rd Qu.:0.31200  
##                     Max.   :5.700   Max.   :24.32   Max.   :0.47800  
##                     NA's   :10      NA's   :38      NA's   :39       
##  Carb Pressure     Carb Temp          PSC             PSC Fill     
##  Min.   :57.00   Min.   :128.6   Min.   :0.00200   Min.   :0.0000  
##  1st Qu.:65.60   1st Qu.:138.4   1st Qu.:0.04800   1st Qu.:0.1000  
##  Median :68.20   Median :140.8   Median :0.07600   Median :0.1800  
##  Mean   :68.19   Mean   :141.1   Mean   :0.08457   Mean   :0.1954  
##  3rd Qu.:70.60   3rd Qu.:143.8   3rd Qu.:0.11200   3rd Qu.:0.2600  
##  Max.   :79.40   Max.   :154.0   Max.   :0.27000   Max.   :0.6200  
##  NA's   :27      NA's   :26      NA's   :33        NA's   :23      
##     PSC CO2           Mnf Flow       Carb Pressure1  Fill Pressure  
##  Min.   :0.00000   Min.   :-100.20   Min.   :105.6   Min.   :34.60  
##  1st Qu.:0.02000   1st Qu.:-100.00   1st Qu.:119.0   1st Qu.:46.00  
##  Median :0.04000   Median :  65.20   Median :123.2   Median :46.40  
##  Mean   :0.05641   Mean   :  24.57   Mean   :122.6   Mean   :47.92  
##  3rd Qu.:0.08000   3rd Qu.: 140.80   3rd Qu.:125.4   3rd Qu.:50.00  
##  Max.   :0.24000   Max.   : 229.40   Max.   :140.2   Max.   :60.40  
##  NA's   :39        NA's   :2         NA's   :32      NA's   :22     
##  Hyd Pressure1   Hyd Pressure2   Hyd Pressure3   Hyd Pressure4   
##  Min.   :-0.80   Min.   : 0.00   Min.   :-1.20   Min.   : 52.00  
##  1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 86.00  
##  Median :11.40   Median :28.60   Median :27.60   Median : 96.00  
##  Mean   :12.44   Mean   :20.96   Mean   :20.46   Mean   : 96.29  
##  3rd Qu.:20.20   3rd Qu.:34.60   3rd Qu.:33.40   3rd Qu.:102.00  
##  Max.   :58.00   Max.   :59.40   Max.   :50.00   Max.   :142.00  
##  NA's   :11      NA's   :15      NA's   :15      NA's   :30      
##   Filler Level    Filler Speed   Temperature      Usage cont      Carb Flow   
##  Min.   : 55.8   Min.   : 998   Min.   :63.60   Min.   :12.08   Min.   :  26  
##  1st Qu.: 98.3   1st Qu.:3888   1st Qu.:65.20   1st Qu.:18.36   1st Qu.:1144  
##  Median :118.4   Median :3982   Median :65.60   Median :21.79   Median :3028  
##  Mean   :109.3   Mean   :3687   Mean   :65.97   Mean   :20.99   Mean   :2468  
##  3rd Qu.:120.0   3rd Qu.:3998   3rd Qu.:66.40   3rd Qu.:23.75   3rd Qu.:3186  
##  Max.   :161.2   Max.   :4030   Max.   :76.20   Max.   :25.90   Max.   :5104  
##  NA's   :20      NA's   :57     NA's   :14      NA's   :5       NA's   :2     
##     Density           MFR           Balling       Pressure Vacuum 
##  Min.   :0.240   Min.   : 31.4   Min.   :-0.170   Min.   :-6.600  
##  1st Qu.:0.900   1st Qu.:706.3   1st Qu.: 1.496   1st Qu.:-5.600  
##  Median :0.980   Median :724.0   Median : 1.648   Median :-5.400  
##  Mean   :1.174   Mean   :704.0   Mean   : 2.198   Mean   :-5.216  
##  3rd Qu.:1.620   3rd Qu.:731.0   3rd Qu.: 3.292   3rd Qu.:-5.000  
##  Max.   :1.920   Max.   :868.6   Max.   : 4.012   Max.   :-3.600  
##  NA's   :1       NA's   :212     NA's   :1                        
##        PH        Oxygen Filler     Bowl Setpoint   Pressure Setpoint
##  Min.   :7.880   Min.   :0.00240   Min.   : 70.0   Min.   :44.00    
##  1st Qu.:8.440   1st Qu.:0.02200   1st Qu.:100.0   1st Qu.:46.00    
##  Median :8.540   Median :0.03340   Median :120.0   Median :46.00    
##  Mean   :8.546   Mean   :0.04684   Mean   :109.3   Mean   :47.62    
##  3rd Qu.:8.680   3rd Qu.:0.06000   3rd Qu.:120.0   3rd Qu.:50.00    
##  Max.   :9.360   Max.   :0.40000   Max.   :140.0   Max.   :52.00    
##  NA's   :4       NA's   :12        NA's   :2       NA's   :12       
##  Air Pressurer      Alch Rel        Carb Rel      Balling Lvl  
##  Min.   :140.8   Min.   :5.280   Min.   :4.960   Min.   :0.00  
##  1st Qu.:142.2   1st Qu.:6.540   1st Qu.:5.340   1st Qu.:1.38  
##  Median :142.6   Median :6.560   Median :5.400   Median :1.48  
##  Mean   :142.8   Mean   :6.897   Mean   :5.437   Mean   :2.05  
##  3rd Qu.:143.0   3rd Qu.:7.240   3rd Qu.:5.540   3rd Qu.:3.14  
##  Max.   :148.2   Max.   :8.620   Max.   :6.060   Max.   :3.66  
##                  NA's   :9       NA's   :10      NA's   :1
#column names appear as `Brand Code`, instead of Brand_Code, fix the names
colnames(training_data) <- c("Brand_Code", "Carb_Volume", "Fill_Ounces", "PC_Volume", "Carb_Pressure", "Carb_Temp", "PSC", "PSC_Fill", "PSC_C02", "Mnf_Flow", "Carb_Pressure1", "Fill_Pressure", "Hyd_Pressure1", "Hyd_Pressure2", "Hyd_Pressure3", "Hyd_Pressure4", "Filler_Level", "Filler_Speed", "Temperature", "Usage_cont", "Carb_Flow","Density", "MFR", "Balling", "Pressure_Vacuum", "PH","Oxygen_Filler", "Bowl_Setpoint", "Pressure_Setpoint","Air_Pressure", "Alch_Rel", "Carb_Rel", "Balling_Lvl")

#check for missing values
missing_values <- colSums(is.na(training_data))
missing_values_df <- data.frame(Variable = names(missing_values), Missing_Values = missing_values)
missing_values_df
##                            Variable Missing_Values
## Brand_Code               Brand_Code            120
## Carb_Volume             Carb_Volume             10
## Fill_Ounces             Fill_Ounces             38
## PC_Volume                 PC_Volume             39
## Carb_Pressure         Carb_Pressure             27
## Carb_Temp                 Carb_Temp             26
## PSC                             PSC             33
## PSC_Fill                   PSC_Fill             23
## PSC_C02                     PSC_C02             39
## Mnf_Flow                   Mnf_Flow              2
## Carb_Pressure1       Carb_Pressure1             32
## Fill_Pressure         Fill_Pressure             22
## Hyd_Pressure1         Hyd_Pressure1             11
## Hyd_Pressure2         Hyd_Pressure2             15
## Hyd_Pressure3         Hyd_Pressure3             15
## Hyd_Pressure4         Hyd_Pressure4             30
## Filler_Level           Filler_Level             20
## Filler_Speed           Filler_Speed             57
## Temperature             Temperature             14
## Usage_cont               Usage_cont              5
## Carb_Flow                 Carb_Flow              2
## Density                     Density              1
## MFR                             MFR            212
## Balling                     Balling              1
## Pressure_Vacuum     Pressure_Vacuum              0
## PH                               PH              4
## Oxygen_Filler         Oxygen_Filler             12
## Bowl_Setpoint         Bowl_Setpoint              2
## Pressure_Setpoint Pressure_Setpoint             12
## Air_Pressure           Air_Pressure              0
## Alch_Rel                   Alch_Rel              9
## Carb_Rel                   Carb_Rel             10
## Balling_Lvl             Balling_Lvl              1
#Brand_Code is categorical, we can't impute with averages, we'll use the mode and fill in the 120 values 
table(training_data$Brand_Code)
## 
##    A    B    C    D 
##  293 1239  304  615
mode_brand_code <- training_data %>% 
  group_by(Brand_Code) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  slice(1) %>%
  pull(Brand_Code)

training_data_imputed <- training_data %>%
  mutate(Brand_Code= ifelse(is.na(training_data$Brand_Code), mode_brand_code, training_data$Brand_Code))

sum(is.na(training_data_imputed$Brand_Code))
## [1] 0
summary(training_data_imputed)
##   Brand_Code         Carb_Volume     Fill_Ounces      PC_Volume      
##  Length:2571        Min.   :5.040   Min.   :23.63   Min.   :0.07933  
##  Class :character   1st Qu.:5.293   1st Qu.:23.92   1st Qu.:0.23917  
##  Mode  :character   Median :5.347   Median :23.97   Median :0.27133  
##                     Mean   :5.370   Mean   :23.97   Mean   :0.27712  
##                     3rd Qu.:5.453   3rd Qu.:24.03   3rd Qu.:0.31200  
##                     Max.   :5.700   Max.   :24.32   Max.   :0.47800  
##                     NA's   :10      NA's   :38      NA's   :39       
##  Carb_Pressure     Carb_Temp          PSC             PSC_Fill     
##  Min.   :57.00   Min.   :128.6   Min.   :0.00200   Min.   :0.0000  
##  1st Qu.:65.60   1st Qu.:138.4   1st Qu.:0.04800   1st Qu.:0.1000  
##  Median :68.20   Median :140.8   Median :0.07600   Median :0.1800  
##  Mean   :68.19   Mean   :141.1   Mean   :0.08457   Mean   :0.1954  
##  3rd Qu.:70.60   3rd Qu.:143.8   3rd Qu.:0.11200   3rd Qu.:0.2600  
##  Max.   :79.40   Max.   :154.0   Max.   :0.27000   Max.   :0.6200  
##  NA's   :27      NA's   :26      NA's   :33        NA's   :23      
##     PSC_C02           Mnf_Flow       Carb_Pressure1  Fill_Pressure  
##  Min.   :0.00000   Min.   :-100.20   Min.   :105.6   Min.   :34.60  
##  1st Qu.:0.02000   1st Qu.:-100.00   1st Qu.:119.0   1st Qu.:46.00  
##  Median :0.04000   Median :  65.20   Median :123.2   Median :46.40  
##  Mean   :0.05641   Mean   :  24.57   Mean   :122.6   Mean   :47.92  
##  3rd Qu.:0.08000   3rd Qu.: 140.80   3rd Qu.:125.4   3rd Qu.:50.00  
##  Max.   :0.24000   Max.   : 229.40   Max.   :140.2   Max.   :60.40  
##  NA's   :39        NA's   :2         NA's   :32      NA's   :22     
##  Hyd_Pressure1   Hyd_Pressure2   Hyd_Pressure3   Hyd_Pressure4   
##  Min.   :-0.80   Min.   : 0.00   Min.   :-1.20   Min.   : 52.00  
##  1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 86.00  
##  Median :11.40   Median :28.60   Median :27.60   Median : 96.00  
##  Mean   :12.44   Mean   :20.96   Mean   :20.46   Mean   : 96.29  
##  3rd Qu.:20.20   3rd Qu.:34.60   3rd Qu.:33.40   3rd Qu.:102.00  
##  Max.   :58.00   Max.   :59.40   Max.   :50.00   Max.   :142.00  
##  NA's   :11      NA's   :15      NA's   :15      NA's   :30      
##   Filler_Level    Filler_Speed   Temperature      Usage_cont      Carb_Flow   
##  Min.   : 55.8   Min.   : 998   Min.   :63.60   Min.   :12.08   Min.   :  26  
##  1st Qu.: 98.3   1st Qu.:3888   1st Qu.:65.20   1st Qu.:18.36   1st Qu.:1144  
##  Median :118.4   Median :3982   Median :65.60   Median :21.79   Median :3028  
##  Mean   :109.3   Mean   :3687   Mean   :65.97   Mean   :20.99   Mean   :2468  
##  3rd Qu.:120.0   3rd Qu.:3998   3rd Qu.:66.40   3rd Qu.:23.75   3rd Qu.:3186  
##  Max.   :161.2   Max.   :4030   Max.   :76.20   Max.   :25.90   Max.   :5104  
##  NA's   :20      NA's   :57     NA's   :14      NA's   :5       NA's   :2     
##     Density           MFR           Balling       Pressure_Vacuum 
##  Min.   :0.240   Min.   : 31.4   Min.   :-0.170   Min.   :-6.600  
##  1st Qu.:0.900   1st Qu.:706.3   1st Qu.: 1.496   1st Qu.:-5.600  
##  Median :0.980   Median :724.0   Median : 1.648   Median :-5.400  
##  Mean   :1.174   Mean   :704.0   Mean   : 2.198   Mean   :-5.216  
##  3rd Qu.:1.620   3rd Qu.:731.0   3rd Qu.: 3.292   3rd Qu.:-5.000  
##  Max.   :1.920   Max.   :868.6   Max.   : 4.012   Max.   :-3.600  
##  NA's   :1       NA's   :212     NA's   :1                        
##        PH        Oxygen_Filler     Bowl_Setpoint   Pressure_Setpoint
##  Min.   :7.880   Min.   :0.00240   Min.   : 70.0   Min.   :44.00    
##  1st Qu.:8.440   1st Qu.:0.02200   1st Qu.:100.0   1st Qu.:46.00    
##  Median :8.540   Median :0.03340   Median :120.0   Median :46.00    
##  Mean   :8.546   Mean   :0.04684   Mean   :109.3   Mean   :47.62    
##  3rd Qu.:8.680   3rd Qu.:0.06000   3rd Qu.:120.0   3rd Qu.:50.00    
##  Max.   :9.360   Max.   :0.40000   Max.   :140.0   Max.   :52.00    
##  NA's   :4       NA's   :12        NA's   :2       NA's   :12       
##   Air_Pressure      Alch_Rel        Carb_Rel      Balling_Lvl  
##  Min.   :140.8   Min.   :5.280   Min.   :4.960   Min.   :0.00  
##  1st Qu.:142.2   1st Qu.:6.540   1st Qu.:5.340   1st Qu.:1.38  
##  Median :142.6   Median :6.560   Median :5.400   Median :1.48  
##  Mean   :142.8   Mean   :6.897   Mean   :5.437   Mean   :2.05  
##  3rd Qu.:143.0   3rd Qu.:7.240   3rd Qu.:5.540   3rd Qu.:3.14  
##  Max.   :148.2   Max.   :8.620   Max.   :6.060   Max.   :3.66  
##                  NA's   :9       NA's   :10      NA's   :1
#impute numerical values with mean of the feature
for (i in colnames(training_data_imputed)[sapply(training_data_imputed, is.numeric)]) {
  training_data_imputed[[i]] <- na_mean(training_data_imputed[[i]])
}

#check for any remaning missing values
any(is.na(training_data_imputed))
## [1] FALSE

Data Exploration

#distribution of PH
ggplot(training_data_imputed, aes(x = PH)) +
  geom_histogram() +
  labs(title = "pH distribution", x = "pH", y = "Freq")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#distribution of other numeric features
training_data_imputed %>%
  keep(is.numeric) %>%
  gather() %>%
  ggplot(aes(value)) + 
  geom_histogram(bins = 10) + 
  facet_wrap(~key, scales = "free") 

#correlations
correlation_matrix <- cor(training_data_imputed[, -1])

corrplot(correlation_matrix, method = "color", type = "upper", tl.col = "black", tl.srt = 45, tl.cex = 0.6)

Model Building

#training control to use in model building
train_control <- trainControl(method = "cv", number = 10)

#linear regression, random forest and support vector machine models
lm_mod <- train(PH ~ ., data = training_data_imputed, method = "lm", trControl = train_control)
rf_mod <- train(PH ~ ., data = training_data_imputed, method = "rf", trControl = train_control)
svm_mod <- train(PH ~ ., data = training_data_imputed, method = "svmRadial", trControl = train_control)

#evaluations
lm_results <- summary(lm_mod)
rf_results <- rf_mod$results[which.max(rf_mod$results$Rsquared), ]
svm_results <- svm_mod$results[which.max(svm_mod$results$Rsquared), ]

lm_results
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.51684 -0.07726  0.01138  0.08826  0.72663 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.066e+01  9.346e-01  11.408  < 2e-16 ***
## Brand_CodeB        6.352e-02  2.216e-02   2.866 0.004186 ** 
## Brand_CodeC       -7.033e-02  2.222e-02  -3.165 0.001572 ** 
## Brand_CodeD        5.453e-02  1.520e-02   3.587 0.000341 ***
## Carb_Volume       -5.979e-02  5.846e-02  -1.023 0.306497    
## Fill_Ounces       -8.393e-02  3.232e-02  -2.597 0.009464 ** 
## PC_Volume         -1.012e-01  5.354e-02  -1.890 0.058922 .  
## Carb_Pressure     -7.018e-04  2.085e-03  -0.337 0.736490    
## Carb_Temp          1.723e-03  1.670e-03   1.032 0.302177    
## PSC               -9.946e-02  5.714e-02  -1.741 0.081879 .  
## PSC_Fill          -2.948e-02  2.343e-02  -1.258 0.208523    
## PSC_C02           -1.218e-01  6.333e-02  -1.924 0.054526 .  
## Mnf_Flow          -7.210e-04  4.593e-05 -15.696  < 2e-16 ***
## Carb_Pressure1     6.359e-03  6.988e-04   9.100  < 2e-16 ***
## Fill_Pressure      1.992e-03  1.216e-03   1.638 0.101561    
## Hyd_Pressure1     -2.310e-06  3.662e-04  -0.006 0.994968    
## Hyd_Pressure2     -7.774e-04  5.321e-04  -1.461 0.144101    
## Hyd_Pressure3      3.025e-03  5.807e-04   5.208 2.06e-07 ***
## Hyd_Pressure4      2.611e-05  2.984e-04   0.087 0.930283    
## Filler_Level      -1.061e-03  5.180e-04  -2.049 0.040593 *  
## Filler_Speed      -1.422e-06  5.498e-06  -0.259 0.795864    
## Temperature       -1.702e-02  2.230e-03  -7.632 3.25e-14 ***
## Usage_cont        -6.671e-03  1.141e-03  -5.847 5.66e-09 ***
## Carb_Flow          1.095e-05  3.618e-06   3.027 0.002495 ** 
## Density           -1.304e-01  2.799e-02  -4.660 3.33e-06 ***
## MFR                1.304e-05  4.634e-05   0.281 0.778356    
## Balling           -7.214e-02  2.265e-02  -3.185 0.001465 ** 
## Pressure_Vacuum   -1.671e-02  7.415e-03  -2.253 0.024328 *  
## Oxygen_Filler     -2.759e-01  6.818e-02  -4.046 5.36e-05 ***
## Bowl_Setpoint      3.071e-03  5.419e-04   5.666 1.62e-08 ***
## Pressure_Setpoint -8.224e-03  1.953e-03  -4.212 2.62e-05 ***
## Air_Pressure      -1.676e-03  2.370e-03  -0.707 0.479476    
## Alch_Rel           6.350e-02  2.083e-02   3.049 0.002322 ** 
## Carb_Rel           6.564e-02  4.631e-02   1.417 0.156540    
## Balling_Lvl        1.037e-01  2.081e-02   4.985 6.61e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.133 on 2536 degrees of freedom
## Multiple R-squared:  0.4122, Adjusted R-squared:  0.4043 
## F-statistic:  52.3 on 34 and 2536 DF,  p-value: < 2.2e-16
rf_results
##   mtry       RMSE  Rsquared        MAE      RMSESD RsquaredSD       MAESD
## 2   18 0.09334958 0.7219972 0.06759072 0.007549756 0.03511768 0.004079897
svm_results
##        sigma C      RMSE  Rsquared        MAE      RMSESD RsquaredSD
## 3 0.02045062 1 0.1183709 0.5330265 0.08671873 0.007912032 0.04676892
##         MAESD
## 3 0.006055283

Model Selection and predictions

#look at most important variables in the random forest model for predicting PH
rf_imp <- varImp(rf_mod, scale = FALSE)
plot(rf_imp)

#fix the evaluation data column names as we did with the training data
colnames(testing_data) <- c("Brand_Code", "Carb_Volume", "Fill_Ounces", "PC_Volume", "Carb_Pressure", "Carb_Temp", "PSC", "PSC_Fill", "PSC_C02", "Mnf_Flow", "Carb_Pressure1", "Fill_Pressure", "Hyd_Pressure1", "Hyd_Pressure2", "Hyd_Pressure3", "Hyd_Pressure4", "Filler_Level", "Filler_Speed", "Temperature", "Usage_cont", "Carb_Flow","Density", "MFR", "Balling", "Pressure_Vacuum", "PH","Oxygen_Filler", "Bowl_Setpoint", "Pressure_Setpoint","Air_Pressure", "Alch_Rel", "Carb_Rel", "Balling_Lvl")

#check for NAs
summary(testing_data)
##   Brand_Code         Carb_Volume     Fill_Ounces      PC_Volume      
##  Length:267         Min.   :5.147   Min.   :23.75   Min.   :0.09867  
##  Class :character   1st Qu.:5.287   1st Qu.:23.92   1st Qu.:0.23333  
##  Mode  :character   Median :5.340   Median :23.97   Median :0.27533  
##                     Mean   :5.369   Mean   :23.97   Mean   :0.27769  
##                     3rd Qu.:5.465   3rd Qu.:24.01   3rd Qu.:0.32200  
##                     Max.   :5.667   Max.   :24.20   Max.   :0.46400  
##                     NA's   :1       NA's   :6       NA's   :4        
##  Carb_Pressure     Carb_Temp          PSC             PSC_Fill     
##  Min.   :60.20   Min.   :130.0   Min.   :0.00400   Min.   :0.0200  
##  1st Qu.:65.30   1st Qu.:138.4   1st Qu.:0.04450   1st Qu.:0.1000  
##  Median :68.00   Median :140.8   Median :0.07600   Median :0.1800  
##  Mean   :68.25   Mean   :141.2   Mean   :0.08545   Mean   :0.1903  
##  3rd Qu.:70.60   3rd Qu.:143.8   3rd Qu.:0.11200   3rd Qu.:0.2600  
##  Max.   :77.60   Max.   :154.0   Max.   :0.24600   Max.   :0.6200  
##                  NA's   :1       NA's   :5         NA's   :3       
##     PSC_C02           Mnf_Flow       Carb_Pressure1  Fill_Pressure  
##  Min.   :0.00000   Min.   :-100.20   Min.   :113.0   Min.   :37.80  
##  1st Qu.:0.02000   1st Qu.:-100.00   1st Qu.:120.2   1st Qu.:46.00  
##  Median :0.04000   Median :   0.20   Median :123.4   Median :47.80  
##  Mean   :0.05107   Mean   :  21.03   Mean   :123.0   Mean   :48.14  
##  3rd Qu.:0.06000   3rd Qu.: 141.30   3rd Qu.:125.5   3rd Qu.:50.20  
##  Max.   :0.24000   Max.   : 220.40   Max.   :136.0   Max.   :60.20  
##  NA's   :5                           NA's   :4       NA's   :2      
##  Hyd_Pressure1    Hyd_Pressure2    Hyd_Pressure3    Hyd_Pressure4   
##  Min.   :-50.00   Min.   :-50.00   Min.   :-50.00   Min.   : 68.00  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.: 90.00  
##  Median : 10.40   Median : 26.80   Median : 27.70   Median : 98.00  
##  Mean   : 12.01   Mean   : 20.11   Mean   : 19.61   Mean   : 97.84  
##  3rd Qu.: 20.40   3rd Qu.: 34.80   3rd Qu.: 33.00   3rd Qu.:104.00  
##  Max.   : 50.00   Max.   : 61.40   Max.   : 49.20   Max.   :140.00  
##                   NA's   :1        NA's   :1        NA's   :4       
##   Filler_Level    Filler_Speed   Temperature      Usage_cont      Carb_Flow   
##  Min.   : 69.2   Min.   :1006   Min.   :63.80   Min.   :12.90   Min.   :   0  
##  1st Qu.:100.6   1st Qu.:3812   1st Qu.:65.40   1st Qu.:18.12   1st Qu.:1083  
##  Median :118.6   Median :3978   Median :65.80   Median :21.44   Median :3038  
##  Mean   :110.3   Mean   :3581   Mean   :66.23   Mean   :20.90   Mean   :2409  
##  3rd Qu.:120.2   3rd Qu.:3996   3rd Qu.:66.60   3rd Qu.:23.74   3rd Qu.:3215  
##  Max.   :153.2   Max.   :4020   Max.   :75.40   Max.   :24.60   Max.   :3858  
##  NA's   :2       NA's   :10     NA's   :2       NA's   :2                     
##     Density           MFR           Balling      Pressure_Vacuum 
##  Min.   :0.060   Min.   : 15.6   Min.   :0.902   Min.   :-6.400  
##  1st Qu.:0.920   1st Qu.:707.0   1st Qu.:1.498   1st Qu.:-5.600  
##  Median :0.980   Median :724.6   Median :1.648   Median :-5.200  
##  Mean   :1.177   Mean   :697.8   Mean   :2.203   Mean   :-5.174  
##  3rd Qu.:1.600   3rd Qu.:731.5   3rd Qu.:3.242   3rd Qu.:-4.800  
##  Max.   :1.840   Max.   :784.8   Max.   :3.788   Max.   :-3.600  
##  NA's   :1       NA's   :31      NA's   :1       NA's   :1       
##     PH          Oxygen_Filler     Bowl_Setpoint   Pressure_Setpoint
##  Mode:logical   Min.   :0.00240   Min.   : 70.0   Min.   :44.00    
##  NA's:267       1st Qu.:0.01960   1st Qu.:100.0   1st Qu.:46.00    
##                 Median :0.03370   Median :120.0   Median :46.00    
##                 Mean   :0.04666   Mean   :109.6   Mean   :47.73    
##                 3rd Qu.:0.05440   3rd Qu.:120.0   3rd Qu.:50.00    
##                 Max.   :0.39800   Max.   :130.0   Max.   :52.00    
##                 NA's   :3         NA's   :1       NA's   :2        
##   Air_Pressure      Alch_Rel        Carb_Rel     Balling_Lvl   
##  Min.   :141.2   Min.   :6.400   Min.   :5.18   Min.   :0.000  
##  1st Qu.:142.2   1st Qu.:6.540   1st Qu.:5.34   1st Qu.:1.380  
##  Median :142.6   Median :6.580   Median :5.40   Median :1.480  
##  Mean   :142.8   Mean   :6.907   Mean   :5.44   Mean   :2.051  
##  3rd Qu.:142.8   3rd Qu.:7.180   3rd Qu.:5.56   3rd Qu.:3.080  
##  Max.   :147.2   Max.   :7.820   Max.   :5.74   Max.   :3.420  
##  NA's   :1       NA's   :3       NA's   :2
#impute with mean as we did for training data
testing_data_imputed <- testing_data
for (i in colnames(testing_data_imputed)[sapply(testing_data_imputed, is.numeric)]) {
  testing_data_imputed[[i]] <- na_mean(testing_data_imputed[[i]])
}

#predict PH using the randomforest model
predictions <- predict(rf_mod, newdata = testing_data_imputed)
hist(predictions, xlab = "Predicted pH")

mean_training_data_ph <- mean(training_data_imputed$PH) #8.5456
mean_predictions_ph <- mean(predictions) #8.5498

#use rf model to on the training_data_imputed data to get it to 259 rows to plot for comparison
training_predictions <- predict(rf_mod, newdata = training_data_imputed)

#residuals and plot
residuals <- training_data_imputed$PH - training_predictions

plot(training_predictions, residuals, xlab = "rf predicted pH", ylab = "residuals") + abline(h = 0, col = "red")

## integer(0)
#create a data frame with actual PH values and predicted PH values to plot against each other as another way to visualize model performance and plot against each other
predictions_df <- data.frame(Actual = training_data_imputed$PH, Predicted = training_predictions)

plot(predictions_df$Actual, predictions_df$Predicted, xlab = "Actual PH", ylab = "Predicted pH")+abline(0, 1, col = "red")

## integer(0)
#export predictions
#write_xlsx(data.frame(Predicted_PH = pH_predictions), "BrianSingh_PH_Predictions.xlsx")