&^& Add transformations %$% Run models

LOAD PACKAGES

library(e1071)
library(dplyr)
library(tidyr)
library(ggplot2)
library(VIM)
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.2

LOAD DATA

Loading data from GitHub:

# open file
path <- ("https://raw.githubusercontent.com/kennygfm/fall624-group3/master/StudentData.csv")
con <- file(path, open="r")

# "Student" soft drink data
soda <- read.csv(con, header=T, sep=",", stringsAsFactors = F)

# close file
close(con)

soda[ , c(16,18,21,28)] <- sapply(soda[, c(16,18,21,28)], as.numeric)  # get rid of pesky integer values

Dataset is 32 predictors + 1 target variable (Brand.Code), with 2571 observations:

dim(soda) 
## [1] 2571   33

With the exception of the Brand.Code, all variables are either numeric or integers:

str(soda)
## 'data.frame':    2571 obs. of  33 variables:
##  $ Brand.Code       : chr  "B" "A" "B" "A" ...
##  $ Carb.Volume      : num  5.34 5.43 5.29 5.44 5.49 ...
##  $ Fill.Ounces      : num  24 24 24.1 24 24.3 ...
##  $ PC.Volume        : num  0.263 0.239 0.263 0.293 0.111 ...
##  $ Carb.Pressure    : num  68.2 68.4 70.8 63 67.2 66.6 64.2 67.6 64.2 72 ...
##  $ Carb.Temp        : num  141 140 145 133 137 ...
##  $ PSC              : num  0.104 0.124 0.09 NA 0.026 0.09 0.128 0.154 0.132 0.014 ...
##  $ PSC.Fill         : num  0.26 0.22 0.34 0.42 0.16 0.24 0.4 0.34 0.12 0.24 ...
##  $ PSC.CO2          : num  0.04 0.04 0.16 0.04 0.12 0.04 0.04 0.04 0.14 0.06 ...
##  $ Mnf.Flow         : num  -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 ...
##  $ Carb.Pressure1   : num  119 122 120 115 118 ...
##  $ Fill.Pressure    : num  46 46 46 46.4 45.8 45.6 51.8 46.8 46 45.2 ...
##  $ Hyd.Pressure1    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Hyd.Pressure2    : num  NA NA NA 0 0 0 0 0 0 0 ...
##  $ Hyd.Pressure3    : num  NA NA NA 0 0 0 0 0 0 0 ...
##  $ Hyd.Pressure4    : num  118 106 82 92 92 116 124 132 90 108 ...
##  $ Filler.Level     : num  121 119 120 118 119 ...
##  $ Filler.Speed     : num  4002 3986 4020 4012 4010 ...
##  $ Temperature      : num  66 67.6 67 65.6 65.6 66.2 65.8 65.2 65.4 66.6 ...
##  $ Usage.cont       : num  16.2 19.9 17.8 17.4 17.7 ...
##  $ Carb.Flow        : num  2932 3144 2914 3062 3054 ...
##  $ Density          : num  0.88 0.92 1.58 1.54 1.54 1.52 0.84 0.84 0.9 0.9 ...
##  $ MFR              : num  725 727 735 731 723 ...
##  $ Balling          : num  1.4 1.5 3.14 3.04 3.04 ...
##  $ Pressure.Vacuum  : num  -4 -4 -3.8 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 ...
##  $ PH               : num  8.36 8.26 8.94 8.24 8.26 8.32 8.4 8.38 8.38 8.5 ...
##  $ Oxygen.Filler    : num  0.022 0.026 0.024 0.03 0.03 0.024 0.066 0.046 0.064 0.022 ...
##  $ Bowl.Setpoint    : num  120 120 120 120 120 120 120 120 120 120 ...
##  $ Pressure.Setpoint: num  46.4 46.8 46.6 46 46 46 46 46 46 46 ...
##  $ Air.Pressurer    : num  143 143 142 146 146 ...
##  $ Alch.Rel         : num  6.58 6.56 7.66 7.14 7.14 7.16 6.54 6.52 6.52 6.54 ...
##  $ Carb.Rel         : num  5.32 5.3 5.84 5.42 5.44 5.44 5.38 5.34 5.34 5.34 ...
##  $ Balling.Lvl      : num  1.48 1.56 3.28 3.04 3.04 3.02 1.44 1.44 1.44 1.38 ...

EXPLORE DATA

Summary Table:

There are too many predictors for the standard summary function to produce an easy-to-read output. A more useful summary with skewness, correlations to PH, and NA counts is below. The variable Brand.Code, because it is a character vector, is left out:

means <- sapply(soda[-1], function(y) mean(y, na.rm = TRUE))
medians <- sapply(soda[-1], function(y) median(y, na.rm = TRUE))
IQRs <- sapply(soda[-1], function(y) IQR(y, na.rm = TRUE))
vars <- sapply(soda[-1], function(y) var(y, na.rm = TRUE))
skews <- sapply(soda[-1], function(y) skewness(as.numeric(y), na.rm = TRUE))
cors <- as.vector(cor(soda$PH, soda[,2:ncol(soda)], use = "complete.obs"))
NAs <- sapply(soda[-1], function(y) sum(length(which(is.na(y)))))

soda_summary <- data.frame(means, medians, IQRs, vars, skews, cors, NAs)
colnames(soda_summary) <- c("MEAN", "MEDIAN", "IQR", "Var", "SKEW", "$r_{PH}$", "NAs")
soda_summary <- round(soda_summary, 2)

soda_summary
##                      MEAN  MEDIAN     IQR        Var  SKEW $r_{PH}$ NAs
## Carb.Volume          5.37    5.35    0.16       0.01  0.39     0.07  10
## Fill.Ounces         23.97   23.97    0.11       0.01 -0.02    -0.12  38
## PC.Volume            0.28    0.27    0.07       0.00  0.34     0.10  39
## Carb.Pressure       68.19   68.20    5.00      12.52  0.18     0.08  27
## Carb.Temp          141.09  140.80    5.40      16.30  0.25     0.03  26
## PSC                  0.08    0.08    0.06       0.00  0.85    -0.07  33
## PSC.Fill             0.20    0.18    0.16       0.01  0.93    -0.02  23
## PSC.CO2              0.06    0.04    0.06       0.00  1.73    -0.09  39
## Mnf.Flow            24.57   65.20  240.80   14275.74  0.00    -0.46   2
## Carb.Pressure1     122.59  123.20    6.40      22.49  0.05    -0.12  32
## Fill.Pressure       47.92   46.40    4.00      10.10  0.55    -0.32  22
## Hyd.Pressure1       12.44   11.40   20.20     154.59  0.78    -0.05  11
## Hyd.Pressure2       20.96   28.60   34.60     268.51 -0.30    -0.22  15
## Hyd.Pressure3       20.46   27.60   33.40     255.22 -0.32    -0.27  15
## Hyd.Pressure4       96.29   96.00   16.00     172.20  0.55    -0.17  30
## Filler.Level       109.25  118.40   21.70     246.44 -0.85     0.35  20
## Filler.Speed      3687.20 3982.00  110.00  594163.50 -2.87    -0.04  57
## Temperature         65.97   65.60    1.20       1.91  2.39    -0.18  14
## Usage.cont          20.99   21.79    5.39       8.87 -0.54    -0.36   5
## Carb.Flow         2468.35 3028.00 2042.00 1152824.12 -0.99     0.23   2
## Density              1.17    0.98    0.72       0.14  0.53     0.10   1
## MFR                704.05  724.00   24.70    5460.96 -5.09    -0.05 212
## Balling              2.20    1.65    1.80       0.87  0.59     0.08   1
## Pressure.Vacuum     -5.22   -5.40    0.60       0.32  0.53     0.22   0
## PH                   8.55    8.54    0.24       0.03 -0.29     1.00   4
## Oxygen.Filler        0.05    0.03    0.04       0.00  2.66     0.16  12
## Bowl.Setpoint      109.33  120.00   20.00     234.19 -0.97     0.36   2
## Pressure.Setpoint   47.62   46.00    4.00       4.16  0.20    -0.31  12
## Air.Pressurer      142.83  142.60    0.80       1.47  2.25    -0.01   0
## Alch.Rel             6.90    6.56    0.70       0.26  0.88     0.17   9
## Carb.Rel             5.44    5.40    0.20       0.02  0.50     0.20  10
## Balling.Lvl          2.05    1.48    1.76       0.76  0.59     0.11   1

The table above shows some useful information regarding the mean and median for each predictor, and comparing the differences between the two, skewness of some variables is already apparent. Using the skewness function from e1071, the predictors MFR, Filler.Speed, Carb.Flow, and Bowl.Setpoint are shown to be the most negatively skewed. On the other end of the spectrum, Oxygen.Filler, Temperature, Air.Pressurer, PSC.Fill, and PSC.CO2 are all positively skewed. Many of these predictors have to do with gasses and air pressure, and we may want to consider scaling and centering these and other predictors before fitting any models.

Boxplots of these predictors are provided below:

Boxplots:

skewed_preds <- soda[, c(8,9,18,19,21:23,27,28,30)]

s <- gather(skewed_preds, "predictor", "value")

ggplot(s, aes(predictor, value)) + geom_boxplot(aes(fill = predictor)) + facet_wrap(~predictor, scale="free") + scale_fill_discrete(guide=FALSE) + scale_y_continuous('', labels = NULL, breaks = NULL) +
  scale_x_discrete('') + ggtitle("Most Skewed Predictors")
## Warning: Removed 362 rows containing non-finite values (stat_boxplot).

Histograms

Aside from the skewed predictors, histograms of all 32 numeric predictors are created in order to get a quick overview of the distributions, as well as to spot any odd patterns:

par(mfrow=c(4,4))

for (i in c(2:17)){
  hist(soda[ ,i], main = paste(names(soda[i]), "Distribution"), 
       xlab = names(soda)[i], cex.main = 0.9, cex.axis = 0.7,
       col=rgb(64, 224, 208, max = 255, alpha = 70, names = "grays"), pch=19)
}

par(mfrow=c(1,1))

Many of the first 16 predictors (as ordered in the dataset) have normal or somewhat-normal distributions, and appear to be continuous variables. A few of the skewed distributions follow an almost chi-squared or log-normal distribution, so these are predictors where transformations should be considered (PSC.Distribution and PSC.C02.Distribution).

Another insteresting pattern are the number of 0 values in the Hyd.Pressure(1-3) variables. The \(4^{th}\) Hyd.Pressure does not follow this same pattern, so depending on the relationship between these variables and the target and/or other variables, they may be removed outright.

The next set of histograms for variables 17-32 are below:

par(mfrow=c(4,4))

for (i in c(18:ncol(soda))){
  hist(soda[ ,i], main = paste(names(soda[i]), "Distribution"), 
       xlab = names(soda)[i], cex.main = 0.9, cex.axis = 0.7,
       col=rgb(64, 224, 208, max = 255, alpha = 70, names = "grays"), pch=19)
}

par(mfrow=c(1,1))

These next predictors follow some different patterns. A few are normal or near-normal distributions (Pressure.Vacuum and PH, the target), and some are highly skewed (Filler.Speed, Oxygen.Filler). Another set of predictors appear to not be continuous, but discrete distributions (Pressure.Setpoint, Alch.Rel), however, as we’ll see in later plots, many of these variables are just constrained to a few values, but are still continuous.

Out of all the variables, Bowl.Setpoint does seem to be a discrete distribution:

table(soda$Bowl.Setpoint)
## 
##   70   80   90  100  110  120  122  126  130  134  140 
##   99   96  434  112  437 1307    1   10   51    2   20
hist(soda$Bowl.Setpoint, main="Bowl.Setpoint", col='grey', xlab='Value')

Correlated Predictors:

Aside from skewed variables, several variables have high correlations with each other:

cors_all_preds <- round(cor(soda[-1], use="complete.obs"), 2)

cors_all_preds_df <- as.data.frame(cors_all_preds)

corrplot(as.matrix(cors_all_preds_df), method="color", tl.cex=.5, tl.col=colors()[598])

As the columns are organized in the data, some interesting patterns are present in the correlogram. Two areas show distinct positive correlations - these are the predictors that have something to do with carbonation, and another area where different pressure levels correlate with each other. Another set of variables are negatively correlated with these pressure predictors, these have to do with the filling of the bottles, so this makes sense (Oxygen.Filler, Bowl.Setpoint, Pressure.Setpoint).

Some of these same precictors are also correlated well with the target PH variable:

vars <- rownames(cors_all_preds)

top_ph_cors <- cors_all_preds_df$PH

top_ph_cors <- as.data.frame(cbind(vars, as.numeric(as.character(top_ph_cors))))

top_ph_cors$V2 <- as.numeric(as.character(top_ph_cors$V2))

top_ph_cors_neg <- top_ph_cors %>%
  arrange(V2)

top_ph_cors_pos <- top_ph_cors %>%
  arrange(desc(V2))

Below are the top variables that are positively correlated with the target PH variable:

top_ph_cors_pos[2:11, ]
##               vars   V2
## 2    Bowl.Setpoint 0.36
## 3     Filler.Level 0.35
## 4        Carb.Flow 0.23
## 5  Pressure.Vacuum 0.22
## 6         Carb.Rel 0.20
## 7         Alch.Rel 0.17
## 8    Oxygen.Filler 0.16
## 9      Balling.Lvl 0.11
## 10       PC.Volume 0.10
## 11         Density 0.10

Without transforming predictors, many of the highly correlated predictors to PH have to do with the bottle filling process (Fillers, Flow, Vacuums, etc.). In addition, the predictors that have the highest negative correlation to PH are below:

top_ph_cors_neg[1:11, ]
##                 vars    V2
## 1           Mnf.Flow -0.46
## 2         Usage.cont -0.36
## 3      Fill.Pressure -0.32
## 4  Pressure.Setpoint -0.31
## 5      Hyd.Pressure3 -0.27
## 6      Hyd.Pressure2 -0.22
## 7        Temperature -0.18
## 8      Hyd.Pressure4 -0.17
## 9        Fill.Ounces -0.12
## 10    Carb.Pressure1 -0.12
## 11           PSC.CO2 -0.09

Again, the same pattern as the positively correlated predictors is apparent. It may be worth constructing some models only using these predictors, as using all 32 variables may prove to introduce a lot of noise into the model.

Intercorrelated Predictors

In addition, some of the variables are highly correlated with each other. The following pairings seem to have something to do with each other in the bottle filling process:

cors_0_diag <- cors_all_preds  # create duplicate matrix

cors_0_diag[lower.tri(cors_0_diag, diag = TRUE)] <- NA  # prevent duplicates by taking upper triangle of matrix

cors_0_diag_df <- as.data.frame(cors_0_diag)


inter_cor <- reshape2::melt(as.matrix(cors_0_diag_df))  # melt into single col of correlations

colnames(inter_cor)[3] <- "Correlation" 

neg_cors <- inter_cor %>%
  arrange(Correlation)

pos_cors <- inter_cor %>%
  arrange(desc(Correlation))

The following predictors have high negative correlations with each other

neg_cors[1:15, ]
##             Var1            Var2 Correlation
## 1  Hyd.Pressure3 Pressure.Vacuum       -0.67
## 2  Hyd.Pressure4        Alch.Rel       -0.64
## 3  Hyd.Pressure2 Pressure.Vacuum       -0.63
## 4       Mnf.Flow    Filler.Level       -0.60
## 5       Mnf.Flow   Bowl.Setpoint       -0.60
## 6  Hyd.Pressure4         Density       -0.56
## 7  Hyd.Pressure4         Balling       -0.56
## 8       Mnf.Flow Pressure.Vacuum       -0.55
## 9       Mnf.Flow   Oxygen.Filler       -0.55
## 10 Hyd.Pressure4        Carb.Rel       -0.54
## 11 Hyd.Pressure4     Balling.Lvl       -0.54
## 12   Carb.Volume   Hyd.Pressure4       -0.53
## 13 Hyd.Pressure3    Filler.Level       -0.52
## 14 Hyd.Pressure3   Bowl.Setpoint       -0.52
## 15      Mnf.Flow       Carb.Flow       -0.49

Conversely, the following predictors are postively correlated with each other. On this end of the spectrum, many of the predictors are almost perfectly correlated with one another. We may want to consider removing some of these redundant variables, perhaps the ones that are less correlated to the target variable:

pos_cors[1:15, ]
##             Var1          Var2 Correlation
## 1        Balling   Balling.Lvl        0.99
## 2   Filler.Level Bowl.Setpoint        0.98
## 3        Density   Balling.Lvl        0.96
## 4   Filler.Speed           MFR        0.95
## 5        Density       Balling        0.95
## 6        Balling      Alch.Rel        0.94
## 7       Alch.Rel   Balling.Lvl        0.94
## 8  Hyd.Pressure2 Hyd.Pressure3        0.92
## 9        Density      Alch.Rel        0.92
## 10      Alch.Rel      Carb.Rel        0.88
## 11      Carb.Rel   Balling.Lvl        0.87
## 12       Density      Carb.Rel        0.85
## 13       Balling      Carb.Rel        0.85
## 14   Carb.Volume      Carb.Rel        0.84
## 15 Carb.Pressure     Carb.Temp        0.83

Bi-Variate Exploration

Scatterplots Between PH and Other Predictors

Before moving on to modeling, we’ll look further into the relationship between the PH target variable and the other predictors. The first predictors we’ll examine against PH are the most highly correlated (\(+\) or \(-\)):

par(mfrow=c(2,3))
for (i in c(10,12,17,20,28,29)){
  plot(soda[ ,i], soda$PH, main = paste(names(soda[i]), "vs. PH"), 
       xlab = names(soda)[i], ylab = "Ph", cex.main = 0.9, cex.lab = 0.7,
       col=rgb(112, 128, 144, max = 255, alpha = 70, names = "grays"), pch=19)
}

par(mfrow=c(1,1))
pairs(soda[ ,c(10,12,17,20,28,29)], main='Variables Highly Correlated with Target', 
      col=rgb(0, 0, 255, max = 255, alpha = 95, names = "blue"), pch=16)

The remaining predictors are plotted against the PH variable to see if any relevant patterns emerge:

par(mfrow=c(2,4))

for (i in c(2:9)){
  plot(soda[ ,i], soda$PH, main = paste(names(soda[i]), "vs. PH"), 
       xlab = names(soda)[i], ylab = "Ph",  cex.main = 0.9, cex.lab = 0.7,
       col=rgb(255, 99, 71, max = 255, alpha = 70, names = "reds"), pch=19)
}

par(mfrow=c(1,1))
par(mfrow=c(2,4))

# c(10,12,17,20,28,29) already used

for (i in c(11,13:16,18,19,21)){
  plot(soda[ ,i], soda$PH, main = paste(names(soda[i]), "vs. PH"), 
       xlab = names(soda)[i], ylab = "Ph",  cex.main = 0.9, cex.lab = 0.7,
       col=rgb(64, 224, 208, max = 255, alpha = 70, names = "grays"), pch=19)
}

par(mfrow=c(1,1))
par(mfrow=c(2,4))

for (i in c(22:25, 27, 30, 31, 32)){
  plot(soda[ ,i], soda$PH, main = paste(names(soda[i]), "vs. PH"), 
       xlab = names(soda)[i], ylab = "Ph", cex.main = 0.9, cex.lab = 0.7,
       col=rgb(221, 160, 221, max = 255, alpha = 70, names = "reds"), pch=19)
}

par(mfrow=c(1,1))

While no strong patters emerge, there is some directionality in the scatterplots for a few predictors, such as Pressure.Vacuum, Bowl.Setpoint, Air.Pressurer and Alch.Rel. While only one of these variables is from the set highly correlated with the predictor, all of these should probably be considered for any model.

Missing Values:

One of the predictors (MFR) contains a considerable amount of missing values (~\(8 \%\) of the cases). This variable is also highly skewed, so imputing using only the median/mean should be done with care, or other methods investigated:

sort(NAs, decreasing = TRUE)
##               MFR      Filler.Speed         PC.Volume           PSC.CO2 
##               212                57                39                39 
##       Fill.Ounces               PSC    Carb.Pressure1     Hyd.Pressure4 
##                38                33                32                30 
##     Carb.Pressure         Carb.Temp          PSC.Fill     Fill.Pressure 
##                27                26                23                22 
##      Filler.Level     Hyd.Pressure2     Hyd.Pressure3       Temperature 
##                20                15                15                14 
##     Oxygen.Filler Pressure.Setpoint     Hyd.Pressure1       Carb.Volume 
##                12                12                11                10 
##          Carb.Rel          Alch.Rel        Usage.cont                PH 
##                10                 9                 5                 4 
##          Mnf.Flow         Carb.Flow     Bowl.Setpoint           Density 
##                 2                 2                 2                 1 
##           Balling       Balling.Lvl   Pressure.Vacuum     Air.Pressurer 
##                 1                 1                 0                 0
hist(soda$MFR, col='Blue', main='Distribution of MFR Predictor')

Most of the other predictors are missing values for a marginal number of cases (\(< 3\%\)), but a few cases are missing data across a number of the predictors. Instead of removing entire variables or imputing them all, we may want to remove some of the cases from the data instead. The plot below shows the ratio and location of the missing values within the dataset:

aggr(soda, col=c('navyblue', 'red'), numbers=TRUE, cex.numbers=.75, cex.axis=0.5)

Counts of the different brand codes are below:

table(soda$Brand.Code)
## 
##         A    B    C    D 
##  120  293 1239  304  615

Most of the cases (nearly half) are brand “B”; the next most common is brand “D”, followed by “C”, and “A” closely behind that. 120 (\(> 5 \%\)) of the cases in the soda dataset contain no brand name. Whether these are all the same brand will require some exploration.

brand_unknown <- soda[soda$Brand.Code == '', ]

The same method used to create the overall summary for all brands is used for the separated “unknown” or blank brand name. The minimum and maximum have been added to show the range in addition to the IQR for each predictor within this subset:

means <- sapply(brand_unknown[-1], function(y) mean(y, na.rm = TRUE))
medians <- sapply(brand_unknown[-1], function(y) median(y, na.rm = TRUE))
IQRs <- sapply(brand_unknown[-1], function(y) IQR(y, na.rm = TRUE))
skews <- sapply(brand_unknown[-1], function(y) skewness(as.numeric(y), na.rm = TRUE))
NAs <- sapply(brand_unknown[-1], function(y) sum(length(which(is.na(y)))))

brand_unknown_summary <- data.frame(means, medians, IQRs, skews, NAs)
colnames(brand_unknown_summary) <- c("MEAN", "MEDIAN", "IQR", "SKEW", "NAs")
brand_unknown_summary <- round(brand_unknown_summary, 2)

brand_unknown_summary
##                      MEAN  MEDIAN    IQR  SKEW NAs
## Carb.Volume          5.29    5.29   0.11  0.38   0
## Fill.Ounces         24.00   23.99   0.12  0.37   4
## PC.Volume            0.29    0.28   0.11 -0.02   4
## Carb.Pressure       66.26   66.00   4.30  0.60   1
## Carb.Temp          140.52  140.20   6.00  0.43   3
## PSC                  0.09    0.08   0.07  0.77   3
## PSC.Fill             0.19    0.18   0.11  0.78   4
## PSC.CO2              0.06    0.04   0.04  1.79   2
## Mnf.Flow            27.15   71.00 235.00 -0.04   0
## Carb.Pressure1     123.46  123.40   5.70  0.54   1
## Fill.Pressure       48.78   49.60   4.20  0.42   3
## Hyd.Pressure1       10.89    8.80  17.50  1.06   0
## Hyd.Pressure2       18.44   22.80  32.70 -0.16   0
## Hyd.Pressure3       19.79   27.00  33.60 -0.30   0
## Hyd.Pressure4       95.98   86.00  30.50  0.70   4
## Filler.Level       112.15  119.10  20.00 -0.49   2
## Filler.Speed      3517.83 3908.00 392.00 -2.11   3
## Temperature         67.23   67.00   1.60  1.90   1
## Usage.cont          20.73   21.51   5.96 -0.47   0
## Carb.Flow         2557.43 3027.00 897.50 -1.22   0
## Density              0.97    0.98   0.08 -1.30   0
## MFR                685.10  711.20  53.00 -3.71  13
## Balling              1.67    1.70   0.21  1.23   0
## Pressure.Vacuum     -5.33   -5.40   0.80  0.36   0
## PH                   8.49    8.51   0.26  0.09   0
## Oxygen.Filler        0.05    0.03   0.05  1.28   1
## Bowl.Setpoint      112.17  120.00  20.00 -0.41   0
## Pressure.Setpoint   47.80   48.00   4.00 -0.08   1
## Air.Pressurer      142.67  142.60   0.60  0.03   0
## Alch.Rel             6.64    6.56   0.04  3.47   1
## Carb.Rel             5.34    5.34   0.12  0.31   2
## Balling.Lvl          1.47    1.46   0.18  3.58   1

Examining the results, there is not too much variance in the predictors that actually rely on the makeup (chemical or otherwise) of the soda itself. These would be predictors like Fill.Ounces, PH, and Carb.Volume. The predictors that have more variance have to do with temperature and pressure, and given the changes in these across all predictors, it is probably safe to assume that the unknown brand is just un-named.

Just to make the case, we’ll look at the same stats for another brand (“D”), just to see if the same predictors stay consistent, or if we should expect more variance:

brand_d <- soda[soda$Brand.Code == 'D', ]
mins <- sapply(brand_d[-1], function(y) min(y, na.rm = TRUE))
means <- sapply(brand_d[-1], function(y) mean(y, na.rm = TRUE))
medians <- sapply(brand_d[-1], function(y) median(y, na.rm = TRUE))
IQRs <- sapply(brand_d[-1], function(y) IQR(y, na.rm = TRUE))
maxs <- sapply(brand_d[-1], function(y) max(y, na.rm = TRUE))
skews <- sapply(brand_d[-1], function(y) skewness(as.numeric(y), na.rm = TRUE))
cors <- as.vector(cor(brand_d$PH, brand_d[,2:ncol(brand_d)], use = "complete.obs"))
NAs <- sapply(brand_d[-1], function(y) sum(length(which(is.na(y)))))

brand_d_summary <- data.frame(mins, means, medians, IQRs, maxs, skews, cors, NAs)
colnames(brand_d_summary) <- c("MIN", "MEAN", "MEDIAN", "IQR", "MAX", "SKEW", "$r_{PH}$", "NAs")
brand_d_summary <- round(brand_d_summary, 2)

brand_d_summary
##                       MIN    MEAN  MEDIAN     IQR     MAX  SKEW $r_{PH}$
## Carb.Volume          5.26    5.51    5.51    0.07    5.70 -0.33    -0.05
## Fill.Ounces         23.69   23.96   23.96    0.10   24.32  0.02    -0.03
## PC.Volume            0.09    0.26    0.25    0.07    0.46  0.69     0.09
## Carb.Pressure       60.40   70.70   70.60    3.70   79.40  0.13     0.01
## Carb.Temp          129.20  141.13  140.80    4.60  153.80  0.27     0.02
## PSC                  0.00    0.08    0.07    0.06    0.26  0.87    -0.08
## PSC.Fill             0.00    0.19    0.18    0.16    0.60  0.94     0.01
## PSC.CO2              0.00    0.05    0.04    0.04    0.24  1.62    -0.06
## Mnf.Flow          -100.20   25.75   84.20  241.00  204.60  0.00    -0.40
## Carb.Pressure1     105.60  122.72  123.40    6.80  140.20 -0.04    -0.01
## Fill.Pressure       37.80   46.96   46.20    2.20   60.00  0.99    -0.04
## Hyd.Pressure1       -0.80   12.40   10.80   21.00   52.20  0.81    -0.10
## Hyd.Pressure2        0.00   21.97   31.60   35.60   46.20 -0.45    -0.26
## Hyd.Pressure3       -1.20   21.59   28.40   34.60   50.00 -0.36    -0.23
## Hyd.Pressure4       62.00   83.45   80.00    6.00  140.00  2.21    -0.06
## Filler.Level        55.80  110.42  118.20   19.20  151.80 -0.92     0.16
## Filler.Speed       998.00 3688.98 3982.00  107.50 4026.00 -2.86    -0.03
## Temperature         63.60   65.46   65.20    1.40   73.60  2.09    -0.08
## Usage.cont          12.46   21.01   21.78    5.40   25.78 -0.54    -0.38
## Carb.Flow           26.00 2437.18 3020.00 1965.00 3846.00 -0.97     0.18
## Density              0.72    1.68    1.72    0.12    1.92 -2.56    -0.16
## MFR                112.60  703.98  725.40   25.20  814.60 -4.95    -0.03
## Balling              1.00    3.49    3.49    0.30    4.01 -3.91    -0.16
## Pressure.Vacuum     -6.60   -5.26   -5.40    0.60   -3.60  0.66     0.17
## PH                   8.16    8.60    8.62    0.20    8.92 -0.10     1.00
## Oxygen.Filler        0.00    0.04    0.03    0.03    0.32  2.45     0.16
## Bowl.Setpoint       70.00  110.89  120.00   10.00  140.00 -1.08     0.19
## Pressure.Setpoint   44.00   46.65   46.00    0.00   52.00  1.03     0.07
## Air.Pressurer      141.00  142.69  142.60    0.60  147.40  2.72    -0.06
## Alch.Rel             6.50    7.69    7.72    0.06    8.20 -6.13     0.07
## Carb.Rel             4.96    5.61    5.60    0.06    5.80 -2.28     0.06
## Balling.Lvl          0.90    3.23    3.28    0.16    3.66 -5.53    -0.04
##                   NAs
## Carb.Volume         4
## Fill.Ounces         6
## PC.Volume           7
## Carb.Pressure      12
## Carb.Temp           5
## PSC                 7
## PSC.Fill            5
## PSC.CO2             7
## Mnf.Flow            0
## Carb.Pressure1     10
## Fill.Pressure       6
## Hyd.Pressure1       7
## Hyd.Pressure2       8
## Hyd.Pressure3       8
## Hyd.Pressure4       1
## Filler.Level        5
## Filler.Speed       13
## Temperature         3
## Usage.cont          0
## Carb.Flow           0
## Density             0
## MFR                50
## Balling             0
## Pressure.Vacuum     0
## PH                  0
## Oxygen.Filler       1
## Bowl.Setpoint       0
## Pressure.Setpoint   2
## Air.Pressurer       0
## Alch.Rel            0
## Carb.Rel            0
## Balling.Lvl         0

Brand and PH Correlation

Before removing the Brand.Code predictor, we’ll check for any correlation between the brand name/type and the target variable (PH). The character data is replaced with arbitrary numerical values:

soda$Brand.Code[which(soda$Brand.Code == "")] <- 1
soda$Brand.Code[which(soda$Brand.Code == "A")] <- 2
soda$Brand.Code[which(soda$Brand.Code == "B")] <- 3
soda$Brand.Code[which(soda$Brand.Code == "C")] <- 4
soda$Brand.Code[which(soda$Brand.Code == "D")] <- 5

soda$Brand.Code <- as.numeric(soda$Brand.Code)

After replacing the characters, the correlation of the brands to PH level is calculated. There is very little correlation, either positive or negative, between the brand of soda and the Ph level.

brand_ph <- as.data.frame(cbind(soda$Brand.Code, soda$PH))

colnames(brand_ph) <- c("BrandCode", "PH")

cors_brand_ph <- round(cor(brand_ph, use="complete.obs"), 2)

cors_brand_ph <- as.data.frame(cors_brand_ph)

corrplot(as.matrix(cors_brand_ph), method="color", tl.cex=.75, tl.col=colors()[598])