&^& Add transformations %$% Run models
library(e1071)
library(dplyr)
library(tidyr)
library(ggplot2)
library(VIM)
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.2
Loading data from GitHub:
# open file
path <- ("https://raw.githubusercontent.com/kennygfm/fall624-group3/master/StudentData.csv")
con <- file(path, open="r")
# "Student" soft drink data
soda <- read.csv(con, header=T, sep=",", stringsAsFactors = F)
# close file
close(con)
soda[ , c(16,18,21,28)] <- sapply(soda[, c(16,18,21,28)], as.numeric) # get rid of pesky integer values
Dataset is 32 predictors + 1 target variable (Brand.Code
), with 2571 observations:
dim(soda)
## [1] 2571 33
With the exception of the Brand.Code
, all variables are either numeric or integers:
str(soda)
## 'data.frame': 2571 obs. of 33 variables:
## $ Brand.Code : chr "B" "A" "B" "A" ...
## $ Carb.Volume : num 5.34 5.43 5.29 5.44 5.49 ...
## $ Fill.Ounces : num 24 24 24.1 24 24.3 ...
## $ PC.Volume : num 0.263 0.239 0.263 0.293 0.111 ...
## $ Carb.Pressure : num 68.2 68.4 70.8 63 67.2 66.6 64.2 67.6 64.2 72 ...
## $ Carb.Temp : num 141 140 145 133 137 ...
## $ PSC : num 0.104 0.124 0.09 NA 0.026 0.09 0.128 0.154 0.132 0.014 ...
## $ PSC.Fill : num 0.26 0.22 0.34 0.42 0.16 0.24 0.4 0.34 0.12 0.24 ...
## $ PSC.CO2 : num 0.04 0.04 0.16 0.04 0.12 0.04 0.04 0.04 0.14 0.06 ...
## $ Mnf.Flow : num -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 ...
## $ Carb.Pressure1 : num 119 122 120 115 118 ...
## $ Fill.Pressure : num 46 46 46 46.4 45.8 45.6 51.8 46.8 46 45.2 ...
## $ Hyd.Pressure1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Hyd.Pressure2 : num NA NA NA 0 0 0 0 0 0 0 ...
## $ Hyd.Pressure3 : num NA NA NA 0 0 0 0 0 0 0 ...
## $ Hyd.Pressure4 : num 118 106 82 92 92 116 124 132 90 108 ...
## $ Filler.Level : num 121 119 120 118 119 ...
## $ Filler.Speed : num 4002 3986 4020 4012 4010 ...
## $ Temperature : num 66 67.6 67 65.6 65.6 66.2 65.8 65.2 65.4 66.6 ...
## $ Usage.cont : num 16.2 19.9 17.8 17.4 17.7 ...
## $ Carb.Flow : num 2932 3144 2914 3062 3054 ...
## $ Density : num 0.88 0.92 1.58 1.54 1.54 1.52 0.84 0.84 0.9 0.9 ...
## $ MFR : num 725 727 735 731 723 ...
## $ Balling : num 1.4 1.5 3.14 3.04 3.04 ...
## $ Pressure.Vacuum : num -4 -4 -3.8 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 ...
## $ PH : num 8.36 8.26 8.94 8.24 8.26 8.32 8.4 8.38 8.38 8.5 ...
## $ Oxygen.Filler : num 0.022 0.026 0.024 0.03 0.03 0.024 0.066 0.046 0.064 0.022 ...
## $ Bowl.Setpoint : num 120 120 120 120 120 120 120 120 120 120 ...
## $ Pressure.Setpoint: num 46.4 46.8 46.6 46 46 46 46 46 46 46 ...
## $ Air.Pressurer : num 143 143 142 146 146 ...
## $ Alch.Rel : num 6.58 6.56 7.66 7.14 7.14 7.16 6.54 6.52 6.52 6.54 ...
## $ Carb.Rel : num 5.32 5.3 5.84 5.42 5.44 5.44 5.38 5.34 5.34 5.34 ...
## $ Balling.Lvl : num 1.48 1.56 3.28 3.04 3.04 3.02 1.44 1.44 1.44 1.38 ...
There are too many predictors for the standard summary
function to produce an easy-to-read output. A more useful summary with skewness, correlations to PH
, and NA
counts is below. The variable Brand.Code
, because it is a character vector, is left out:
means <- sapply(soda[-1], function(y) mean(y, na.rm = TRUE))
medians <- sapply(soda[-1], function(y) median(y, na.rm = TRUE))
IQRs <- sapply(soda[-1], function(y) IQR(y, na.rm = TRUE))
vars <- sapply(soda[-1], function(y) var(y, na.rm = TRUE))
skews <- sapply(soda[-1], function(y) skewness(as.numeric(y), na.rm = TRUE))
cors <- as.vector(cor(soda$PH, soda[,2:ncol(soda)], use = "complete.obs"))
NAs <- sapply(soda[-1], function(y) sum(length(which(is.na(y)))))
soda_summary <- data.frame(means, medians, IQRs, vars, skews, cors, NAs)
colnames(soda_summary) <- c("MEAN", "MEDIAN", "IQR", "Var", "SKEW", "$r_{PH}$", "NAs")
soda_summary <- round(soda_summary, 2)
soda_summary
## MEAN MEDIAN IQR Var SKEW $r_{PH}$ NAs
## Carb.Volume 5.37 5.35 0.16 0.01 0.39 0.07 10
## Fill.Ounces 23.97 23.97 0.11 0.01 -0.02 -0.12 38
## PC.Volume 0.28 0.27 0.07 0.00 0.34 0.10 39
## Carb.Pressure 68.19 68.20 5.00 12.52 0.18 0.08 27
## Carb.Temp 141.09 140.80 5.40 16.30 0.25 0.03 26
## PSC 0.08 0.08 0.06 0.00 0.85 -0.07 33
## PSC.Fill 0.20 0.18 0.16 0.01 0.93 -0.02 23
## PSC.CO2 0.06 0.04 0.06 0.00 1.73 -0.09 39
## Mnf.Flow 24.57 65.20 240.80 14275.74 0.00 -0.46 2
## Carb.Pressure1 122.59 123.20 6.40 22.49 0.05 -0.12 32
## Fill.Pressure 47.92 46.40 4.00 10.10 0.55 -0.32 22
## Hyd.Pressure1 12.44 11.40 20.20 154.59 0.78 -0.05 11
## Hyd.Pressure2 20.96 28.60 34.60 268.51 -0.30 -0.22 15
## Hyd.Pressure3 20.46 27.60 33.40 255.22 -0.32 -0.27 15
## Hyd.Pressure4 96.29 96.00 16.00 172.20 0.55 -0.17 30
## Filler.Level 109.25 118.40 21.70 246.44 -0.85 0.35 20
## Filler.Speed 3687.20 3982.00 110.00 594163.50 -2.87 -0.04 57
## Temperature 65.97 65.60 1.20 1.91 2.39 -0.18 14
## Usage.cont 20.99 21.79 5.39 8.87 -0.54 -0.36 5
## Carb.Flow 2468.35 3028.00 2042.00 1152824.12 -0.99 0.23 2
## Density 1.17 0.98 0.72 0.14 0.53 0.10 1
## MFR 704.05 724.00 24.70 5460.96 -5.09 -0.05 212
## Balling 2.20 1.65 1.80 0.87 0.59 0.08 1
## Pressure.Vacuum -5.22 -5.40 0.60 0.32 0.53 0.22 0
## PH 8.55 8.54 0.24 0.03 -0.29 1.00 4
## Oxygen.Filler 0.05 0.03 0.04 0.00 2.66 0.16 12
## Bowl.Setpoint 109.33 120.00 20.00 234.19 -0.97 0.36 2
## Pressure.Setpoint 47.62 46.00 4.00 4.16 0.20 -0.31 12
## Air.Pressurer 142.83 142.60 0.80 1.47 2.25 -0.01 0
## Alch.Rel 6.90 6.56 0.70 0.26 0.88 0.17 9
## Carb.Rel 5.44 5.40 0.20 0.02 0.50 0.20 10
## Balling.Lvl 2.05 1.48 1.76 0.76 0.59 0.11 1
The table above shows some useful information regarding the mean and median for each predictor, and comparing the differences between the two, skewness of some variables is already apparent. Using the skewness
function from e1071
, the predictors MFR
, Filler.Speed
, Carb.Flow
, and Bowl.Setpoint
are shown to be the most negatively skewed. On the other end of the spectrum, Oxygen.Filler
, Temperature
, Air.Pressurer
, PSC.Fill
, and PSC.CO2
are all positively skewed. Many of these predictors have to do with gasses and air pressure, and we may want to consider scaling and centering these and other predictors before fitting any models.
Boxplots of these predictors are provided below:
skewed_preds <- soda[, c(8,9,18,19,21:23,27,28,30)]
s <- gather(skewed_preds, "predictor", "value")
ggplot(s, aes(predictor, value)) + geom_boxplot(aes(fill = predictor)) + facet_wrap(~predictor, scale="free") + scale_fill_discrete(guide=FALSE) + scale_y_continuous('', labels = NULL, breaks = NULL) +
scale_x_discrete('') + ggtitle("Most Skewed Predictors")
## Warning: Removed 362 rows containing non-finite values (stat_boxplot).
Aside from the skewed predictors, histograms of all 32 numeric predictors are created in order to get a quick overview of the distributions, as well as to spot any odd patterns:
par(mfrow=c(4,4))
for (i in c(2:17)){
hist(soda[ ,i], main = paste(names(soda[i]), "Distribution"),
xlab = names(soda)[i], cex.main = 0.9, cex.axis = 0.7,
col=rgb(64, 224, 208, max = 255, alpha = 70, names = "grays"), pch=19)
}
par(mfrow=c(1,1))
Many of the first 16 predictors (as ordered in the dataset) have normal or somewhat-normal distributions, and appear to be continuous variables. A few of the skewed distributions follow an almost chi-squared or log-normal distribution, so these are predictors where transformations should be considered (PSC.Distribution
and PSC.C02.Distribution
).
Another insteresting pattern are the number of 0 values in the Hyd.Pressure
(1-3) variables. The \(4^{th}\) Hyd.Pressure
does not follow this same pattern, so depending on the relationship between these variables and the target and/or other variables, they may be removed outright.
The next set of histograms for variables 17-32 are below:
par(mfrow=c(4,4))
for (i in c(18:ncol(soda))){
hist(soda[ ,i], main = paste(names(soda[i]), "Distribution"),
xlab = names(soda)[i], cex.main = 0.9, cex.axis = 0.7,
col=rgb(64, 224, 208, max = 255, alpha = 70, names = "grays"), pch=19)
}
par(mfrow=c(1,1))
These next predictors follow some different patterns. A few are normal or near-normal distributions (Pressure.Vacuum
and PH
, the target), and some are highly skewed (Filler.Speed
, Oxygen.Filler
). Another set of predictors appear to not be continuous, but discrete distributions (Pressure.Setpoint
, Alch.Rel
), however, as we’ll see in later plots, many of these variables are just constrained to a few values, but are still continuous.
Out of all the variables, Bowl.Setpoint
does seem to be a discrete distribution:
table(soda$Bowl.Setpoint)
##
## 70 80 90 100 110 120 122 126 130 134 140
## 99 96 434 112 437 1307 1 10 51 2 20
hist(soda$Bowl.Setpoint, main="Bowl.Setpoint", col='grey', xlab='Value')
Scatterplots Between PH and Other Predictors
Before moving on to modeling, we’ll look further into the relationship between the PH
target variable and the other predictors. The first predictors we’ll examine against PH
are the most highly correlated (\(+\) or \(-\)):
par(mfrow=c(2,3))
for (i in c(10,12,17,20,28,29)){
plot(soda[ ,i], soda$PH, main = paste(names(soda[i]), "vs. PH"),
xlab = names(soda)[i], ylab = "Ph", cex.main = 0.9, cex.lab = 0.7,
col=rgb(112, 128, 144, max = 255, alpha = 70, names = "grays"), pch=19)
}
par(mfrow=c(1,1))
pairs(soda[ ,c(10,12,17,20,28,29)], main='Variables Highly Correlated with Target',
col=rgb(0, 0, 255, max = 255, alpha = 95, names = "blue"), pch=16)
The remaining predictors are plotted against the PH
variable to see if any relevant patterns emerge:
par(mfrow=c(2,4))
for (i in c(2:9)){
plot(soda[ ,i], soda$PH, main = paste(names(soda[i]), "vs. PH"),
xlab = names(soda)[i], ylab = "Ph", cex.main = 0.9, cex.lab = 0.7,
col=rgb(255, 99, 71, max = 255, alpha = 70, names = "reds"), pch=19)
}
par(mfrow=c(1,1))
par(mfrow=c(2,4))
# c(10,12,17,20,28,29) already used
for (i in c(11,13:16,18,19,21)){
plot(soda[ ,i], soda$PH, main = paste(names(soda[i]), "vs. PH"),
xlab = names(soda)[i], ylab = "Ph", cex.main = 0.9, cex.lab = 0.7,
col=rgb(64, 224, 208, max = 255, alpha = 70, names = "grays"), pch=19)
}
par(mfrow=c(1,1))
par(mfrow=c(2,4))
for (i in c(22:25, 27, 30, 31, 32)){
plot(soda[ ,i], soda$PH, main = paste(names(soda[i]), "vs. PH"),
xlab = names(soda)[i], ylab = "Ph", cex.main = 0.9, cex.lab = 0.7,
col=rgb(221, 160, 221, max = 255, alpha = 70, names = "reds"), pch=19)
}
par(mfrow=c(1,1))
While no strong patters emerge, there is some directionality in the scatterplots for a few predictors, such as Pressure.Vacuum
, Bowl.Setpoint
, Air.Pressurer
and Alch.Rel
. While only one of these variables is from the set highly correlated with the predictor, all of these should probably be considered for any model.
One of the predictors (MFR
) contains a considerable amount of missing values (~\(8 \%\) of the cases). This variable is also highly skewed, so imputing using only the median/mean should be done with care, or other methods investigated:
sort(NAs, decreasing = TRUE)
## MFR Filler.Speed PC.Volume PSC.CO2
## 212 57 39 39
## Fill.Ounces PSC Carb.Pressure1 Hyd.Pressure4
## 38 33 32 30
## Carb.Pressure Carb.Temp PSC.Fill Fill.Pressure
## 27 26 23 22
## Filler.Level Hyd.Pressure2 Hyd.Pressure3 Temperature
## 20 15 15 14
## Oxygen.Filler Pressure.Setpoint Hyd.Pressure1 Carb.Volume
## 12 12 11 10
## Carb.Rel Alch.Rel Usage.cont PH
## 10 9 5 4
## Mnf.Flow Carb.Flow Bowl.Setpoint Density
## 2 2 2 1
## Balling Balling.Lvl Pressure.Vacuum Air.Pressurer
## 1 1 0 0
hist(soda$MFR, col='Blue', main='Distribution of MFR Predictor')
Most of the other predictors are missing values for a marginal number of cases (\(< 3\%\)), but a few cases are missing data across a number of the predictors. Instead of removing entire variables or imputing them all, we may want to remove some of the cases from the data instead. The plot below shows the ratio and location of the missing values within the dataset:
aggr(soda, col=c('navyblue', 'red'), numbers=TRUE, cex.numbers=.75, cex.axis=0.5)
Counts of the different brand codes are below:
table(soda$Brand.Code)
##
## A B C D
## 120 293 1239 304 615
Most of the cases (nearly half) are brand “B”; the next most common is brand “D”, followed by “C”, and “A” closely behind that. 120 (\(> 5 \%\)) of the cases in the soda dataset contain no brand name. Whether these are all the same brand will require some exploration.
brand_unknown <- soda[soda$Brand.Code == '', ]
The same method used to create the overall summary for all brands is used for the separated “unknown” or blank brand name. The minimum and maximum have been added to show the range in addition to the IQR for each predictor within this subset:
means <- sapply(brand_unknown[-1], function(y) mean(y, na.rm = TRUE))
medians <- sapply(brand_unknown[-1], function(y) median(y, na.rm = TRUE))
IQRs <- sapply(brand_unknown[-1], function(y) IQR(y, na.rm = TRUE))
skews <- sapply(brand_unknown[-1], function(y) skewness(as.numeric(y), na.rm = TRUE))
NAs <- sapply(brand_unknown[-1], function(y) sum(length(which(is.na(y)))))
brand_unknown_summary <- data.frame(means, medians, IQRs, skews, NAs)
colnames(brand_unknown_summary) <- c("MEAN", "MEDIAN", "IQR", "SKEW", "NAs")
brand_unknown_summary <- round(brand_unknown_summary, 2)
brand_unknown_summary
## MEAN MEDIAN IQR SKEW NAs
## Carb.Volume 5.29 5.29 0.11 0.38 0
## Fill.Ounces 24.00 23.99 0.12 0.37 4
## PC.Volume 0.29 0.28 0.11 -0.02 4
## Carb.Pressure 66.26 66.00 4.30 0.60 1
## Carb.Temp 140.52 140.20 6.00 0.43 3
## PSC 0.09 0.08 0.07 0.77 3
## PSC.Fill 0.19 0.18 0.11 0.78 4
## PSC.CO2 0.06 0.04 0.04 1.79 2
## Mnf.Flow 27.15 71.00 235.00 -0.04 0
## Carb.Pressure1 123.46 123.40 5.70 0.54 1
## Fill.Pressure 48.78 49.60 4.20 0.42 3
## Hyd.Pressure1 10.89 8.80 17.50 1.06 0
## Hyd.Pressure2 18.44 22.80 32.70 -0.16 0
## Hyd.Pressure3 19.79 27.00 33.60 -0.30 0
## Hyd.Pressure4 95.98 86.00 30.50 0.70 4
## Filler.Level 112.15 119.10 20.00 -0.49 2
## Filler.Speed 3517.83 3908.00 392.00 -2.11 3
## Temperature 67.23 67.00 1.60 1.90 1
## Usage.cont 20.73 21.51 5.96 -0.47 0
## Carb.Flow 2557.43 3027.00 897.50 -1.22 0
## Density 0.97 0.98 0.08 -1.30 0
## MFR 685.10 711.20 53.00 -3.71 13
## Balling 1.67 1.70 0.21 1.23 0
## Pressure.Vacuum -5.33 -5.40 0.80 0.36 0
## PH 8.49 8.51 0.26 0.09 0
## Oxygen.Filler 0.05 0.03 0.05 1.28 1
## Bowl.Setpoint 112.17 120.00 20.00 -0.41 0
## Pressure.Setpoint 47.80 48.00 4.00 -0.08 1
## Air.Pressurer 142.67 142.60 0.60 0.03 0
## Alch.Rel 6.64 6.56 0.04 3.47 1
## Carb.Rel 5.34 5.34 0.12 0.31 2
## Balling.Lvl 1.47 1.46 0.18 3.58 1
Examining the results, there is not too much variance in the predictors that actually rely on the makeup (chemical or otherwise) of the soda itself. These would be predictors like Fill.Ounces
, PH
, and Carb.Volume
. The predictors that have more variance have to do with temperature and pressure, and given the changes in these across all predictors, it is probably safe to assume that the unknown brand is just un-named.
Just to make the case, we’ll look at the same stats for another brand (“D”), just to see if the same predictors stay consistent, or if we should expect more variance:
brand_d <- soda[soda$Brand.Code == 'D', ]
mins <- sapply(brand_d[-1], function(y) min(y, na.rm = TRUE))
means <- sapply(brand_d[-1], function(y) mean(y, na.rm = TRUE))
medians <- sapply(brand_d[-1], function(y) median(y, na.rm = TRUE))
IQRs <- sapply(brand_d[-1], function(y) IQR(y, na.rm = TRUE))
maxs <- sapply(brand_d[-1], function(y) max(y, na.rm = TRUE))
skews <- sapply(brand_d[-1], function(y) skewness(as.numeric(y), na.rm = TRUE))
cors <- as.vector(cor(brand_d$PH, brand_d[,2:ncol(brand_d)], use = "complete.obs"))
NAs <- sapply(brand_d[-1], function(y) sum(length(which(is.na(y)))))
brand_d_summary <- data.frame(mins, means, medians, IQRs, maxs, skews, cors, NAs)
colnames(brand_d_summary) <- c("MIN", "MEAN", "MEDIAN", "IQR", "MAX", "SKEW", "$r_{PH}$", "NAs")
brand_d_summary <- round(brand_d_summary, 2)
brand_d_summary
## MIN MEAN MEDIAN IQR MAX SKEW $r_{PH}$
## Carb.Volume 5.26 5.51 5.51 0.07 5.70 -0.33 -0.05
## Fill.Ounces 23.69 23.96 23.96 0.10 24.32 0.02 -0.03
## PC.Volume 0.09 0.26 0.25 0.07 0.46 0.69 0.09
## Carb.Pressure 60.40 70.70 70.60 3.70 79.40 0.13 0.01
## Carb.Temp 129.20 141.13 140.80 4.60 153.80 0.27 0.02
## PSC 0.00 0.08 0.07 0.06 0.26 0.87 -0.08
## PSC.Fill 0.00 0.19 0.18 0.16 0.60 0.94 0.01
## PSC.CO2 0.00 0.05 0.04 0.04 0.24 1.62 -0.06
## Mnf.Flow -100.20 25.75 84.20 241.00 204.60 0.00 -0.40
## Carb.Pressure1 105.60 122.72 123.40 6.80 140.20 -0.04 -0.01
## Fill.Pressure 37.80 46.96 46.20 2.20 60.00 0.99 -0.04
## Hyd.Pressure1 -0.80 12.40 10.80 21.00 52.20 0.81 -0.10
## Hyd.Pressure2 0.00 21.97 31.60 35.60 46.20 -0.45 -0.26
## Hyd.Pressure3 -1.20 21.59 28.40 34.60 50.00 -0.36 -0.23
## Hyd.Pressure4 62.00 83.45 80.00 6.00 140.00 2.21 -0.06
## Filler.Level 55.80 110.42 118.20 19.20 151.80 -0.92 0.16
## Filler.Speed 998.00 3688.98 3982.00 107.50 4026.00 -2.86 -0.03
## Temperature 63.60 65.46 65.20 1.40 73.60 2.09 -0.08
## Usage.cont 12.46 21.01 21.78 5.40 25.78 -0.54 -0.38
## Carb.Flow 26.00 2437.18 3020.00 1965.00 3846.00 -0.97 0.18
## Density 0.72 1.68 1.72 0.12 1.92 -2.56 -0.16
## MFR 112.60 703.98 725.40 25.20 814.60 -4.95 -0.03
## Balling 1.00 3.49 3.49 0.30 4.01 -3.91 -0.16
## Pressure.Vacuum -6.60 -5.26 -5.40 0.60 -3.60 0.66 0.17
## PH 8.16 8.60 8.62 0.20 8.92 -0.10 1.00
## Oxygen.Filler 0.00 0.04 0.03 0.03 0.32 2.45 0.16
## Bowl.Setpoint 70.00 110.89 120.00 10.00 140.00 -1.08 0.19
## Pressure.Setpoint 44.00 46.65 46.00 0.00 52.00 1.03 0.07
## Air.Pressurer 141.00 142.69 142.60 0.60 147.40 2.72 -0.06
## Alch.Rel 6.50 7.69 7.72 0.06 8.20 -6.13 0.07
## Carb.Rel 4.96 5.61 5.60 0.06 5.80 -2.28 0.06
## Balling.Lvl 0.90 3.23 3.28 0.16 3.66 -5.53 -0.04
## NAs
## Carb.Volume 4
## Fill.Ounces 6
## PC.Volume 7
## Carb.Pressure 12
## Carb.Temp 5
## PSC 7
## PSC.Fill 5
## PSC.CO2 7
## Mnf.Flow 0
## Carb.Pressure1 10
## Fill.Pressure 6
## Hyd.Pressure1 7
## Hyd.Pressure2 8
## Hyd.Pressure3 8
## Hyd.Pressure4 1
## Filler.Level 5
## Filler.Speed 13
## Temperature 3
## Usage.cont 0
## Carb.Flow 0
## Density 0
## MFR 50
## Balling 0
## Pressure.Vacuum 0
## PH 0
## Oxygen.Filler 1
## Bowl.Setpoint 0
## Pressure.Setpoint 2
## Air.Pressurer 0
## Alch.Rel 0
## Carb.Rel 0
## Balling.Lvl 0
Before removing the Brand.Code
predictor, we’ll check for any correlation between the brand name/type and the target variable (PH
). The character data is replaced with arbitrary numerical values:
soda$Brand.Code[which(soda$Brand.Code == "")] <- 1
soda$Brand.Code[which(soda$Brand.Code == "A")] <- 2
soda$Brand.Code[which(soda$Brand.Code == "B")] <- 3
soda$Brand.Code[which(soda$Brand.Code == "C")] <- 4
soda$Brand.Code[which(soda$Brand.Code == "D")] <- 5
soda$Brand.Code <- as.numeric(soda$Brand.Code)
After replacing the characters, the correlation of the brands to PH
level is calculated. There is very little correlation, either positive or negative, between the brand of soda and the Ph level.
brand_ph <- as.data.frame(cbind(soda$Brand.Code, soda$PH))
colnames(brand_ph) <- c("BrandCode", "PH")
cors_brand_ph <- round(cor(brand_ph, use="complete.obs"), 2)
cors_brand_ph <- as.data.frame(cors_brand_ph)
corrplot(as.matrix(cors_brand_ph), method="color", tl.cex=.75, tl.col=colors()[598])