library(tidyverse)
library(cowplot)
library(lattice)
library(reshape2)
library(corrplot)
library(randomForest)
library(caTools)
library(nnet)
library(caret)
library(party)
library(rpart)
library(rpart.plot)
library(pROC)
library(tree)
library(stringr)
library(Hmisc)

Data Content and Structure

Data

For this assignment I have chosen to work with the Wine Quality data set. This data set can be accessed from http://archive.ics.uci.edu/ml/datasets/Wine+Quality. This data set contains two sub data sets for Red and White wine respectively. For the purposes of this analysis, I will be working with the Red wine sub set. The goal of this data set was to model the wine quality based on physicochemical test. It contains 12 attributes as listed below.

1 - fixed acidity 2 - volatile acidity 3 - citric acid 4 - residual sugar 5 - chlorides 6 - free sulfur dioxide 7 - total sulfur dioxide 8 - density 9 - pH 10 - sulphates 11 - alcohol Output variable (based on sensory data): 12 - quality (score between 0 and 10)

For this analysis, I will be attempting to model the quality of the wine based on a different combinations of attributes.

Data Setup

wine <- as.data.frame(read.csv('winequality-red.csv'))
wine[1,]
## [1] "7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5"
colnames(wine) <- c('col')

wine[c('Fixed_Acidity', 'Volatile_Acidity', 'Citric_Acid', 'Residual_Sugar', 'Chlorides', 'Free_Sulfer_Dioxide', 'Total_Sulfur_Dioxide', 'Density', 'pH', 'Sulphates', 'Alchohol', 'Quality')] <- str_split_fixed(wine$col, ';', 12)

wine <- wine %>% select(!col)

head(wine)
##   Fixed_Acidity Volatile_Acidity Citric_Acid Residual_Sugar Chlorides
## 1           7.4              0.7           0            1.9     0.076
## 2           7.8             0.88           0            2.6     0.098
## 3           7.8             0.76        0.04            2.3     0.092
## 4          11.2             0.28        0.56            1.9     0.075
## 5           7.4              0.7           0            1.9     0.076
## 6           7.4             0.66           0            1.8     0.075
##   Free_Sulfer_Dioxide Total_Sulfur_Dioxide Density   pH Sulphates Alchohol
## 1                  11                   34  0.9978 3.51      0.56      9.4
## 2                  25                   67  0.9968  3.2      0.68      9.8
## 3                  15                   54   0.997 3.26      0.65      9.8
## 4                  17                   60   0.998 3.16      0.58      9.8
## 5                  11                   34  0.9978 3.51      0.56      9.4
## 6                  13                   40  0.9978 3.51      0.56      9.4
##   Quality
## 1       5
## 2       5
## 3       5
## 4       6
## 5       5
## 6       5

Exploratary Analysis

Glimpse

As the first step of exploratory analysis we will take a glimpse at the data.

head(wine,3)
##   Fixed_Acidity Volatile_Acidity Citric_Acid Residual_Sugar Chlorides
## 1           7.4              0.7           0            1.9     0.076
## 2           7.8             0.88           0            2.6     0.098
## 3           7.8             0.76        0.04            2.3     0.092
##   Free_Sulfer_Dioxide Total_Sulfur_Dioxide Density   pH Sulphates Alchohol
## 1                  11                   34  0.9978 3.51      0.56      9.4
## 2                  25                   67  0.9968  3.2      0.68      9.8
## 3                  15                   54   0.997 3.26      0.65      9.8
##   Quality
## 1       5
## 2       5
## 3       5

Convert Column Datatype

Looking at the datatypes of the columns, they are all of type character. They will be converted to type numeric.

wine$Fixed_Acidity <- as.numeric(wine$Fixed_Acidity)
wine$Volatile_Acidity <- as.numeric(wine$Volatile_Acidity)
wine$Citric_Acid <- as.numeric(wine$Citric_Acid)
wine$Residual_Sugar <- as.numeric(wine$Residual_Sugar)
wine$Chlorides <- as.numeric(wine$Chlorides)
wine$Free_Sulfer_Dioxide <- as.numeric(wine$Free_Sulfer_Dioxide)
wine$Total_Sulfur_Dioxide <- as.numeric(wine$Total_Sulfur_Dioxide)
wine$Density <- as.numeric(wine$Density)
wine$pH <- as.numeric(wine$pH)
wine$Sulphates <- as.numeric(wine$Sulphates)
wine$Alchohol <- as.numeric(wine$Alchohol)
wine$Quality <- as.numeric(wine$Quality)

NA

In order to make sure we have complete data we will do a check for any missing values.

sum(is.na(wine))
## [1] 0

Now that we have confirmed that no missing values exist in the data and have properly converted the column type, we can start with the statistical and visual summaries.

###Summary

Numerical

summary(wine)
##  Fixed_Acidity   Volatile_Acidity  Citric_Acid    Residual_Sugar  
##  Min.   : 4.60   Min.   :0.1200   Min.   :0.000   Min.   : 0.900  
##  1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090   1st Qu.: 1.900  
##  Median : 7.90   Median :0.5200   Median :0.260   Median : 2.200  
##  Mean   : 8.32   Mean   :0.5278   Mean   :0.271   Mean   : 2.539  
##  3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420   3rd Qu.: 2.600  
##  Max.   :15.90   Max.   :1.5800   Max.   :1.000   Max.   :15.500  
##    Chlorides       Free_Sulfer_Dioxide Total_Sulfur_Dioxide    Density      
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.00       Min.   :0.9901  
##  1st Qu.:0.07000   1st Qu.: 7.00       1st Qu.: 22.00       1st Qu.:0.9956  
##  Median :0.07900   Median :14.00       Median : 38.00       Median :0.9968  
##  Mean   :0.08747   Mean   :15.87       Mean   : 46.47       Mean   :0.9967  
##  3rd Qu.:0.09000   3rd Qu.:21.00       3rd Qu.: 62.00       3rd Qu.:0.9978  
##  Max.   :0.61100   Max.   :72.00       Max.   :289.00       Max.   :1.0037  
##        pH          Sulphates         Alchohol        Quality     
##  Min.   :2.740   Min.   :0.3300   Min.   : 8.40   Min.   :3.000  
##  1st Qu.:3.210   1st Qu.:0.5500   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.310   Median :0.6200   Median :10.20   Median :6.000  
##  Mean   :3.311   Mean   :0.6581   Mean   :10.42   Mean   :5.636  
##  3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90   Max.   :8.000

Boxplot

#create boxplot
meltD <- melt(wine)
p <- ggplot(meltD, aes(factor(variable), value)) 
p + geom_boxplot() + facet_wrap(~variable, scale="free")

Histogram

hist.data.frame(wine)

Correlation Plot

corrplot(cor(wine),
         method = 'circle', order = 'alphabet', type = 'lower', diag = FALSE, number.cex = 0.75, tl.cex = 0.5,  col=colorRampPalette(c("blue","white","red"))(200))

Models

For the purposes of this analysis, I will be creating two decision tree models and one random forest model.

Tracker

The tracker data frame will contain the name, RMSE, and R-Square for each model

tracker <- data.frame(matrix(vector(), 0, 3,
                dimnames=list(c(), c("Name", "RMSE", "R-Squared"))),
                stringsAsFactors=F)

Data Split

The dataset is split into a training and testing sets on a 85:15 ratio.

set.seed(123)
train_ind <- sample(seq_len(nrow(wine)), size = floor(0.85 * nrow(wine)))

train <- wine[train_ind, ]
test <- wine[-train_ind, ]

Decision Tree One

For the first decision tree model, I will be looking a the relationship between the target variable ‘Quality’ and the independent variables ‘Alcohol’ and ‘pH’.

dtree1 <- rpart(formula = Quality ~ Alchohol + pH,
      data = train)

dtree1
## n= 1359 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 1359 912.97720 5.650478  
##    2) Alchohol< 10.525 823 368.64400 5.364520  
##      4) Alchohol< 9.975 566 206.33920 5.257951 *
##      5) Alchohol>=9.975 257 141.71980 5.599222 *
##    3) Alchohol>=10.525 536 373.70150 6.089552  
##      6) Alchohol< 11.55 313 209.62940 5.869010  
##       12) pH>=3.495 45  35.24444 5.288889 *
##       13) pH< 3.495 268 156.69780 5.966418 *
##      7) Alchohol>=11.55 223 127.47980 6.399103  
##       14) pH>=3.365 90  57.12222 6.144444 *
##       15) pH< 3.365 133  60.57143 6.571429 *
printcp(dtree1)
## 
## Regression tree:
## rpart(formula = Quality ~ Alchohol + pH, data = train)
## 
## Variables actually used in tree construction:
## [1] Alchohol pH      
## 
## Root node error: 912.98/1359 = 0.6718
## 
## n= 1359 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.186896      0   1.00000 1.00210 0.040898
## 2 0.040080      1   0.81310 0.84053 0.039165
## 3 0.022547      2   0.77302 0.78732 0.036813
## 4 0.019373      3   0.75048 0.78125 0.036770
## 5 0.010719      4   0.73110 0.75810 0.035123
## 6 0.010000      5   0.72038 0.75832 0.035191
# Visualize tree
rpart.plot(dtree1)

rsq.rpart(dtree1)
## 
## Regression tree:
## rpart(formula = Quality ~ Alchohol + pH, data = train)
## 
## Variables actually used in tree construction:
## [1] Alchohol pH      
## 
## Root node error: 912.98/1359 = 0.6718
## 
## n= 1359 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.186896      0   1.00000 1.00210 0.040898
## 2 0.040080      1   0.81310 0.84053 0.039165
## 3 0.022547      2   0.77302 0.78732 0.036813
## 4 0.019373      3   0.75048 0.78125 0.036770
## 5 0.010719      4   0.73110 0.75810 0.035123
## 6 0.010000      5   0.72038 0.75832 0.035191

dtpred1 <- predict(dtree1,test)
# RMSE
rmse <- sqrt(mean((test$Quality - dtpred1)^2))
#R-Square
rs <- (cor(test$Quality, dtpred1))^2

tracker[nrow(tracker)+1,] <- c('Decision Tree', rmse, rs)

Decision Tree Two

For the second decision tree model, I will be looking a the relationship between the target variable ‘Quality’ and the independent variables ‘Alcohol’, ‘pH’, ‘Density’, and ‘Citric_Acid’.

dtree2 <- rpart(formula = Quality ~ Alchohol + pH + Density + Citric_Acid,
      data = train)

dtree2
## n= 1359 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 1359 912.97720 5.650478  
##    2) Alchohol< 10.525 823 368.64400 5.364520  
##      4) Alchohol< 9.975 566 206.33920 5.257951 *
##      5) Alchohol>=9.975 257 141.71980 5.599222 *
##    3) Alchohol>=10.525 536 373.70150 6.089552  
##      6) Citric_Acid< 0.295 238 168.91180 5.794118  
##       12) Alchohol< 11.55 153  96.67974 5.601307 *
##       13) Alchohol>=11.55 85  56.30588 6.141176 *
##      7) Citric_Acid>=0.295 298 167.42620 6.325503  
##       14) Alchohol< 11.55 160  91.50000 6.125000 *
##       15) Alchohol>=11.55 138  62.03623 6.557971 *
# Visualize tree
rpart.plot(dtree2)

printcp(dtree2)
## 
## Regression tree:
## rpart(formula = Quality ~ Alchohol + pH + Density + Citric_Acid, 
##     data = train)
## 
## Variables actually used in tree construction:
## [1] Alchohol    Citric_Acid
## 
## Root node error: 912.98/1359 = 0.6718
## 
## n= 1359 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.186896      0   1.00000 1.00221 0.040876
## 2 0.040925      1   0.81310 0.82795 0.038443
## 3 0.022547      2   0.77218 0.80094 0.036344
## 4 0.017444      3   0.74963 0.78168 0.035784
## 5 0.015214      4   0.73219 0.76706 0.035483
## 6 0.010000      5   0.71697 0.76027 0.035098
rsq.rpart(dtree1)
## 
## Regression tree:
## rpart(formula = Quality ~ Alchohol + pH, data = train)
## 
## Variables actually used in tree construction:
## [1] Alchohol pH      
## 
## Root node error: 912.98/1359 = 0.6718
## 
## n= 1359 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.186896      0   1.00000 1.00210 0.040898
## 2 0.040080      1   0.81310 0.84053 0.039165
## 3 0.022547      2   0.77302 0.78732 0.036813
## 4 0.019373      3   0.75048 0.78125 0.036770
## 5 0.010719      4   0.73110 0.75810 0.035123
## 6 0.010000      5   0.72038 0.75832 0.035191

# Predict
dtpred2 <- predict(dtree2, test)
# RMSE
rmse <- sqrt(mean((test$Quality - dtpred2)^2))
#R-Square
rs <- (cor(test$Quality, dtpred2))^2

tracker[nrow(tracker)+1,] <- c('Decision Tree 2', rmse, rs)

Random Forest

For the Random Forest model, I will be looking at the relationship between the target variable ‘Sales.Channel’ and the independent variables ‘Units.Sold’, ‘Item.Type’, ‘Region’, ‘Order.Priority’.

randForest <- randomForest(Quality ~ Alchohol + pH + Density + Citric_Acid,
                         train, proximity = TRUE)
randForest
## 
## Call:
##  randomForest(formula = Quality ~ Alchohol + pH + Density + Citric_Acid,      data = train, proximity = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.4270051
##                     % Var explained: 36.44
plot(randForest)

rfPred <- predict(randForest, test)
# RMSE
rmse <- sqrt(mean((test$Quality - rfPred)^2))
#R-Square
rs <- (cor(test$Quality, rfPred))^2

tracker[nrow(tracker)+1,] <- c('Random Forest', rmse, rs)

Comparision

tracker
##              Name              RMSE         R.Squared
## 1   Decision Tree 0.652901901024141 0.207676650780274
## 2 Decision Tree 2 0.637373087979101 0.239013227294275
## 3   Random Forest 0.573764622517034 0.382788017617649

Of all the models, the Random Forest model seems to be the best fit for the data. The second model also is an improvement from the initial decision tree. For this analysis, only a percentage of the attributes were used. It can be said that we expect the models to be better once more attributes are included.