library(tidyverse)
library(cowplot)
library(lattice)
library(reshape2)
library(corrplot)
library(randomForest)
library(caTools)
library(nnet)
library(caret)
library(party)
library(rpart)
library(rpart.plot)
library(pROC)
library(tree)
library(stringr)
library(Hmisc)
For this assignment I have chosen to work with the Wine Quality data set. This data set can be accessed from http://archive.ics.uci.edu/ml/datasets/Wine+Quality. This data set contains two sub data sets for Red and White wine respectively. For the purposes of this analysis, I will be working with the Red wine sub set. The goal of this data set was to model the wine quality based on physicochemical test. It contains 12 attributes as listed below.
1 - fixed acidity 2 - volatile acidity 3 - citric acid 4 - residual sugar 5 - chlorides 6 - free sulfur dioxide 7 - total sulfur dioxide 8 - density 9 - pH 10 - sulphates 11 - alcohol Output variable (based on sensory data): 12 - quality (score between 0 and 10)
For this analysis, I will be attempting to model the quality of the wine based on a different combinations of attributes.
wine <- as.data.frame(read.csv('winequality-red.csv'))
wine[1,]
## [1] "7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5"
colnames(wine) <- c('col')
wine[c('Fixed_Acidity', 'Volatile_Acidity', 'Citric_Acid', 'Residual_Sugar', 'Chlorides', 'Free_Sulfer_Dioxide', 'Total_Sulfur_Dioxide', 'Density', 'pH', 'Sulphates', 'Alchohol', 'Quality')] <- str_split_fixed(wine$col, ';', 12)
wine <- wine %>% select(!col)
head(wine)
## Fixed_Acidity Volatile_Acidity Citric_Acid Residual_Sugar Chlorides
## 1 7.4 0.7 0 1.9 0.076
## 2 7.8 0.88 0 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.7 0 1.9 0.076
## 6 7.4 0.66 0 1.8 0.075
## Free_Sulfer_Dioxide Total_Sulfur_Dioxide Density pH Sulphates Alchohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.2 0.68 9.8
## 3 15 54 0.997 3.26 0.65 9.8
## 4 17 60 0.998 3.16 0.58 9.8
## 5 11 34 0.9978 3.51 0.56 9.4
## 6 13 40 0.9978 3.51 0.56 9.4
## Quality
## 1 5
## 2 5
## 3 5
## 4 6
## 5 5
## 6 5
As the first step of exploratory analysis we will take a glimpse at the data.
head(wine,3)
## Fixed_Acidity Volatile_Acidity Citric_Acid Residual_Sugar Chlorides
## 1 7.4 0.7 0 1.9 0.076
## 2 7.8 0.88 0 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## Free_Sulfer_Dioxide Total_Sulfur_Dioxide Density pH Sulphates Alchohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.2 0.68 9.8
## 3 15 54 0.997 3.26 0.65 9.8
## Quality
## 1 5
## 2 5
## 3 5
Looking at the datatypes of the columns, they are all of type character. They will be converted to type numeric.
wine$Fixed_Acidity <- as.numeric(wine$Fixed_Acidity)
wine$Volatile_Acidity <- as.numeric(wine$Volatile_Acidity)
wine$Citric_Acid <- as.numeric(wine$Citric_Acid)
wine$Residual_Sugar <- as.numeric(wine$Residual_Sugar)
wine$Chlorides <- as.numeric(wine$Chlorides)
wine$Free_Sulfer_Dioxide <- as.numeric(wine$Free_Sulfer_Dioxide)
wine$Total_Sulfur_Dioxide <- as.numeric(wine$Total_Sulfur_Dioxide)
wine$Density <- as.numeric(wine$Density)
wine$pH <- as.numeric(wine$pH)
wine$Sulphates <- as.numeric(wine$Sulphates)
wine$Alchohol <- as.numeric(wine$Alchohol)
wine$Quality <- as.numeric(wine$Quality)
In order to make sure we have complete data we will do a check for any missing values.
sum(is.na(wine))
## [1] 0
Now that we have confirmed that no missing values exist in the data and have properly converted the column type, we can start with the statistical and visual summaries.
###Summary
summary(wine)
## Fixed_Acidity Volatile_Acidity Citric_Acid Residual_Sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## Chlorides Free_Sulfer_Dioxide Total_Sulfur_Dioxide Density
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956
## Median :0.07900 Median :14.00 Median : 38.00 Median :0.9968
## Mean :0.08747 Mean :15.87 Mean : 46.47 Mean :0.9967
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :72.00 Max. :289.00 Max. :1.0037
## pH Sulphates Alchohol Quality
## Min. :2.740 Min. :0.3300 Min. : 8.40 Min. :3.000
## 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.310 Median :0.6200 Median :10.20 Median :6.000
## Mean :3.311 Mean :0.6581 Mean :10.42 Mean :5.636
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :8.000
#create boxplot
meltD <- melt(wine)
p <- ggplot(meltD, aes(factor(variable), value))
p + geom_boxplot() + facet_wrap(~variable, scale="free")
hist.data.frame(wine)
corrplot(cor(wine),
method = 'circle', order = 'alphabet', type = 'lower', diag = FALSE, number.cex = 0.75, tl.cex = 0.5, col=colorRampPalette(c("blue","white","red"))(200))
For the purposes of this analysis, I will be creating two decision tree models and one random forest model.
The tracker data frame will contain the name, RMSE, and R-Square for each model
tracker <- data.frame(matrix(vector(), 0, 3,
dimnames=list(c(), c("Name", "RMSE", "R-Squared"))),
stringsAsFactors=F)
The dataset is split into a training and testing sets on a 85:15 ratio.
set.seed(123)
train_ind <- sample(seq_len(nrow(wine)), size = floor(0.85 * nrow(wine)))
train <- wine[train_ind, ]
test <- wine[-train_ind, ]
For the first decision tree model, I will be looking a the relationship between the target variable ‘Quality’ and the independent variables ‘Alcohol’ and ‘pH’.
dtree1 <- rpart(formula = Quality ~ Alchohol + pH,
data = train)
dtree1
## n= 1359
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 1359 912.97720 5.650478
## 2) Alchohol< 10.525 823 368.64400 5.364520
## 4) Alchohol< 9.975 566 206.33920 5.257951 *
## 5) Alchohol>=9.975 257 141.71980 5.599222 *
## 3) Alchohol>=10.525 536 373.70150 6.089552
## 6) Alchohol< 11.55 313 209.62940 5.869010
## 12) pH>=3.495 45 35.24444 5.288889 *
## 13) pH< 3.495 268 156.69780 5.966418 *
## 7) Alchohol>=11.55 223 127.47980 6.399103
## 14) pH>=3.365 90 57.12222 6.144444 *
## 15) pH< 3.365 133 60.57143 6.571429 *
printcp(dtree1)
##
## Regression tree:
## rpart(formula = Quality ~ Alchohol + pH, data = train)
##
## Variables actually used in tree construction:
## [1] Alchohol pH
##
## Root node error: 912.98/1359 = 0.6718
##
## n= 1359
##
## CP nsplit rel error xerror xstd
## 1 0.186896 0 1.00000 1.00210 0.040898
## 2 0.040080 1 0.81310 0.84053 0.039165
## 3 0.022547 2 0.77302 0.78732 0.036813
## 4 0.019373 3 0.75048 0.78125 0.036770
## 5 0.010719 4 0.73110 0.75810 0.035123
## 6 0.010000 5 0.72038 0.75832 0.035191
# Visualize tree
rpart.plot(dtree1)
rsq.rpart(dtree1)
##
## Regression tree:
## rpart(formula = Quality ~ Alchohol + pH, data = train)
##
## Variables actually used in tree construction:
## [1] Alchohol pH
##
## Root node error: 912.98/1359 = 0.6718
##
## n= 1359
##
## CP nsplit rel error xerror xstd
## 1 0.186896 0 1.00000 1.00210 0.040898
## 2 0.040080 1 0.81310 0.84053 0.039165
## 3 0.022547 2 0.77302 0.78732 0.036813
## 4 0.019373 3 0.75048 0.78125 0.036770
## 5 0.010719 4 0.73110 0.75810 0.035123
## 6 0.010000 5 0.72038 0.75832 0.035191
dtpred1 <- predict(dtree1,test)
# RMSE
rmse <- sqrt(mean((test$Quality - dtpred1)^2))
#R-Square
rs <- (cor(test$Quality, dtpred1))^2
tracker[nrow(tracker)+1,] <- c('Decision Tree', rmse, rs)
For the second decision tree model, I will be looking a the relationship between the target variable ‘Quality’ and the independent variables ‘Alcohol’, ‘pH’, ‘Density’, and ‘Citric_Acid’.
dtree2 <- rpart(formula = Quality ~ Alchohol + pH + Density + Citric_Acid,
data = train)
dtree2
## n= 1359
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 1359 912.97720 5.650478
## 2) Alchohol< 10.525 823 368.64400 5.364520
## 4) Alchohol< 9.975 566 206.33920 5.257951 *
## 5) Alchohol>=9.975 257 141.71980 5.599222 *
## 3) Alchohol>=10.525 536 373.70150 6.089552
## 6) Citric_Acid< 0.295 238 168.91180 5.794118
## 12) Alchohol< 11.55 153 96.67974 5.601307 *
## 13) Alchohol>=11.55 85 56.30588 6.141176 *
## 7) Citric_Acid>=0.295 298 167.42620 6.325503
## 14) Alchohol< 11.55 160 91.50000 6.125000 *
## 15) Alchohol>=11.55 138 62.03623 6.557971 *
# Visualize tree
rpart.plot(dtree2)
printcp(dtree2)
##
## Regression tree:
## rpart(formula = Quality ~ Alchohol + pH + Density + Citric_Acid,
## data = train)
##
## Variables actually used in tree construction:
## [1] Alchohol Citric_Acid
##
## Root node error: 912.98/1359 = 0.6718
##
## n= 1359
##
## CP nsplit rel error xerror xstd
## 1 0.186896 0 1.00000 1.00221 0.040876
## 2 0.040925 1 0.81310 0.82795 0.038443
## 3 0.022547 2 0.77218 0.80094 0.036344
## 4 0.017444 3 0.74963 0.78168 0.035784
## 5 0.015214 4 0.73219 0.76706 0.035483
## 6 0.010000 5 0.71697 0.76027 0.035098
rsq.rpart(dtree1)
##
## Regression tree:
## rpart(formula = Quality ~ Alchohol + pH, data = train)
##
## Variables actually used in tree construction:
## [1] Alchohol pH
##
## Root node error: 912.98/1359 = 0.6718
##
## n= 1359
##
## CP nsplit rel error xerror xstd
## 1 0.186896 0 1.00000 1.00210 0.040898
## 2 0.040080 1 0.81310 0.84053 0.039165
## 3 0.022547 2 0.77302 0.78732 0.036813
## 4 0.019373 3 0.75048 0.78125 0.036770
## 5 0.010719 4 0.73110 0.75810 0.035123
## 6 0.010000 5 0.72038 0.75832 0.035191
# Predict
dtpred2 <- predict(dtree2, test)
# RMSE
rmse <- sqrt(mean((test$Quality - dtpred2)^2))
#R-Square
rs <- (cor(test$Quality, dtpred2))^2
tracker[nrow(tracker)+1,] <- c('Decision Tree 2', rmse, rs)
For the Random Forest model, I will be looking at the relationship between the target variable ‘Sales.Channel’ and the independent variables ‘Units.Sold’, ‘Item.Type’, ‘Region’, ‘Order.Priority’.
randForest <- randomForest(Quality ~ Alchohol + pH + Density + Citric_Acid,
train, proximity = TRUE)
randForest
##
## Call:
## randomForest(formula = Quality ~ Alchohol + pH + Density + Citric_Acid, data = train, proximity = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.4270051
## % Var explained: 36.44
plot(randForest)
rfPred <- predict(randForest, test)
# RMSE
rmse <- sqrt(mean((test$Quality - rfPred)^2))
#R-Square
rs <- (cor(test$Quality, rfPred))^2
tracker[nrow(tracker)+1,] <- c('Random Forest', rmse, rs)
tracker
## Name RMSE R.Squared
## 1 Decision Tree 0.652901901024141 0.207676650780274
## 2 Decision Tree 2 0.637373087979101 0.239013227294275
## 3 Random Forest 0.573764622517034 0.382788017617649
Of all the models, the Random Forest model seems to be the best fit for the data. The second model also is an improvement from the initial decision tree. For this analysis, only a percentage of the attributes were used. It can be said that we expect the models to be better once more attributes are included.