Quality analysis of Redwine

For starters, this Project kernel is to perform Exploratory Data Analysis on the dataset “Red Wine Quality” and to analyze which independent variables influences the most on the Quality factor (Response Variable) of Red wine.

Firstly, Importing the dataset “Redwine” and performing simple analysis.

library(readr)

## Warning: package 'readr' was built under R version 3.6.3

redwine = read_csv("C:/Users/Kowshik Kumar B/Documents/HU/506/Project/redwine.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   `fixed acidity` = col_double(),
##   `volatile acidity` = col_double(),
##   `citric acid` = col_double(),
##   `residual sugar` = col_double(),
##   chlorides = col_double(),
##   `free sulfur dioxide` = col_double(),
##   `total sulfur dioxide` = col_double(),
##   density = col_double(),
##   pH = col_double(),
##   sulphates = col_double(),
##   alcohol = col_double(),
##   quality = col_double()
## )

head(redwine)

## # A tibble: 6 x 12
##   `fixed acidity` `volatile acidi~ `citric acid` `residual sugar` chlorides
##             <dbl>            <dbl>         <dbl>            <dbl>     <dbl>
## 1             7.4             0.7           0                 1.9     0.076
## 2             7.8             0.88          0                 2.6     0.098
## 3             7.8             0.76          0.04              2.3     0.092
## 4            11.2             0.28          0.56              1.9     0.075
## 5             7.4             0.7           0                 1.9     0.076
## 6             7.4             0.66          0                 1.8     0.075
## # ... with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## #   dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## #   quality <dbl>

tail(redwine)

## # A tibble: 6 x 12
##   `fixed acidity` `volatile acidi~ `citric acid` `residual sugar` chlorides
##             <dbl>            <dbl>         <dbl>            <dbl>     <dbl>
## 1             6.8            0.62           0.08              1.9     0.068
## 2             6.2            0.6            0.08              2       0.09 
## 3             5.9            0.55           0.1               2.2     0.062
## 4             6.3            0.51           0.13              2.3     0.076
## 5             5.9            0.645          0.12              2       0.075
## 6             6              0.31           0.47              3.6     0.067
## # ... with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## #   dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## #   quality <dbl>

str(redwine)

## tibble [1,599 x 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ fixed acidity       : num [1:1599] 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
##  $ volatile acidity    : num [1:1599] 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
##  $ citric acid         : num [1:1599] 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
##  $ residual sugar      : num [1:1599] 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
##  $ chlorides           : num [1:1599] 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
##  $ free sulfur dioxide : num [1:1599] 11 25 15 17 11 13 15 15 9 17 ...
##  $ total sulfur dioxide: num [1:1599] 34 67 54 60 34 40 59 21 18 102 ...
##  $ density             : num [1:1599] 0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num [1:1599] 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
##  $ sulphates           : num [1:1599] 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
##  $ alcohol             : num [1:1599] 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
##  $ quality             : num [1:1599] 5 5 5 6 5 5 5 7 7 5 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `fixed acidity` = col_double(),
##   ..   `volatile acidity` = col_double(),
##   ..   `citric acid` = col_double(),
##   ..   `residual sugar` = col_double(),
##   ..   chlorides = col_double(),
##   ..   `free sulfur dioxide` = col_double(),
##   ..   `total sulfur dioxide` = col_double(),
##   ..   density = col_double(),
##   ..   pH = col_double(),
##   ..   sulphates = col_double(),
##   ..   alcohol = col_double(),
##   ..   quality = col_double()
##   .. )

dim(redwine)

## [1] 1599   12

We can clearly interpret that the dataset has 12 Columns and 1599 Rows alongside all the independent variables are Numerical data and the response variable can be featured as Categorial data.

Then Performing Summary Statistics and Plotting the datapoints for undestanding or estimating the variables of the dataset “RedWine”

summary(redwine)

##  fixed acidity   volatile acidity  citric acid    residual sugar  
##  Min.   : 4.60   Min.   :0.1200   Min.   :0.000   Min.   : 0.900  
##  1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090   1st Qu.: 1.900  
##  Median : 7.90   Median :0.5200   Median :0.260   Median : 2.200  
##  Mean   : 8.32   Mean   :0.5278   Mean   :0.271   Mean   : 2.539  
##  3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420   3rd Qu.: 2.600  
##  Max.   :15.90   Max.   :1.5800   Max.   :1.000   Max.   :15.500  
##    chlorides       free sulfur dioxide total sulfur dioxide    density      
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.00       Min.   :0.9901  
##  1st Qu.:0.07000   1st Qu.: 7.00       1st Qu.: 22.00       1st Qu.:0.9956  
##  Median :0.07900   Median :14.00       Median : 38.00       Median :0.9968  
##  Mean   :0.08747   Mean   :15.87       Mean   : 46.47       Mean   :0.9967  
##  3rd Qu.:0.09000   3rd Qu.:21.00       3rd Qu.: 62.00       3rd Qu.:0.9978  
##  Max.   :0.61100   Max.   :72.00       Max.   :289.00       Max.   :1.0037  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.740   Min.   :0.3300   Min.   : 8.40   Min.   :3.000  
##  1st Qu.:3.210   1st Qu.:0.5500   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.310   Median :0.6200   Median :10.20   Median :6.000  
##  Mean   :3.311   Mean   :0.6581   Mean   :10.42   Mean   :5.636  
##  3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90   Max.   :8.000

plot(redwine)

Checking for total number of NA variables in our dataset.

NA_values=data.frame(no_of_na_values=colSums(is.na(redwine)))
head(NA_values,12)

##                      no_of_na_values
## fixed acidity                      0
## volatile acidity                   0
## citric acid                        0
## residual sugar                     0
## chlorides                          0
## free sulfur dioxide                0
## total sulfur dioxide               0
## density                            0
## pH                                 0
## sulphates                          0
## alcohol                            0
## quality                            0

We can clearly see that we do not have any NA Values.

Further, Checking the Correlation between the varaibles of our dataset and visualizing the correlation matrix.

cor(redwine)

##                      fixed acidity volatile acidity citric acid residual sugar
## fixed acidity           1.00000000     -0.256130895  0.67170343    0.114776724
## volatile acidity       -0.25613089      1.000000000 -0.55249568    0.001917882
## citric acid             0.67170343     -0.552495685  1.00000000    0.143577162
## residual sugar          0.11477672      0.001917882  0.14357716    1.000000000
## chlorides               0.09370519      0.061297772  0.20382291    0.055609535
## free sulfur dioxide    -0.15379419     -0.010503827 -0.06097813    0.187048995
## total sulfur dioxide   -0.11318144      0.076470005  0.03553302    0.203027882
## density                 0.66804729      0.022026232  0.36494718    0.355283371
## pH                     -0.68297819      0.234937294 -0.54190414   -0.085652422
## sulphates               0.18300566     -0.260986685  0.31277004    0.005527121
## alcohol                -0.06166827     -0.202288027  0.10990325    0.042075437
## quality                 0.12405165     -0.390557780  0.22637251    0.013731637
##                         chlorides free sulfur dioxide total sulfur dioxide
## fixed acidity         0.093705186        -0.153794193          -0.11318144
## volatile acidity      0.061297772        -0.010503827           0.07647000
## citric acid           0.203822914        -0.060978129           0.03553302
## residual sugar        0.055609535         0.187048995           0.20302788
## chlorides             1.000000000         0.005562147           0.04740047
## free sulfur dioxide   0.005562147         1.000000000           0.66766645
## total sulfur dioxide  0.047400468         0.667666450           1.00000000
## density               0.200632327        -0.021945831           0.07126948
## pH                   -0.265026131         0.070377499          -0.06649456
## sulphates             0.371260481         0.051657572           0.04294684
## alcohol              -0.221140545        -0.069408354          -0.20565394
## quality              -0.128906560        -0.050656057          -0.18510029
##                          density          pH    sulphates     alcohol
## fixed acidity         0.66804729 -0.68297819  0.183005664 -0.06166827
## volatile acidity      0.02202623  0.23493729 -0.260986685 -0.20228803
## citric acid           0.36494718 -0.54190414  0.312770044  0.10990325
## residual sugar        0.35528337 -0.08565242  0.005527121  0.04207544
## chlorides             0.20063233 -0.26502613  0.371260481 -0.22114054
## free sulfur dioxide  -0.02194583  0.07037750  0.051657572 -0.06940835
## total sulfur dioxide  0.07126948 -0.06649456  0.042946836 -0.20565394
## density               1.00000000 -0.34169933  0.148506412 -0.49617977
## pH                   -0.34169933  1.00000000 -0.196647602  0.20563251
## sulphates             0.14850641 -0.19664760  1.000000000  0.09359475
## alcohol              -0.49617977  0.20563251  0.093594750  1.00000000
## quality              -0.17491923 -0.05773139  0.251397079  0.47616632
##                          quality
## fixed acidity         0.12405165
## volatile acidity     -0.39055778
## citric acid           0.22637251
## residual sugar        0.01373164
## chlorides            -0.12890656
## free sulfur dioxide  -0.05065606
## total sulfur dioxide -0.18510029
## density              -0.17491923
## pH                   -0.05773139
## sulphates             0.25139708
## alcohol               0.47616632
## quality               1.00000000

library(corrplot)

## Warning: package 'corrplot' was built under R version 3.6.1

## corrplot 0.84 loaded

corrplot.mixed(cor(redwine),lower.col = "black", number.cex = .7)

Assigning a new variable “bestquality” as 1 for good if the “quality” level of redwine is above 6 and vice-versa.

redwine$bestquality=ifelse(redwine$quality>6,1,0)
table(redwine$bestquality)

## 
##    0    1 
## 1382  217

Plotting the Response variables(Quality of Redwine) Distribution Plot.

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.6.3

b1=ggplot(redwine,aes(x=quality))+
  geom_bar(stat = "count")+
  ggtitle("Distribution of Red Wine Quality Ratings")+
  scale_x_continuous(breaks = seq(3,8,1))+
   theme(plot.title = element_text(hjust = 0.5))+
    labs(x = "Quality Ratings")
b2=ggplot(redwine,aes(x=bestquality,fill=factor(bestquality)))+
  geom_bar(stat = "count")+
  ggtitle("Distribution of Red Wine Quality Ratings")+
  scale_x_continuous(breaks = seq(0,1,1))+
   theme(plot.title = element_text(hjust = 0.5))+
    labs(x = "0=Bad Quality 1=Good Quality")
b1

b2

Further, Plotting the datapoints density and Histogram plots of each independent variable to check how it is influencing the Quality of the Redwine.

library(ggplot2)
d1=ggplot(redwine,aes(x=redwine$`fixed acidity`,fill=factor(bestquality)))+
  geom_density(alpha=0.25)+
  scale_x_continuous(breaks = seq(5,15,1))+
  xlab(label = "Fixed Acidity Level")+
  ggtitle("Density Distribution of Fixed Acidity Levels")+
 theme(plot.title = element_text(hjust = 0.5))
d2=ggplot(redwine,aes(x=redwine$`volatile acidity`,fill=factor(bestquality)))+
  geom_density(alpha=0.25)+
  scale_x_continuous(breaks = seq(5,15,1))+
  xlab(label = "Volatile Acidity Level")+
  ggtitle("Density Distribution of Volatile Acidity Levels")+
 theme(plot.title = element_text(hjust = 0.5))
d3=ggplot(redwine,aes(x=redwine$`citric acid`,fill=factor(bestquality)))+
  geom_density(alpha=0.25)+
  scale_x_continuous(breaks = seq(5,15,1))+
  xlab(label = "Citric Acid Level")+
  ggtitle("Density Distribution of Citric Acid Levels")+
 theme(plot.title = element_text(hjust = 0.5))
d4=ggplot(redwine,aes(x=redwine$`residual sugar`,fill=factor(bestquality)))+
  geom_density(alpha=0.25)+
  scale_x_continuous(breaks = seq(5,15,1))+
  xlab(label = "Residual Sugar Level")+
  ggtitle("Density Distribution of Residual Sugar Levels")+
 theme(plot.title = element_text(hjust = 0.5))
d5=ggplot(redwine,aes(x=redwine$chlorides,fill=factor(bestquality)))+
  geom_density(alpha=0.25)+
  scale_x_continuous(breaks = seq(5,15,1))+
  xlab(label = "Chlorides present")+
  ggtitle("Density Distribution of Chlorides Levels")+
 theme(plot.title = element_text(hjust = 0.5))
d6=ggplot(redwine,aes(x=redwine$`free sulfur dioxide`,fill=factor(bestquality)))+
  geom_density(alpha=0.25)+
  scale_x_continuous(breaks = seq(5,15,1))+
  xlab(label = "Free sulfur Level")+
  ggtitle("Density Distribution of Free Sulfar Levels")+
 theme(plot.title = element_text(hjust = 0.5))
d7=ggplot(redwine,aes(x=redwine$`total sulfur dioxide`,fill=factor(bestquality)))+
  geom_density(alpha=0.25)+
  scale_x_continuous(breaks = seq(5,15,1))+
  xlab(label = "Total sulfur Level")+
  ggtitle("Density Distribution of Total Sulfar Levels")+
 theme(plot.title = element_text(hjust = 0.5))
d8=ggplot(redwine,aes(x=redwine$density,fill=factor(bestquality)))+
  geom_density(alpha=0.25)+
  scale_x_continuous(breaks = seq(5,15,1))+
  xlab(label = "Density")+
  ggtitle("Density Distribution of Density")+
 theme(plot.title = element_text(hjust = 0.5))
d9=ggplot(redwine,aes(x=redwine$sulphates,fill=factor(bestquality)))+
  geom_density(alpha=0.25)+
  scale_x_continuous(breaks = seq(5,15,1))+
  xlab(label = "Sulphates Present")+
  ggtitle("Density Distribution of Sulphates")+
 theme(plot.title = element_text(hjust = 0.5))
d10=ggplot(redwine,aes(x=redwine$pH,fill=factor(bestquality)))+
  geom_density(alpha=0.25)+
  scale_x_continuous(breaks = seq(5,15,1))+
  xlab(label = "pH levels")+
  ggtitle("Density Distribution of pH levels")+
 theme(plot.title = element_text(hjust = 0.5))
d11=ggplot(redwine,aes(x=redwine$alcohol,fill=factor(bestquality)))+
  geom_density(alpha=0.25)+
  scale_x_continuous(breaks = seq(5,15,1))+
  xlab(label = "Alchol levels")+
  ggtitle("Density Distribution of Alchol levels")+
 theme(plot.title = element_text(hjust = 0.5))
d1

d2

d3

d4

d5

d6

d7

d8

d9

d10

d11

library("ggplot2")
h1=ggplot(data = redwine, mapping = aes(x = redwine$`fixed acidity`)) +
  geom_histogram( bins=100, fill = "steelblue") +
  labs(x = "Fixed Acidity levels") +
  ggtitle("Histogram of Fixed Acidity levels") +
  theme(plot.title = element_text(hjust = 0.5)) 
h2=ggplot(data = redwine, mapping = aes(x = redwine$`volatile acidity`)) +
  geom_histogram( bins=30, fill = "darkolivegreen") +
  labs(x = "Volatile Acidity levels") +
  ggtitle("Histogram of Volatile Acidity levels ") +
  theme(plot.title = element_text(hjust = 0.5)) 
h3=ggplot(data = redwine, mapping = aes(x = redwine$`citric acid`)) +
  geom_histogram( bins=30, fill = "green1") +
  labs(x = "Citric acid levels") +
  ggtitle("Histogram of Citric acid levels ") +
  theme(plot.title = element_text(hjust = 0.5)) 
h4=ggplot(data = redwine, mapping = aes(x = redwine$`residual sugar`)) +
  geom_histogram( bins=30, fill = "tomato1") +
  labs(x = "Residual Sugar levels") +
  ggtitle("Histogram of Residual Sugar levels ") +
  theme(plot.title = element_text(hjust = 0.5)) 
h5=ggplot(data = redwine, mapping = aes(x = redwine$chlorides)) +
  geom_histogram( bins=30, fill = "red1") +
  labs(x = "Chlorides present") +
  ggtitle("Histogram of Chlorides present ") +
  theme(plot.title = element_text(hjust = 0.5)) 
h6=ggplot(data = redwine, mapping = aes(x = redwine$`free sulfur dioxide`)) +
  geom_histogram( bins=30, fill = "brown1") +
  labs(x = "Free sulfur dioxide levels") +
  ggtitle("Histogram of Free sulfur dioxide levles ") +
  theme(plot.title = element_text(hjust = 0.5)) 
h7=ggplot(data = redwine, mapping = aes(x = redwine$`total sulfur dioxide`)) +
  geom_histogram( bins=30, fill = "grey1") +
  labs(x = "Total sulfur dioxide levels") +
  ggtitle("Histogram of Total sulfur dioxide levles ") +
  theme(plot.title = element_text(hjust = 0.5)) 
h8=ggplot(data = redwine, mapping = aes(x = redwine$density)) +
  geom_histogram( bins=30, fill = "orange1") +
  labs(x = "Density") +
  ggtitle("Histogram of Density ") +
  theme(plot.title = element_text(hjust = 0.5)) 
h9=ggplot(data = redwine, mapping = aes(x = redwine$pH)) +
  geom_histogram( bins=30, fill = "orange1") +
  labs(x = "pH levels") +
  ggtitle("Histogram of  ") +
  theme(plot.title = element_text(hjust = 0.5)) 
h10=ggplot(data = redwine, mapping = aes(x = redwine$sulphates)) +
  geom_histogram( bins=30, fill = "orange1") +
  labs(x = "Sulphates present") +
  ggtitle("Histogram of Sulphates present") +
  theme(plot.title = element_text(hjust = 0.5)) 
h11=ggplot(data = redwine, mapping = aes(x = redwine$alcohol)) +
  geom_histogram( bins=30, fill = "orange1") +
  labs(x = "Alcohol levels") +
  ggtitle("Histogram of Alcohol levels ") +
  theme(plot.title = element_text(hjust = 0.5)) 
h1

h2

h3

h4

h5

h6

h7

h8

h9

h10

h11

library("GGally")

## Warning: package 'GGally' was built under R version 3.6.3

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

ggpairs(redwine)

Creating a Regressor to perform Modelling for checking the fit of the model and obtain performance results.

regressor=factor(redwine$bestquality)~redwine$`fixed acidity`+redwine$`volatile acidity`+redwine$`citric acid`+redwine$`residual sugar`+redwine$chlorides+redwine$`free sulfur dioxide`+redwine$`total sulfur dioxide`+redwine$density+redwine$pH+redwine$sulphates+redwine$alcohol

Performing RandomForest model to our dataset “Redwine” to check how the Quality factor is getting influenced by the other independent variables.

library(randomForest)

## Warning: package 'randomForest' was built under R version 3.6.1

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

set.seed(12345)
redwine_RandomForests<-randomForest(regressor,redwine,ntree=150,nodesize = 10, proximity = T)
redwine_RandomForests

## 
## Call:
##  randomForest(formula = regressor, data = redwine, ntree = 150,      nodesize = 10, proximity = T) 
##                Type of random forest: classification
##                      Number of trees: 150
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 9.57%
## Confusion matrix:
##      0  1 class.error
## 0 1354 28  0.02026049
## 1  125 92  0.57603687

varImpPlot(redwine_RandomForests)

We can see that the Performance from our model is approximately 91.5% alongide we can clearly see that the Alchol, Sulphates, Volatile Acidity and Density are the most important factors influencing the quality of the Redwine.

Performing Logistic Regression to our dataset to check the “Quality”(Categorical) factor is getting influenced by the other independent variables.

redwine_logit=glm(regressor, data = redwine, family = "binomial")
summary(redwine_logit)

## 
## Call:
## glm(formula = regressor, family = "binomial", data = redwine)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9878  -0.4351  -0.2207  -0.1222   2.9869  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     2.428e+02  1.081e+02   2.247 0.024660 *  
## redwine$`fixed acidity`         2.750e-01  1.253e-01   2.195 0.028183 *  
## redwine$`volatile acidity`     -2.581e+00  7.843e-01  -3.291 0.000999 ***
## redwine$`citric acid`           5.678e-01  8.385e-01   0.677 0.498313    
## redwine$`residual sugar`        2.395e-01  7.373e-02   3.248 0.001163 ** 
## redwine$chlorides              -8.816e+00  3.365e+00  -2.620 0.008788 ** 
## redwine$`free sulfur dioxide`   1.082e-02  1.223e-02   0.884 0.376469    
## redwine$`total sulfur dioxide` -1.653e-02  4.894e-03  -3.378 0.000731 ***
## redwine$density                -2.578e+02  1.104e+02  -2.335 0.019536 *  
## redwine$pH                      2.242e-01  9.984e-01   0.225 0.822327    
## redwine$sulphates               3.750e+00  5.416e-01   6.924 4.39e-12 ***
## redwine$alcohol                 7.533e-01  1.316e-01   5.724 1.04e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1269.92  on 1598  degrees of freedom
## Residual deviance:  870.86  on 1587  degrees of freedom
## AIC: 894.86
## 
## Number of Fisher Scoring iterations: 6

The Logistic model clearly states that most of the variables are statistically significant except two variables i.e Citric acid and pH. Alongside the Volatile Acidity, Sulfur dioxide and Alcohol levels are most significant or important to our model which means the Quality factor is highly influenced by the abvove three variables.

Then checking the confidence interval wherein the datapoints are significant or influencing the model.

confint(redwine_logit)

## Waiting for profiling to be done...

##                                        2.5 %        97.5 %
## (Intercept)                      31.39072397 455.477987722
## redwine$`fixed acidity`           0.02920528   0.520924990
## redwine$`volatile acidity`       -4.14912796  -1.075526938
## redwine$`citric acid`            -1.08534075   2.203320201
## redwine$`residual sugar`          0.08873459   0.380410083
## redwine$chlorides               -16.17020457  -2.978181607
## redwine$`free sulfur dioxide`    -0.01326636   0.034821389
## redwine$`total sulfur dioxide`   -0.02664137  -0.007429704
## redwine$density                -475.10890589 -41.834834603
## redwine$pH                       -1.74982855   2.167994149
## redwine$sulphates                 2.68719771   4.819706731
## redwine$alcohol                   0.49736265   1.014132717

Furthermore, generating the Odds Ratio’s and its Confidence interval so as to know with what ratio the Quality factor gets influenced which can be seen below:

exp(cbind(OR = coef(redwine_logit), confint(redwine_logit)))

## Waiting for profiling to be done...

##                                           OR         2.5 %        97.5 %
## (Intercept)                    2.694155e+105  4.293567e+13 6.480025e+197
## redwine$`fixed acidity`         1.316469e+00  1.029636e+00  1.683584e+00
## redwine$`volatile acidity`      7.569811e-02  1.577817e-02  3.411180e-01
## redwine$`citric acid`           1.764371e+00  3.377867e-01  9.055028e+00
## redwine$`residual sugar`        1.270568e+00  1.092791e+00  1.462884e+00
## redwine$chlorides               1.482863e-04  9.492255e-08  5.088528e-02
## redwine$`free sulfur dioxide`   1.010879e+00  9.868212e-01  1.035435e+00
## redwine$`total sulfur dioxide`  9.836053e-01  9.737104e-01  9.925978e-01
## redwine$density                1.096312e-112 4.600699e-207  6.782069e-19
## redwine$pH                      1.251303e+00  1.738037e-01  8.740734e+00
## redwine$sulphates               4.251593e+01  1.469045e+01  1.239287e+02
## redwine$alcohol                 2.124081e+00  1.644379e+00  2.756971e+00

redwine_prob=predict(redwine_logit, redwine, type="response")
redwine_pred = ifelse(redwine_prob>= 0.5, "Good", "Bad")
table(redwine_pred,redwine$bestquality)

##             
## redwine_pred    0    1
##         Bad  1339  142
##         Good   43   75

redwine_logit_accuracy=(1339+75)/(1339+142+43+75)
redwine_logit_accuracy

## [1] 0.8843027

From above we can say that accurracy of our Logistkc model predictions is around 88.5%.

Here checking for the mulitcollinearity between the variables by performing VIF

library(car)

## Warning: package 'car' was built under R version 3.6.3

## Loading required package: carData

## Warning: package 'carData' was built under R version 3.6.3

vif(redwine_logit)

##        redwine$`fixed acidity`     redwine$`volatile acidity` 
##                       8.548951                       1.789987 
##          redwine$`citric acid`       redwine$`residual sugar` 
##                       3.455691                       1.714940 
##              redwine$chlorides  redwine$`free sulfur dioxide` 
##                       1.252465                       2.086780 
## redwine$`total sulfur dioxide`                redwine$density 
##                       2.305224                       7.948391 
##                     redwine$pH              redwine$sulphates 
##                       3.409585                       1.319694 
##                redwine$alcohol 
##                       2.536675

We have two factor(Fixed Acidity and Density) that are having VIF more than 5 which states that the multicollinearity is high but it is acceptable in our case.

Then, Checking for Normaility assumptions.

plot(redwine_logit)

We can clearly see the dispersion of Categorical datapoints scattered and all the assumptions looks good with few outliers which is negligible.

Finally, Performing the Principal Component Analysis for our dataset “Redwine”.

redwine_pca <- prcomp(redwine[,c(1:11)], center = TRUE,scale. = TRUE)
summary(redwine_pca)

## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6     PC7
## Standard deviation     1.7604 1.3878 1.2452 1.1015 0.97943 0.81216 0.76406
## Proportion of Variance 0.2817 0.1751 0.1410 0.1103 0.08721 0.05996 0.05307
## Cumulative Proportion  0.2817 0.4568 0.5978 0.7081 0.79528 0.85525 0.90832
##                            PC8     PC9    PC10    PC11
## Standard deviation     0.65035 0.58706 0.42583 0.24405
## Proportion of Variance 0.03845 0.03133 0.01648 0.00541
## Cumulative Proportion  0.94677 0.97810 0.99459 1.00000

From above we can clearly see that the first 7 Pricipal Components are contributing to 90% of the total variance.

ggbiplot2=function(pcobj, choices = 1:2, scale = 1, pc.biplot = TRUE, 
         obs.scale = 1 - scale, var.scale = scale, 
         groups = NULL, ellipse = FALSE, ellipse.prob = 0.68, 
         labels = NULL, labels.size = 3, alpha = 1, 
         var.axes = TRUE, 
         circle = FALSE, circle.prob = 0.69, 
         varname.size = 3, varname.adjust = 1.5, 
         varname.abbrev = FALSE, ...)
{
  library(ggplot2)
  library(plyr)
  library(scales)
  library(grid)

  stopifnot(length(choices) == 2)

  # Recover the SVD
  if(inherits(pcobj, 'prcomp')){
    nobs.factor <- sqrt(nrow(pcobj$x) - 1)
    d <- pcobj$sdev
    u <- sweep(pcobj$x, 2, 1 / (d * nobs.factor), FUN = '*')
    v <- pcobj$rotation
  } else if(inherits(pcobj, 'princomp')) {
    nobs.factor <- sqrt(pcobj$n.obs)
    d <- pcobj$sdev
    u <- sweep(pcobj$scores, 2, 1 / (d * nobs.factor), FUN = '*')
    v <- pcobj$loadings
  } else if(inherits(pcobj, 'PCA')) {
    nobs.factor <- sqrt(nrow(pcobj$call$X))
    d <- unlist(sqrt(pcobj$eig)[1])
    u <- sweep(pcobj$ind$coord, 2, 1 / (d * nobs.factor), FUN = '*')
    v <- sweep(pcobj$var$coord,2,sqrt(pcobj$eig[1:ncol(pcobj$var$coord),1]),FUN="/")
  } else {
    stop('Expected a object of class prcomp, princomp or PCA')
  }

  # Scores
  df.u <- as.data.frame(sweep(u[,choices], 2, d[choices]^obs.scale, FUN='*'))

  # Directions
  v <- sweep(v, 2, d^var.scale, FUN='*')
  df.v <- as.data.frame(v[, choices])

  names(df.u) <- c('xvar', 'yvar')
  names(df.v) <- names(df.u)

  if(pc.biplot) {
    df.u <- df.u * nobs.factor
  }

  # Scale the radius of the correlation circle so that it corresponds to 
  # a data ellipse for the standardized PC scores
  r <- 1

  # Scale directions
  v.scale <- rowSums(v^2)
  df.v <- df.v / sqrt(max(v.scale))

  ## Scale Scores
  r.scale=sqrt(max(df.u[,1]^2+df.u[,2]^2))
  df.u=.99*df.u/r.scale

  # Change the labels for the axes
  if(obs.scale == 0) {
    u.axis.labs <- paste('standardized PC', choices, sep='')
  } else {
    u.axis.labs <- paste('PC', choices, sep='')
  }

  # Append the proportion of explained variance to the axis labels
  u.axis.labs <- paste(u.axis.labs, 
                       sprintf('(%0.1f%% explained var.)', 
                               100 * pcobj$sdev[choices]^2/sum(pcobj$sdev^2)))

  # Score Labels
  if(!is.null(labels)) {
    df.u$labels <- labels
  }

  # Grouping variable
  if(!is.null(groups)) {
    df.u$groups <- groups
  }

  # Variable Names
  if(varname.abbrev) {
    df.v$varname <- abbreviate(rownames(v))
  } else {
    df.v$varname <- rownames(v)
  }

  # Variables for text label placement
  df.v$angle <- with(df.v, (180/pi) * atan(yvar / xvar))
  df.v$hjust = with(df.v, (1 - varname.adjust * sign(xvar)) / 2)

  # Base plot
  g <- ggplot(data = df.u, aes(x = xvar, y = yvar)) + 
    xlab(u.axis.labs[1]) + ylab(u.axis.labs[2]) + coord_equal()

  if(var.axes) {
    # Draw circle
    if(circle) 
    {
      theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50))
      circle <- data.frame(xvar = r * cos(theta), yvar = r * sin(theta))
      g <- g + geom_path(data = circle, color = muted('white'), 
                         size = 1/2, alpha = 1/3)
    }

    # Draw directions
    g <- g +
      geom_segment(data = df.v,
                   aes(x = 0, y = 0, xend = xvar, yend = yvar),
                   arrow = arrow(length = unit(1/2, 'picas')), 
                   color = muted('red'))
  }

  # Draw either labels or points
  if(!is.null(df.u$labels)) {
    if(!is.null(df.u$groups)) {
      g <- g + geom_text(aes(label = labels, color = groups), 
                         size = labels.size)
    } else {
      g <- g + geom_text(aes(label = labels), size = labels.size)      
    }
  } else {
    if(!is.null(df.u$groups)) {
      g <- g + geom_point(aes(color = groups), alpha = alpha)
    } else {
      g <- g + geom_point(alpha = alpha)      
    }
  }

  # Overlay a concentration ellipse if there are groups
  if(!is.null(df.u$groups) && ellipse) {
    theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50))
    circle <- cbind(cos(theta), sin(theta))

    ell <- ddply(df.u, 'groups', function(x) {
      if(nrow(x) < 2) {
        return(NULL)
      } else if(nrow(x) == 2) {
        sigma <- var(cbind(x$xvar, x$yvar))
      } else {
        sigma <- diag(c(var(x$xvar), var(x$yvar)))
      }
      mu <- c(mean(x$xvar), mean(x$yvar))
      ed <- sqrt(qchisq(ellipse.prob, df = 2))
      data.frame(sweep(circle %*% chol(sigma) * ed, 2, mu, FUN = '+'), 
                 groups = x$groups[1])
    })
    names(ell)[1:2] <- c('xvar', 'yvar')
    g <- g + geom_path(data = ell, aes(color = groups, group = groups))
  }

  # Label the variable axes
  if(var.axes) {
    g <- g + 
      geom_text(data = df.v, 
                aes(label = varname, x = xvar, y = yvar, 
                    angle = angle, hjust = hjust), 
                color = 'darkred', size = varname.size)
  }
  # Change the name of the legend for groups
  # if(!is.null(groups)) {
  #   g <- g + scale_color_brewer(name = deparse(substitute(groups)), 
  #                               palette = 'Dark2')
  # }

  # TODO: Add a second set of axes

  return(g)
}

ggbiplot2(redwine_pca)

## Warning: package 'plyr' was built under R version 3.6.3

## Warning: package 'scales' was built under R version 3.6.3

## 
## Attaching package: 'scales'

## The following object is masked from 'package:readr':
## 
##     col_factor

In conclusion, Volatile Acidity, Sulfur dioxide and Alcohol levels are the three important factors which are significantly influencing the Quality of the Redwine while Citric Acid and pH does not have any effect on the Quality of the Redwine. Alongside, RandomForest regression model predicts with better performance or accuracy over the Logistic Regression model.

Quality analysis of Redwine

Kowshik Kumar Bandameedipalli

3/23/2020