For starters, this Project kernel is to perform Exploratory Data Analysis on the dataset “Red Wine Quality” and to analyze which independent variables influences the most on the Quality factor (Response Variable) of Red wine.
Firstly, Importing the dataset “Redwine” and performing simple analysis.
library(readr)
## Warning: package 'readr' was built under R version 3.6.3
redwine = read_csv("C:/Users/Kowshik Kumar B/Documents/HU/506/Project/redwine.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## `fixed acidity` = col_double(),
## `volatile acidity` = col_double(),
## `citric acid` = col_double(),
## `residual sugar` = col_double(),
## chlorides = col_double(),
## `free sulfur dioxide` = col_double(),
## `total sulfur dioxide` = col_double(),
## density = col_double(),
## pH = col_double(),
## sulphates = col_double(),
## alcohol = col_double(),
## quality = col_double()
## )
head(redwine)
## # A tibble: 6 x 12
## `fixed acidity` `volatile acidi~ `citric acid` `residual sugar` chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.4 0.7 0 1.9 0.076
## 2 7.8 0.88 0 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.7 0 1.9 0.076
## 6 7.4 0.66 0 1.8 0.075
## # ... with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## # dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## # quality <dbl>
tail(redwine)
## # A tibble: 6 x 12
## `fixed acidity` `volatile acidi~ `citric acid` `residual sugar` chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6.8 0.62 0.08 1.9 0.068
## 2 6.2 0.6 0.08 2 0.09
## 3 5.9 0.55 0.1 2.2 0.062
## 4 6.3 0.51 0.13 2.3 0.076
## 5 5.9 0.645 0.12 2 0.075
## 6 6 0.31 0.47 3.6 0.067
## # ... with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## # dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## # quality <dbl>
str(redwine)
## tibble [1,599 x 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ fixed acidity : num [1:1599] 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile acidity : num [1:1599] 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric acid : num [1:1599] 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual sugar : num [1:1599] 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num [1:1599] 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free sulfur dioxide : num [1:1599] 11 25 15 17 11 13 15 15 9 17 ...
## $ total sulfur dioxide: num [1:1599] 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num [1:1599] 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num [1:1599] 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num [1:1599] 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num [1:1599] 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : num [1:1599] 5 5 5 6 5 5 5 7 7 5 ...
## - attr(*, "spec")=
## .. cols(
## .. `fixed acidity` = col_double(),
## .. `volatile acidity` = col_double(),
## .. `citric acid` = col_double(),
## .. `residual sugar` = col_double(),
## .. chlorides = col_double(),
## .. `free sulfur dioxide` = col_double(),
## .. `total sulfur dioxide` = col_double(),
## .. density = col_double(),
## .. pH = col_double(),
## .. sulphates = col_double(),
## .. alcohol = col_double(),
## .. quality = col_double()
## .. )
dim(redwine)
## [1] 1599 12
We can clearly interpret that the dataset has 12 Columns and 1599 Rows alongside all the independent variables are Numerical data and the response variable can be featured as Categorial data.
Then Performing Summary Statistics and Plotting the datapoints for undestanding or estimating the variables of the dataset “RedWine”
summary(redwine)
## fixed acidity volatile acidity citric acid residual sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## chlorides free sulfur dioxide total sulfur dioxide density
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956
## Median :0.07900 Median :14.00 Median : 38.00 Median :0.9968
## Mean :0.08747 Mean :15.87 Mean : 46.47 Mean :0.9967
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :72.00 Max. :289.00 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.740 Min. :0.3300 Min. : 8.40 Min. :3.000
## 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.310 Median :0.6200 Median :10.20 Median :6.000
## Mean :3.311 Mean :0.6581 Mean :10.42 Mean :5.636
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :8.000
plot(redwine)
Checking for total number of NA variables in our dataset.
NA_values=data.frame(no_of_na_values=colSums(is.na(redwine)))
head(NA_values,12)
## no_of_na_values
## fixed acidity 0
## volatile acidity 0
## citric acid 0
## residual sugar 0
## chlorides 0
## free sulfur dioxide 0
## total sulfur dioxide 0
## density 0
## pH 0
## sulphates 0
## alcohol 0
## quality 0
We can clearly see that we do not have any NA Values.
Further, Checking the Correlation between the varaibles of our dataset and visualizing the correlation matrix.
cor(redwine)
## fixed acidity volatile acidity citric acid residual sugar
## fixed acidity 1.00000000 -0.256130895 0.67170343 0.114776724
## volatile acidity -0.25613089 1.000000000 -0.55249568 0.001917882
## citric acid 0.67170343 -0.552495685 1.00000000 0.143577162
## residual sugar 0.11477672 0.001917882 0.14357716 1.000000000
## chlorides 0.09370519 0.061297772 0.20382291 0.055609535
## free sulfur dioxide -0.15379419 -0.010503827 -0.06097813 0.187048995
## total sulfur dioxide -0.11318144 0.076470005 0.03553302 0.203027882
## density 0.66804729 0.022026232 0.36494718 0.355283371
## pH -0.68297819 0.234937294 -0.54190414 -0.085652422
## sulphates 0.18300566 -0.260986685 0.31277004 0.005527121
## alcohol -0.06166827 -0.202288027 0.10990325 0.042075437
## quality 0.12405165 -0.390557780 0.22637251 0.013731637
## chlorides free sulfur dioxide total sulfur dioxide
## fixed acidity 0.093705186 -0.153794193 -0.11318144
## volatile acidity 0.061297772 -0.010503827 0.07647000
## citric acid 0.203822914 -0.060978129 0.03553302
## residual sugar 0.055609535 0.187048995 0.20302788
## chlorides 1.000000000 0.005562147 0.04740047
## free sulfur dioxide 0.005562147 1.000000000 0.66766645
## total sulfur dioxide 0.047400468 0.667666450 1.00000000
## density 0.200632327 -0.021945831 0.07126948
## pH -0.265026131 0.070377499 -0.06649456
## sulphates 0.371260481 0.051657572 0.04294684
## alcohol -0.221140545 -0.069408354 -0.20565394
## quality -0.128906560 -0.050656057 -0.18510029
## density pH sulphates alcohol
## fixed acidity 0.66804729 -0.68297819 0.183005664 -0.06166827
## volatile acidity 0.02202623 0.23493729 -0.260986685 -0.20228803
## citric acid 0.36494718 -0.54190414 0.312770044 0.10990325
## residual sugar 0.35528337 -0.08565242 0.005527121 0.04207544
## chlorides 0.20063233 -0.26502613 0.371260481 -0.22114054
## free sulfur dioxide -0.02194583 0.07037750 0.051657572 -0.06940835
## total sulfur dioxide 0.07126948 -0.06649456 0.042946836 -0.20565394
## density 1.00000000 -0.34169933 0.148506412 -0.49617977
## pH -0.34169933 1.00000000 -0.196647602 0.20563251
## sulphates 0.14850641 -0.19664760 1.000000000 0.09359475
## alcohol -0.49617977 0.20563251 0.093594750 1.00000000
## quality -0.17491923 -0.05773139 0.251397079 0.47616632
## quality
## fixed acidity 0.12405165
## volatile acidity -0.39055778
## citric acid 0.22637251
## residual sugar 0.01373164
## chlorides -0.12890656
## free sulfur dioxide -0.05065606
## total sulfur dioxide -0.18510029
## density -0.17491923
## pH -0.05773139
## sulphates 0.25139708
## alcohol 0.47616632
## quality 1.00000000
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.1
## corrplot 0.84 loaded
corrplot.mixed(cor(redwine),lower.col = "black", number.cex = .7)
Assigning a new variable “bestquality” as 1 for good if the “quality” level of redwine is above 6 and vice-versa.
redwine$bestquality=ifelse(redwine$quality>6,1,0)
table(redwine$bestquality)
##
## 0 1
## 1382 217
Plotting the Response variables(Quality of Redwine) Distribution Plot.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
b1=ggplot(redwine,aes(x=quality))+
geom_bar(stat = "count")+
ggtitle("Distribution of Red Wine Quality Ratings")+
scale_x_continuous(breaks = seq(3,8,1))+
theme(plot.title = element_text(hjust = 0.5))+
labs(x = "Quality Ratings")
b2=ggplot(redwine,aes(x=bestquality,fill=factor(bestquality)))+
geom_bar(stat = "count")+
ggtitle("Distribution of Red Wine Quality Ratings")+
scale_x_continuous(breaks = seq(0,1,1))+
theme(plot.title = element_text(hjust = 0.5))+
labs(x = "0=Bad Quality 1=Good Quality")
b1
b2
Further, Plotting the datapoints density and Histogram plots of each independent variable to check how it is influencing the Quality of the Redwine.
library(ggplot2)
d1=ggplot(redwine,aes(x=redwine$`fixed acidity`,fill=factor(bestquality)))+
geom_density(alpha=0.25)+
scale_x_continuous(breaks = seq(5,15,1))+
xlab(label = "Fixed Acidity Level")+
ggtitle("Density Distribution of Fixed Acidity Levels")+
theme(plot.title = element_text(hjust = 0.5))
d2=ggplot(redwine,aes(x=redwine$`volatile acidity`,fill=factor(bestquality)))+
geom_density(alpha=0.25)+
scale_x_continuous(breaks = seq(5,15,1))+
xlab(label = "Volatile Acidity Level")+
ggtitle("Density Distribution of Volatile Acidity Levels")+
theme(plot.title = element_text(hjust = 0.5))
d3=ggplot(redwine,aes(x=redwine$`citric acid`,fill=factor(bestquality)))+
geom_density(alpha=0.25)+
scale_x_continuous(breaks = seq(5,15,1))+
xlab(label = "Citric Acid Level")+
ggtitle("Density Distribution of Citric Acid Levels")+
theme(plot.title = element_text(hjust = 0.5))
d4=ggplot(redwine,aes(x=redwine$`residual sugar`,fill=factor(bestquality)))+
geom_density(alpha=0.25)+
scale_x_continuous(breaks = seq(5,15,1))+
xlab(label = "Residual Sugar Level")+
ggtitle("Density Distribution of Residual Sugar Levels")+
theme(plot.title = element_text(hjust = 0.5))
d5=ggplot(redwine,aes(x=redwine$chlorides,fill=factor(bestquality)))+
geom_density(alpha=0.25)+
scale_x_continuous(breaks = seq(5,15,1))+
xlab(label = "Chlorides present")+
ggtitle("Density Distribution of Chlorides Levels")+
theme(plot.title = element_text(hjust = 0.5))
d6=ggplot(redwine,aes(x=redwine$`free sulfur dioxide`,fill=factor(bestquality)))+
geom_density(alpha=0.25)+
scale_x_continuous(breaks = seq(5,15,1))+
xlab(label = "Free sulfur Level")+
ggtitle("Density Distribution of Free Sulfar Levels")+
theme(plot.title = element_text(hjust = 0.5))
d7=ggplot(redwine,aes(x=redwine$`total sulfur dioxide`,fill=factor(bestquality)))+
geom_density(alpha=0.25)+
scale_x_continuous(breaks = seq(5,15,1))+
xlab(label = "Total sulfur Level")+
ggtitle("Density Distribution of Total Sulfar Levels")+
theme(plot.title = element_text(hjust = 0.5))
d8=ggplot(redwine,aes(x=redwine$density,fill=factor(bestquality)))+
geom_density(alpha=0.25)+
scale_x_continuous(breaks = seq(5,15,1))+
xlab(label = "Density")+
ggtitle("Density Distribution of Density")+
theme(plot.title = element_text(hjust = 0.5))
d9=ggplot(redwine,aes(x=redwine$sulphates,fill=factor(bestquality)))+
geom_density(alpha=0.25)+
scale_x_continuous(breaks = seq(5,15,1))+
xlab(label = "Sulphates Present")+
ggtitle("Density Distribution of Sulphates")+
theme(plot.title = element_text(hjust = 0.5))
d10=ggplot(redwine,aes(x=redwine$pH,fill=factor(bestquality)))+
geom_density(alpha=0.25)+
scale_x_continuous(breaks = seq(5,15,1))+
xlab(label = "pH levels")+
ggtitle("Density Distribution of pH levels")+
theme(plot.title = element_text(hjust = 0.5))
d11=ggplot(redwine,aes(x=redwine$alcohol,fill=factor(bestquality)))+
geom_density(alpha=0.25)+
scale_x_continuous(breaks = seq(5,15,1))+
xlab(label = "Alchol levels")+
ggtitle("Density Distribution of Alchol levels")+
theme(plot.title = element_text(hjust = 0.5))
d1
d2
d3
d4
d5
d6
d7
d8
d9
d10
d11
library("ggplot2")
h1=ggplot(data = redwine, mapping = aes(x = redwine$`fixed acidity`)) +
geom_histogram( bins=100, fill = "steelblue") +
labs(x = "Fixed Acidity levels") +
ggtitle("Histogram of Fixed Acidity levels") +
theme(plot.title = element_text(hjust = 0.5))
h2=ggplot(data = redwine, mapping = aes(x = redwine$`volatile acidity`)) +
geom_histogram( bins=30, fill = "darkolivegreen") +
labs(x = "Volatile Acidity levels") +
ggtitle("Histogram of Volatile Acidity levels ") +
theme(plot.title = element_text(hjust = 0.5))
h3=ggplot(data = redwine, mapping = aes(x = redwine$`citric acid`)) +
geom_histogram( bins=30, fill = "green1") +
labs(x = "Citric acid levels") +
ggtitle("Histogram of Citric acid levels ") +
theme(plot.title = element_text(hjust = 0.5))
h4=ggplot(data = redwine, mapping = aes(x = redwine$`residual sugar`)) +
geom_histogram( bins=30, fill = "tomato1") +
labs(x = "Residual Sugar levels") +
ggtitle("Histogram of Residual Sugar levels ") +
theme(plot.title = element_text(hjust = 0.5))
h5=ggplot(data = redwine, mapping = aes(x = redwine$chlorides)) +
geom_histogram( bins=30, fill = "red1") +
labs(x = "Chlorides present") +
ggtitle("Histogram of Chlorides present ") +
theme(plot.title = element_text(hjust = 0.5))
h6=ggplot(data = redwine, mapping = aes(x = redwine$`free sulfur dioxide`)) +
geom_histogram( bins=30, fill = "brown1") +
labs(x = "Free sulfur dioxide levels") +
ggtitle("Histogram of Free sulfur dioxide levles ") +
theme(plot.title = element_text(hjust = 0.5))
h7=ggplot(data = redwine, mapping = aes(x = redwine$`total sulfur dioxide`)) +
geom_histogram( bins=30, fill = "grey1") +
labs(x = "Total sulfur dioxide levels") +
ggtitle("Histogram of Total sulfur dioxide levles ") +
theme(plot.title = element_text(hjust = 0.5))
h8=ggplot(data = redwine, mapping = aes(x = redwine$density)) +
geom_histogram( bins=30, fill = "orange1") +
labs(x = "Density") +
ggtitle("Histogram of Density ") +
theme(plot.title = element_text(hjust = 0.5))
h9=ggplot(data = redwine, mapping = aes(x = redwine$pH)) +
geom_histogram( bins=30, fill = "orange1") +
labs(x = "pH levels") +
ggtitle("Histogram of ") +
theme(plot.title = element_text(hjust = 0.5))
h10=ggplot(data = redwine, mapping = aes(x = redwine$sulphates)) +
geom_histogram( bins=30, fill = "orange1") +
labs(x = "Sulphates present") +
ggtitle("Histogram of Sulphates present") +
theme(plot.title = element_text(hjust = 0.5))
h11=ggplot(data = redwine, mapping = aes(x = redwine$alcohol)) +
geom_histogram( bins=30, fill = "orange1") +
labs(x = "Alcohol levels") +
ggtitle("Histogram of Alcohol levels ") +
theme(plot.title = element_text(hjust = 0.5))
h1
h2
h3
h4
h5
h6
h7
h8
h9
h10
h11
library("GGally")
## Warning: package 'GGally' was built under R version 3.6.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(redwine)
Creating a Regressor to perform Modelling for checking the fit of the model and obtain performance results.
regressor=factor(redwine$bestquality)~redwine$`fixed acidity`+redwine$`volatile acidity`+redwine$`citric acid`+redwine$`residual sugar`+redwine$chlorides+redwine$`free sulfur dioxide`+redwine$`total sulfur dioxide`+redwine$density+redwine$pH+redwine$sulphates+redwine$alcohol
Performing RandomForest model to our dataset “Redwine” to check how the Quality factor is getting influenced by the other independent variables.
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.6.1
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
set.seed(12345)
redwine_RandomForests<-randomForest(regressor,redwine,ntree=150,nodesize = 10, proximity = T)
redwine_RandomForests
##
## Call:
## randomForest(formula = regressor, data = redwine, ntree = 150, nodesize = 10, proximity = T)
## Type of random forest: classification
## Number of trees: 150
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 9.57%
## Confusion matrix:
## 0 1 class.error
## 0 1354 28 0.02026049
## 1 125 92 0.57603687
varImpPlot(redwine_RandomForests)
We can see that the Performance from our model is approximately 91.5% alongide we can clearly see that the Alchol, Sulphates, Volatile Acidity and Density are the most important factors influencing the quality of the Redwine.
Performing Logistic Regression to our dataset to check the “Quality”(Categorical) factor is getting influenced by the other independent variables.
redwine_logit=glm(regressor, data = redwine, family = "binomial")
summary(redwine_logit)
##
## Call:
## glm(formula = regressor, family = "binomial", data = redwine)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9878 -0.4351 -0.2207 -0.1222 2.9869
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.428e+02 1.081e+02 2.247 0.024660 *
## redwine$`fixed acidity` 2.750e-01 1.253e-01 2.195 0.028183 *
## redwine$`volatile acidity` -2.581e+00 7.843e-01 -3.291 0.000999 ***
## redwine$`citric acid` 5.678e-01 8.385e-01 0.677 0.498313
## redwine$`residual sugar` 2.395e-01 7.373e-02 3.248 0.001163 **
## redwine$chlorides -8.816e+00 3.365e+00 -2.620 0.008788 **
## redwine$`free sulfur dioxide` 1.082e-02 1.223e-02 0.884 0.376469
## redwine$`total sulfur dioxide` -1.653e-02 4.894e-03 -3.378 0.000731 ***
## redwine$density -2.578e+02 1.104e+02 -2.335 0.019536 *
## redwine$pH 2.242e-01 9.984e-01 0.225 0.822327
## redwine$sulphates 3.750e+00 5.416e-01 6.924 4.39e-12 ***
## redwine$alcohol 7.533e-01 1.316e-01 5.724 1.04e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1269.92 on 1598 degrees of freedom
## Residual deviance: 870.86 on 1587 degrees of freedom
## AIC: 894.86
##
## Number of Fisher Scoring iterations: 6
The Logistic model clearly states that most of the variables are statistically significant except two variables i.e Citric acid and pH. Alongside the Volatile Acidity, Sulfur dioxide and Alcohol levels are most significant or important to our model which means the Quality factor is highly influenced by the abvove three variables.
Then checking the confidence interval wherein the datapoints are significant or influencing the model.
confint(redwine_logit)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 31.39072397 455.477987722
## redwine$`fixed acidity` 0.02920528 0.520924990
## redwine$`volatile acidity` -4.14912796 -1.075526938
## redwine$`citric acid` -1.08534075 2.203320201
## redwine$`residual sugar` 0.08873459 0.380410083
## redwine$chlorides -16.17020457 -2.978181607
## redwine$`free sulfur dioxide` -0.01326636 0.034821389
## redwine$`total sulfur dioxide` -0.02664137 -0.007429704
## redwine$density -475.10890589 -41.834834603
## redwine$pH -1.74982855 2.167994149
## redwine$sulphates 2.68719771 4.819706731
## redwine$alcohol 0.49736265 1.014132717
Furthermore, generating the Odds Ratio’s and its Confidence interval so as to know with what ratio the Quality factor gets influenced which can be seen below:
exp(cbind(OR = coef(redwine_logit), confint(redwine_logit)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 2.694155e+105 4.293567e+13 6.480025e+197
## redwine$`fixed acidity` 1.316469e+00 1.029636e+00 1.683584e+00
## redwine$`volatile acidity` 7.569811e-02 1.577817e-02 3.411180e-01
## redwine$`citric acid` 1.764371e+00 3.377867e-01 9.055028e+00
## redwine$`residual sugar` 1.270568e+00 1.092791e+00 1.462884e+00
## redwine$chlorides 1.482863e-04 9.492255e-08 5.088528e-02
## redwine$`free sulfur dioxide` 1.010879e+00 9.868212e-01 1.035435e+00
## redwine$`total sulfur dioxide` 9.836053e-01 9.737104e-01 9.925978e-01
## redwine$density 1.096312e-112 4.600699e-207 6.782069e-19
## redwine$pH 1.251303e+00 1.738037e-01 8.740734e+00
## redwine$sulphates 4.251593e+01 1.469045e+01 1.239287e+02
## redwine$alcohol 2.124081e+00 1.644379e+00 2.756971e+00
redwine_prob=predict(redwine_logit, redwine, type="response")
redwine_pred = ifelse(redwine_prob>= 0.5, "Good", "Bad")
table(redwine_pred,redwine$bestquality)
##
## redwine_pred 0 1
## Bad 1339 142
## Good 43 75
redwine_logit_accuracy=(1339+75)/(1339+142+43+75)
redwine_logit_accuracy
## [1] 0.8843027
From above we can say that accurracy of our Logistkc model predictions is around 88.5%.
Here checking for the mulitcollinearity between the variables by performing VIF
library(car)
## Warning: package 'car' was built under R version 3.6.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 3.6.3
vif(redwine_logit)
## redwine$`fixed acidity` redwine$`volatile acidity`
## 8.548951 1.789987
## redwine$`citric acid` redwine$`residual sugar`
## 3.455691 1.714940
## redwine$chlorides redwine$`free sulfur dioxide`
## 1.252465 2.086780
## redwine$`total sulfur dioxide` redwine$density
## 2.305224 7.948391
## redwine$pH redwine$sulphates
## 3.409585 1.319694
## redwine$alcohol
## 2.536675
We have two factor(Fixed Acidity and Density) that are having VIF more than 5 which states that the multicollinearity is high but it is acceptable in our case.
Then, Checking for Normaility assumptions.
plot(redwine_logit)
We can clearly see the dispersion of Categorical datapoints scattered and all the assumptions looks good with few outliers which is negligible.
Finally, Performing the Principal Component Analysis for our dataset “Redwine”.
redwine_pca <- prcomp(redwine[,c(1:11)], center = TRUE,scale. = TRUE)
summary(redwine_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.7604 1.3878 1.2452 1.1015 0.97943 0.81216 0.76406
## Proportion of Variance 0.2817 0.1751 0.1410 0.1103 0.08721 0.05996 0.05307
## Cumulative Proportion 0.2817 0.4568 0.5978 0.7081 0.79528 0.85525 0.90832
## PC8 PC9 PC10 PC11
## Standard deviation 0.65035 0.58706 0.42583 0.24405
## Proportion of Variance 0.03845 0.03133 0.01648 0.00541
## Cumulative Proportion 0.94677 0.97810 0.99459 1.00000
From above we can clearly see that the first 7 Pricipal Components are contributing to 90% of the total variance.
ggbiplot2=function(pcobj, choices = 1:2, scale = 1, pc.biplot = TRUE,
obs.scale = 1 - scale, var.scale = scale,
groups = NULL, ellipse = FALSE, ellipse.prob = 0.68,
labels = NULL, labels.size = 3, alpha = 1,
var.axes = TRUE,
circle = FALSE, circle.prob = 0.69,
varname.size = 3, varname.adjust = 1.5,
varname.abbrev = FALSE, ...)
{
library(ggplot2)
library(plyr)
library(scales)
library(grid)
stopifnot(length(choices) == 2)
# Recover the SVD
if(inherits(pcobj, 'prcomp')){
nobs.factor <- sqrt(nrow(pcobj$x) - 1)
d <- pcobj$sdev
u <- sweep(pcobj$x, 2, 1 / (d * nobs.factor), FUN = '*')
v <- pcobj$rotation
} else if(inherits(pcobj, 'princomp')) {
nobs.factor <- sqrt(pcobj$n.obs)
d <- pcobj$sdev
u <- sweep(pcobj$scores, 2, 1 / (d * nobs.factor), FUN = '*')
v <- pcobj$loadings
} else if(inherits(pcobj, 'PCA')) {
nobs.factor <- sqrt(nrow(pcobj$call$X))
d <- unlist(sqrt(pcobj$eig)[1])
u <- sweep(pcobj$ind$coord, 2, 1 / (d * nobs.factor), FUN = '*')
v <- sweep(pcobj$var$coord,2,sqrt(pcobj$eig[1:ncol(pcobj$var$coord),1]),FUN="/")
} else {
stop('Expected a object of class prcomp, princomp or PCA')
}
# Scores
df.u <- as.data.frame(sweep(u[,choices], 2, d[choices]^obs.scale, FUN='*'))
# Directions
v <- sweep(v, 2, d^var.scale, FUN='*')
df.v <- as.data.frame(v[, choices])
names(df.u) <- c('xvar', 'yvar')
names(df.v) <- names(df.u)
if(pc.biplot) {
df.u <- df.u * nobs.factor
}
# Scale the radius of the correlation circle so that it corresponds to
# a data ellipse for the standardized PC scores
r <- 1
# Scale directions
v.scale <- rowSums(v^2)
df.v <- df.v / sqrt(max(v.scale))
## Scale Scores
r.scale=sqrt(max(df.u[,1]^2+df.u[,2]^2))
df.u=.99*df.u/r.scale
# Change the labels for the axes
if(obs.scale == 0) {
u.axis.labs <- paste('standardized PC', choices, sep='')
} else {
u.axis.labs <- paste('PC', choices, sep='')
}
# Append the proportion of explained variance to the axis labels
u.axis.labs <- paste(u.axis.labs,
sprintf('(%0.1f%% explained var.)',
100 * pcobj$sdev[choices]^2/sum(pcobj$sdev^2)))
# Score Labels
if(!is.null(labels)) {
df.u$labels <- labels
}
# Grouping variable
if(!is.null(groups)) {
df.u$groups <- groups
}
# Variable Names
if(varname.abbrev) {
df.v$varname <- abbreviate(rownames(v))
} else {
df.v$varname <- rownames(v)
}
# Variables for text label placement
df.v$angle <- with(df.v, (180/pi) * atan(yvar / xvar))
df.v$hjust = with(df.v, (1 - varname.adjust * sign(xvar)) / 2)
# Base plot
g <- ggplot(data = df.u, aes(x = xvar, y = yvar)) +
xlab(u.axis.labs[1]) + ylab(u.axis.labs[2]) + coord_equal()
if(var.axes) {
# Draw circle
if(circle)
{
theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50))
circle <- data.frame(xvar = r * cos(theta), yvar = r * sin(theta))
g <- g + geom_path(data = circle, color = muted('white'),
size = 1/2, alpha = 1/3)
}
# Draw directions
g <- g +
geom_segment(data = df.v,
aes(x = 0, y = 0, xend = xvar, yend = yvar),
arrow = arrow(length = unit(1/2, 'picas')),
color = muted('red'))
}
# Draw either labels or points
if(!is.null(df.u$labels)) {
if(!is.null(df.u$groups)) {
g <- g + geom_text(aes(label = labels, color = groups),
size = labels.size)
} else {
g <- g + geom_text(aes(label = labels), size = labels.size)
}
} else {
if(!is.null(df.u$groups)) {
g <- g + geom_point(aes(color = groups), alpha = alpha)
} else {
g <- g + geom_point(alpha = alpha)
}
}
# Overlay a concentration ellipse if there are groups
if(!is.null(df.u$groups) && ellipse) {
theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50))
circle <- cbind(cos(theta), sin(theta))
ell <- ddply(df.u, 'groups', function(x) {
if(nrow(x) < 2) {
return(NULL)
} else if(nrow(x) == 2) {
sigma <- var(cbind(x$xvar, x$yvar))
} else {
sigma <- diag(c(var(x$xvar), var(x$yvar)))
}
mu <- c(mean(x$xvar), mean(x$yvar))
ed <- sqrt(qchisq(ellipse.prob, df = 2))
data.frame(sweep(circle %*% chol(sigma) * ed, 2, mu, FUN = '+'),
groups = x$groups[1])
})
names(ell)[1:2] <- c('xvar', 'yvar')
g <- g + geom_path(data = ell, aes(color = groups, group = groups))
}
# Label the variable axes
if(var.axes) {
g <- g +
geom_text(data = df.v,
aes(label = varname, x = xvar, y = yvar,
angle = angle, hjust = hjust),
color = 'darkred', size = varname.size)
}
# Change the name of the legend for groups
# if(!is.null(groups)) {
# g <- g + scale_color_brewer(name = deparse(substitute(groups)),
# palette = 'Dark2')
# }
# TODO: Add a second set of axes
return(g)
}
ggbiplot2(redwine_pca)
## Warning: package 'plyr' was built under R version 3.6.3
## Warning: package 'scales' was built under R version 3.6.3
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
In conclusion, Volatile Acidity, Sulfur dioxide and Alcohol levels are the three important factors which are significantly influencing the Quality of the Redwine while Citric Acid and pH does not have any effect on the Quality of the Redwine. Alongside, RandomForest regression model predicts with better performance or accuracy over the Logistic Regression model.