Use the wine quality data set (located in Week 6 files folder) to answer the following questions. Present any figures that assist you in your analysis:
Describe variables that cluster with higher values of wine quality. Describe variables that cluster with lower values of wine quality.
If you want to make a good bottle of wine, then what characteristics are most important according to this analysis?
Wine_data<-read.csv("Data/Wine_data.csv")
Wine<-na.omit(Wine_data)
Wine<-scale(Wine)
set.seed(2)
w2 <- kmeans(Wine, 2, nstart = 25)
set.seed(3)
w3 <- kmeans(Wine, 3, nstart = 25)
set.seed(4)
w4 <- kmeans(Wine, 4, nstart = 25)
set.seed(5)
w5 <- kmeans(Wine, 5, nstart = 25)
set.seed(10)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
q2 <- fviz_cluster(w2, data = Wine) + ggtitle("k = 2")
q3 <- fviz_cluster(w3, data= Wine) + ggtitle("k = 3")
q4 <- fviz_cluster(w4, data = Wine) + ggtitle("k = 4")
q5 <- fviz_cluster(w5, data = Wine) + ggtitle("k = 5")
library(gridExtra)
grid.arrange(q2, q3, q4,q5, nrow = 2)
set.seed(2345)
fviz_nbclust(Wine, kmeans, method = "wss")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Wine_data %>%
mutate(Cluster3 = w3$cluster) %>%
group_by(Cluster3) %>%
summarise_all("mean")
## # A tibble: 3 x 13
## Cluster3 fixed.acidity volatile.acidity citric.acid residual.sugar
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 9.97 0.397 0.467 2.59
## 2 2 8.20 0.537 0.290 3.05
## 3 3 7.19 0.617 0.119 2.22
## # ... with 8 more variables: chlorides <dbl>, free.sulfur.dioxide <dbl>,
## # total.sulfur.dioxide <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## # alcohol <dbl>, quality <dbl>
library(dplyr)
Wine_data %>%
mutate(Cluster2 = w2$cluster) %>%
group_by(Cluster2) %>%
summarise_all("mean")
## # A tibble: 2 x 13
## Cluster2 fixed.acidity volatile.acidity citric.acid residual.sugar
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 9.86 0.397 0.464 2.72
## 2 2 7.43 0.603 0.160 2.44
## # ... with 8 more variables: chlorides <dbl>, free.sulfur.dioxide <dbl>,
## # total.sulfur.dioxide <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## # alcohol <dbl>, quality <dbl>
names(Wine_data)
## [1] "fixed.acidity" "volatile.acidity" "citric.acid"
## [4] "residual.sugar" "chlorides" "free.sulfur.dioxide"
## [7] "total.sulfur.dioxide" "density" "pH"
## [10] "sulphates" "alcohol" "quality"
###Because there is an obvious elbow after k=3, I chose three clusters. The high quanlity has mean around 61, the medium quanlity has mean around 55, the low quanlity has mean around 53.
###The variables that cluster with higher values of wine quality includes: "alcohol", "sulphates", "density", "chlorides", "chlorides", "residual.sugar", "citric.acid" and "fixed.acidity".
###The variables that cluster with lower values of wine quality includes:"total.sulfur.dioxide", "free.sulfur.dioxide", "residual.sugar" and "volatile.acidity".
###The characters which can reflect obvious difference between different quanlities are "free.sulfur.dioxide","total.sulfur.dioxide","alcohol" and "fixed.acidity".
Describe variables that cluster with higher values of wine quality. Describe variables that cluster with lower values of wine quality.
If you want to make a good bottle of wine, then what characteristics are most important according to this analysis? Have your conclusions changed using Hierarchical clustering rather than k means clustering? Present any figures that assist you in your analysis.
dwine <- dist(Wine, method = "euclidean")
hclust_wine <- hclust(dwine, method = "complete" )
plot(hclust_wine, cex = 0.6, hang = -1)
sub_group <- cutree(hclust_wine, k = 3)
library(dplyr)
Wine_data %>%
mutate(cluster_Hi = sub_group) %>%
group_by(cluster_Hi) %>%
summarise_all("mean")
## # A tibble: 3 x 13
## cluster_Hi fixed.acidity volatile.acidity citric.acid residual.sugar
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 8.32 0.529 0.269 2.45
## 2 2 8.45 0.465 0.880 2.60
## 3 3 8.52 0.398 0.421 11.8
## # ... with 8 more variables: chlorides <dbl>, free.sulfur.dioxide <dbl>,
## # total.sulfur.dioxide <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## # alcohol <dbl>, quality <dbl>
plot(hclust_wine, cex = 0.6)
rect.hclust(hclust_wine, k = 3, border = 1:2)
library(factoextra)
fviz_cluster(list(data = Wine, cluster = sub_group))
###The variables that cluster with higher values of wine quality includes:"alcohol", "total.sulfur.dioxide", "free.sulfur.dioxide", "residual.sugar", "citric.acid".
###The variables that cluster with lower values of wine quality includes:"sulphates", "sulphates", "PH", "density", "chlorides", "volatile.acidity", "fixed.acidity".
###If I want to make a good bottle of wine, then the most important characteristics which can reflect the differnce between quanlities are "total.sulfur.dioxide", "free.sulfur.dioxide", "alcohol","free.sulfur.dioxide".
pr.out_wine=prcomp(Wine, scale=TRUE)
biplot(pr.out_wine, scale=0)
pr.out_wine$rotation=-pr.out_wine$rotation
pr.out_wine$x=-pr.out_wine$x
biplot(pr.out_wine, scale=0)
pr.var_wine=pr.out_wine$sdev^2
pve_wine=pr.var_wine/sum(pr.var_wine)
plot(pve_wine, xlab="Principal Component", ylab="Proportion of Variance Explained", ylim=c(0,1),
type="b")
plot(cumsum(pve_wine), xlab="Principal Component", ylab="Cumulative Proportion of Variance Explained",
ylim=c(0,1),type="b")
###There are almost 40% variation explained by the first two principal components.