K Means Clustering Project

Usually when dealing with an unsupervised learning problem, its difficult to get a good measure of how well the model performed. For this project, we will use data from the UCI archive based off of red and white wines (this is a very commonly used data set in ML).

We will then add a label to the a combined data set, we’ll bring this label back later to see how well we can cluster the wine into groups. Get the Data

Libraries

library(ggplot2)

Data

df1 <- read.csv("winequality-red.csv", sep = ";")
df2 <- read.csv("winequality-white.csv", sep = ";")

df1$wine.type <- "red"
df2$wine.type <- "white"

Check heads of dfs

head(df1, 10)
##    fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1            7.4             0.70        0.00            1.9     0.076
## 2            7.8             0.88        0.00            2.6     0.098
## 3            7.8             0.76        0.04            2.3     0.092
## 4           11.2             0.28        0.56            1.9     0.075
## 5            7.4             0.70        0.00            1.9     0.076
## 6            7.4             0.66        0.00            1.8     0.075
## 7            7.9             0.60        0.06            1.6     0.069
## 8            7.3             0.65        0.00            1.2     0.065
## 9            7.8             0.58        0.02            2.0     0.073
## 10           7.5             0.50        0.36            6.1     0.071
##    free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                   11                   34  0.9978 3.51      0.56     9.4
## 2                   25                   67  0.9968 3.20      0.68     9.8
## 3                   15                   54  0.9970 3.26      0.65     9.8
## 4                   17                   60  0.9980 3.16      0.58     9.8
## 5                   11                   34  0.9978 3.51      0.56     9.4
## 6                   13                   40  0.9978 3.51      0.56     9.4
## 7                   15                   59  0.9964 3.30      0.46     9.4
## 8                   15                   21  0.9946 3.39      0.47    10.0
## 9                    9                   18  0.9968 3.36      0.57     9.5
## 10                  17                  102  0.9978 3.35      0.80    10.5
##    quality wine.type
## 1        5       red
## 2        5       red
## 3        5       red
## 4        6       red
## 5        5       red
## 6        5       red
## 7        5       red
## 8        7       red
## 9        7       red
## 10       5       red
head(df2, 10)
##    fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1            7.0             0.27        0.36           20.7     0.045
## 2            6.3             0.30        0.34            1.6     0.049
## 3            8.1             0.28        0.40            6.9     0.050
## 4            7.2             0.23        0.32            8.5     0.058
## 5            7.2             0.23        0.32            8.5     0.058
## 6            8.1             0.28        0.40            6.9     0.050
## 7            6.2             0.32        0.16            7.0     0.045
## 8            7.0             0.27        0.36           20.7     0.045
## 9            6.3             0.30        0.34            1.6     0.049
## 10           8.1             0.22        0.43            1.5     0.044
##    free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                   45                  170  1.0010 3.00      0.45     8.8
## 2                   14                  132  0.9940 3.30      0.49     9.5
## 3                   30                   97  0.9951 3.26      0.44    10.1
## 4                   47                  186  0.9956 3.19      0.40     9.9
## 5                   47                  186  0.9956 3.19      0.40     9.9
## 6                   30                   97  0.9951 3.26      0.44    10.1
## 7                   30                  136  0.9949 3.18      0.47     9.6
## 8                   45                  170  1.0010 3.00      0.45     8.8
## 9                   14                  132  0.9940 3.30      0.49     9.5
## 10                  28                  129  0.9938 3.22      0.45    11.0
##    quality wine.type
## 1        6     white
## 2        6     white
## 3        6     white
## 4        6     white
## 5        6     white
## 6        6     white
## 7        6     white
## 8        6     white
## 9        6     white
## 10       6     white

Combine dataframes

wine <- rbind(df1, df2)
str(wine)
## 'data.frame':    6497 obs. of  13 variables:
##  $ fixed.acidity       : num  7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
##  $ volatile.acidity    : num  0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
##  $ citric.acid         : num  0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
##  $ residual.sugar      : num  1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
##  $ chlorides           : num  0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
##  $ free.sulfur.dioxide : num  11 25 15 17 11 13 15 15 9 17 ...
##  $ total.sulfur.dioxide: num  34 67 54 60 34 40 59 21 18 102 ...
##  $ density             : num  0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num  3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
##  $ sulphates           : num  0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
##  $ alcohol             : num  9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
##  $ quality             : int  5 5 5 6 5 5 5 7 7 5 ...
##  $ wine.type           : chr  "red" "red" "red" "red" ...

EDA

Exploring data

Histrogram of the residual sugar by wine

ggplot(wine, aes(residual.sugar)) + geom_histogram(aes(fill=wine.type), color = "black", alpha = .7)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Histogram of citric.acid from wine data. Color by wine.

ggplot(wine, aes(citric.acid)) + geom_histogram(aes(fill=wine.type), color = "black", alpha = .7)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Histogram of alcohol from the wine data. Color by red and white wines.

ggplot(wine, aes(alcohol)) + geom_histogram(aes(fill=wine.type), color = "black", alpha = .7)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Scatterplot of residual.sugar versus citric.acid, color by red and white wine.

ggplot(wine, aes(citric.acid, residual.sugar))+geom_point(aes(color=wine.type), alpha=.2, size = 2)

Scatterplot of volatile.acidity versus residual.sugar, color by red and white wine.

ggplot(wine, aes(volatile.acidity, residual.sugar))+geom_point(aes(color=wine.type), 
                                                               alpha=.2, size = 2)

Subset dataframe without label

clus.data <- wine[, 1:12]

head(clus.data, 10)
##    fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1            7.4             0.70        0.00            1.9     0.076
## 2            7.8             0.88        0.00            2.6     0.098
## 3            7.8             0.76        0.04            2.3     0.092
## 4           11.2             0.28        0.56            1.9     0.075
## 5            7.4             0.70        0.00            1.9     0.076
## 6            7.4             0.66        0.00            1.8     0.075
## 7            7.9             0.60        0.06            1.6     0.069
## 8            7.3             0.65        0.00            1.2     0.065
## 9            7.8             0.58        0.02            2.0     0.073
## 10           7.5             0.50        0.36            6.1     0.071
##    free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                   11                   34  0.9978 3.51      0.56     9.4
## 2                   25                   67  0.9968 3.20      0.68     9.8
## 3                   15                   54  0.9970 3.26      0.65     9.8
## 4                   17                   60  0.9980 3.16      0.58     9.8
## 5                   11                   34  0.9978 3.51      0.56     9.4
## 6                   13                   40  0.9978 3.51      0.56     9.4
## 7                   15                   59  0.9964 3.30      0.46     9.4
## 8                   15                   21  0.9946 3.39      0.47    10.0
## 9                    9                   18  0.9968 3.36      0.57     9.5
## 10                  17                  102  0.9978 3.35      0.80    10.5
##    quality
## 1        5
## 2        5
## 3        5
## 4        6
## 5        5
## 6        5
## 7        5
## 8        7
## 9        7
## 10       5

Building the clusters

Call the kmeans function on clus.data and assign the results to wine.cluster.

wine.cluster <- kmeans(wine[1:12], 2)

print(wine.cluster$centers)
##   fixed.acidity volatile.acidity citric.acid residual.sugar  chlorides
## 1      6.904812        0.2871659   0.3397642       7.244809 0.04859257
## 2      7.623219        0.4086378   0.2908725       3.076425 0.06580983
##   free.sulfur.dioxide total.sulfur.dioxide   density       pH sulphates
## 1            39.75590            155.69246 0.9947903 3.190808 0.4999485
## 2            18.39868             63.26318 0.9945736 3.254882 0.5724145
##    alcohol  quality
## 1 10.25932 5.824343
## 2 10.79722 5.810541

Evaluating the clusters

table(wine$wine.type, wine.cluster$cluster)
##        
##            1    2
##   red     85 1514
##   white 3604 1294