## Homework 3: Clustering

Loading the Data

# load data
data_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
wine_data <- read.csv(data_url, header = FALSE)

# assign column names
colnames(wine_data) <- c("Class", "Alcohol", "Malic_Acid", "Ash", "Alcalinity_of_Ash", "Magnesium", "Total_Phenols",
                        "Flavanoids", "Nonflavanoid_Phenols", "Proanthocyanins", "Color_Intensity", "Hue",
                        "OD280_OD315_of_Diluted_Wines", "Proline")

# inspect the data
glimpse(wine_data)
## Rows: 178
## Columns: 14
## $ Class                        <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Alcohol                      <dbl> 14.23, 13.20, 13.16, 14.37, 13.24, 14.20,…
## $ Malic_Acid                   <dbl> 1.71, 1.78, 2.36, 1.95, 2.59, 1.76, 1.87,…
## $ Ash                          <dbl> 2.43, 2.14, 2.67, 2.50, 2.87, 2.45, 2.45,…
## $ Alcalinity_of_Ash            <dbl> 15.6, 11.2, 18.6, 16.8, 21.0, 15.2, 14.6,…
## $ Magnesium                    <int> 127, 100, 101, 113, 118, 112, 96, 121, 97…
## $ Total_Phenols                <dbl> 2.80, 2.65, 2.80, 3.85, 2.80, 3.27, 2.50,…
## $ Flavanoids                   <dbl> 3.06, 2.76, 3.24, 3.49, 2.69, 3.39, 2.52,…
## $ Nonflavanoid_Phenols         <dbl> 0.28, 0.26, 0.30, 0.24, 0.39, 0.34, 0.30,…
## $ Proanthocyanins              <dbl> 2.29, 1.28, 2.81, 2.18, 1.82, 1.97, 1.98,…
## $ Color_Intensity              <dbl> 5.64, 4.38, 5.68, 7.80, 4.32, 6.75, 5.25,…
## $ Hue                          <dbl> 1.04, 1.05, 1.03, 0.86, 1.04, 1.05, 1.02,…
## $ OD280_OD315_of_Diluted_Wines <dbl> 3.92, 3.40, 3.17, 3.45, 2.93, 2.85, 3.58,…
## $ Proline                      <int> 1065, 1050, 1185, 1480, 735, 1450, 1290, …

Data Preprocessing (2 points)

# check for missing data
colSums(is.na(wine_data))
##                        Class                      Alcohol 
##                            0                            0 
##                   Malic_Acid                          Ash 
##                            0                            0 
##            Alcalinity_of_Ash                    Magnesium 
##                            0                            0 
##                Total_Phenols                   Flavanoids 
##                            0                            0 
##         Nonflavanoid_Phenols              Proanthocyanins 
##                            0                            0 
##              Color_Intensity                          Hue 
##                            0                            0 
## OD280_OD315_of_Diluted_Wines                      Proline 
##                            0                            0
# select relevant columns and scale data
wine_data_cluster <- wine_data %>%
  select(-Class) %>%
  scale()

# inspect scaled data
glimpse(wine_data_cluster)
##  num [1:178, 1:13] 1.514 0.246 0.196 1.687 0.295 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:13] "Alcohol" "Malic_Acid" "Ash" "Alcalinity_of_Ash" ...
##  - attr(*, "scaled:center")= Named num [1:13] 13 2.34 2.37 19.49 99.74 ...
##   ..- attr(*, "names")= chr [1:13] "Alcohol" "Malic_Acid" "Ash" "Alcalinity_of_Ash" ...
##  - attr(*, "scaled:scale")= Named num [1:13] 0.812 1.117 0.274 3.34 14.282 ...
##   ..- attr(*, "names")= chr [1:13] "Alcohol" "Malic_Acid" "Ash" "Alcalinity_of_Ash" ...

Building the Clustering Model (2 points)

# set seed
set.seed(123)

# Fit the k-means clustering model
kmeans_result <- kmeans(wine_data_cluster, centers = 3, nstart = 25)

# Display the clustering results
kmeans_result
## K-means clustering with 3 clusters of sizes 51, 62, 65
## 
## Cluster means:
##      Alcohol Malic_Acid        Ash Alcalinity_of_Ash   Magnesium Total_Phenols
## 1  0.1644436  0.8690954  0.1863726         0.5228924 -0.07526047   -0.97657548
## 2  0.8328826 -0.3029551  0.3636801        -0.6084749  0.57596208    0.88274724
## 3 -0.9234669 -0.3929331 -0.4931257         0.1701220 -0.49032869   -0.07576891
##    Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity        Hue
## 1 -1.21182921           0.72402116     -0.77751312       0.9388902 -1.1615122
## 2  0.97506900          -0.56050853      0.57865427       0.1705823  0.4726504
## 3  0.02075402          -0.03343924      0.05810161      -0.8993770  0.4605046
##   OD280_OD315_of_Diluted_Wines    Proline
## 1                   -1.2887761 -0.4059428
## 2                    0.7770551  1.1220202
## 3                    0.2700025 -0.7517257
## 
## Clustering vector:
##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 1 3 3 3 3 3 3 3 3 3 3 3 2
##  [75] 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [112] 3 3 3 3 3 3 3 1 3 3 2 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 326.3537 385.6983 558.6971
##  (between_SS / total_SS =  44.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Model Evaluation and Visualization (3 points)

# Add cluster assignments to the original dataset
wine_clustered <- wine_data %>%
  mutate(Cluster = as.factor(kmeans_result$cluster))

# Confusion matrix
table(wine_clustered$Cluster, wine_clustered$Class)
##    
##      1  2  3
##   1  0  3 48
##   2 59  3  0
##   3  0 65  0
# Visualization
ggplot(wine_clustered, aes(Alcohol, Total_Phenols, color = Cluster)) +
  geom_point(size = 3) +
  labs(title = "Clustering of Wine Dataset",
       x = "Alcohol",
       y = "Total Phenols")