Palmer Station Penguin Data

We will be using the palmerpenguins data set for this lab.

We will also be needing to load the broom package

library(palmerpenguins)
library(broom)
library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.3 ──

## ✓ dials        0.0.9      ✓ rsample      0.0.9 
## ✓ dplyr        1.0.6      ✓ tibble       3.1.2 
## ✓ ggplot2      3.3.3      ✓ tidyr        1.1.3 
## ✓ infer        0.5.4      ✓ tune         0.1.5 
## ✓ modeldata    0.1.0      ✓ workflows    0.2.2 
## ✓ parsnip      0.1.5      ✓ workflowsets 0.0.2 
## ✓ purrr        0.3.4      ✓ yardstick    0.0.8 
## ✓ recipes      0.1.16

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x purrr::discard() masks scales::discard()
## x dplyr::filter()  masks stats::filter()
## x dplyr::lag()     masks stats::lag()
## x recipes::step()  masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ✓ stringr 1.4.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x readr::col_factor() masks scales::col_factor()
## x purrr::discard()    masks scales::discard()
## x dplyr::filter()     masks stats::filter()
## x stringr::fixed()    masks recipes::fixed()
## x dplyr::lag()        masks stats::lag()
## x readr::spec()       masks yardstick::spec()

# our data set
head(penguins)

## # A tibble: 6 x 8
##   species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex  
##   <fct>   <fct>           <dbl>         <dbl>            <int>       <int> <fct>
## 1 Adelie  Torge…           39.1          18.7              181        3750 male 
## 2 Adelie  Torge…           39.5          17.4              186        3800 fema…
## 3 Adelie  Torge…           40.3          18                195        3250 fema…
## 4 Adelie  Torge…           NA            NA                 NA          NA <NA> 
## 5 Adelie  Torge…           36.7          19.3              193        3450 fema…
## 6 Adelie  Torge…           39.3          20.6              190        3650 male 
## # … with 1 more variable: year <int>

Transform the data set into a matrix using two of the numeric variables

# tidyverse style
penguins %>%
  select(bill_length_mm, bill_depth_mm) %>%
  drop_na() %>%
  as.matrix() -> pen2
pen2 %>%
  nrow() # the matrix has 342 observations

## [1] 342

# Base R style
penn <- as.matrix(penguins[, c("bill_length_mm", "bill_depth_mm")])

3 Clusters

Perform k-means using 3 clusters

The first to third observations is 85, 116, 141, respectively.
The mean of each cluster has shown below.
Each vector aims to partition n observations into k clusters in which each observation belongs to the cluster with the nearest mean.

set.seed(1234)
pen2_kmeans <- kmeans(pen2, centers = 3) # perform k-means clustering on a data matrix
pen2_kmeans

## K-means clustering with 3 clusters of sizes 85, 116, 141
## 
## Cluster means:
##   bill_length_mm bill_depth_mm
## 1       50.90353      17.33647
## 2       45.51379      15.64397
## 3       38.40355      18.27943
## 
## Clustering vector:
##   [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [38] 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3
##  [75] 2 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 2 3 2
## [112] 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [149] 3 3 3 2 1 2 1 2 2 2 2 2 2 2 1 2 2 2 1 2 1 2 1 1 2 2 2 2 2 2 2 2 1 2 2 2 1
## [186] 1 1 2 2 2 1 2 1 2 1 1 2 2 1 2 2 2 2 2 1 2 2 2 2 2 1 2 2 2 1 2 1 2 1 2 1 2
## [223] 2 2 2 2 1 2 1 2 2 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1 2 2 1 2 1 2 1 2 2 2 1
## [260] 2 2 1 1 2 1 2 1 2 1 2 2 1 2 1 2 1 1 2 1 2 2 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1
## [297] 2 1 2 1 1 1 2 1 3 1 2 1 1 2 2 1 2 1 1 2 1 2 1 1 1 1 1 1 2 1 2 1 2 1 2 1 1
## [334] 2 1 2 2 1 2 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 617.9859 754.7437 944.4986
##  (between_SS / total_SS =  79.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Look at the result object with summary(), names(), and str().

The summary() shows the elements in pen2_kmeans.

summary(pen2_kmeans)

##              Length Class  Mode   
## cluster      342    -none- numeric
## centers        6    -none- numeric
## totss          1    -none- numeric
## withinss       3    -none- numeric
## tot.withinss   1    -none- numeric
## betweenss      1    -none- numeric
## size           3    -none- numeric
## iter           1    -none- numeric
## ifault         1    -none- numeric

We also can found the attributes by using names().

cluster: A vector of integers (from 1:k) indicating the cluster to which each point is allocated.
centers: A matrix of cluster centers
totss: The total sum of squares.
withinss: Vector of within-cluster sum of squares, one component per cluster.
tot.withinss: Total within-cluster sum of squares, i.e.sum(withinss).
betweenss: The between-cluster sum of squares, i.e.totss-tot.withinss.
size: The number of points in each cluster (observations).
iter: The number of (outer) iterations.
ifault: integer: indicator of a possible algorithm problem – for experts.

names(pen2_kmeans) # between within sum of square

## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

For example, I want to see how many iterations in pen2_kmeans, I will carry out:

pen2_kmeans$iter

## [1] 2

str() displays the specific structure of an object.

str(pen2_kmeans)

## List of 9
##  $ cluster     : int [1:342] 3 3 3 3 3 3 3 3 3 3 ...
##  $ centers     : num [1:3, 1:2] 50.9 45.5 38.4 17.3 15.6 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "1" "2" "3"
##   .. ..$ : chr [1:2] "bill_length_mm" "bill_depth_mm"
##  $ totss       : num 11494
##  $ withinss    : num [1:3] 618 755 944
##  $ tot.withinss: num 2317
##  $ betweenss   : num 9177
##  $ size        : int [1:3] 85 116 141
##  $ iter        : int 2
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

Use augment(), glance() and tidy() to extract information from the model

tidy() generates a readable output.

tidy(pen2_kmeans)

## # A tibble: 3 x 5
##   bill_length_mm bill_depth_mm  size withinss cluster
##            <dbl>         <dbl> <int>    <dbl> <fct>  
## 1           50.9          17.3    85     618. 1      
## 2           45.5          15.6   116     755. 2      
## 3           38.4          18.3   141     944. 3

glance() shows the simple over view of the model.

glance(pen2_kmeans)

## # A tibble: 1 x 4
##    totss tot.withinss betweenss  iter
##    <dbl>        <dbl>     <dbl> <int>
## 1 11494.        2317.     9177.     2

augment() saves the results in the existing data set, automatically generating a new column.

augment(pen2_kmeans, data = pen2)

## # A tibble: 342 x 3
##    bill_length_mm bill_depth_mm .cluster
##             <dbl>         <dbl> <fct>   
##  1           39.1          18.7 3       
##  2           39.5          17.4 3       
##  3           40.3          18   3       
##  4           36.7          19.3 3       
##  5           39.3          20.6 3       
##  6           38.9          17.8 3       
##  7           39.2          19.6 3       
##  8           34.1          18.1 3       
##  9           42            20.2 3       
## 10           37.8          17.1 3       
## # … with 332 more rows

Or we can select more variables from the original data and attach the results as a new data set.

penguins %>%
  select(species, island, year, bill_length_mm, bill_depth_mm) %>%
  drop_na() -> new_penguins
augment(pen2_kmeans, data = new_penguins) -> new_clusters
head(new_clusters)

## # A tibble: 6 x 6
##   species island     year bill_length_mm bill_depth_mm .cluster
##   <fct>   <fct>     <int>          <dbl>         <dbl> <fct>   
## 1 Adelie  Torgersen  2007           39.1          18.7 3       
## 2 Adelie  Torgersen  2007           39.5          17.4 3       
## 3 Adelie  Torgersen  2007           40.3          18   3       
## 4 Adelie  Torgersen  2007           36.7          19.3 3       
## 5 Adelie  Torgersen  2007           39.3          20.6 3       
## 6 Adelie  Torgersen  2007           38.9          17.8 3

Plot the clusters with your package of choice

Showing 3 clusters in a scatterplot

new_clusters %>%
  ggplot(aes(bill_length_mm, bill_depth_mm, color = .cluster)) +
  geom_point() +
  theme_bw()

Fix the x- and y-axes and enlarge the data points of cluster means.

new_clusters %>%
  ggplot(aes(bill_length_mm, bill_depth_mm, color = .cluster)) +
  geom_point() +
  geom_point(aes(color = cluster), size = 10, data = tidy(pen2_kmeans)) +
  coord_fixed()

Add a species variable to legend.

new_clusters %>% 
  ggplot(aes(bill_length_mm, bill_depth_mm, color = species)) +
  geom_point() +
  theme_dark()

Count species vs cluster sampling

new_clusters %>%
  count(.cluster, species) %>%
  ggplot(aes(.cluster, species, size = n)) +
  geom_point() +
  ggtitle("Species vs Cluster")

# 5 Clusters

Return The previous steps with more variables and different values of \(K\)

Perform k-means using 5 clusters: We now add two more variables flipper_length_mm and body_mass_g to the matrix, and save it as pen3.

pen3 <- penguins %>%
  select(bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g) %>%
  drop_na() %>%
  scale() %>% # will calculate the mean and standard deviation of the entire vector
  as.matrix()
head(pen3)

##      bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## [1,]     -0.8832047     0.7843001        -1.4162715  -0.5633167
## [2,]     -0.8099390     0.1260033        -1.0606961  -0.5009690
## [3,]     -0.6634077     0.4298326        -0.4206603  -1.1867934
## [4,]     -1.3227986     1.0881294        -0.5628905  -0.9374027
## [5,]     -0.8465718     1.7464261        -0.7762357  -0.6880121
## [6,]     -0.9198375     0.3285561        -1.4162715  -0.7191859

set.seed(1)
pen3_kmeans <- kmeans(pen3, centers = 5) # perform k-means clustering on a data matrix
pen3_kmeans

## K-means clustering with 5 clusters of sizes 65, 57, 61, 93, 66
## 
## Cluster means:
##   bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## 1      0.9698529     0.7024999        -0.2904650  -0.5278265
## 2      1.0753664    -0.7153018         1.4932089   1.6434628
## 3     -0.6519975     1.1113731        -0.5115943  -0.1069522
## 4     -1.1181668     0.2327247        -0.9918750  -1.0027672
## 5      0.2943188    -1.4292037         0.8669538   0.6123148
## 
## Clustering vector:
##   [1] 4 4 4 4 3 4 3 4 3 4 4 4 3 3 4 3 3 4 3 4 4 3 4 4 4 4 4 4 3 4 4 4 3 4 3 3 4
##  [38] 4 3 4 3 4 3 4 3 4 4 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 1 4
##  [75] 3 4 3 4 3 4 3 4 3 4 3 3 4 3 4 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 4 4 3 4 3 4 3
## [112] 4 3 3 3 4 3 4 4 4 3 4 3 4 3 4 3 4 1 4 3 4 3 4 4 4 3 4 3 4 4 4 4 4 4 3 4 4
## [149] 4 4 3 5 2 5 2 2 5 5 2 5 5 5 2 5 2 5 2 5 2 5 2 2 5 5 5 5 5 5 2 5 2 2 5 5 2
## [186] 2 2 5 2 5 2 5 2 5 5 2 5 5 2 5 2 5 2 5 2 5 5 5 5 5 2 5 2 5 2 5 2 5 2 5 2 5
## [223] 2 2 5 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 5 2 2 5 5 2 5 2 5 2 5 2 5 2
## [260] 5 2 2 2 5 2 5 2 5 2 5 5 2 5 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 1
## [297] 4 1 1 1 1 1 1 1 4 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 1 1 1 1
## [334] 1 1 1 1 1 1 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 66.25997 37.34371 43.15683 58.11216 27.04454
##  (between_SS / total_SS =  83.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Result of 5 Clusters with 4 variables.

tidy(pen3_kmeans)

## # A tibble: 5 x 7
##   bill_length_mm bill_depth_mm flipper_length_mm body_mass_g  size withinss
##            <dbl>         <dbl>             <dbl>       <dbl> <int>    <dbl>
## 1          0.970         0.702            -0.290      -0.528    65     66.3
## 2          1.08         -0.715             1.49        1.64     57     37.3
## 3         -0.652         1.11             -0.512      -0.107    61     43.2
## 4         -1.12          0.233            -0.992      -1.00     93     58.1
## 5          0.294        -1.43              0.867       0.612    66     27.0
## # … with 1 more variable: cluster <fct>

Data visualzation

penguins %>%
  select(species:body_mass_g) %>%
  drop_na() -> tmp
augment(pen3_kmeans, tmp) %>%
  ggplot(aes(bill_length_mm, bill_depth_mm, color = body_mass_g, size = flipper_length_mm)) +
  geom_point() +
  theme_bw() +
  facet_wrap(~.cluster)

Elbow Chart

Construct an Elbow Chart to find an appropriate number of clusters for the data set

We need to visualize the total within-cluster sum of squares with each k
According to the Elbow Chart, we can say the roughly appropriate number of clusters is k = 2.

tibble(k = 1:10) %>%
  mutate(models = map(k, ~kmeans(pen3, centers = .x))) %>% #.x = k %>%
  mutate(tot.withinss = map_dbl(models, "tot.withinss")) -> many_kmenas_res

Plotting

many_kmenas_res %>%
  ggplot(aes(k, tot.withinss)) +
  geom_point() +
  geom_line() + 
  theme_bw() +
  ggtitle("Total within-cluster sum of squares vs K") +
  geom_vline(xintercept = 2, color = "red") +
  geom_text(x = 4, y = 800, label = "Elbow Point, K = 2", 
            size = 6, colour = "red")

K-means Clustering

Yunting Chiu

2021-06-05

Palmer Station Penguin Data

3 Clusters

Elbow Chart

References