Kmeans Clusting: Himalayan Climbing Expeditions

library(tidyverse)

members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')

Prepare data

Goal: Is there any relationship between peaks and seasons in terms of deaths?

Preparing data Please understand why Julia prepared the employment data the way shed did. For her screencast, click this link.

She wanted to investigate the relationship between demographics and occupation in employment. To this end, she transformed the data so demographic variables became columns and occupations are row names. Also, the values inside the matrix should be scaled employment ratios.

Plan to make the similar transformation for your climbing data. You want to investigate the relationship between peaks and season in terms of deaths. To this end, you need to transform the data so seasons become columns and peaks are row names. Also, the values inside the matrix should be scaled seasonal death ratio in yearly total.

# Calculate total deaths by season and peaks
members_tidy <- members %>% 
    group_by(peak_name, season) %>%
    summarise(died = sum(died)) %>%
    ungroup()

# Get the yearly total deaths. There is no TOTAL in the season variable
members_tidy %>%
    group_by(peak_name) %>%
    summarise(total_deaths = sum(died)) %>%
    ungroup()

## # A tibble: 391 × 2
##    peak_name          total_deaths
##    <chr>                     <int>
##  1 Aichyn                        0
##  2 Ama Dablam                   32
##  3 Amotsang                      0
##  4 Amphu Gyabjen                 0
##  5 Amphu I                       0
##  6 Amphu Middle                  0
##  7 Anidesh Chuli                 0
##  8 Annapurna I                  72
##  9 Annapurna I East              1
## 10 Annapurna I Middle            3
## # ℹ 381 more rows

members_demo <- members_tidy %>%
  filter(season %in% c("Winter", "Spring", "Autumn")) %>%
  pivot_wider(names_from = season, values_from = died, values_fill = 0) %>%
  left_join(members_tidy %>%
                group_by(peak_name) %>%
                summarise(total_deaths = sum(died)) %>%
                ungroup()) %>%
    
    # Removes outliers, peaks with no deaths and more than 300
    filter(total_deaths > 0, total_deaths < 100) %>%
    
    # Calculate the seasonal percent of death per peak
    mutate(across(c(Autumn, Spring, Winter), ~ . / total_deaths),
           total_deaths = log(total_deaths),
           across(where(is.numeric), ~ as.numeric(scale(.)))) 

members_demo

## # A tibble: 85 × 5
##    peak_name           Autumn  Spring  Winter total_deaths
##    <chr>                <dbl>   <dbl>   <dbl>        <dbl>
##  1 Ama Dablam          0.167  -0.128  -0.0987       1.72  
##  2 Annapurna I        -0.262   0.219   0.149        2.35  
##  3 Annapurna I East    0.907  -0.821  -0.302       -0.994 
##  4 Annapurna I Middle -0.847  -0.821   4.03        -0.134 
##  5 Annapurna II       -0.408   0.566  -0.302        0.409 
##  6 Annapurna III      -0.262   0.412  -0.302        0.726 
##  7 Annapurna IV        0.381  -0.266  -0.302        0.266 
##  8 Annapurna South    -0.737   0.912  -0.302        0.634 
##  9 Api Main           -1.72    1.26    1.32         0.0913
## 10 Baruntse            0.0974  0.0325 -0.302        1.01  
## # ℹ 75 more rows

Implementing k-means clustering

members_clust <- kmeans(select(members_demo, - peak_name), centers = 3)
summary(members_clust)

##              Length Class  Mode   
## cluster      85     -none- numeric
## centers      12     -none- numeric
## totss         1     -none- numeric
## withinss      3     -none- numeric
## tot.withinss  1     -none- numeric
## betweenss     1     -none- numeric
## size          3     -none- numeric
## iter          1     -none- numeric
## ifault        1     -none- numeric

library(broom)
tidy(members_clust)

## # A tibble: 3 × 7
##   Autumn Spring  Winter total_deaths  size withinss cluster
##    <dbl>  <dbl>   <dbl>        <dbl> <int>    <dbl> <fct>  
## 1  0.835 -0.745 -0.302        -0.386    44     23.3 1      
## 2 -1.33   1.13   0.648        -0.454    22    104.  2      
## 3 -0.396  0.417 -0.0518        1.42     19     17.8 3

augment(members_clust, members_demo) %>%
  ggplot(aes(total_deaths, Autumn, color = .cluster)) +
  geom_point()

Choosing k

kclusts <-
  tibble(k = 1:9) %>%
  mutate(
    kclust = map(k, ~ kmeans(select(members_demo, - peak_name), .x)),
    tidied = map(kclust, tidy),
    glanced = map(kclust, glance),
    augmented = map(kclust, augment, members_demo)
  )

kclusts %>%
  unnest(glanced) %>%
  ggplot(aes(k, tot.withinss)) +
  geom_line(alpha = 0.8) +
  geom_point(size = 2)

library(plotly)

members_clust <- kmeans(select(members_demo, - peak_name), centers = 4)

p <- augment(members_clust, members_demo) %>%
  ggplot(aes(total_deaths, Spring, color = .cluster, name = peak_name)) +
  geom_point(alpha = 0.8)

ggplotly(p)