library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
members_raw <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Prepare Data

members_tidy <- members_raw %>%
  group_by(peak_name, season) %>% 
  summarise(died = sum(died)) %>% 
  ungroup()
## `summarise()` has grouped output by 'peak_name'. You can override using the
## `.groups` argument.
members_tidy %>% 
    group_by(peak_name) %>% 
    summarise(total_deaths = sum(died)) %>% 
    ungroup()
## # A tibble: 391 × 2
##    peak_name          total_deaths
##    <chr>                     <int>
##  1 Aichyn                        0
##  2 Ama Dablam                   32
##  3 Amotsang                      0
##  4 Amphu Gyabjen                 0
##  5 Amphu I                       0
##  6 Amphu Middle                  0
##  7 Anidesh Chuli                 0
##  8 Annapurna I                  72
##  9 Annapurna I East              1
## 10 Annapurna I Middle            3
## # ℹ 381 more rows
members_demo <- members_tidy %>%
  filter(season %in% c("Autumn", "Spring", "Winter")) %>% 
  pivot_wider(names_from = season, values_from = died, values_fill = 0) %>% 
  left_join(members_tidy %>% 
                group_by(peak_name) %>% 
                summarise(total_deaths = sum(died)) %>% 
                ungroup()) %>% 
    
    filter(total_deaths > 0, total_deaths < 100) %>% 
    mutate(across(c(Winter, Autumn, Spring), ~ . / total_deaths), 
           total_deaths = log(total_deaths), 
           across(where(is.numeric), ~ as.numeric(scale(.))))
## Joining with `by = join_by(peak_name)`
members_demo
## # A tibble: 85 × 5
##    peak_name           Autumn  Spring  Winter total_deaths
##    <chr>                <dbl>   <dbl>   <dbl>        <dbl>
##  1 Ama Dablam          0.167  -0.128  -0.0987       1.72  
##  2 Annapurna I        -0.262   0.219   0.149        2.35  
##  3 Annapurna I East    0.907  -0.821  -0.302       -0.994 
##  4 Annapurna I Middle -0.847  -0.821   4.03        -0.134 
##  5 Annapurna II       -0.408   0.566  -0.302        0.409 
##  6 Annapurna III      -0.262   0.412  -0.302        0.726 
##  7 Annapurna IV        0.381  -0.266  -0.302        0.266 
##  8 Annapurna South    -0.737   0.912  -0.302        0.634 
##  9 Api Main           -1.72    1.26    1.32         0.0913
## 10 Baruntse            0.0974  0.0325 -0.302        1.01  
## # ℹ 75 more rows

Implementing k-means clustering

members_clust <- kmeans(select(members_demo, - peak_name), centers = 3)
summary(members_clust)
##              Length Class  Mode   
## cluster      85     -none- numeric
## centers      12     -none- numeric
## totss         1     -none- numeric
## withinss      3     -none- numeric
## tot.withinss  1     -none- numeric
## betweenss     1     -none- numeric
## size          3     -none- numeric
## iter          1     -none- numeric
## ifault        1     -none- numeric
library(broom)
tidy(members_clust)
## # A tibble: 3 × 7
##   Autumn Spring  Winter total_deaths  size withinss cluster
##    <dbl>  <dbl>   <dbl>        <dbl> <int>    <dbl> <fct>  
## 1 -1.07   1.14  -0.0480        0.418    32    77.2  1      
## 2  0.769 -0.677 -0.298        -0.243    49    36.9  2      
## 3 -0.847 -0.821  4.03         -0.372     4     8.26 3
augment(members_clust, members_demo) %>%
  ggplot(aes(total_deaths, Autumn, color = .cluster)) +
  geom_point()

Choosing K

kclusts <-
  tibble(k = 1:9) %>%
  mutate(
    kclust = map(k, ~ kmeans(select(members_demo, - peak_name), .x)),
    tidied = map(kclust, tidy),
    glanced = map(kclust, glance),
    augmented = map(kclust, augment, members_demo)
  )

kclusts %>%
  unnest(glanced) %>%
  ggplot(aes(k, tot.withinss)) +
  geom_line(alpha = 0.8) +
  geom_point(size = 2)

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
members_clust <- kmeans(select(members_demo, - peak_name), centers = 5)

p <- augment(members_clust, members_demo) %>%
  ggplot(aes(total_deaths, Winter, color = .cluster, name = peak_name)) +
  geom_point(alpha = 0.8)

ggplotly(p)

#1 - The modeling goal was to use k means clustering to explore climbing deaths in relation to peak name and season. -The data consists of different mountains, seasons and years they were climbed, ages and citizenship of all involved, etc. - The main variables used in the analysis consist of peak name, season, and did they die or not.

#2 - The original data had more variables and data points, while the transformed data has selected the main data it wants to work with. For example it has selected just peak names and seasons instead of using all variables. We also removed outliers. This was all done to make the data best fit for the k means clustering models. #3 - k-means Clustering - To find optimal K value you can look at the total within-cluster squares and see if there is a drop off.

#4 - We were able to see what peaks had deaths in similar seasons and which peaks were similar to those around them.