library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
members_tidy <- members %>% 
    group_by(peak_name, season) %>%
    summarise(died = sum(died)) %>%
    ungroup()
## `summarise()` has grouped output by 'peak_name'. You can override using the
## `.groups` argument.
members_tidy %>%
    group_by(peak_name) %>%
    summarise(total_deaths = sum(died)) %>%
    ungroup()
## # A tibble: 391 × 2
##    peak_name          total_deaths
##    <chr>                     <int>
##  1 Aichyn                        0
##  2 Ama Dablam                   32
##  3 Amotsang                      0
##  4 Amphu Gyabjen                 0
##  5 Amphu I                       0
##  6 Amphu Middle                  0
##  7 Anidesh Chuli                 0
##  8 Annapurna I                  72
##  9 Annapurna I East              1
## 10 Annapurna I Middle            3
## # ℹ 381 more rows
members_demo <- members_tidy %>%
  filter(season %in% c("Winter", "Spring", "Autumn")) %>%
  pivot_wider(names_from = season, values_from = died, values_fill = 0) %>%
  left_join(members_tidy %>%
                group_by(peak_name) %>%
                summarise(total_deaths = sum(died)) %>%
                ungroup()) %>%

    filter(total_deaths > 0, total_deaths < 100) %>%

    mutate(across(c(Autumn, Spring, Winter), ~ . / total_deaths),
           total_deaths = log(total_deaths),
           across(where(is.numeric), ~ as.numeric(scale(.)))) 
## Joining with `by = join_by(peak_name)`
members_demo
## # A tibble: 85 × 5
##    peak_name           Autumn  Spring  Winter total_deaths
##    <chr>                <dbl>   <dbl>   <dbl>        <dbl>
##  1 Ama Dablam          0.167  -0.128  -0.0987       1.72  
##  2 Annapurna I        -0.262   0.219   0.149        2.35  
##  3 Annapurna I East    0.907  -0.821  -0.302       -0.994 
##  4 Annapurna I Middle -0.847  -0.821   4.03        -0.134 
##  5 Annapurna II       -0.408   0.566  -0.302        0.409 
##  6 Annapurna III      -0.262   0.412  -0.302        0.726 
##  7 Annapurna IV        0.381  -0.266  -0.302        0.266 
##  8 Annapurna South    -0.737   0.912  -0.302        0.634 
##  9 Api Main           -1.72    1.26    1.32         0.0913
## 10 Baruntse            0.0974  0.0325 -0.302        1.01  
## # ℹ 75 more rows

Implement k-means clustering

members_clust <- kmeans(select(members_demo, - peak_name), centers = 3)
summary(members_clust)
##              Length Class  Mode   
## cluster      85     -none- numeric
## centers      12     -none- numeric
## totss         1     -none- numeric
## withinss      3     -none- numeric
## tot.withinss  1     -none- numeric
## betweenss     1     -none- numeric
## size          3     -none- numeric
## iter          1     -none- numeric
## ifault        1     -none- numeric
library(broom)
tidy(members_clust)
## # A tibble: 3 × 7
##   Autumn Spring  Winter total_deaths  size withinss cluster
##    <dbl>  <dbl>   <dbl>        <dbl> <int>    <dbl> <fct>  
## 1 -1.07   1.14  -0.0480        0.418    32    77.2  1      
## 2 -0.847 -0.821  4.03         -0.372     4     8.26 2      
## 3  0.769 -0.677 -0.298        -0.243    49    36.9  3
augment(members_clust, members_demo) %>%
  ggplot(aes(total_deaths, Autumn, color = .cluster)) +
  geom_point()

Choosing k

kclusts <-
  tibble(k = 1:9) %>%
  mutate(
    kclust = map(k, ~ kmeans(select(members_demo, - peak_name), .x)),
    tidied = map(kclust, tidy),
    glanced = map(kclust, glance),
    augmented = map(kclust, augment, members_demo)
  )

kclusts %>%
  unnest(glanced) %>%
  ggplot(aes(k, tot.withinss)) +
  geom_line(alpha = 0.8, color = "midnightblue") +
  geom_point(size = 2, color = "red")

final_clust <- kmeans(select(members_demo, -peak_name), centers = 5)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
members_clust <- kmeans(select(members_demo, - peak_name), centers = 4)

p <- augment(members_clust, members_demo) %>%
  ggplot(aes(total_deaths, Spring, color = .cluster, name = peak_name)) +
  geom_point(alpha = 0.8)

ggplotly(p)
  1. Question and Data
  1. What is the goal of the analysis? The goal of the analysis is to predict if someone died during a hiking expedition or not.
  2. Describe the data briefly. The data has 76,519 observations of 21 variables. The data contains different variables that pertain to hiking expeditions. Some variables are season, sex, citizenship, age, oxygen_used, died, death_cause. peak_name, year, member_id and more. These variables are either numerical, character, or logical data types.
  3. What are the characteristics of the key variables used in the analysis? The key variables in the analysis are peak_name, season, died, and total_deaths. Peak_name is the name of the mountain peak being climbed during the expedition. Season is the season during the expedition (winter, spring, or autumn). Died indicates if a death occurred during the expedition. Total_deaths is the total number of deaths that have occurred on the mountain. Total_deaths is what we are trying to predict using these key variables. Died is logical data type, peak_name and season are character data and total_deaths is numeric data type.
  1. Data Exploration and Transformation
  1. Describe the differences between the original data and the data transformed from modeling. Why? The original data has 76,519 observations of 21 variables. The data transformed for modeling has only 714 observations of 3 variables. The original data set has many variables that I do not need for my modeling analysis. The data is transformed by summarizing deaths by peak and season. I used “members_tidy <- members %>% group_by(peak_name, season) %>% summarise(died = sum(died)) %>% ungroup()” and “members_tidy %>% group_by(peak_name) %>% summarise(total_deaths = sum(died)) %>% ungroup()” to do this. I used log transform on the total number of deaths to reduce skewness. These transformations were necessary in order to improve the clustering process. K-means uses the distance between points in space, and the differences can throw off the results making the analysis less accurate and valuable.
  1. Data preparation and modeling
  1. What is the type of clustering used in the analysis? The clustering used is k-means clustering. K-means clustering is an algorithm used to split data into k-distinct, non-overlapping subsets.
  2. How is the optimal value for k found? The optimal value for k is found by using the within-cluster sum of squares method. The goal with this is to choose a k that minimizes the within-cluster sum of squares.
  1. Conclusion
  2. What are the findings from the analysis? The analysis uses k-means clustering to understand patterns in the climbing members data set. The transformed data and the k-means clustering method helps to find the structure in the data that is linked to hiking risks by season and peaks. The data analysis reveals which peaks have the most/least deaths, and which season may be the safest/most dangerous to hike in.