library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl (6): hired, success, solo, oxygen_used, died, injured
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
members_tidy <- members %>%
group_by(peak_name, season) %>%
summarise(died = sum(died)) %>%
ungroup()
## `summarise()` has grouped output by 'peak_name'. You can override using the
## `.groups` argument.
members_tidy %>%
group_by(peak_name) %>%
summarise(total_deaths = sum(died)) %>%
ungroup()
## # A tibble: 391 × 2
## peak_name total_deaths
## <chr> <int>
## 1 Aichyn 0
## 2 Ama Dablam 32
## 3 Amotsang 0
## 4 Amphu Gyabjen 0
## 5 Amphu I 0
## 6 Amphu Middle 0
## 7 Anidesh Chuli 0
## 8 Annapurna I 72
## 9 Annapurna I East 1
## 10 Annapurna I Middle 3
## # ℹ 381 more rows
members_demo <- members_tidy %>%
filter(season %in% c("Winter", "Spring", "Autumn")) %>%
pivot_wider(names_from = season, values_from = died, values_fill = 0) %>%
left_join(members_tidy %>%
group_by(peak_name) %>%
summarise(total_deaths = sum(died)) %>%
ungroup()) %>%
filter(total_deaths > 0, total_deaths < 100) %>%
mutate(across(c(Autumn, Spring, Winter), ~ . / total_deaths),
total_deaths = log(total_deaths),
across(where(is.numeric), ~ as.numeric(scale(.))))
## Joining with `by = join_by(peak_name)`
members_demo
## # A tibble: 85 × 5
## peak_name Autumn Spring Winter total_deaths
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Ama Dablam 0.167 -0.128 -0.0987 1.72
## 2 Annapurna I -0.262 0.219 0.149 2.35
## 3 Annapurna I East 0.907 -0.821 -0.302 -0.994
## 4 Annapurna I Middle -0.847 -0.821 4.03 -0.134
## 5 Annapurna II -0.408 0.566 -0.302 0.409
## 6 Annapurna III -0.262 0.412 -0.302 0.726
## 7 Annapurna IV 0.381 -0.266 -0.302 0.266
## 8 Annapurna South -0.737 0.912 -0.302 0.634
## 9 Api Main -1.72 1.26 1.32 0.0913
## 10 Baruntse 0.0974 0.0325 -0.302 1.01
## # ℹ 75 more rows
Implement k-means clustering
members_clust <- kmeans(select(members_demo, - peak_name), centers = 3)
summary(members_clust)
## Length Class Mode
## cluster 85 -none- numeric
## centers 12 -none- numeric
## totss 1 -none- numeric
## withinss 3 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 3 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
library(broom)
tidy(members_clust)
## # A tibble: 3 × 7
## Autumn Spring Winter total_deaths size withinss cluster
## <dbl> <dbl> <dbl> <dbl> <int> <dbl> <fct>
## 1 -1.07 1.14 -0.0480 0.418 32 77.2 1
## 2 -0.847 -0.821 4.03 -0.372 4 8.26 2
## 3 0.769 -0.677 -0.298 -0.243 49 36.9 3
augment(members_clust, members_demo) %>%
ggplot(aes(total_deaths, Autumn, color = .cluster)) +
geom_point()

Choosing k
kclusts <-
tibble(k = 1:9) %>%
mutate(
kclust = map(k, ~ kmeans(select(members_demo, - peak_name), .x)),
tidied = map(kclust, tidy),
glanced = map(kclust, glance),
augmented = map(kclust, augment, members_demo)
)
kclusts %>%
unnest(glanced) %>%
ggplot(aes(k, tot.withinss)) +
geom_line(alpha = 0.8, color = "midnightblue") +
geom_point(size = 2, color = "red")

final_clust <- kmeans(select(members_demo, -peak_name), centers = 5)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
members_clust <- kmeans(select(members_demo, - peak_name), centers = 4)
p <- augment(members_clust, members_demo) %>%
ggplot(aes(total_deaths, Spring, color = .cluster, name = peak_name)) +
geom_point(alpha = 0.8)
ggplotly(p)
- Question and Data
- What is the goal of the analysis? The goal of the analysis is to
predict if someone died during a hiking expedition or not.
- Describe the data briefly. The data has 76,519 observations of 21
variables. The data contains different variables that pertain to hiking
expeditions. Some variables are season, sex, citizenship, age,
oxygen_used, died, death_cause. peak_name, year, member_id and more.
These variables are either numerical, character, or logical data
types.
- What are the characteristics of the key variables used in the
analysis? The key variables in the analysis are peak_name, season, died,
and total_deaths. Peak_name is the name of the mountain peak being
climbed during the expedition. Season is the season during the
expedition (winter, spring, or autumn). Died indicates if a death
occurred during the expedition. Total_deaths is the total number of
deaths that have occurred on the mountain. Total_deaths is what we are
trying to predict using these key variables. Died is logical data type,
peak_name and season are character data and total_deaths is numeric data
type.
- Data Exploration and Transformation
- Describe the differences between the original data and the data
transformed from modeling. Why? The original data has 76,519
observations of 21 variables. The data transformed for modeling has only
714 observations of 3 variables. The original data set has many
variables that I do not need for my modeling analysis. The data is
transformed by summarizing deaths by peak and season. I used
“members_tidy <- members %>% group_by(peak_name, season) %>%
summarise(died = sum(died)) %>% ungroup()” and “members_tidy %>%
group_by(peak_name) %>% summarise(total_deaths = sum(died)) %>%
ungroup()” to do this. I used log transform on the total number of
deaths to reduce skewness. These transformations were necessary in order
to improve the clustering process. K-means uses the distance between
points in space, and the differences can throw off the results making
the analysis less accurate and valuable.
- Data preparation and modeling
- What is the type of clustering used in the analysis? The clustering
used is k-means clustering. K-means clustering is an algorithm used to
split data into k-distinct, non-overlapping subsets.
- How is the optimal value for k found? The optimal value for k is
found by using the within-cluster sum of squares method. The goal with
this is to choose a k that minimizes the within-cluster sum of
squares.
- Conclusion
- What are the findings from the analysis? The analysis uses k-means
clustering to understand patterns in the climbing members data set. The
transformed data and the k-means clustering method helps to find the
structure in the data that is linked to hiking risks by season and
peaks. The data analysis reveals which peaks have the most/least deaths,
and which season may be the safest/most dangerous to hike in.