library(tidyverse)
members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
Goal: Is there any relationship between peaks and seasons in terms of deaths?
Preparing data Please understand why Julia prepared the employment data the way shed did. For her screencast, click this link.
She wanted to investigate the relationship between demographics and occupation in employment. To this end, she transformed the data so demographic variables became columns and occupations are row names. Also, the values inside the matrix should be scaled employment ratios.
Plan to make the similar transformation for your climbing data. You want to investigate the relationship between peaks and season in terms of deaths. To this end, you need to transform the data so seasons become columns and peaks are row names. Also, the values inside the matrix should be scaled seasonal death ratio in yearly total.
# Calculate total deaths by season and peaks
members_tidy <- members %>%
group_by(peak_name, season) %>%
summarise(died = sum(died)) %>%
ungroup()
# Get the yearly total deaths. There is no TOTAL in the season variable
members_tidy %>%
group_by(peak_name) %>%
summarise(total_deaths = sum(died)) %>%
ungroup()
## # A tibble: 391 × 2
## peak_name total_deaths
## <chr> <int>
## 1 Aichyn 0
## 2 Ama Dablam 32
## 3 Amotsang 0
## 4 Amphu Gyabjen 0
## 5 Amphu I 0
## 6 Amphu Middle 0
## 7 Anidesh Chuli 0
## 8 Annapurna I 72
## 9 Annapurna I East 1
## 10 Annapurna I Middle 3
## # ℹ 381 more rows
members_demo <- members_tidy %>%
filter(season %in% c("Winter", "Spring", "Autumn")) %>%
pivot_wider(names_from = season, values_from = died, values_fill = 0) %>%
left_join(members_tidy %>%
group_by(peak_name) %>%
summarise(total_deaths = sum(died)) %>%
ungroup()) %>%
# Removes outliers, peaks with no deaths and more than 300
filter(total_deaths > 0, total_deaths < 100) %>%
# Calculate the seasonal percent of death per peak
mutate(across(c(Autumn, Spring, Winter), ~ . / total_deaths),
total_deaths = log(total_deaths),
across(where(is.numeric), ~ as.numeric(scale(.))))
members_demo
## # A tibble: 85 × 5
## peak_name Autumn Spring Winter total_deaths
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Ama Dablam 0.167 -0.128 -0.0987 1.72
## 2 Annapurna I -0.262 0.219 0.149 2.35
## 3 Annapurna I East 0.907 -0.821 -0.302 -0.994
## 4 Annapurna I Middle -0.847 -0.821 4.03 -0.134
## 5 Annapurna II -0.408 0.566 -0.302 0.409
## 6 Annapurna III -0.262 0.412 -0.302 0.726
## 7 Annapurna IV 0.381 -0.266 -0.302 0.266
## 8 Annapurna South -0.737 0.912 -0.302 0.634
## 9 Api Main -1.72 1.26 1.32 0.0913
## 10 Baruntse 0.0974 0.0325 -0.302 1.01
## # ℹ 75 more rows
members_clust <- kmeans(select(members_demo, - peak_name), centers = 3)
summary(members_clust)
## Length Class Mode
## cluster 85 -none- numeric
## centers 12 -none- numeric
## totss 1 -none- numeric
## withinss 3 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 3 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
library(broom)
tidy(members_clust)
## # A tibble: 3 × 7
## Autumn Spring Winter total_deaths size withinss cluster
## <dbl> <dbl> <dbl> <dbl> <int> <dbl> <fct>
## 1 0.835 -0.745 -0.302 -0.386 44 23.3 1
## 2 -1.33 1.13 0.648 -0.454 22 104. 2
## 3 -0.396 0.417 -0.0518 1.42 19 17.8 3
augment(members_clust, members_demo) %>%
ggplot(aes(total_deaths, Autumn, color = .cluster)) +
geom_point()
kclusts <-
tibble(k = 1:9) %>%
mutate(
kclust = map(k, ~ kmeans(select(members_demo, - peak_name), .x)),
tidied = map(kclust, tidy),
glanced = map(kclust, glance),
augmented = map(kclust, augment, members_demo)
)
kclusts %>%
unnest(glanced) %>%
ggplot(aes(k, tot.withinss)) +
geom_line(alpha = 0.8) +
geom_point(size = 2)
library(plotly)
members_clust <- kmeans(select(members_demo, - peak_name), centers = 4)
p <- augment(members_clust, members_demo) %>%
ggplot(aes(total_deaths, Spring, color = .cluster, name = peak_name)) +
geom_point(alpha = 0.8)
ggplotly(p)