library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl (6): hired, success, solo, oxygen_used, died, injured
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
members %>%
count(died)
## # A tibble: 2 × 2
## died n
## <lgl> <int>
## 1 FALSE 75413
## 2 TRUE 1106
data <- members %>%
select(-death_cause, -injury_type, -highpoint_metres, -death_height_metres, -injury_height_metres) %>%
drop_na() %>%
select(-peak_id, -expedition_id) %>%
distinct(member_id, .keep_all = TRUE) %>%
mutate(died = case_when(died == "TRUE" ~ "died", died == "FALSE" ~ "no")) %>%
mutate(across(where(is.character), factor)) %>%
mutate(across(where(is.logical), factor)) %>%
mutate(member_id = as.character(member_id))
skimr::skim(data)
Data summary
| Name |
data |
| Number of rows |
72984 |
| Number of columns |
14 |
| _______________________ |
|
| Column type frequency: |
|
| character |
1 |
| factor |
11 |
| numeric |
2 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| member_id |
0 |
1 |
12 |
12 |
0 |
72984 |
0 |
Variable type: factor
| peak_name |
0 |
1 |
FALSE |
390 |
Eve: 20994, Cho: 8608, Ama: 8235, Man: 4510 |
| season |
0 |
1 |
FALSE |
4 |
Spr: 36150, Aut: 34186, Win: 2011, Sum: 637 |
| sex |
0 |
1 |
FALSE |
2 |
M: 66150, F: 6834 |
| citizenship |
0 |
1 |
FALSE |
207 |
Nep: 14367, USA: 6318, Jap: 6188, UK: 5071 |
| expedition_role |
0 |
1 |
FALSE |
483 |
Cli: 43315, H-A: 13033, Lea: 9884, Exp: 1411 |
| hired |
0 |
1 |
FALSE |
2 |
FAL: 59006, TRU: 13978 |
| success |
0 |
1 |
FALSE |
2 |
FAL: 44913, TRU: 28071 |
| solo |
0 |
1 |
FALSE |
2 |
FAL: 72868, TRU: 116 |
| oxygen_used |
0 |
1 |
FALSE |
2 |
FAL: 55215, TRU: 17769 |
| died |
0 |
1 |
FALSE |
2 |
no: 72055, die: 929 |
| injured |
0 |
1 |
FALSE |
2 |
FAL: 71333, TRU: 1651 |
Variable type: numeric
| year |
0 |
1 |
2001.00 |
14.12 |
1905 |
1992 |
2004 |
2012 |
2019 |
▁▁▁▃▇ |
| age |
0 |
1 |
37.34 |
10.39 |
7 |
29 |
36 |
44 |
85 |
▁▇▅▁▁ |
data %>%
select(hired:injured) %>%
pivot_longer(-died) %>%
filter(value == "TRUE") %>%
group_by(died, name) %>%
summarise(n_true = n()) %>%
ungroup() %>%
pivot_wider(names_from = died, values_from = n_true) %>%
mutate(pct_died = died/no * 100) %>%
ggplot(aes(pct_died, fct_reorder(name, pct_died))) +
geom_col() +
labs(y = NULL, x = "Percent Died",
caption = "Solo seem to be relevant - 'Dangerous'")
## `summarise()` has grouped output by 'died'. You can override using the
## `.groups` argument.

top10_peaks <- data %>% count(peak_name, sort = TRUE) %>% slice_max(order_by = n, n = 10) %>% pull(peak_name)
data %>%
filter(peak_name %in% top10_peaks) %>%
count(died, peak_name) %>%
pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
mutate(pct_died = (died / (no + died)) * 100) %>%
ggplot(aes(pct_died, fct_reorder(peak_name, pct_died))) +
geom_col() +
labs(y = "Peak Names", x = "Percent Died")

data %>%
ggplot(aes(died, year)) +
geom_boxplot()

data %>%
mutate(year_decade = year %/% 10 * 10) %>%
count(died, year_decade) %>%
pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
mutate(pct_died = (died / (no + died)) * 100) %>%
ggplot(aes(year_decade, pct_died)) +
geom_col(fill = "midnightblue") +
labs(y = NULL, x = "Percent Died")

data %>%
count(died, season) %>%
pivot_wider(names_from = died, values_from = n) %>%
mutate(pct_died = (died / (died + no) * 100)) %>%
ggplot(aes(pct_died, fct_reorder(season, pct_died))) +
geom_col(fill = "midnightblue") +
labs(y = NULL, x = "Percent Died")

plot_pct_died <- function(data, cat_var) {
data %>%
count(died, {{cat_var}}) %>%
pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
mutate(pct_died = (died / (died + no) * 100)) %>%
ggplot(aes(pct_died, fct_reorder({{cat_var}}, pct_died))) +
geom_col(fill = "midnightblue") +
labs(y = NULL, x = "Percent Died")
}
data %>% plot_pct_died(cat_var = sex)

data %>%
ggplot(aes(died, age)) +
geom_boxplot()

data %>%
mutate(age_decade = age %/% 10 * 10) %>%
count(died, age_decade) %>%
pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
mutate(pct_died = (died / (no + died)) * 100) %>%
ggplot(aes(age_decade, pct_died)) +
geom_col() %>%
labs(y = "Percent Died", x = "Age by Decade")

data %>%
mutate(age_decade = age %/% 10 * 10) %>%
count(died, age_decade) %>%
pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
mutate(pct_died = (died / (no + died)) * 100) %>%
filter(age_decade != 80) %>%
ggplot(aes(age_decade, pct_died)) +
geom_col() %>%
labs(y = "Percent Died", x = "Age by Decade")

top10_citizen <- data %>% count(citizenship, sort = TRUE) %>% slice_max(order_by = n, n = 10) %>% pull(citizenship)
data %>%
filter(citizenship %in% top10_citizen) %>%
plot_pct_died(cat_var = citizenship)

top10_role <- data %>% count(expedition_role, sort = TRUE) %>% slice_max(order_by = n, n = 10) %>% pull(expedition_role)
data %>%
filter(expedition_role %in% top10_role) %>%
plot_pct_died(cat_var = expedition_role)

Build a model
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.4 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
set.seed(123)
data_split <- initial_split(data, strata = died)
data_train <- training(data_split)
data_test <- testing(data_split)
set.seed(234)
data_folds <- vfold_cv(data_train, strata = died)
data_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [49264/5474]> Fold01
## 2 <split [49264/5474]> Fold02
## 3 <split [49264/5474]> Fold03
## 4 <split [49264/5474]> Fold04
## 5 <split [49264/5474]> Fold05
## 6 <split [49264/5474]> Fold06
## 7 <split [49264/5474]> Fold07
## 8 <split [49264/5474]> Fold08
## 9 <split [49265/5473]> Fold09
## 10 <split [49265/5473]> Fold10
library(themis)
library(embed)
xgboost_recipe <-
recipe(formula = died ~ ., data = data_train) %>%
update_role(member_id, new_role = "id") %>%
step_other(peak_name, citizenship, expedition_role) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_zv(all_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_smote(died)
xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 108,068
## Columns: 35
## $ member_id <fct> AMAD78301-02, AMAD78301-05, AMAD78301-06, A…
## $ year <dbl> -1.635974, -1.635974, -1.635974, -1.635974,…
## $ age <dbl> 0.35136828, -0.32114464, -1.18580412, 0.351…
## $ died <fct> no, no, no, no, no, no, no, no, no, no, no,…
## $ peak_name_Ama.Dablam <dbl> 2.812559, 2.812559, 2.812559, 2.812559, 2.8…
## $ peak_name_Cho.Oyu <dbl> -0.363673, -0.363673, -0.363673, -0.363673,…
## $ peak_name_Everest <dbl> -0.6366008, -0.6366008, -0.6366008, -0.6366…
## $ peak_name_Manaslu <dbl> -0.2574662, -0.2574662, -0.2574662, -0.2574…
## $ peak_name_other <dbl> -0.8516586, -0.8516586, -0.8516586, -0.8516…
## $ season_Autumn <dbl> 1.066582, 1.066582, 1.066582, 1.066582, -0.…
## $ season_Spring <dbl> -0.9927462, -0.9927462, -0.9927462, -0.9927…
## $ season_Summer <dbl> -0.09336126, -0.09336126, -0.09336126, -0.0…
## $ season_Winter <dbl> -0.1671621, -0.1671621, -0.1671621, -0.1671…
## $ sex_F <dbl> -0.3221255, -0.3221255, -0.3221255, -0.3221…
## $ sex_M <dbl> 0.3221255, 0.3221255, 0.3221255, 0.3221255,…
## $ citizenship_France <dbl> 3.9004595, 3.9004595, 3.9004595, 3.9004595,…
## $ citizenship_Japan <dbl> -0.3062252, -0.3062252, -0.3062252, -0.3062…
## $ citizenship_Nepal <dbl> -0.493404, -0.493404, -0.493404, -0.493404,…
## $ citizenship_UK <dbl> -0.2720068, -0.2720068, -0.2720068, -0.2720…
## $ citizenship_USA <dbl> -0.3077572, -0.3077572, -0.3077572, -0.3077…
## $ citizenship_other <dbl> -1.0027716, -1.0027716, -1.0027716, -1.0027…
## $ expedition_role_Climber <dbl> -1.2099521, 0.8264639, 0.8264639, 0.8264639…
## $ expedition_role_H.A.Worker <dbl> -0.4644574, -0.4644574, -0.4644574, -0.4644…
## $ expedition_role_Leader <dbl> -0.396114, -0.396114, -0.396114, -0.396114,…
## $ expedition_role_other <dbl> 3.127181, -0.319771, -0.319771, -0.319771, …
## $ hired_FALSE. <dbl> 0.4848862, 0.4848862, 0.4848862, 0.4848862,…
## $ hired_TRUE. <dbl> -0.4848862, -0.4848862, -0.4848862, -0.4848…
## $ success_FALSE. <dbl> 0.7921472, 0.7921472, 0.7921472, 0.7921472,…
## $ success_TRUE. <dbl> -0.7921472, -0.7921472, -0.7921472, -0.7921…
## $ solo_FALSE. <dbl> 0.0398985, 0.0398985, 0.0398985, 0.0398985,…
## $ solo_TRUE. <dbl> -0.0398985, -0.0398985, -0.0398985, -0.0398…
## $ oxygen_used_FALSE. <dbl> 0.5681049, 0.5681049, 0.5681049, 0.5681049,…
## $ oxygen_used_TRUE. <dbl> -0.5681049, -0.5681049, -0.5681049, -0.5681…
## $ injured_FALSE. <dbl> 0.1515509, 0.1515509, 0.1515509, 0.1515509,…
## $ injured_TRUE. <dbl> -0.1515509, -0.1515509, -0.1515509, -0.1515…
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune()) %>%
set_mode("classification") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
doParallel::registerDoParallel()
set.seed(27358)
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = data_folds,
control = control_resamples(save_pred = TRUE))
collect_metrics(xgboost_tune)
## # A tibble: 20 × 10
## trees min_n tree_depth learn_rate .metric .estimator mean n std_err
## <int> <int> <int> <dbl> <chr> <chr> <dbl> <int> <dbl>
## 1 6 2 5 0.00648 accuracy binary 0.613 10 0.0191
## 2 6 2 5 0.00648 roc_auc binary 0.666 10 0.0128
## 3 1765 8 1 0.252 accuracy binary 0.805 10 0.00260
## 4 1765 8 1 0.252 roc_auc binary 0.675 10 0.0140
## 5 257 10 7 0.0840 accuracy binary 0.985 10 0.000627
## 6 257 10 7 0.0840 roc_auc binary 0.735 10 0.0126
## 7 1867 15 14 0.00279 accuracy binary 0.963 10 0.00154
## 8 1867 15 14 0.00279 roc_auc binary 0.736 10 0.0138
## 9 1057 20 3 0.00427 accuracy binary 0.692 10 0.00690
## 10 1057 20 3 0.00427 roc_auc binary 0.703 10 0.0125
## 11 771 23 10 0.00102 accuracy binary 0.794 10 0.00474
## 12 771 23 10 0.00102 roc_auc binary 0.723 10 0.0140
## 13 1531 26 13 0.171 accuracy binary 0.986 10 0.000557
## 14 1531 26 13 0.171 roc_auc binary 0.730 10 0.00918
## 15 966 29 7 0.0419 accuracy binary 0.986 10 0.000545
## 16 966 29 7 0.0419 roc_auc binary 0.742 10 0.0127
## 17 1233 33 9 0.0140 accuracy binary 0.985 10 0.000531
## 18 1233 33 9 0.0140 roc_auc binary 0.738 10 0.0129
## 19 582 39 11 0.0186 accuracy binary 0.982 10 0.000572
## 20 582 39 11 0.0186 roc_auc binary 0.735 10 0.0128
## # ℹ 1 more variable: .config <chr>
collect_predictions(xgboost_tune) %>%
group_by(id) %>%
roc_curve(died, .pred_died) %>%
autoplot()

final_fitted <- xgboost_workflow %>%
finalize_workflow(select_best(xgboost_tune, "accuracy")) %>%
last_fit(data_split)
collect_metrics(final_fitted)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.986 Preprocessor1_Model1
## 2 roc_auc binary 0.733 Preprocessor1_Model1
collect_predictions(final_fitted) %>%
conf_mat(died, .pred_class) %>%
autoplot()

collect_metrics(final_fitted)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.986 Preprocessor1_Model1
## 2 roc_auc binary 0.733 Preprocessor1_Model1
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
final_fitted %>%
extract_fit_engine() %>%
vip()

- Question and Data:
- What is the research question? Clearly state the research question
you aim to address using the new dataset. The research question is “Can
we predict deaths of hikers using peak_name?”
- Describe the data briefly: Provide an overview of the new dataset,
highlighting its key characteristics and dimensions. The data set has
76,519 observations of 21 variables. The key variables are peak_name,
which is the name of the mountain being climbed and died which is
whether or not the hiker passed away. These two variables are used in
the analysis, peak_name is what is being used to predict deaths.
- What are the characteristics of the key variables used in the
analysis? Describe the primary variables of interest in the dataset and
their characteristics. The primary variables of interest are peak_name
and died. Peak_name contains data with all of the names of the mountains
that were climbed. Peak_name is character data. Died contains data about
whether the hiker died during the hike or not. Died is character
data.
- Data Exploration and Transformation:
- Describe the differences between the original data and the data
transformed for modeling. Why? Explain any preprocessing or
transformations performed on the new dataset compared to the original
data. Discuss why these changes were necessary or beneficial. The
original data set has 76,519 observations of 21 variables. I
preprocessed the data by using “members %>% unnest_tokens(word,
peak_name) %>% count(died, word) %>% bind_log_odds(died, word, n)
%>% arrange(-log_odds_weighted)” This unnested and tokenized the
peak_names so that they could be used to predict the died variable. I
also split the data into training and testing data.
- Data Preparation and Modeling:
- What are the names of data preparation steps mentioned in the video?
List and describe any data preparation steps or techniques mentioned in
the CA video that you applied to the new dataset. I used unnest_tokens
to unnest the peak names. I also used members_split <- members %>%
select(peak_name, died) %>% initial_split(strata = died) to split the
data into training data and testing data to traing and test the machine
learning model.
- What is the name of the machine learning model(s) used in the
analysis? Specify the machine learning model(s) you employed for your
analysis and briefly explain their relevance to the research question.
The machine learning models used were the logistic regression model,
specifically glmnet. Logistic regression is well-suited for
classification problems when the goal is to assign observations to one
of two classes (alive or dead). Logistic regression is also easy to
interpret the impact of the predictor variables on the probability of
belonging to one of two classes. Logistic regression is also good with
handling text data, which I used with peak name as well as dead or
alive.
- Model Evaluation:
- What metrics are used in the model evaluation? Detail the evaluation
metrics you used to assess the performance of your machine learning
model(s) on the new dataset. Discuss the significance of these metrics
in the context of your research question. The metrics used was the
receiver operating characteristic area under the curve (roc_auc). I used
this to assess the performance of the machine learning model. This
metric is good at classification, and I want to classify whether the
hiker died or lived when hiking the mountain. Roc_auc measures the
models ability to differentiate between two classes and a higher roc_auc
indicates a better distinction between the two classes. This is
important as it shows us if the model is able to differentiate between
dead or alive based on the peak.
- Conclusion:
- What are the major findings? Summarize the key findings and insights
obtained from your analysis of the new dataset. Relate these findings
back to the research question and any similarities or differences
compared to the CA assignment. In summary, the analysis suggests that
there is a relationship between the words in peak names and the
likelihood of death during mountaineering expeditions. While the model
demonstrates some predictive capability, it is not a definitive
predictor of hiker deaths, as other factors such as season, age, and
more likely play a significant role. Further research and feature
engineering may be needed to improve the accuracy of predictions and
better understand the complex factors contributing to expedition
outcomes. The roc_mean was 0.685. In the code along, the roc_auc mean
was 0.847, which is a better prediction. I tried to use other variables
to make a better model but was getting means under 0.685.