library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
members %>%
  count(died)
## # A tibble: 2 × 2
##   died      n
##   <lgl> <int>
## 1 FALSE 75413
## 2 TRUE   1106
data <- members %>%
    select(-death_cause, -injury_type, -highpoint_metres, -death_height_metres, -injury_height_metres) %>%
    
    drop_na() %>%
    select(-peak_id, -expedition_id) %>%
    distinct(member_id, .keep_all = TRUE) %>%
    mutate(died = case_when(died == "TRUE" ~ "died", died == "FALSE" ~ "no")) %>%
    mutate(across(where(is.character), factor)) %>%
    mutate(across(where(is.logical), factor)) %>%
    mutate(member_id = as.character(member_id))


skimr::skim(data)    
Data summary
Name data
Number of rows 72984
Number of columns 14
_______________________
Column type frequency:
character 1
factor 11
numeric 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
member_id 0 1 12 12 0 72984 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
peak_name 0 1 FALSE 390 Eve: 20994, Cho: 8608, Ama: 8235, Man: 4510
season 0 1 FALSE 4 Spr: 36150, Aut: 34186, Win: 2011, Sum: 637
sex 0 1 FALSE 2 M: 66150, F: 6834
citizenship 0 1 FALSE 207 Nep: 14367, USA: 6318, Jap: 6188, UK: 5071
expedition_role 0 1 FALSE 483 Cli: 43315, H-A: 13033, Lea: 9884, Exp: 1411
hired 0 1 FALSE 2 FAL: 59006, TRU: 13978
success 0 1 FALSE 2 FAL: 44913, TRU: 28071
solo 0 1 FALSE 2 FAL: 72868, TRU: 116
oxygen_used 0 1 FALSE 2 FAL: 55215, TRU: 17769
died 0 1 FALSE 2 no: 72055, die: 929
injured 0 1 FALSE 2 FAL: 71333, TRU: 1651

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1 2001.00 14.12 1905 1992 2004 2012 2019 ▁▁▁▃▇
age 0 1 37.34 10.39 7 29 36 44 85 ▁▇▅▁▁
data %>%
    select(hired:injured) %>%
    pivot_longer(-died) %>%
    filter(value == "TRUE") %>%
    group_by(died, name) %>%
    summarise(n_true = n()) %>%
    ungroup() %>%
    pivot_wider(names_from = died, values_from = n_true) %>%
    mutate(pct_died = died/no * 100) %>%
    ggplot(aes(pct_died, fct_reorder(name, pct_died))) +
    geom_col() +
    
    labs(y = NULL, x = "Percent Died",
         caption = "Solo seem to be relevant - 'Dangerous'")
## `summarise()` has grouped output by 'died'. You can override using the
## `.groups` argument.

top10_peaks <- data %>% count(peak_name, sort = TRUE) %>% slice_max(order_by = n, n = 10) %>% pull(peak_name)

data %>% 
    
    filter(peak_name %in% top10_peaks) %>%
    count(died, peak_name) %>%
    pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
    mutate(pct_died = (died / (no + died)) * 100) %>%

    ggplot(aes(pct_died, fct_reorder(peak_name, pct_died))) +
    geom_col() +
    
    labs(y = "Peak Names", x = "Percent Died")

data %>% 
    
    ggplot(aes(died, year)) +
    geom_boxplot()

data %>%
    
    mutate(year_decade = year %/% 10 * 10) %>%
    count(died, year_decade) %>%
    pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
    mutate(pct_died = (died / (no + died)) * 100) %>%

    ggplot(aes(year_decade, pct_died))  +
    geom_col(fill = "midnightblue") +
    
    labs(y = NULL, x = "Percent Died")

data %>%
    
    count(died, season) %>%
    pivot_wider(names_from = died, values_from = n) %>%
    mutate(pct_died = (died / (died + no) * 100)) %>%
    
    ggplot(aes(pct_died, fct_reorder(season, pct_died))) +
    geom_col(fill = "midnightblue") +
    
    labs(y = NULL, x = "Percent Died")

plot_pct_died <- function(data, cat_var) {
    
    data %>%

    count(died, {{cat_var}}) %>%
    pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
    mutate(pct_died = (died / (died + no) * 100)) %>%
    
    ggplot(aes(pct_died, fct_reorder({{cat_var}}, pct_died))) +
    geom_col(fill = "midnightblue") +
    
    labs(y = NULL, x = "Percent Died")
    
}  
data %>% plot_pct_died(cat_var = sex)

data %>% 
    
    ggplot(aes(died, age)) +
    geom_boxplot()

data %>%
    
    mutate(age_decade = age %/% 10 * 10) %>%
    count(died, age_decade) %>%
    pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
    mutate(pct_died = (died / (no + died)) * 100) %>%

    ggplot(aes(age_decade, pct_died)) +
    geom_col() %>%
    
    labs(y = "Percent Died", x = "Age by Decade")

data %>%
    
    mutate(age_decade = age %/% 10 * 10) %>%
    count(died, age_decade) %>%
    pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
    mutate(pct_died = (died / (no + died)) * 100) %>%
    filter(age_decade != 80) %>%
    
    ggplot(aes(age_decade, pct_died)) +
    geom_col() %>%
    
    labs(y = "Percent Died", x = "Age by Decade")

top10_citizen <- data %>% count(citizenship, sort = TRUE) %>% slice_max(order_by = n, n = 10) %>% pull(citizenship)

data %>%
    filter(citizenship %in% top10_citizen)  %>%
    plot_pct_died(cat_var = citizenship)

top10_role <- data %>% count(expedition_role, sort = TRUE) %>% slice_max(order_by = n, n = 10) %>% pull(expedition_role)

data %>%
    filter(expedition_role %in% top10_role)  %>%
    plot_pct_died(cat_var = expedition_role)

Build a model

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
set.seed(123)
data_split <- initial_split(data, strata = died)
data_train <- training(data_split)
data_test <- testing(data_split)

set.seed(234)
data_folds <- vfold_cv(data_train, strata = died)
data_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits               id    
##    <list>               <chr> 
##  1 <split [49264/5474]> Fold01
##  2 <split [49264/5474]> Fold02
##  3 <split [49264/5474]> Fold03
##  4 <split [49264/5474]> Fold04
##  5 <split [49264/5474]> Fold05
##  6 <split [49264/5474]> Fold06
##  7 <split [49264/5474]> Fold07
##  8 <split [49264/5474]> Fold08
##  9 <split [49265/5473]> Fold09
## 10 <split [49265/5473]> Fold10
library(themis)
library(embed)  

xgboost_recipe <-
  recipe(formula = died ~ ., data = data_train) %>%
    update_role(member_id, new_role = "id") %>%
    step_other(peak_name, citizenship, expedition_role) %>%
    step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
    step_zv(all_predictors()) %>%
    step_normalize(all_numeric_predictors()) %>%
    step_smote(died)


xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 108,068
## Columns: 35
## $ member_id                  <fct> AMAD78301-02, AMAD78301-05, AMAD78301-06, A…
## $ year                       <dbl> -1.635974, -1.635974, -1.635974, -1.635974,…
## $ age                        <dbl> 0.35136828, -0.32114464, -1.18580412, 0.351…
## $ died                       <fct> no, no, no, no, no, no, no, no, no, no, no,…
## $ peak_name_Ama.Dablam       <dbl> 2.812559, 2.812559, 2.812559, 2.812559, 2.8…
## $ peak_name_Cho.Oyu          <dbl> -0.363673, -0.363673, -0.363673, -0.363673,…
## $ peak_name_Everest          <dbl> -0.6366008, -0.6366008, -0.6366008, -0.6366…
## $ peak_name_Manaslu          <dbl> -0.2574662, -0.2574662, -0.2574662, -0.2574…
## $ peak_name_other            <dbl> -0.8516586, -0.8516586, -0.8516586, -0.8516…
## $ season_Autumn              <dbl> 1.066582, 1.066582, 1.066582, 1.066582, -0.…
## $ season_Spring              <dbl> -0.9927462, -0.9927462, -0.9927462, -0.9927…
## $ season_Summer              <dbl> -0.09336126, -0.09336126, -0.09336126, -0.0…
## $ season_Winter              <dbl> -0.1671621, -0.1671621, -0.1671621, -0.1671…
## $ sex_F                      <dbl> -0.3221255, -0.3221255, -0.3221255, -0.3221…
## $ sex_M                      <dbl> 0.3221255, 0.3221255, 0.3221255, 0.3221255,…
## $ citizenship_France         <dbl> 3.9004595, 3.9004595, 3.9004595, 3.9004595,…
## $ citizenship_Japan          <dbl> -0.3062252, -0.3062252, -0.3062252, -0.3062…
## $ citizenship_Nepal          <dbl> -0.493404, -0.493404, -0.493404, -0.493404,…
## $ citizenship_UK             <dbl> -0.2720068, -0.2720068, -0.2720068, -0.2720…
## $ citizenship_USA            <dbl> -0.3077572, -0.3077572, -0.3077572, -0.3077…
## $ citizenship_other          <dbl> -1.0027716, -1.0027716, -1.0027716, -1.0027…
## $ expedition_role_Climber    <dbl> -1.2099521, 0.8264639, 0.8264639, 0.8264639…
## $ expedition_role_H.A.Worker <dbl> -0.4644574, -0.4644574, -0.4644574, -0.4644…
## $ expedition_role_Leader     <dbl> -0.396114, -0.396114, -0.396114, -0.396114,…
## $ expedition_role_other      <dbl> 3.127181, -0.319771, -0.319771, -0.319771, …
## $ hired_FALSE.               <dbl> 0.4848862, 0.4848862, 0.4848862, 0.4848862,…
## $ hired_TRUE.                <dbl> -0.4848862, -0.4848862, -0.4848862, -0.4848…
## $ success_FALSE.             <dbl> 0.7921472, 0.7921472, 0.7921472, 0.7921472,…
## $ success_TRUE.              <dbl> -0.7921472, -0.7921472, -0.7921472, -0.7921…
## $ solo_FALSE.                <dbl> 0.0398985, 0.0398985, 0.0398985, 0.0398985,…
## $ solo_TRUE.                 <dbl> -0.0398985, -0.0398985, -0.0398985, -0.0398…
## $ oxygen_used_FALSE.         <dbl> 0.5681049, 0.5681049, 0.5681049, 0.5681049,…
## $ oxygen_used_TRUE.          <dbl> -0.5681049, -0.5681049, -0.5681049, -0.5681…
## $ injured_FALSE.             <dbl> 0.1515509, 0.1515509, 0.1515509, 0.1515509,…
## $ injured_TRUE.              <dbl> -0.1515509, -0.1515509, -0.1515509, -0.1515…
xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

doParallel::registerDoParallel()
set.seed(27358)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_folds,
            control = control_resamples(save_pred = TRUE))
collect_metrics(xgboost_tune)
## # A tibble: 20 × 10
##    trees min_n tree_depth learn_rate .metric  .estimator  mean     n  std_err
##    <int> <int>      <int>      <dbl> <chr>    <chr>      <dbl> <int>    <dbl>
##  1     6     2          5    0.00648 accuracy binary     0.613    10 0.0191  
##  2     6     2          5    0.00648 roc_auc  binary     0.666    10 0.0128  
##  3  1765     8          1    0.252   accuracy binary     0.805    10 0.00260 
##  4  1765     8          1    0.252   roc_auc  binary     0.675    10 0.0140  
##  5   257    10          7    0.0840  accuracy binary     0.985    10 0.000627
##  6   257    10          7    0.0840  roc_auc  binary     0.735    10 0.0126  
##  7  1867    15         14    0.00279 accuracy binary     0.963    10 0.00154 
##  8  1867    15         14    0.00279 roc_auc  binary     0.736    10 0.0138  
##  9  1057    20          3    0.00427 accuracy binary     0.692    10 0.00690 
## 10  1057    20          3    0.00427 roc_auc  binary     0.703    10 0.0125  
## 11   771    23         10    0.00102 accuracy binary     0.794    10 0.00474 
## 12   771    23         10    0.00102 roc_auc  binary     0.723    10 0.0140  
## 13  1531    26         13    0.171   accuracy binary     0.986    10 0.000557
## 14  1531    26         13    0.171   roc_auc  binary     0.730    10 0.00918 
## 15   966    29          7    0.0419  accuracy binary     0.986    10 0.000545
## 16   966    29          7    0.0419  roc_auc  binary     0.742    10 0.0127  
## 17  1233    33          9    0.0140  accuracy binary     0.985    10 0.000531
## 18  1233    33          9    0.0140  roc_auc  binary     0.738    10 0.0129  
## 19   582    39         11    0.0186  accuracy binary     0.982    10 0.000572
## 20   582    39         11    0.0186  roc_auc  binary     0.735    10 0.0128  
## # ℹ 1 more variable: .config <chr>
collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(died, .pred_died) %>%
    autoplot()

final_fitted <- xgboost_workflow %>%
    
    finalize_workflow(select_best(xgboost_tune, "accuracy")) %>%
    last_fit(data_split)

collect_metrics(final_fitted)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.986 Preprocessor1_Model1
## 2 roc_auc  binary         0.733 Preprocessor1_Model1
collect_predictions(final_fitted) %>%
    conf_mat(died, .pred_class) %>%
    autoplot()

collect_metrics(final_fitted)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.986 Preprocessor1_Model1
## 2 roc_auc  binary         0.733 Preprocessor1_Model1
library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
final_fitted %>%
  extract_fit_engine() %>%
  vip()

  1. Question and Data:
    • What is the research question? Clearly state the research question you aim to address using the new dataset. The research question is “Can we predict deaths of hikers using peak_name?”
    • Describe the data briefly: Provide an overview of the new dataset, highlighting its key characteristics and dimensions. The data set has 76,519 observations of 21 variables. The key variables are peak_name, which is the name of the mountain being climbed and died which is whether or not the hiker passed away. These two variables are used in the analysis, peak_name is what is being used to predict deaths.
    • What are the characteristics of the key variables used in the analysis? Describe the primary variables of interest in the dataset and their characteristics. The primary variables of interest are peak_name and died. Peak_name contains data with all of the names of the mountains that were climbed. Peak_name is character data. Died contains data about whether the hiker died during the hike or not. Died is character data.
  2. Data Exploration and Transformation:
    • Describe the differences between the original data and the data transformed for modeling. Why? Explain any preprocessing or transformations performed on the new dataset compared to the original data. Discuss why these changes were necessary or beneficial. The original data set has 76,519 observations of 21 variables. I preprocessed the data by using “members %>% unnest_tokens(word, peak_name) %>% count(died, word) %>% bind_log_odds(died, word, n) %>% arrange(-log_odds_weighted)” This unnested and tokenized the peak_names so that they could be used to predict the died variable. I also split the data into training and testing data.
  3. Data Preparation and Modeling:
    • What are the names of data preparation steps mentioned in the video? List and describe any data preparation steps or techniques mentioned in the CA video that you applied to the new dataset. I used unnest_tokens to unnest the peak names. I also used members_split <- members %>% select(peak_name, died) %>% initial_split(strata = died) to split the data into training data and testing data to traing and test the machine learning model.
    • What is the name of the machine learning model(s) used in the analysis? Specify the machine learning model(s) you employed for your analysis and briefly explain their relevance to the research question. The machine learning models used were the logistic regression model, specifically glmnet. Logistic regression is well-suited for classification problems when the goal is to assign observations to one of two classes (alive or dead). Logistic regression is also easy to interpret the impact of the predictor variables on the probability of belonging to one of two classes. Logistic regression is also good with handling text data, which I used with peak name as well as dead or alive.
  4. Model Evaluation:
    • What metrics are used in the model evaluation? Detail the evaluation metrics you used to assess the performance of your machine learning model(s) on the new dataset. Discuss the significance of these metrics in the context of your research question. The metrics used was the receiver operating characteristic area under the curve (roc_auc). I used this to assess the performance of the machine learning model. This metric is good at classification, and I want to classify whether the hiker died or lived when hiking the mountain. Roc_auc measures the models ability to differentiate between two classes and a higher roc_auc indicates a better distinction between the two classes. This is important as it shows us if the model is able to differentiate between dead or alive based on the peak.
  5. Conclusion:
    • What are the major findings? Summarize the key findings and insights obtained from your analysis of the new dataset. Relate these findings back to the research question and any similarities or differences compared to the CA assignment. In summary, the analysis suggests that there is a relationship between the words in peak names and the likelihood of death during mountaineering expeditions. While the model demonstrates some predictive capability, it is not a definitive predictor of hiker deaths, as other factors such as season, age, and more likely play a significant role. Further research and feature engineering may be needed to improve the accuracy of predictions and better understand the complex factors contributing to expedition outcomes. The roc_mean was 0.685. In the code along, the roc_auc mean was 0.847, which is a better prediction. I tried to use other variables to make a better model but was getting means under 0.685.