library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
members %>%
  count(died)
## # A tibble: 2 × 2
##   died      n
##   <lgl> <int>
## 1 FALSE 75413
## 2 TRUE   1106
members <-
  members %>%
  mutate(died = case_when(
    died == "FALSE" ~ "alive",
    died == "TRUE" ~ " dead"))
library(tidytext)
library(tidylo)

members %>%
  unnest_tokens(word, peak_name) %>%
  count(died, word) %>%
  bind_log_odds(died, word, n) %>%
  arrange(-log_odds_weighted)
## # A tibble: 469 × 4
##    died    word          n log_odds_weighted
##    <chr>   <chr>     <int>             <dbl>
##  1 " dead" everest     306              7.87
##  2 "alive" dome        327              7.04
##  3 "alive" glacier     220              5.77
##  4 "alive" bhrikuti    218              5.75
##  5 "alive" gyajikang   174              5.13
##  6 "alive" central     166              5.01
##  7 "alive" khang       137              4.55
##  8 "alive" changtse    136              4.54
##  9 "alive" kabru       135              4.52
## 10 "alive" nemjung     133              4.49
## # ℹ 459 more rows

Build a model

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
set.seed(123)
members_split <- 
  members %>%
  select(peak_name, died) %>%
  initial_split(strata = died)

members_train <- training(members_split)
members_test <- testing(members_split)

set.seed(234)
members_folds <- vfold_cv(members_train, v = 5, strata = died)
members_folds
## #  5-fold cross-validation using stratification 
## # A tibble: 5 × 2
##   splits                id   
##   <list>                <chr>
## 1 <split [45911/11478]> Fold1
## 2 <split [45911/11478]> Fold2
## 3 <split [45911/11478]> Fold3
## 4 <split [45911/11478]> Fold4
## 5 <split [45912/11477]> Fold5
library(textrecipes)

members_rec <-
  recipe(died ~ peak_name, data = members_train) %>%
  step_tokenize(peak_name) %>%
  step_tokenfilter(peak_name) %>%
  step_tfidf(peak_name)

members_rec
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:   1
## predictor: 1
## 
## ── Operations
## • Tokenization for: peak_name
## • Text filtering for: peak_name
## • Term frequency-inverse document frequency with: peak_name
glmnet_spec <- 
  logistic_reg(mixture = 1, penalty = tune()) %>%
  set_engine("glmnet")

members_wf <- workflow(members_rec, glmnet_spec)
doParallel::registerDoParallel()
set.seed(123)

members_res <- 
  tune_grid(
    members_wf, 
    members_folds, 
    grid = 10)

autoplot(members_res, color = "group")

show_best(members_res)
## Warning: No value of `metric` was given; metric 'roc_auc' will be used.
## # A tibble: 5 × 7
##    penalty .metric .estimator  mean     n std_err .config              
##      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1 1.27e- 5 roc_auc binary     0.685     5 0.00587 Preprocessor1_Model06
## 2 4.09e- 6 roc_auc binary     0.684     5 0.00603 Preprocessor1_Model05
## 3 6.57e-10 roc_auc binary     0.684     5 0.00603 Preprocessor1_Model01
## 4 5.55e- 9 roc_auc binary     0.684     5 0.00603 Preprocessor1_Model02
## 5 1.05e- 8 roc_auc binary     0.684     5 0.00603 Preprocessor1_Model03
select_by_pct_loss(members_res, desc(penalty), metric = "roc_auc")
## # A tibble: 1 × 9
##     penalty .metric .estimator  mean     n std_err .config           .best .loss
##       <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>             <dbl> <dbl>
## 1 0.0000127 roc_auc binary     0.685     5 0.00587 Preprocessor1_Mo… 0.685     0
members_final <-
  members_wf %>%
  finalize_workflow(
    select_by_pct_loss(members_res, desc(penalty), metric = "roc_auc")
  ) %>%
  last_fit(members_split)

members_final
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits                id             .metrics .notes   .predictions .workflow 
##   <list>                <chr>          <list>   <list>   <list>       <list>    
## 1 <split [57389/19130]> train/test sp… <tibble> <tibble> <tibble>     <workflow>
collect_metrics(members_final)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.986 Preprocessor1_Model1
## 2 roc_auc  binary         0.693 Preprocessor1_Model1
collect_predictions(members_final) %>%
  conf_mat(died, .pred_class)
##           Truth
## Prediction  dead alive
##       dead     0     0
##      alive   271 18859
library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
members_final %>%
  extract_fit_engine() %>%
  vi()
## # A tibble: 100 × 3
##    Variable                  Importance Sign 
##    <chr>                          <dbl> <chr>
##  1 tfidf_peak_name_central         2.31 POS  
##  2 tfidf_peak_name_vi              1.52 POS  
##  3 tfidf_peak_name_guru            1.36 NEG  
##  4 tfidf_peak_name_kabru           1.29 POS  
##  5 tfidf_peak_name_noir            1.27 POS  
##  6 tfidf_peak_name_lamjung         1.20 POS  
##  7 tfidf_peak_name_29              1.18 NEG  
##  8 tfidf_peak_name_mukut           1.17 POS  
##  9 tfidf_peak_name_lobuje          1.14 POS  
## 10 tfidf_peak_name_langshisa       1.10 NEG  
## # ℹ 90 more rows
  1. Question and Data:
    • What is the research question? Clearly state the research question you aim to address using the new dataset. The research question is “Can we predict deaths of hikers using peak_name?”
    • Describe the data briefly: Provide an overview of the new dataset, highlighting its key characteristics and dimensions. The data set has 76,519 observations of 21 variables. The key variables are peak_name, which is the name of the mountain being climbed and died which is whether or not the hiker passed away. These two variables are used in the analysis, peak_name is what is being used to predict deaths.
    • What are the characteristics of the key variables used in the analysis? Describe the primary variables of interest in the dataset and their characteristics. The primary variables of interest are peak_name and died. Peak_name contains data with all of the names of the mountains that were climbed. Peak_name is character data. Died contains data about whether the hiker died during the hike or not. Died is character data.
  2. Data Exploration and Transformation:
    • Describe the differences between the original data and the data transformed for modeling. Why? Explain any preprocessing or transformations performed on the new dataset compared to the original data. Discuss why these changes were necessary or beneficial. The original data set has 76,519 observations of 21 variables. I preprocessed the data by using “members %>% unnest_tokens(word, peak_name) %>% count(died, word) %>% bind_log_odds(died, word, n) %>% arrange(-log_odds_weighted)” This unnested and tokenized the peak_names so that they could be used to predict the died variable. I also split the data into training and testing data.
  3. Data Preparation and Modeling:
    • What are the names of data preparation steps mentioned in the video? List and describe any data preparation steps or techniques mentioned in the CA video that you applied to the new dataset. I used unnest_tokens to unnest the peak names. I also used members_split <- members %>% select(peak_name, died) %>% initial_split(strata = died) to split the data into training data and testing data to traing and test the machine learning model.
    • What is the name of the machine learning model(s) used in the analysis? Specify the machine learning model(s) you employed for your analysis and briefly explain their relevance to the research question. The machine learning models used were the logistic regression model, specifically glmnet. Logistic regression is well-suited for classification problems when the goal is to assign observations to one of two classes (alive or dead). Logistic regression is also easy to interpret the impact of the predictor variables on the probability of belonging to one of two classes. Logistic regression is also good with handling text data, which I used with peak name as well as dead or alive.
  4. Model Evaluation:
    • What metrics are used in the model evaluation? Detail the evaluation metrics you used to assess the performance of your machine learning model(s) on the new dataset. Discuss the significance of these metrics in the context of your research question. The metrics used was the receiver operating characteristic area under the curve (roc_auc). I used this to assess the performance of the machine learning model. This metric is good at classification, and I want to classify whether the hiker died or lived when hiking the mountain. Roc_auc measures the models ability to differentiate between two classes and a higher roc_auc indicates a better distinction between the two classes. This is important as it shows us if the model is able to differentiate between dead or alive based on the peak.
  5. Conclusion:
    • What are the major findings? Summarize the key findings and insights obtained from your analysis of the new dataset. Relate these findings back to the research question and any similarities or differences compared to the CA assignment. In summary, the analysis suggests that there is a relationship between the words in peak names and the likelihood of death during mountaineering expeditions. While the model demonstrates some predictive capability, it is not a definitive predictor of hiker deaths, as other factors such as season, age, and more likely play a significant role. Further research and feature engineering may be needed to improve the accuracy of predictions and better understand the complex factors contributing to expedition outcomes. The roc_mean was 0.685. In the code along, the roc_auc mean was 0.847, which is a better prediction. I tried to use other variables to make a better model but was getting means under 0.685.