library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl (6): hired, success, solo, oxygen_used, died, injured
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
members %>%
count(died)
## # A tibble: 2 × 2
## died n
## <lgl> <int>
## 1 FALSE 75413
## 2 TRUE 1106
members <-
members %>%
mutate(died = case_when(
died == "FALSE" ~ "alive",
died == "TRUE" ~ " dead"))
library(tidytext)
library(tidylo)
members %>%
unnest_tokens(word, peak_name) %>%
count(died, word) %>%
bind_log_odds(died, word, n) %>%
arrange(-log_odds_weighted)
## # A tibble: 469 × 4
## died word n log_odds_weighted
## <chr> <chr> <int> <dbl>
## 1 " dead" everest 306 7.87
## 2 "alive" dome 327 7.04
## 3 "alive" glacier 220 5.77
## 4 "alive" bhrikuti 218 5.75
## 5 "alive" gyajikang 174 5.13
## 6 "alive" central 166 5.01
## 7 "alive" khang 137 4.55
## 8 "alive" changtse 136 4.54
## 9 "alive" kabru 135 4.52
## 10 "alive" nemjung 133 4.49
## # ℹ 459 more rows
Build a model
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.4 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
set.seed(123)
members_split <-
members %>%
select(peak_name, died) %>%
initial_split(strata = died)
members_train <- training(members_split)
members_test <- testing(members_split)
set.seed(234)
members_folds <- vfold_cv(members_train, v = 5, strata = died)
members_folds
## # 5-fold cross-validation using stratification
## # A tibble: 5 × 2
## splits id
## <list> <chr>
## 1 <split [45911/11478]> Fold1
## 2 <split [45911/11478]> Fold2
## 3 <split [45911/11478]> Fold3
## 4 <split [45911/11478]> Fold4
## 5 <split [45912/11477]> Fold5
library(textrecipes)
members_rec <-
recipe(died ~ peak_name, data = members_train) %>%
step_tokenize(peak_name) %>%
step_tokenfilter(peak_name) %>%
step_tfidf(peak_name)
members_rec
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 1
##
## ── Operations
## • Tokenization for: peak_name
## • Text filtering for: peak_name
## • Term frequency-inverse document frequency with: peak_name
glmnet_spec <-
logistic_reg(mixture = 1, penalty = tune()) %>%
set_engine("glmnet")
members_wf <- workflow(members_rec, glmnet_spec)
doParallel::registerDoParallel()
set.seed(123)
members_res <-
tune_grid(
members_wf,
members_folds,
grid = 10)
autoplot(members_res, color = "group")

show_best(members_res)
## Warning: No value of `metric` was given; metric 'roc_auc' will be used.
## # A tibble: 5 × 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 1.27e- 5 roc_auc binary 0.685 5 0.00587 Preprocessor1_Model06
## 2 4.09e- 6 roc_auc binary 0.684 5 0.00603 Preprocessor1_Model05
## 3 6.57e-10 roc_auc binary 0.684 5 0.00603 Preprocessor1_Model01
## 4 5.55e- 9 roc_auc binary 0.684 5 0.00603 Preprocessor1_Model02
## 5 1.05e- 8 roc_auc binary 0.684 5 0.00603 Preprocessor1_Model03
select_by_pct_loss(members_res, desc(penalty), metric = "roc_auc")
## # A tibble: 1 × 9
## penalty .metric .estimator mean n std_err .config .best .loss
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr> <dbl> <dbl>
## 1 0.0000127 roc_auc binary 0.685 5 0.00587 Preprocessor1_Mo… 0.685 0
members_final <-
members_wf %>%
finalize_workflow(
select_by_pct_loss(members_res, desc(penalty), metric = "roc_auc")
) %>%
last_fit(members_split)
members_final
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [57389/19130]> train/test sp… <tibble> <tibble> <tibble> <workflow>
collect_metrics(members_final)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.986 Preprocessor1_Model1
## 2 roc_auc binary 0.693 Preprocessor1_Model1
collect_predictions(members_final) %>%
conf_mat(died, .pred_class)
## Truth
## Prediction dead alive
## dead 0 0
## alive 271 18859
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
members_final %>%
extract_fit_engine() %>%
vi()
## # A tibble: 100 × 3
## Variable Importance Sign
## <chr> <dbl> <chr>
## 1 tfidf_peak_name_central 2.31 POS
## 2 tfidf_peak_name_vi 1.52 POS
## 3 tfidf_peak_name_guru 1.36 NEG
## 4 tfidf_peak_name_kabru 1.29 POS
## 5 tfidf_peak_name_noir 1.27 POS
## 6 tfidf_peak_name_lamjung 1.20 POS
## 7 tfidf_peak_name_29 1.18 NEG
## 8 tfidf_peak_name_mukut 1.17 POS
## 9 tfidf_peak_name_lobuje 1.14 POS
## 10 tfidf_peak_name_langshisa 1.10 NEG
## # ℹ 90 more rows
- Question and Data:
- What is the research question? Clearly state the research question
you aim to address using the new dataset. The research question is “Can
we predict deaths of hikers using peak_name?”
- Describe the data briefly: Provide an overview of the new dataset,
highlighting its key characteristics and dimensions. The data set has
76,519 observations of 21 variables. The key variables are peak_name,
which is the name of the mountain being climbed and died which is
whether or not the hiker passed away. These two variables are used in
the analysis, peak_name is what is being used to predict deaths.
- What are the characteristics of the key variables used in the
analysis? Describe the primary variables of interest in the dataset and
their characteristics. The primary variables of interest are peak_name
and died. Peak_name contains data with all of the names of the mountains
that were climbed. Peak_name is character data. Died contains data about
whether the hiker died during the hike or not. Died is character
data.
- Data Exploration and Transformation:
- Describe the differences between the original data and the data
transformed for modeling. Why? Explain any preprocessing or
transformations performed on the new dataset compared to the original
data. Discuss why these changes were necessary or beneficial. The
original data set has 76,519 observations of 21 variables. I
preprocessed the data by using “members %>% unnest_tokens(word,
peak_name) %>% count(died, word) %>% bind_log_odds(died, word, n)
%>% arrange(-log_odds_weighted)” This unnested and tokenized the
peak_names so that they could be used to predict the died variable. I
also split the data into training and testing data.
- Data Preparation and Modeling:
- What are the names of data preparation steps mentioned in the video?
List and describe any data preparation steps or techniques mentioned in
the CA video that you applied to the new dataset. I used unnest_tokens
to unnest the peak names. I also used members_split <- members %>%
select(peak_name, died) %>% initial_split(strata = died) to split the
data into training data and testing data to traing and test the machine
learning model.
- What is the name of the machine learning model(s) used in the
analysis? Specify the machine learning model(s) you employed for your
analysis and briefly explain their relevance to the research question.
The machine learning models used were the logistic regression model,
specifically glmnet. Logistic regression is well-suited for
classification problems when the goal is to assign observations to one
of two classes (alive or dead). Logistic regression is also easy to
interpret the impact of the predictor variables on the probability of
belonging to one of two classes. Logistic regression is also good with
handling text data, which I used with peak name as well as dead or
alive.
- Model Evaluation:
- What metrics are used in the model evaluation? Detail the evaluation
metrics you used to assess the performance of your machine learning
model(s) on the new dataset. Discuss the significance of these metrics
in the context of your research question. The metrics used was the
receiver operating characteristic area under the curve (roc_auc). I used
this to assess the performance of the machine learning model. This
metric is good at classification, and I want to classify whether the
hiker died or lived when hiking the mountain. Roc_auc measures the
models ability to differentiate between two classes and a higher roc_auc
indicates a better distinction between the two classes. This is
important as it shows us if the model is able to differentiate between
dead or alive based on the peak.
- Conclusion:
- What are the major findings? Summarize the key findings and insights
obtained from your analysis of the new dataset. Relate these findings
back to the research question and any similarities or differences
compared to the CA assignment. In summary, the analysis suggests that
there is a relationship between the words in peak names and the
likelihood of death during mountaineering expeditions. While the model
demonstrates some predictive capability, it is not a definitive
predictor of hiker deaths, as other factors such as season, age, and
more likely play a significant role. Further research and feature
engineering may be needed to improve the accuracy of predictions and
better understand the complex factors contributing to expedition
outcomes. The roc_mean was 0.685. In the code along, the roc_auc mean
was 0.847, which is a better prediction. I tried to use other variables
to make a better model but was getting means under 0.685.