Explore Data

library(tidyverse)
museums <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')

museums %>%
  count(Accreditation)
## # A tibble: 2 × 2
##   Accreditation     n
##   <chr>         <int>
## 1 Accredited     1720
## 2 Unaccredited   2471
top_subjects <- 
  museums %>% 
  count(Subject_Matter) %>% 
  slice_max(n, n = 6) %>% 
  pull(Subject_Matter)

museums %>%
  filter(Subject_Matter %in% top_subjects) %>%
  count(Subject_Matter, Accreditation) %>%
  ggplot(aes(Accreditation, n, fill = Accreditation)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(vars(Subject_Matter), scales = "free_y") +
  labs(x = NULL, y = "Number of museums")

museums %>%
  count(Accreditation, Size)
## # A tibble: 10 × 3
##    Accreditation Size        n
##    <chr>         <chr>   <int>
##  1 Accredited    huge       11
##  2 Accredited    large     402
##  3 Accredited    medium    644
##  4 Accredited    small     650
##  5 Accredited    unknown    13
##  6 Unaccredited  huge        1
##  7 Unaccredited  large     142
##  8 Unaccredited  medium    381
##  9 Unaccredited  small    1751
## 10 Unaccredited  unknown   196
top_gov <- 
  museums %>% 
  count(Governance) %>% 
  slice_max(n, n = 4) %>% 
  pull(Governance)

museums %>%
  filter(Governance %in% top_gov) %>%
  count(Governance, Accreditation) %>%
  ggplot(aes(Accreditation, n, fill = Accreditation)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(vars(Governance), scales = "free_y") +
  labs(x = NULL, y = "Number of museums")

museum_parsed <-
  museums %>%
  select(museum_id, Accreditation, Governance, Size,
         Subject_Matter, Year_opened, Year_closed, Area_Deprivation_index) %>%
  mutate(Year_opened = parse_number(Year_opened),
         IsClosed = if_else(Year_closed == "9999:9999", "Open", "Closed")) %>%
  select(-Year_closed) %>%
  na.omit() %>%
  mutate(across(where(is.character), as.factor)) %>%
  mutate(museum_id = as.character(museum_id))

glimpse(museum_parsed)
## Rows: 4,142
## Columns: 8
## $ museum_id              <chr> "mm.New.1", "mm.aim.1230", "mm.domus.WM019", "m…
## $ Accreditation          <fct> Unaccredited, Unaccredited, Unaccredited, Accre…
## $ Governance             <fct> Independent-Not_for_profit, Independent-Unknown…
## $ Size                   <fct> large, small, medium, medium, small, small, sma…
## $ Subject_Matter         <fct> Sea_and_seafaring-Boats_and_ships, Natural_worl…
## $ Year_opened            <dbl> 2012, 1971, 1984, 2013, 1996, 1980, 1993, 1854,…
## $ Area_Deprivation_index <dbl> 2, 9, 8, 8, 2, 6, 6, 5, 6, 3, 7, 5, 8, 6, 9, 1,…
## $ IsClosed               <fct> Open, Closed, Open, Open, Closed, Closed, Open,…

Feature engineering

library(tidymodels)

set.seed(123)
museum_split <- initial_split(museum_parsed, strata = Accreditation)

museum_train <- training(museum_split)
museum_test <- testing(museum_split)

set.seed(234)
museum_folds <- vfold_cv(museum_train, strata = Accreditation)

museum_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [2795/311]> Fold01
##  2 <split [2795/311]> Fold02
##  3 <split [2795/311]> Fold03
##  4 <split [2795/311]> Fold04
##  5 <split [2795/311]> Fold05
##  6 <split [2795/311]> Fold06
##  7 <split [2795/311]> Fold07
##  8 <split [2796/310]> Fold08
##  9 <split [2796/310]> Fold09
## 10 <split [2797/309]> Fold10
library(embed)

museum_rec <- 
  recipe(Accreditation ~ ., data = museum_train) %>%
  update_role(museum_id, new_role = "id") %>%
  step_lencode_glm(Subject_Matter, outcome = vars(Accreditation)) %>%
  step_dummy(all_nominal_predictors())

museum_rec
prep(museum_rec) %>%
  tidy(number = 1) %>%
  filter(level == "..new")
## # A tibble: 1 × 4
##   level  value terms          id               
##   <chr>  <dbl> <chr>          <chr>            
## 1 ..new -0.909 Subject_Matter lencode_glm_kFEsv

Build a model

xgb_spec <-
  boost_tree(
    trees = tune(),
    min_n = tune(),
    mtry = tune(),
    learn_rate = 0.01
  ) %>%
  set_engine("xgboost") %>%
  set_mode("classification")

xgb_wf <- workflow(museum_rec, xgb_spec)
library(finetune)
doParallel::registerDoParallel()

set.seed(345)
xgb_rs <- tune_race_anova(
  xgb_wf,
  resamples = museum_folds,
  grid = 15,
  control = control_race(verbose_elim = TRUE)
)

xgb_rs
## # Tuning results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 × 5
##    splits             id     .order .metrics          .notes          
##    <list>             <chr>   <int> <list>            <list>          
##  1 <split [2795/311]> Fold01      2 <tibble [30 × 7]> <tibble [0 × 3]>
##  2 <split [2795/311]> Fold02      3 <tibble [30 × 7]> <tibble [0 × 3]>
##  3 <split [2797/309]> Fold10      1 <tibble [30 × 7]> <tibble [0 × 3]>
##  4 <split [2795/311]> Fold07      4 <tibble [10 × 7]> <tibble [0 × 3]>
##  5 <split [2795/311]> Fold03      5 <tibble [6 × 7]>  <tibble [0 × 3]>
##  6 <split [2795/311]> Fold05      6 <tibble [6 × 7]>  <tibble [0 × 3]>
##  7 <split [2796/310]> Fold09      7 <tibble [6 × 7]>  <tibble [0 × 3]>
##  8 <split [2795/311]> Fold04      8 <tibble [6 × 7]>  <tibble [0 × 3]>
##  9 <split [2795/311]> Fold06      9 <tibble [6 × 7]>  <tibble [0 × 3]>
## 10 <split [2796/310]> Fold08     10 <tibble [4 × 7]>  <tibble [0 × 3]>

Evaluate and finalize model

plot_race(xgb_rs)

collect_metrics(xgb_rs)
## # A tibble: 4 × 9
##    mtry trees min_n .metric  .estimator  mean     n std_err .config             
##   <int> <int> <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1     2   599     8 accuracy binary     0.796    10 0.00729 Preprocessor1_Model…
## 2     2   599     8 roc_auc  binary     0.885    10 0.00528 Preprocessor1_Model…
## 3     4   136     3 accuracy binary     0.797    10 0.00786 Preprocessor1_Model…
## 4     4   136     3 roc_auc  binary     0.883    10 0.00593 Preprocessor1_Model…
xgb_last <-
  xgb_wf %>% 
  finalize_workflow(select_best(xgb_rs, "accuracy")) %>%
  last_fit(museum_split)
collect_metrics(xgb_last)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.817 Preprocessor1_Model1
## 2 roc_auc  binary         0.891 Preprocessor1_Model1
collect_predictions(xgb_last) %>%
  conf_mat(Accreditation, .pred_class)
##               Truth
## Prediction     Accredited Unaccredited
##   Accredited          350          110
##   Unaccredited         80          496
library(vip)

xgb_last %>%
  extract_fit_engine() %>%
  vip()