Explore Data
library(tidyverse)
museums <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')
museums %>%
count(Accreditation)
## # A tibble: 2 × 2
## Accreditation n
## <chr> <int>
## 1 Accredited 1720
## 2 Unaccredited 2471
top_subjects <-
museums %>%
count(Subject_Matter) %>%
slice_max(n, n = 6) %>%
pull(Subject_Matter)
museums %>%
filter(Subject_Matter %in% top_subjects) %>%
count(Subject_Matter, Accreditation) %>%
ggplot(aes(Accreditation, n, fill = Accreditation)) +
geom_col(show.legend = FALSE) +
facet_wrap(vars(Subject_Matter), scales = "free_y") +
labs(x = NULL, y = "Number of museums")

museums %>%
count(Accreditation, Size)
## # A tibble: 10 × 3
## Accreditation Size n
## <chr> <chr> <int>
## 1 Accredited huge 11
## 2 Accredited large 402
## 3 Accredited medium 644
## 4 Accredited small 650
## 5 Accredited unknown 13
## 6 Unaccredited huge 1
## 7 Unaccredited large 142
## 8 Unaccredited medium 381
## 9 Unaccredited small 1751
## 10 Unaccredited unknown 196
top_gov <-
museums %>%
count(Governance) %>%
slice_max(n, n = 4) %>%
pull(Governance)
museums %>%
filter(Governance %in% top_gov) %>%
count(Governance, Accreditation) %>%
ggplot(aes(Accreditation, n, fill = Accreditation)) +
geom_col(show.legend = FALSE) +
facet_wrap(vars(Governance), scales = "free_y") +
labs(x = NULL, y = "Number of museums")

museum_parsed <-
museums %>%
select(museum_id, Accreditation, Governance, Size,
Subject_Matter, Year_opened, Year_closed, Area_Deprivation_index) %>%
mutate(Year_opened = parse_number(Year_opened),
IsClosed = if_else(Year_closed == "9999:9999", "Open", "Closed")) %>%
select(-Year_closed) %>%
na.omit() %>%
mutate(across(where(is.character), as.factor)) %>%
mutate(museum_id = as.character(museum_id))
glimpse(museum_parsed)
## Rows: 4,142
## Columns: 8
## $ museum_id <chr> "mm.New.1", "mm.aim.1230", "mm.domus.WM019", "m…
## $ Accreditation <fct> Unaccredited, Unaccredited, Unaccredited, Accre…
## $ Governance <fct> Independent-Not_for_profit, Independent-Unknown…
## $ Size <fct> large, small, medium, medium, small, small, sma…
## $ Subject_Matter <fct> Sea_and_seafaring-Boats_and_ships, Natural_worl…
## $ Year_opened <dbl> 2012, 1971, 1984, 2013, 1996, 1980, 1993, 1854,…
## $ Area_Deprivation_index <dbl> 2, 9, 8, 8, 2, 6, 6, 5, 6, 3, 7, 5, 8, 6, 9, 1,…
## $ IsClosed <fct> Open, Closed, Open, Open, Closed, Closed, Open,…
Feature engineering
library(tidymodels)
set.seed(123)
museum_split <- initial_split(museum_parsed, strata = Accreditation)
museum_train <- training(museum_split)
museum_test <- testing(museum_split)
set.seed(234)
museum_folds <- vfold_cv(museum_train, strata = Accreditation)
museum_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [2795/311]> Fold01
## 2 <split [2795/311]> Fold02
## 3 <split [2795/311]> Fold03
## 4 <split [2795/311]> Fold04
## 5 <split [2795/311]> Fold05
## 6 <split [2795/311]> Fold06
## 7 <split [2795/311]> Fold07
## 8 <split [2796/310]> Fold08
## 9 <split [2796/310]> Fold09
## 10 <split [2797/309]> Fold10
library(embed)
museum_rec <-
recipe(Accreditation ~ ., data = museum_train) %>%
update_role(museum_id, new_role = "id") %>%
step_lencode_glm(Subject_Matter, outcome = vars(Accreditation)) %>%
step_dummy(all_nominal_predictors())
museum_rec
prep(museum_rec) %>%
tidy(number = 1) %>%
filter(level == "..new")
## # A tibble: 1 × 4
## level value terms id
## <chr> <dbl> <chr> <chr>
## 1 ..new -0.909 Subject_Matter lencode_glm_kFEsv
Build a model
xgb_spec <-
boost_tree(
trees = tune(),
min_n = tune(),
mtry = tune(),
learn_rate = 0.01
) %>%
set_engine("xgboost") %>%
set_mode("classification")
xgb_wf <- workflow(museum_rec, xgb_spec)
library(finetune)
doParallel::registerDoParallel()
set.seed(345)
xgb_rs <- tune_race_anova(
xgb_wf,
resamples = museum_folds,
grid = 15,
control = control_race(verbose_elim = TRUE)
)
xgb_rs
## # Tuning results
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 5
## splits id .order .metrics .notes
## <list> <chr> <int> <list> <list>
## 1 <split [2795/311]> Fold01 2 <tibble [30 × 7]> <tibble [0 × 3]>
## 2 <split [2795/311]> Fold02 3 <tibble [30 × 7]> <tibble [0 × 3]>
## 3 <split [2797/309]> Fold10 1 <tibble [30 × 7]> <tibble [0 × 3]>
## 4 <split [2795/311]> Fold07 4 <tibble [10 × 7]> <tibble [0 × 3]>
## 5 <split [2795/311]> Fold03 5 <tibble [6 × 7]> <tibble [0 × 3]>
## 6 <split [2795/311]> Fold05 6 <tibble [6 × 7]> <tibble [0 × 3]>
## 7 <split [2796/310]> Fold09 7 <tibble [6 × 7]> <tibble [0 × 3]>
## 8 <split [2795/311]> Fold04 8 <tibble [6 × 7]> <tibble [0 × 3]>
## 9 <split [2795/311]> Fold06 9 <tibble [6 × 7]> <tibble [0 × 3]>
## 10 <split [2796/310]> Fold08 10 <tibble [4 × 7]> <tibble [0 × 3]>
Evaluate and finalize model
plot_race(xgb_rs)

collect_metrics(xgb_rs)
## # A tibble: 4 × 9
## mtry trees min_n .metric .estimator mean n std_err .config
## <int> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 2 599 8 accuracy binary 0.796 10 0.00729 Preprocessor1_Model…
## 2 2 599 8 roc_auc binary 0.885 10 0.00528 Preprocessor1_Model…
## 3 4 136 3 accuracy binary 0.797 10 0.00786 Preprocessor1_Model…
## 4 4 136 3 roc_auc binary 0.883 10 0.00593 Preprocessor1_Model…
xgb_last <-
xgb_wf %>%
finalize_workflow(select_best(xgb_rs, "accuracy")) %>%
last_fit(museum_split)
collect_metrics(xgb_last)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.817 Preprocessor1_Model1
## 2 roc_auc binary 0.891 Preprocessor1_Model1
collect_predictions(xgb_last) %>%
conf_mat(Accreditation, .pred_class)
## Truth
## Prediction Accredited Unaccredited
## Accredited 350 110
## Unaccredited 80 496
library(vip)
xgb_last %>%
extract_fit_engine() %>%
vip()
