library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
museums <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')
## Rows: 4191 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (24): museum_id, Name_of_museum, Address_line_1, Address_line_2, Village...
## dbl (11): Latitude, Longitude, DOMUS_identifier, Area_Deprivation_index, Are...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
museums %>%
count(Accreditation)
## # A tibble: 2 × 2
## Accreditation n
## <chr> <int>
## 1 Accredited 1720
## 2 Unaccredited 2471
top_subjects <- museums %>% count(Subject_Matter) %>% slice_max(n, n = 6) %>% pull(Subject_Matter)
museums %>%
filter(Subject_Matter %in% top_subjects) %>%
count(Subject_Matter, Accreditation) %>%
ggplot(aes(Accreditation, n, fill = Accreditation)) +
geom_col(show.legend = FALSE) +
facet_wrap(vars(Subject_Matter), scales = "free_y") +
labs(x = NULL, y = "Number of museums")
museums %>%
count(Accreditation, Size)
## # A tibble: 10 × 3
## Accreditation Size n
## <chr> <chr> <int>
## 1 Accredited huge 11
## 2 Accredited large 402
## 3 Accredited medium 644
## 4 Accredited small 650
## 5 Accredited unknown 13
## 6 Unaccredited huge 1
## 7 Unaccredited large 142
## 8 Unaccredited medium 381
## 9 Unaccredited small 1751
## 10 Unaccredited unknown 196
top_gov <- museums %>% count(Governance) %>% slice_max(n, n = 4) %>% pull(Governance)
museums %>%
filter(Governance %in% top_gov) %>%
count(Governance, Accreditation) %>%
ggplot(aes(Accreditation, n, fill = Accreditation)) +
geom_col(show.legend = FALSE) +
facet_wrap(vars(Governance), scales = "free_y") +
labs(x = NULL, y = "Number of museums")
museum_parsed <-
museums %>%
select(museum_id, Accreditation, Governance, Size,
Subject_Matter, Year_opened, Year_closed, Area_Deprivation_index) %>%
mutate(Year_opened = parse_number(Year_opened),
IsClosed = if_else(Year_closed == "9999:9999", "Open", "Closed")) %>%
select(-Year_closed) %>%
na.omit() %>%
mutate(across(where(is.character), as.factor)) %>%
mutate(museum_id = as.character(museum_id))
glimpse(museum_parsed)
## Rows: 4,142
## Columns: 8
## $ museum_id <chr> "mm.New.1", "mm.aim.1230", "mm.domus.WM019", "m…
## $ Accreditation <fct> Unaccredited, Unaccredited, Unaccredited, Accre…
## $ Governance <fct> Independent-Not_for_profit, Independent-Unknown…
## $ Size <fct> large, small, medium, medium, small, small, sma…
## $ Subject_Matter <fct> Sea_and_seafaring-Boats_and_ships, Natural_worl…
## $ Year_opened <dbl> 2012, 1971, 1984, 2013, 1996, 1980, 1993, 1854,…
## $ Area_Deprivation_index <dbl> 2, 9, 8, 8, 2, 6, 6, 5, 6, 3, 7, 5, 8, 6, 9, 1,…
## $ IsClosed <fct> Open, Closed, Open, Open, Closed, Closed, Open,…
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.4 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
set.seed(123)
museum_split <- initial_split(museum_parsed, strata = Accreditation)
museum_train <- training(museum_split)
museum_test <- testing(museum_split)
set.seed(234)
museum_folds <- vfold_cv(museum_train, strata = Accreditation)
museum_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [2795/311]> Fold01
## 2 <split [2795/311]> Fold02
## 3 <split [2795/311]> Fold03
## 4 <split [2795/311]> Fold04
## 5 <split [2795/311]> Fold05
## 6 <split [2795/311]> Fold06
## 7 <split [2795/311]> Fold07
## 8 <split [2796/310]> Fold08
## 9 <split [2796/310]> Fold09
## 10 <split [2797/309]> Fold10
library(embed)
museum_rec <-
recipe(Accreditation ~ ., data = museum_train) %>%
update_role(museum_id, new_role = "id") %>%
step_lencode_glm(Subject_Matter, outcome = vars(Accreditation)) %>%
step_dummy(all_nominal_predictors())
museum_rec
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 6
## id: 1
##
## ── Operations
## • Linear embedding for factors via GLM for: Subject_Matter
## • Dummy variables from: all_nominal_predictors()
prep(museum_rec) %>%
tidy(number = 1)
## # A tibble: 114 × 4
## level value terms id
## <chr> <dbl> <chr> <chr>
## 1 Archaeology -16.6 Subject_Matter lencode_glm_kFEsv
## 2 Archaeology-Greek_and_Egyptian 1.39 Subject_Matter lencode_glm_kFEsv
## 3 Archaeology-Medieval -1.10 Subject_Matter lencode_glm_kFEsv
## 4 Archaeology-Mixed -0.981 Subject_Matter lencode_glm_kFEsv
## 5 Archaeology-Other 0.693 Subject_Matter lencode_glm_kFEsv
## 6 Archaeology-Prehistory -0.811 Subject_Matter lencode_glm_kFEsv
## 7 Archaeology-Roman 0.167 Subject_Matter lencode_glm_kFEsv
## 8 Arts-Ceramics -0.405 Subject_Matter lencode_glm_kFEsv
## 9 Arts-Costume_and_textiles 0.118 Subject_Matter lencode_glm_kFEsv
## 10 Arts-Crafts -1.39 Subject_Matter lencode_glm_kFEsv
## # ℹ 104 more rows
prep(museum_rec) %>%
tidy(number = 1) %>%
filter(level == "..new")
## # A tibble: 1 × 4
## level value terms id
## <chr> <dbl> <chr> <chr>
## 1 ..new -0.909 Subject_Matter lencode_glm_kFEsv
xgb_spec <-
boost_tree(
trees = tune(),
min_n = tune(),
mtry = tune(),
learn_rate = 0.01
) %>%
set_engine("xgboost") %>%
set_mode("classification")
xgb_wf <- workflow(museum_rec, xgb_spec)
library(finetune)
doParallel::registerDoParallel()
set.seed(345)
xgb_rs <- tune_race_anova(
xgb_wf,
resamples = museum_folds,
grid = 15,
control = control_race(verbose_elim = TRUE)
)
## i Creating pre-processing data to finalize unknown parameter: mtry
## ℹ Racing will maximize the roc_auc metric.
## ℹ Resamples are analyzed in a random order.
## ℹ Fold10: 11 eliminated; 4 candidates remain.
##
## ℹ Fold07: 2 eliminated; 2 candidates remain.
##
## ℹ Fold03: All but one parameter combination were eliminated.
xgb_rs
## # Tuning results
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 5
## splits id .order .metrics .notes
## <list> <chr> <int> <list> <list>
## 1 <split [2795/311]> Fold01 2 <tibble [30 × 7]> <tibble [0 × 3]>
## 2 <split [2795/311]> Fold02 3 <tibble [30 × 7]> <tibble [0 × 3]>
## 3 <split [2797/309]> Fold10 1 <tibble [30 × 7]> <tibble [0 × 3]>
## 4 <split [2795/311]> Fold07 4 <tibble [8 × 7]> <tibble [0 × 3]>
## 5 <split [2795/311]> Fold03 5 <tibble [4 × 7]> <tibble [0 × 3]>
## 6 <split [2795/311]> Fold04 8 <tibble [2 × 7]> <tibble [0 × 3]>
## 7 <split [2795/311]> Fold05 6 <tibble [2 × 7]> <tibble [0 × 3]>
## 8 <split [2795/311]> Fold06 9 <tibble [2 × 7]> <tibble [0 × 3]>
## 9 <split [2796/310]> Fold08 10 <tibble [2 × 7]> <tibble [0 × 3]>
## 10 <split [2796/310]> Fold09 7 <tibble [2 × 7]> <tibble [0 × 3]>
collect_metrics(xgb_rs)
## # A tibble: 2 × 9
## mtry trees min_n .metric .estimator mean n std_err .config
## <int> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 2 599 8 accuracy binary 0.798 10 0.00701 Preprocessor1_Model…
## 2 2 599 8 roc_auc binary 0.885 10 0.00549 Preprocessor1_Model…
plot_race(xgb_rs)
xgb_last <- xgb_wf %>%
finalize_workflow(select_best(xgb_rs, "accuracy")) %>%
last_fit(museum_split)
xgb_last
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [3106/1036]> train/test split <tibble> <tibble> <tibble> <workflow>
collect_metrics(xgb_last)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.812 Preprocessor1_Model1
## 2 roc_auc binary 0.890 Preprocessor1_Model1
collect_predictions(xgb_last) %>%
conf_mat(Accreditation, .pred_class)
## Truth
## Prediction Accredited Unaccredited
## Accredited 355 120
## Unaccredited 75 486
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
xgb_last %>%
extract_fit_engine() %>%
vip()
What is the research question? Clearly state the research question you aim to address using the new dataset. The research question is “Can the accreditation status of museums in the UK be predicted based on various attributes such as Governance, Size, Subject Matter, Year Opened, and Area Deprivation Index?”
Describe the data briefly: Provide an overview of the new dataset, highlighting its key characteristics and dimensions. The data set contains 4,191 observations of 35 variables. Variables of interest are accreditation, governance, size, subject_matter, year_opened, year_closed, and museum_id. Museum_id, size, year_closed, governance, year_opened, area deprivation index and accreditation are all character data. The variables describe various things about the museums.
What are the characteristics of the key variables used in the analysis? Describe the primary variables of interest in the data set and their characteristics. Accreditation is character data that describes the accreditation status of the museums. Governance is character data that specifies the type of governance structure under which a museum operates. Size is character data which represents the size of the museum. Subject_matter is character data that explains the primary focus of the museum. Year_opened is character data that explains the year the museum was opened. Area deprivation index is numerical data.