Data Exploration

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
museums <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')
## Rows: 4191 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (24): museum_id, Name_of_museum, Address_line_1, Address_line_2, Village...
## dbl (11): Latitude, Longitude, DOMUS_identifier, Area_Deprivation_index, Are...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
museums %>%
  count(Accreditation)
## # A tibble: 2 × 2
##   Accreditation     n
##   <chr>         <int>
## 1 Accredited     1720
## 2 Unaccredited   2471
top_gov <-
  museums %>%
  count(Governance, sort = TRUE) %>%
  slice_max(n, n = 4) %>%
  pull(Governance)

museums %>%
  filter(Governance %in% top_gov) %>%
  count(Governance, Accreditation) %>%
  ggplot(aes(Accreditation, n, fill = Accreditation)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(vars(Governance), scales = "free_y")

museums %>%
  count(Accreditation, Size)
## # A tibble: 10 × 3
##    Accreditation Size        n
##    <chr>         <chr>   <int>
##  1 Accredited    huge       11
##  2 Accredited    large     402
##  3 Accredited    medium    644
##  4 Accredited    small     650
##  5 Accredited    unknown    13
##  6 Unaccredited  huge        1
##  7 Unaccredited  large     142
##  8 Unaccredited  medium    381
##  9 Unaccredited  small    1751
## 10 Unaccredited  unknown   196
museums_parsed <-
  museums %>%
  select(museum_id, 
         Accreditation, 
         Governance, 
         Size, 
         Subject_Matter, 
         Year_opened, 
         Year_closed, 
         Area_Deprivation_index) %>%
  mutate(Year_opened = parse_number(Year_opened),
            Closed = if_else(Year_closed == "9999:9999", "Open", "Closed")) %>%
  select(-Year_closed) %>%
  na.omit() %>%
  mutate_if(is.character, as.factor) %>%
  mutate(museum_id = as.character(museum_id))

Feature Engineering

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
set.seed(123)
museum_split <- initial_split(museums_parsed, strata = Accreditation)
museum_train <- training(museum_split)
museum_test <- testing(museum_split)

set.seed(234)
museum_folds <- vfold_cv(museum_train, strata = Accreditation)
museum_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [2795/311]> Fold01
##  2 <split [2795/311]> Fold02
##  3 <split [2795/311]> Fold03
##  4 <split [2795/311]> Fold04
##  5 <split [2795/311]> Fold05
##  6 <split [2795/311]> Fold06
##  7 <split [2795/311]> Fold07
##  8 <split [2796/310]> Fold08
##  9 <split [2796/310]> Fold09
## 10 <split [2797/309]> Fold10
library(embed)

museum_rec <-
  recipe(Accreditation ~ ., data = museum_train) %>%
  update_role(museum_id, new_role = "id") %>%
  ## How does subject matter affect Accreditation (affect encoding)
  step_lencode_glm(Subject_Matter, outcome = vars(Accreditation)) %>%
  step_dummy(all_nominal_predictors())

museum_rec
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:   1
## predictor: 6
## id:        1
## 
## ── Operations
## • Linear embedding for factors via GLM for: Subject_Matter
## • Dummy variables from: all_nominal_predictors()
prep(museum_rec) %>%
  tidy(number = 1) %>%
  filter(level == "..new")
## # A tibble: 1 × 4
##   level  value terms          id               
##   <chr>  <dbl> <chr>          <chr>            
## 1 ..new -0.909 Subject_Matter lencode_glm_kFEsv

Build a model

xgb_spec <-
  boost_tree(
    trees = tune(),
    min_n = tune(),
    mtry = tune(),
    learn_rate = tune()
  ) %>%
  set_engine("xgboost") %>%
  set_mode("classification")

xgb_wf <- workflow(museum_rec, xgb_spec)
library(finetune)
doParallel::registerDoParallel()

set.seed(345)
xgb_rs <- 
  tune_race_anova(
  xgb_wf,
  resamples = museum_folds,
  grid = 15,
  control = control_race(verbose_elim = TRUE)
)
## i Creating pre-processing data to finalize unknown parameter: mtry
## ℹ Racing will maximize the roc_auc metric.
## ℹ Resamples are analyzed in a random order.
## ℹ Fold10: 11 eliminated; 4 candidates remain.
## 
## ℹ Fold07: All but one parameter combination were eliminated.
xgb_rs
## # Tuning results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 × 5
##    splits             id     .order .metrics          .notes          
##    <list>             <chr>   <int> <list>            <list>          
##  1 <split [2795/311]> Fold01      2 <tibble [30 × 8]> <tibble [0 × 3]>
##  2 <split [2795/311]> Fold02      3 <tibble [30 × 8]> <tibble [0 × 3]>
##  3 <split [2797/309]> Fold10      1 <tibble [30 × 8]> <tibble [0 × 3]>
##  4 <split [2795/311]> Fold07      4 <tibble [8 × 8]>  <tibble [0 × 3]>
##  5 <split [2795/311]> Fold03      5 <tibble [2 × 8]>  <tibble [0 × 3]>
##  6 <split [2795/311]> Fold04      8 <tibble [2 × 8]>  <tibble [0 × 3]>
##  7 <split [2795/311]> Fold05      6 <tibble [2 × 8]>  <tibble [0 × 3]>
##  8 <split [2795/311]> Fold06      9 <tibble [2 × 8]>  <tibble [0 × 3]>
##  9 <split [2796/310]> Fold08     10 <tibble [2 × 8]>  <tibble [0 × 3]>
## 10 <split [2796/310]> Fold09      7 <tibble [2 × 8]>  <tibble [0 × 3]>

Evaluate and finalize the model

plot_race(xgb_rs)

collect_metrics(xgb_rs)
## # A tibble: 2 × 10
##    mtry trees min_n learn_rate .metric  .estimator  mean     n std_err .config  
##   <int> <int> <int>      <dbl> <chr>    <chr>      <dbl> <int>   <dbl> <chr>    
## 1    12   935     8    0.00826 accuracy binary     0.802    10 0.00615 Preproce…
## 2    12   935     8    0.00826 roc_auc  binary     0.884    10 0.00520 Preproce…
xgb_last <-
  xgb_wf %>%
  finalize_workflow(select_best(xgb_rs, "accuracy")) %>%
  last_fit(museum_split)
collect_metrics(xgb_last)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.816 Preprocessor1_Model1
## 2 roc_auc  binary         0.889 Preprocessor1_Model1
collect_predictions(xgb_last) %>%
  conf_mat(Accreditation, .pred_class)
##               Truth
## Prediction     Accredited Unaccredited
##   Accredited          349          110
##   Unaccredited         81          496
library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
xgb_last %>%
  extract_fit_engine() %>%
  vip()

  1. The research question we are trying to answer is whether we can predict if a museum in the UK is accredited or not using other data from the dataset.

The data is made up of 4191 observations and 35 variables.

The key variables that carry the most importanc when predicting the accreditation of a museum was if it is goverend by private, if it had closed down or not, what subject matter the museum was about and the size of the museum also carried a good amount of weight when making the predictions.

  1. The original museums dataset provides comprehensive details on UK museums with 4191 rows and 35 columns. For modeling, specific columns like Accreditation, Governance, and a few others were chosen to improve efficiency and relevance. The Year_opened was parsed to numeric, and a new Closed column was introduced to easily identify if a museum is operational. Essential preprocessing involved converting the Subject_Matter column using a generalized linear model to transform its categorical values to numeric, making it more model-friendly. Furthermore, dummy variables were created for all nominal predictors, converting them to a numeric format, which is often necessary for machine learning models to function optimally. These transformations not only make the dataset compatible for modeling but can also enhance model performance and interpretability.

  2. Feature Selection: Choosing specific columns from the dataset. Parsing: Converting the Year_opened column to numeric values. Feature Creation: Introducing a new Closed column based on the Year_closed values. Data Cleaning: Removing rows with NA values. Linear Embedding: Encoding the Subject_Matter using a generalized linear model. Dummy Variable Creation: Transforming all nominal predictors into dummy variables.

The machine learning model used in the analysis is the XGBoost model.

  1. The analysis uses two metrics for model evaluation:

Accuracy: Represents the proportion of correctly predicted classifications. It gives a direct indication of the model’s overall performance. In the museum accreditation context, high accuracy shows the model’s proficiency at classifying museums as accredited or unaccredited.

ROC_AUC: Measures the model’s ability to distinguish between positive and negative classes across different thresholds. A high ROC_AUC implies the model can effectively differentiate between accredited and unaccredited museums, even as the threshold changes.

  1. Governance and Accreditation: When considering the top governance types, there are varying distributions of accredited versus unaccredited museums within each governance type.

Size and Accreditation: Among huge museums, there are 11 accredited and only 1 unaccredited. Small museums have the highest number of unaccredited institutions at 1,751, compared to 650 that are accredited.

Model Performance: The XGBoost model, after tuning, achieves an accuracy of approximately 81.6% and an ROC_AUC of approximately 88.9% on the test data. This indicates a strong performance in predicting museum accreditation based on the given features.

Feature Importance: The governance of the museum appears to be an influential predictor in determining accreditation, as inferred from the variable importance plot.