library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
museums <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')
## Rows: 4191 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (24): museum_id, Name_of_museum, Address_line_1, Address_line_2, Village...
## dbl (11): Latitude, Longitude, DOMUS_identifier, Area_Deprivation_index, Are...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
museums %>%
  count(Accreditation)
## # A tibble: 2 × 2
##   Accreditation     n
##   <chr>         <int>
## 1 Accredited     1720
## 2 Unaccredited   2471
top_subjects <- museums %>% count(Subject_Matter) %>% slice_max(n, n = 6) %>% pull(Subject_Matter)

museums %>%
  filter(Subject_Matter %in% top_subjects) %>%
  count(Subject_Matter, Accreditation) %>%
  ggplot(aes(Accreditation, n, fill = Accreditation)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(vars(Subject_Matter), scales = "free_y") +
  labs(x = NULL, y = "Number of museums")

museums %>%
  count(Accreditation, Size)
## # A tibble: 10 × 3
##    Accreditation Size        n
##    <chr>         <chr>   <int>
##  1 Accredited    huge       11
##  2 Accredited    large     402
##  3 Accredited    medium    644
##  4 Accredited    small     650
##  5 Accredited    unknown    13
##  6 Unaccredited  huge        1
##  7 Unaccredited  large     142
##  8 Unaccredited  medium    381
##  9 Unaccredited  small    1751
## 10 Unaccredited  unknown   196
top_gov <- museums %>% count(Governance) %>% slice_max(n, n = 4) %>% pull(Governance)

museums %>%
  filter(Governance %in% top_gov) %>%
  count(Governance, Accreditation) %>%
  ggplot(aes(Accreditation, n, fill = Accreditation)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(vars(Governance), scales = "free_y") +
  labs(x = NULL, y = "Number of museums")

museum_parsed <-
  museums %>%
  select(museum_id, Accreditation, Governance, Size,
         Subject_Matter, Year_opened, Year_closed, Area_Deprivation_index) %>%
  mutate(Year_opened = parse_number(Year_opened),
         IsClosed = if_else(Year_closed == "9999:9999", "Open", "Closed")) %>%
  select(-Year_closed) %>%
  na.omit() %>%
  mutate(across(where(is.character), as.factor)) %>%
  mutate(museum_id = as.character(museum_id))

glimpse(museum_parsed)
## Rows: 4,142
## Columns: 8
## $ museum_id              <chr> "mm.New.1", "mm.aim.1230", "mm.domus.WM019", "m…
## $ Accreditation          <fct> Unaccredited, Unaccredited, Unaccredited, Accre…
## $ Governance             <fct> Independent-Not_for_profit, Independent-Unknown…
## $ Size                   <fct> large, small, medium, medium, small, small, sma…
## $ Subject_Matter         <fct> Sea_and_seafaring-Boats_and_ships, Natural_worl…
## $ Year_opened            <dbl> 2012, 1971, 1984, 2013, 1996, 1980, 1993, 1854,…
## $ Area_Deprivation_index <dbl> 2, 9, 8, 8, 2, 6, 6, 5, 6, 3, 7, 5, 8, 6, 9, 1,…
## $ IsClosed               <fct> Open, Closed, Open, Open, Closed, Closed, Open,…

Feature engineering

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
set.seed(123)
museum_split <- initial_split(museum_parsed, strata = Accreditation)

museum_train <- training(museum_split)
museum_test <- testing(museum_split)

set.seed(234)
museum_folds <- vfold_cv(museum_train, strata = Accreditation)
museum_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [2795/311]> Fold01
##  2 <split [2795/311]> Fold02
##  3 <split [2795/311]> Fold03
##  4 <split [2795/311]> Fold04
##  5 <split [2795/311]> Fold05
##  6 <split [2795/311]> Fold06
##  7 <split [2795/311]> Fold07
##  8 <split [2796/310]> Fold08
##  9 <split [2796/310]> Fold09
## 10 <split [2797/309]> Fold10
library(embed)

museum_rec <- 
  recipe(Accreditation ~ ., data = museum_train) %>%
  update_role(museum_id, new_role = "id") %>%
  step_lencode_glm(Subject_Matter, outcome = vars(Accreditation)) %>%
  step_dummy(all_nominal_predictors())

museum_rec
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:   1
## predictor: 6
## id:        1
## 
## ── Operations
## • Linear embedding for factors via GLM for: Subject_Matter
## • Dummy variables from: all_nominal_predictors()
prep(museum_rec) %>%
  tidy(number = 1)
## # A tibble: 114 × 4
##    level                            value terms          id               
##    <chr>                            <dbl> <chr>          <chr>            
##  1 Archaeology                    -16.6   Subject_Matter lencode_glm_kFEsv
##  2 Archaeology-Greek_and_Egyptian   1.39  Subject_Matter lencode_glm_kFEsv
##  3 Archaeology-Medieval            -1.10  Subject_Matter lencode_glm_kFEsv
##  4 Archaeology-Mixed               -0.981 Subject_Matter lencode_glm_kFEsv
##  5 Archaeology-Other                0.693 Subject_Matter lencode_glm_kFEsv
##  6 Archaeology-Prehistory          -0.811 Subject_Matter lencode_glm_kFEsv
##  7 Archaeology-Roman                0.167 Subject_Matter lencode_glm_kFEsv
##  8 Arts-Ceramics                   -0.405 Subject_Matter lencode_glm_kFEsv
##  9 Arts-Costume_and_textiles        0.118 Subject_Matter lencode_glm_kFEsv
## 10 Arts-Crafts                     -1.39  Subject_Matter lencode_glm_kFEsv
## # ℹ 104 more rows
prep(museum_rec) %>%
  tidy(number = 1) %>%
  filter(level == "..new")
## # A tibble: 1 × 4
##   level  value terms          id               
##   <chr>  <dbl> <chr>          <chr>            
## 1 ..new -0.909 Subject_Matter lencode_glm_kFEsv

Build a model

xgb_spec <-
  boost_tree(
    trees = tune(),
    min_n = tune(),
    mtry = tune(),
    learn_rate = 0.01
  ) %>%
  set_engine("xgboost") %>%
  set_mode("classification")

xgb_wf <- workflow(museum_rec, xgb_spec)
library(finetune)
doParallel::registerDoParallel()

set.seed(345)
xgb_rs <- tune_race_anova(
  xgb_wf,
  resamples = museum_folds,
  grid = 15,
  control = control_race(verbose_elim = TRUE)
)
## i Creating pre-processing data to finalize unknown parameter: mtry
## ℹ Racing will maximize the roc_auc metric.
## ℹ Resamples are analyzed in a random order.
## ℹ Fold10: 11 eliminated; 4 candidates remain.
## 
## ℹ Fold07: 2 eliminated; 2 candidates remain.
## 
## ℹ Fold03: All but one parameter combination were eliminated.
xgb_rs
## # Tuning results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 × 5
##    splits             id     .order .metrics          .notes          
##    <list>             <chr>   <int> <list>            <list>          
##  1 <split [2795/311]> Fold01      2 <tibble [30 × 7]> <tibble [0 × 3]>
##  2 <split [2795/311]> Fold02      3 <tibble [30 × 7]> <tibble [0 × 3]>
##  3 <split [2797/309]> Fold10      1 <tibble [30 × 7]> <tibble [0 × 3]>
##  4 <split [2795/311]> Fold07      4 <tibble [8 × 7]>  <tibble [0 × 3]>
##  5 <split [2795/311]> Fold03      5 <tibble [4 × 7]>  <tibble [0 × 3]>
##  6 <split [2795/311]> Fold04      8 <tibble [2 × 7]>  <tibble [0 × 3]>
##  7 <split [2795/311]> Fold05      6 <tibble [2 × 7]>  <tibble [0 × 3]>
##  8 <split [2795/311]> Fold06      9 <tibble [2 × 7]>  <tibble [0 × 3]>
##  9 <split [2796/310]> Fold08     10 <tibble [2 × 7]>  <tibble [0 × 3]>
## 10 <split [2796/310]> Fold09      7 <tibble [2 × 7]>  <tibble [0 × 3]>

Evaluate and finalize mode:

collect_metrics(xgb_rs)
## # A tibble: 2 × 9
##    mtry trees min_n .metric  .estimator  mean     n std_err .config             
##   <int> <int> <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1     2   599     8 accuracy binary     0.798    10 0.00701 Preprocessor1_Model…
## 2     2   599     8 roc_auc  binary     0.885    10 0.00549 Preprocessor1_Model…
plot_race(xgb_rs)

xgb_last <- xgb_wf %>%
  finalize_workflow(select_best(xgb_rs, "accuracy")) %>%
  last_fit(museum_split)

xgb_last
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits              id               .metrics .notes   .predictions .workflow 
##   <list>              <chr>            <list>   <list>   <list>       <list>    
## 1 <split [3106/1036]> train/test split <tibble> <tibble> <tibble>     <workflow>
collect_metrics(xgb_last)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.812 Preprocessor1_Model1
## 2 roc_auc  binary         0.890 Preprocessor1_Model1
collect_predictions(xgb_last) %>%
    conf_mat(Accreditation, .pred_class)
##               Truth
## Prediction     Accredited Unaccredited
##   Accredited          355          120
##   Unaccredited         75          486
library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
xgb_last %>%
  extract_fit_engine() %>%
  vip()

Questions

  1. Question and Data:
    • What is the research question? Clearly state the research question you aim to address using the new dataset. The research question is “Can the accreditation status of museums in the UK be predicted based on various attributes such as Governance, Size, Subject Matter, Year Opened, and Area Deprivation Index?”

    • Describe the data briefly: Provide an overview of the new dataset, highlighting its key characteristics and dimensions. The data set contains 4,191 observations of 35 variables. Variables of interest are accreditation, governance, size, subject_matter, year_opened, year_closed, and museum_id. Museum_id, size, year_closed, governance, year_opened, area deprivation index and accreditation are all character data. The variables describe various things about the museums.

    • What are the characteristics of the key variables used in the analysis? Describe the primary variables of interest in the data set and their characteristics. Accreditation is character data that describes the accreditation status of the museums. Governance is character data that specifies the type of governance structure under which a museum operates. Size is character data which represents the size of the museum. Subject_matter is character data that explains the primary focus of the museum. Year_opened is character data that explains the year the museum was opened. Area deprivation index is numerical data.

  2. Data Exploration and Transformation:
    • Describe the differences between the original data and the data transformed for modeling. Why? Explain any preprocessing or transformations performed on the new data set compared to the original data. Discuss why these changes were necessary or beneficial. The original data has 4,191 observations of 35 variables. There are many different preprocessing and transformation steps that were performed on the new data set. Many NA values were omitted from the data set such as from year_closed. Year_closed was alos changed to IsClosed to show whether or not the museum is still operational. This creates a more meaningful variable that is more helpful at indicating which museums are still open. Museum_id was converted to a factor variable so that it was more beneficial for modeling. We also split the data into training and test data, whereas the original data set was all one data set. This splitting is necessary to make a well trained model.
  3. Data Preparation and Modeling:
    • What are the names of data preparation steps mentioned in the video? List and describe any data preparation steps or techniques mentioned in the CA video that you applied to the new data set. The data preparation steps we used were filter, omitting NA values, creating a new variable from year_closed to IsClosed. We also changed variable types from character to factor data. We also split the data into training and testing data.
    • What is the name of the machine learning model(s) used in the analysis? Specify the machine learning model(s) you employed for your analysis and briefly explain their relevance to the research question. The machine learning model we used was XGBoost. XGBoost was used to predict the accreditation status of the museums based on different variables such as governance, size, subject matter, year opened, and area deprivation index. XGBoost is great at handling classification tasks.
  4. Model Evaluation:
    • What metrics are used in the model evaluation? Detail the evaluation metrics you used to assess the performance of your machine learning model(s) on the new dataset. Discuss the significance of these metrics in the context of your research question. We used “collect_metrics(xgb_rs)”, “collect_metrics(xgb_last)”, and “(collect_predictions(xgb_last) %>% conf_mat(Accreditation, .pred_class)” for the metrics used in this evaluation. Confusion matrix describes the performance of a classification algorithm on a set of test data. These metrics provide a rate of predictions which is necessary for validating the reliability of the models predictive capabilities. Confusion matrix shows the where the model may go wrong which is helpful in understanding the limitations of the model as well as how it could be improved.
  5. Conclusion:
    • What are the major findings? Summarize the key findings and insights obtained from your analysis of the new data set. The top governance types have varying distributions of accredited versus unaccredited museums within each governance type. There 11 accredited and only 1 unaccredited larger museums. Small museums have the highest number of unaccredited institutions at 1,751, compared to 650 that are accredited. The XGBoost model, after tuning, achieves an accuracy of approximately 81.6% and an ROC_AUC of approximately 88.9% on the test data. This indicates a strong performance in predicting museum accreditation based on the given features. The governance of the museum appears to be a strong predictor in determining accreditation.