library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
museums <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')
## Rows: 4191 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (24): museum_id, Name_of_museum, Address_line_1, Address_line_2, Village...
## dbl (11): Latitude, Longitude, DOMUS_identifier, Area_Deprivation_index, Are...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
museums %>%
count(Accreditation)
## # A tibble: 2 × 2
## Accreditation n
## <chr> <int>
## 1 Accredited 1720
## 2 Unaccredited 2471
top_gov <-
museums %>%
count(Governance, sort = TRUE) %>%
slice_max(n, n = 4) %>%
pull(Governance)
museums %>%
filter(Governance %in% top_gov) %>%
count(Governance, Accreditation) %>%
ggplot(aes(Accreditation, n, fill = Accreditation)) +
geom_col(show.legend = FALSE) +
facet_wrap(vars(Governance), scales = "free_y")
museums %>%
count(Accreditation, Size)
## # A tibble: 10 × 3
## Accreditation Size n
## <chr> <chr> <int>
## 1 Accredited huge 11
## 2 Accredited large 402
## 3 Accredited medium 644
## 4 Accredited small 650
## 5 Accredited unknown 13
## 6 Unaccredited huge 1
## 7 Unaccredited large 142
## 8 Unaccredited medium 381
## 9 Unaccredited small 1751
## 10 Unaccredited unknown 196
museums_parsed <-
museums %>%
select(museum_id,
Accreditation,
Governance,
Size,
Subject_Matter,
Year_opened,
Year_closed,
Area_Deprivation_index) %>%
mutate(Year_opened = parse_number(Year_opened),
Closed = if_else(Year_closed == "9999:9999", "Open", "Closed")) %>%
select(-Year_closed) %>%
na.omit() %>%
mutate_if(is.character, as.factor) %>%
mutate(museum_id = as.character(museum_id))
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.4 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
set.seed(123)
museum_split <- initial_split(museums_parsed, strata = Accreditation)
museum_train <- training(museum_split)
museum_test <- testing(museum_split)
set.seed(234)
museum_folds <- vfold_cv(museum_train, strata = Accreditation)
museum_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [2795/311]> Fold01
## 2 <split [2795/311]> Fold02
## 3 <split [2795/311]> Fold03
## 4 <split [2795/311]> Fold04
## 5 <split [2795/311]> Fold05
## 6 <split [2795/311]> Fold06
## 7 <split [2795/311]> Fold07
## 8 <split [2796/310]> Fold08
## 9 <split [2796/310]> Fold09
## 10 <split [2797/309]> Fold10
library(embed)
museum_rec <-
recipe(Accreditation ~ ., data = museum_train) %>%
update_role(museum_id, new_role = "id") %>%
## How does subject matter affect Accreditation (affect encoding)
step_lencode_glm(Subject_Matter, outcome = vars(Accreditation)) %>%
step_dummy(all_nominal_predictors())
museum_rec
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 6
## id: 1
##
## ── Operations
## • Linear embedding for factors via GLM for: Subject_Matter
## • Dummy variables from: all_nominal_predictors()
prep(museum_rec) %>%
tidy(number = 1) %>%
filter(level == "..new")
## # A tibble: 1 × 4
## level value terms id
## <chr> <dbl> <chr> <chr>
## 1 ..new -0.909 Subject_Matter lencode_glm_kFEsv
xgb_spec <-
boost_tree(
trees = tune(),
min_n = tune(),
mtry = tune(),
learn_rate = tune()
) %>%
set_engine("xgboost") %>%
set_mode("classification")
xgb_wf <- workflow(museum_rec, xgb_spec)
library(finetune)
doParallel::registerDoParallel()
set.seed(345)
xgb_rs <-
tune_race_anova(
xgb_wf,
resamples = museum_folds,
grid = 15,
control = control_race(verbose_elim = TRUE)
)
## i Creating pre-processing data to finalize unknown parameter: mtry
## ℹ Racing will maximize the roc_auc metric.
## ℹ Resamples are analyzed in a random order.
## ℹ Fold10: 11 eliminated; 4 candidates remain.
##
## ℹ Fold07: All but one parameter combination were eliminated.
xgb_rs
## # Tuning results
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 5
## splits id .order .metrics .notes
## <list> <chr> <int> <list> <list>
## 1 <split [2795/311]> Fold01 2 <tibble [30 × 8]> <tibble [0 × 3]>
## 2 <split [2795/311]> Fold02 3 <tibble [30 × 8]> <tibble [0 × 3]>
## 3 <split [2797/309]> Fold10 1 <tibble [30 × 8]> <tibble [0 × 3]>
## 4 <split [2795/311]> Fold07 4 <tibble [8 × 8]> <tibble [0 × 3]>
## 5 <split [2795/311]> Fold03 5 <tibble [2 × 8]> <tibble [0 × 3]>
## 6 <split [2795/311]> Fold04 8 <tibble [2 × 8]> <tibble [0 × 3]>
## 7 <split [2795/311]> Fold05 6 <tibble [2 × 8]> <tibble [0 × 3]>
## 8 <split [2795/311]> Fold06 9 <tibble [2 × 8]> <tibble [0 × 3]>
## 9 <split [2796/310]> Fold08 10 <tibble [2 × 8]> <tibble [0 × 3]>
## 10 <split [2796/310]> Fold09 7 <tibble [2 × 8]> <tibble [0 × 3]>
plot_race(xgb_rs)
collect_metrics(xgb_rs)
## # A tibble: 2 × 10
## mtry trees min_n learn_rate .metric .estimator mean n std_err .config
## <int> <int> <int> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 12 935 8 0.00826 accuracy binary 0.802 10 0.00615 Preproce…
## 2 12 935 8 0.00826 roc_auc binary 0.884 10 0.00520 Preproce…
xgb_last <-
xgb_wf %>%
finalize_workflow(select_best(xgb_rs, "accuracy")) %>%
last_fit(museum_split)
collect_metrics(xgb_last)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.816 Preprocessor1_Model1
## 2 roc_auc binary 0.889 Preprocessor1_Model1
collect_predictions(xgb_last) %>%
conf_mat(Accreditation, .pred_class)
## Truth
## Prediction Accredited Unaccredited
## Accredited 349 110
## Unaccredited 81 496
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
xgb_last %>%
extract_fit_engine() %>%
vip()
The data is made up of 4191 observations and 35 variables.
The key variables that carry the most importanc when predicting the accreditation of a museum was if it is goverend by private, if it had closed down or not, what subject matter the museum was about and the size of the museum also carried a good amount of weight when making the predictions.
The original museums dataset provides comprehensive details on UK museums with 4191 rows and 35 columns. For modeling, specific columns like Accreditation, Governance, and a few others were chosen to improve efficiency and relevance. The Year_opened was parsed to numeric, and a new Closed column was introduced to easily identify if a museum is operational. Essential preprocessing involved converting the Subject_Matter column using a generalized linear model to transform its categorical values to numeric, making it more model-friendly. Furthermore, dummy variables were created for all nominal predictors, converting them to a numeric format, which is often necessary for machine learning models to function optimally. These transformations not only make the dataset compatible for modeling but can also enhance model performance and interpretability.
Feature Selection: Choosing specific columns from the dataset. Parsing: Converting the Year_opened column to numeric values. Feature Creation: Introducing a new Closed column based on the Year_closed values. Data Cleaning: Removing rows with NA values. Linear Embedding: Encoding the Subject_Matter using a generalized linear model. Dummy Variable Creation: Transforming all nominal predictors into dummy variables.
The machine learning model used in the analysis is the XGBoost model.
Accuracy: Represents the proportion of correctly predicted classifications. It gives a direct indication of the model’s overall performance. In the museum accreditation context, high accuracy shows the model’s proficiency at classifying museums as accredited or unaccredited.
ROC_AUC: Measures the model’s ability to distinguish between positive and negative classes across different thresholds. A high ROC_AUC implies the model can effectively differentiate between accredited and unaccredited museums, even as the threshold changes.
Size and Accreditation: Among huge museums, there are 11 accredited and only 1 unaccredited. Small museums have the highest number of unaccredited institutions at 1,751, compared to 650 that are accredited.
Model Performance: The XGBoost model, after tuning, achieves an accuracy of approximately 81.6% and an ROC_AUC of approximately 88.9% on the test data. This indicates a strong performance in predicting museum accreditation based on the given features.
Feature Importance: The governance of the museum appears to be an influential predictor in determining accreditation, as inferred from the variable importance plot.