Import Data

museums <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')

museums %>% skimr::skim()

Data summary
Name	Piped data
Number of rows	4191
Number of columns	35
_______________________
Column type frequency:
character	24
numeric	11
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
museum_id	0	1.00	8	15	4191
Name_of_museum	0	1.00	3	76	4190
Address_line_1	441	0.89	3	61	3212
Address_line_2	2816	0.33	3	39	1167
Village,_Town_or_City	4	1.00	3	24	1696
Postcode	0	1.00	6	9	3918
Admin_area	0	1.00	12	137	393
Accreditation	0	1.00	10	12	2
Governance	0	1.00	7	41	13
Size	0	1.00	4	7	5
Size_provenance	179	0.96	2	29	16
Subject_Matter	0	1.00	5	45	114
Year_opened	0	1.00	9	9	351
Year_closed	0	1.00	9	9	170
DOMUS_Subject_Matter	2788	0.33	5	27	21
Primary_provenance_of_data	0	1.00	3	8	18
Identifier_used_in_primary_data_source	2056	0.51	2	8	2134
Area_Geodemographic_group	49	0.99	11	40	17
Area_Geodemographic_group_code	49	0.99	3	3	16
Area_Geodemographic_subgroup	49	0.99	12	39	25
Area_Geodemographic_subgroup_code	49	0.99	4	4	24
Area_Geodemographic_supergroup	49	0.99	16	39	8
Area_Geodemographic_supergroup_code	49	0.99	2	2	8
Notes	2980	0.29	12	751	956

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Latitude	0	1.00	52.93	2.09	49.18	51.48	52.47	53.96	100.00	▇▁▁▁▁
Longitude	0	1.00	-1.96	1.84	-8.09	-3.10	-1.87	-0.48	1.76	▁▂▇▇▅
DOMUS_identifier	2347	0.44	1303.45	1597.19	1.00	486.50	991.50	1470.25	7746.00	▇▂▁▁▁
Area_Deprivation_index	49	0.99	5.44	2.48	1.00	4.00	5.00	7.00	10.00	▃▆▇▆▃
Area_Deprivation_index_crime	49	0.99	5.43	3.07	1.00	3.00	6.00	8.00	10.00	▇▆▅▇▇
Area_Deprivation_index_education	49	0.99	6.04	2.61	1.00	4.00	6.00	8.00	10.00	▃▅▇▇▆
Area_Deprivation_index_employment	49	0.99	6.08	2.76	1.00	4.00	6.00	8.00	10.00	▅▆▇▇▇
Area_Deprivation_index_health	49	0.99	6.02	2.82	1.00	4.00	6.00	8.00	10.00	▅▆▆▇▇
Area_Deprivation_index_housing	49	0.99	3.97	2.75	1.00	1.00	3.00	6.00	10.00	▇▅▃▂▂
Area_Deprivation_index_income	49	0.99	5.99	2.62	1.00	4.00	6.00	8.00	10.00	▃▆▇▇▆
Area_Deprivation_index_services	49	0.99	4.78	3.01	1.00	2.00	4.00	7.00	10.00	▇▅▅▅▅

Notes about the dataset

**Village,_Town_or_City** (, can’t be used for variable name): clean variable names using janitor::clean_names()
Supergroup vs Group vs subgroup (e.g., Geodemographic_group and Geodemographic_subgroup): Keep only one, as they contain overlapping information. Retain the group variable if the subgroup does not provide additional valuable insights, and vice versa.
Variables with too many missing values: Remove the following:
- notes
- Address_line_2
- DOMUS_identifier
- DOMUS_subject_matter
- Identifier_used_in_primary_data_source
Redundant Variables (e.g., Area_Geodemographic_group & Area_Geodemographic_group_code): Keep only one from each pair, as they contain the same information.
Location Variables (Address_line_1 through Admin_area): Select only one or two essential location variables, as these are redundant.
Year_opened, Year_closed: These variables contain two years separated by a colon, which requires clarification. Additionally, the value “9999” in Year_closed likely indicates “not closed.” Decide whether to keep these variables. If you keep them, consider data cleaning. For instance, one approach would be to retain only the first year from Year_opened.
Inconsistent white spaces between words (e.g., area_geodemographic_group has “Ethnically Diverse Metropolitan Living” and “Ethnically Diverse Metropolitan Living”): Revise the character variables so that they have one white space between words, using stringr::str_squish()
Character variables (These are not strings, like notes): Convert them to factors.
ID variables (museum_id and name_of_museum): Keep museum_id as name_of_museum has duplicates

Clean Data

library(janitor)

data_clean <- museums %>%
    
    # Village,_Town_or_City (, can't be used for variable name): clean variable names
    janitor::clean_names() %>%
    
    # supergroup vs groups vs subgroups: run the model with either groups or subgroups and choose one based on the model performance
    select(-area_geodemographic_group, -area_geodemographic_subgroup) %>%
    
    # Variables with too many missing values: Remove
    select(-notes, -address_line_2, -domus_identifier, -domus_subject_matter, -identifier_used_in_primary_data_source) %>%
    
    # Redundant Variables (e.g., Area_Geodemographic_group & Area_Geodemographic_group_code):** Keep only one from each pair, as they contain the same information.
    select(-ends_with("_code")) %>%
    
    # Location Variables (Address_line_1 through Admin_area): Select only one
    select(-address_line_1, -village_town_or_city, -postcode, -latitude, -longitude) %>%
    
    # Extract provinces from admin_area
    mutate(admin_area = admin_area %>% str_remove("\\/")) %>% 
    mutate(admin_area = admin_area %>% str_remove("\\/.*")) %>%
    
    # Year_opened, Year_closed: Drop Year_closed and consider adding later
    select(-year_closed) %>%
    mutate(year_opened = year_opened %>% str_remove("\\:.*") %>% as.numeric()) %>%
    
    # inconsistent white spaces between words (e.g., area_geodemographic_group has "Ethnically Diverse Metropolitan Living" and "Ethnically Diverse Metropolitan  Living"): Revise the character variables so that they have one white space between words
    mutate(across(where(is.character), str_squish)) %>%
    
    # Character variables: Convert them to factor
    mutate(across(where(is.character), factor)) %>%
    
    # id variables (museum_id and name_of_museum): Keep museum_id
    select(-name_of_museum) %>%
    
    # Remove all rows with missing values
    na.omit()

Explore Data

# Check whether the target variable is unbalanced
data_clean %>% count(accreditation)

## # A tibble: 2 × 2
##   accreditation     n
##   <fct>         <int>
## 1 Accredited     1703
## 2 Unaccredited   2267

# correlation funnel analysis

# Step 1: Prepare Data
data_binarized_tbl <- data_clean %>%
    
    # Throw in all predictors except for id and strings
    select(-museum_id) %>%
    binarize()

data_binarized_tbl %>% glimpse()

## Rows: 3,970
## Columns: 109
## $ admin_area__England                                                     <dbl> …
## $ admin_area__Northern_Ireland                                            <dbl> …
## $ admin_area__Scotland                                                    <dbl> …
## $ admin_area__Wales                                                       <dbl> …
## $ `admin_area__-OTHER`                                                    <dbl> …
## $ accreditation__Accredited                                               <dbl> …
## $ accreditation__Unaccredited                                             <dbl> …
## $ `governance__Government-Local_Authority`                                <dbl> …
## $ `governance__Government-National`                                       <dbl> …
## $ `governance__Independent-English_Heritage`                              <dbl> …
## $ `governance__Independent-National_Trust`                                <dbl> …
## $ `governance__Independent-Not_for_profit`                                <dbl> …
## $ `governance__Independent-Private`                                       <dbl> …
## $ `governance__Independent-Unknown`                                       <dbl> …
## $ governance__University                                                  <dbl> …
## $ governance__Unknown                                                     <dbl> …
## $ `governance__-OTHER`                                                    <dbl> …
## $ size__large                                                             <dbl> …
## $ size__medium                                                            <dbl> …
## $ size__small                                                             <dbl> …
## $ size__unknown                                                           <dbl> …
## $ `size__-OTHER`                                                          <dbl> …
## $ size_provenance__ace_size_designation                                   <dbl> …
## $ size_provenance__aim_size_designation                                   <dbl> …
## $ size_provenance__domus                                                  <dbl> …
## $ `size_provenance__ma(fam)`                                              <dbl> …
## $ size_provenance__mm_manual_estimate_2018                                <dbl> …
## $ size_provenance__mm_prediction_random_forest                            <dbl> …
## $ size_provenance__scottish_national_audit                                <dbl> …
## $ size_provenance__unknown                                                <dbl> …
## $ size_provenance__visitbritain                                           <dbl> …
## $ `size_provenance__-OTHER`                                               <dbl> …
## $ `subject_matter__Archaeology-Roman`                                     <dbl> …
## $ `subject_matter__Arts-Fine_and_decorative_arts`                         <dbl> …
## $ `subject_matter__Buildings-Houses-Large_houses`                         <dbl> …
## $ `subject_matter__Buildings-Houses-Medium_houses`                        <dbl> …
## $ `subject_matter__Industry_and_manufacture-Mining_and_quarrying`         <dbl> …
## $ `subject_matter__Leisure_and_sport-Toys_and_models`                     <dbl> …
## $ subject_matter__Local_Histories                                         <dbl> …
## $ `subject_matter__Mixed-Encyclopaedic`                                   <dbl> …
## $ `subject_matter__Mixed-Other`                                           <dbl> …
## $ subject_matter__Other                                                   <dbl> …
## $ `subject_matter__Personality-Literary`                                  <dbl> …
## $ `subject_matter__Rural_Industry-Farming`                                <dbl> …
## $ `subject_matter__Sea_and_seafaring-Boats_and_ships`                     <dbl> …
## $ `subject_matter__Sea_and_seafaring-Mixed`                               <dbl> …
## $ `subject_matter__Transport-Cars_and_motorbikes`                         <dbl> …
## $ `subject_matter__Transport-Trains_and_railways`                         <dbl> …
## $ `subject_matter__War_and_conflict-Airforce`                             <dbl> …
## $ `subject_matter__War_and_conflict-Castles_and_forts`                    <dbl> …
## $ `subject_matter__War_and_conflict-Military`                             <dbl> …
## $ `subject_matter__War_and_conflict-Regiment`                             <dbl> …
## $ `subject_matter__-OTHER`                                                <dbl> …
## $ `year_opened__-Inf_1957`                                                <dbl> …
## $ year_opened__1957_1977                                                  <dbl> …
## $ year_opened__1977_1992                                                  <dbl> …
## $ year_opened__1992_Inf                                                   <dbl> …
## $ primary_provenance_of_data__ace                                         <dbl> …
## $ primary_provenance_of_data__aim                                         <dbl> …
## $ primary_provenance_of_data__aim82M                                      <dbl> …
## $ primary_provenance_of_data__aim82NM                                     <dbl> …
## $ primary_provenance_of_data__domus                                       <dbl> …
## $ primary_provenance_of_data__fcm                                         <dbl> …
## $ primary_provenance_of_data__hha                                         <dbl> …
## $ primary_provenance_of_data__mald                                        <dbl> …
## $ primary_provenance_of_data__mgs                                         <dbl> …
## $ primary_provenance_of_data__misc                                        <dbl> …
## $ primary_provenance_of_data__musassoc                                    <dbl> …
## $ primary_provenance_of_data__wiki                                        <dbl> …
## $ `primary_provenance_of_data__-OTHER`                                    <dbl> …
## $ `area_deprivation_index__-Inf_4`                                        <dbl> …
## $ area_deprivation_index__4_5                                             <dbl> …
## $ area_deprivation_index__5_7                                             <dbl> …
## $ area_deprivation_index__7_Inf                                           <dbl> …
## $ `area_deprivation_index_crime__-Inf_3`                                  <dbl> …
## $ area_deprivation_index_crime__3_6                                       <dbl> …
## $ area_deprivation_index_crime__6_8                                       <dbl> …
## $ area_deprivation_index_crime__8_Inf                                     <dbl> …
## $ `area_deprivation_index_education__-Inf_4`                              <dbl> …
## $ area_deprivation_index_education__4_6                                   <dbl> …
## $ area_deprivation_index_education__6_8                                   <dbl> …
## $ area_deprivation_index_education__8_Inf                                 <dbl> …
## $ `area_deprivation_index_employment__-Inf_4`                             <dbl> …
## $ area_deprivation_index_employment__4_6                                  <dbl> …
## $ area_deprivation_index_employment__6_8                                  <dbl> …
## $ area_deprivation_index_employment__8_Inf                                <dbl> …
## $ `area_deprivation_index_health__-Inf_4`                                 <dbl> …
## $ area_deprivation_index_health__4_6                                      <dbl> …
## $ area_deprivation_index_health__6_8                                      <dbl> …
## $ area_deprivation_index_health__8_Inf                                    <dbl> …
## $ `area_deprivation_index_housing__-Inf_3`                                <dbl> …
## $ area_deprivation_index_housing__3_6                                     <dbl> …
## $ area_deprivation_index_housing__6_Inf                                   <dbl> …
## $ `area_deprivation_index_income__-Inf_4`                                 <dbl> …
## $ area_deprivation_index_income__4_6                                      <dbl> …
## $ area_deprivation_index_income__6_8                                      <dbl> …
## $ area_deprivation_index_income__8_Inf                                    <dbl> …
## $ `area_deprivation_index_services__-Inf_2`                               <dbl> …
## $ area_deprivation_index_services__2_4                                    <dbl> …
## $ area_deprivation_index_services__4_8                                    <dbl> …
## $ area_deprivation_index_services__8_Inf                                  <dbl> …
## $ area_geodemographic_supergroup__Affluent_England                        <dbl> …
## $ area_geodemographic_supergroup__Business_Education_and_Heritage_Centres <dbl> …
## $ area_geodemographic_supergroup__Countryside_Living                      <dbl> …
## $ area_geodemographic_supergroup__Ethnically_Diverse_Metropolitan_Living  <dbl> …
## $ area_geodemographic_supergroup__London_Cosmopolitan                     <dbl> …
## $ area_geodemographic_supergroup__Services_and_Industrial_Legacy          <dbl> …
## $ area_geodemographic_supergroup__Town_and_Country_Living                 <dbl> …
## $ area_geodemographic_supergroup__Urban_Settlements                       <dbl> …

# Step 2: Correlate to the Target
data_corr_tbl <- data_binarized_tbl %>%
    correlate(accreditation__Accredited)

data_corr_tbl

## # A tibble: 109 × 3
##    feature                    bin                         correlation
##    <fct>                      <chr>                             <dbl>
##  1 accreditation              Accredited                        1    
##  2 accreditation              Unaccredited                     -1    
##  3 primary_provenance_of_data domus                             0.571
##  4 size_provenance            mm_prediction_random_forest      -0.516
##  5 size_provenance            domus                             0.458
##  6 governance                 Independent-Private              -0.387
##  7 size                       small                            -0.377
##  8 primary_provenance_of_data wiki                             -0.295
##  9 primary_provenance_of_data ace                               0.261
## 10 size                       large                             0.256
## # ℹ 99 more rows

# Step 3: Plot
data_corr_tbl %>% 
    plot_correlation_funnel()

Model Building

Split Data

# Start with small data until you are sure your code has no errors
data_clean <- data_clean %>% sample_n(100)

library(tidymodels)

set.seed(123)
data_split <- initial_split(data_clean, strata = accreditation)

data_train <- training(data_split)
data_test <- testing(data_split)

set.seed(234)
data_folds <- bootstraps(data_train, strata = accreditation)
data_folds

## # Bootstrap sampling using stratification 
## # A tibble: 25 × 2
##    splits          id         
##    <list>          <chr>      
##  1 <split [75/28]> Bootstrap01
##  2 <split [75/30]> Bootstrap02
##  3 <split [75/24]> Bootstrap03
##  4 <split [75/27]> Bootstrap04
##  5 <split [75/30]> Bootstrap05
##  6 <split [75/27]> Bootstrap06
##  7 <split [75/22]> Bootstrap07
##  8 <split [75/31]> Bootstrap08
##  9 <split [75/29]> Bootstrap09
## 10 <split [75/29]> Bootstrap10
## # ℹ 15 more rows

Preprocess Data Using Recipes Package

# Convert all nominal predictors (characters and factors) to numbers.
data_rec <- 
    recipe(accreditation ~ ., data = data_train) %>%
    update_role(museum_id, new_role = "id") %>%
    step_other(subject_matter) %>%
    step_dummy(all_nominal_predictors()) %>%
    step_normalize(all_numeric_predictors()) %>%
    step_zv()

data_rec %>% prep() %>% juice() %>% glimpse()

## Rows: 75
## Columns: 76
## $ museum_id                                                              <fct> …
## $ year_opened                                                            <dbl> …
## $ area_deprivation_index                                                 <dbl> …
## $ area_deprivation_index_crime                                           <dbl> …
## $ area_deprivation_index_education                                       <dbl> …
## $ area_deprivation_index_employment                                      <dbl> …
## $ area_deprivation_index_health                                          <dbl> …
## $ area_deprivation_index_housing                                         <dbl> …
## $ area_deprivation_index_income                                          <dbl> …
## $ area_deprivation_index_services                                        <dbl> …
## $ accreditation                                                          <fct> …
## $ admin_area_England                                                     <dbl> …
## $ admin_area_Isle.of.Man                                                 <dbl> …
## $ admin_area_Northern.Ireland                                            <dbl> …
## $ admin_area_Scotland                                                    <dbl> …
## $ admin_area_Wales                                                       <dbl> …
## $ governance_Government.Local_Authority                                  <dbl> …
## $ governance_Government.National                                         <dbl> …
## $ governance_Government.Other                                            <dbl> …
## $ governance_Independent.English_Heritage                                <dbl> …
## $ governance_Independent.Historic_Environment_Scotland                   <dbl> …
## $ governance_Independent.National_Trust                                  <dbl> …
## $ governance_Independent.National_Trust_for_Scotland                     <dbl> …
## $ governance_Independent.Not_for_profit                                  <dbl> …
## $ governance_Independent.Private                                         <dbl> …
## $ governance_Independent.Unknown                                         <dbl> …
## $ governance_University                                                  <dbl> …
## $ governance_Unknown                                                     <dbl> …
## $ size_large                                                             <dbl> …
## $ size_medium                                                            <dbl> …
## $ size_small                                                             <dbl> …
## $ size_unknown                                                           <dbl> …
## $ size_provenance_aim_size_designation                                   <dbl> …
## $ size_provenance_babbidge_ewles_and_smith_2006                          <dbl> …
## $ size_provenance_domus                                                  <dbl> …
## $ size_provenance_ma.fam.                                                <dbl> …
## $ size_provenance_ma.fam._year_stated                                    <dbl> …
## $ size_provenance_ma.fam2.                                               <dbl> …
## $ size_provenance_mm                                                     <dbl> …
## $ size_provenance_mm.int                                                 <dbl> …
## $ size_provenance_mm_manual_estimate_2018                                <dbl> …
## $ size_provenance_mm_prediction_random_forest                            <dbl> …
## $ size_provenance_national_trust_annual_report_                          <dbl> …
## $ size_provenance_nilm.vn.                                               <dbl> …
## $ size_provenance_scottish_national_audit                                <dbl> …
## $ size_provenance_unknown                                                <dbl> …
## $ size_provenance_visitbritain                                           <dbl> …
## $ subject_matter_Buildings.Houses.Large_houses                           <dbl> …
## $ subject_matter_Local_Histories                                         <dbl> …
## $ subject_matter_Personality.Literary                                    <dbl> …
## $ subject_matter_War_and_conflict.Regiment                               <dbl> …
## $ subject_matter_other                                                   <dbl> …
## $ primary_provenance_of_data_aim                                         <dbl> …
## $ primary_provenance_of_data_aim82M                                      <dbl> …
## $ primary_provenance_of_data_aim82NM                                     <dbl> …
## $ primary_provenance_of_data_domus                                       <dbl> …
## $ primary_provenance_of_data_fcm                                         <dbl> …
## $ primary_provenance_of_data_hha                                         <dbl> …
## $ primary_provenance_of_data_hud                                         <dbl> …
## $ primary_provenance_of_data_mald                                        <dbl> …
## $ primary_provenance_of_data_MDN                                         <dbl> …
## $ primary_provenance_of_data_mgs                                         <dbl> …
## $ primary_provenance_of_data_misc                                        <dbl> …
## $ primary_provenance_of_data_Misc                                        <dbl> …
## $ primary_provenance_of_data_musassoc                                    <dbl> …
## $ primary_provenance_of_data_MusCal                                      <dbl> …
## $ primary_provenance_of_data_New                                         <dbl> …
## $ primary_provenance_of_data_nimc                                        <dbl> …
## $ primary_provenance_of_data_wiki                                        <dbl> …
## $ area_geodemographic_supergroup_Business.Education.and.Heritage.Centres <dbl> …
## $ area_geodemographic_supergroup_Countryside.Living                      <dbl> …
## $ area_geodemographic_supergroup_Ethnically.Diverse.Metropolitan.Living  <dbl> …
## $ area_geodemographic_supergroup_London.Cosmopolitan                     <dbl> …
## $ area_geodemographic_supergroup_Services.and.Industrial.Legacy          <dbl> …
## $ area_geodemographic_supergroup_Town.and.Country.Living                 <dbl> …
## $ area_geodemographic_supergroup_Urban.Settlements                       <dbl> …

Specify Model

library(usemodels)

model_spec <-
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

model_workflow <- 
  workflow() %>% 
  add_recipe(data_rec) %>% 
  add_model(model_spec) 

doParallel::registerDoParallel()
set.seed(27358)
model_tune <-
  tune_grid(model_workflow, 
            resamples = data_folds,
            control = control_resamples(save_pred = TRUE))

Model evaluation

Show performance

collect_metrics(model_tune)

## # A tibble: 30 × 10
##    trees min_n tree_depth learn_rate .metric     .estimator  mean     n  std_err
##    <int> <int>      <int>      <dbl> <chr>       <chr>      <dbl> <int>    <dbl>
##  1     6     2          5    0.00648 accuracy    binary     0.694    25 0.0178  
##  2     6     2          5    0.00648 brier_class binary     0.244    25 0.000320
##  3     6     2          5    0.00648 roc_auc     binary     0.794    25 0.0162  
##  4  1765     8          1    0.252   accuracy    binary     0.555    25 0.0207  
##  5  1765     8          1    0.252   brier_class binary     0.260    25 0.00622 
##  6  1765     8          1    0.252   roc_auc     binary     0.556    25 0.0224  
##  7   257    10          7    0.0840  accuracy    binary     0.555    25 0.00934 
##  8   257    10          7    0.0840  brier_class binary     0.247    25 0.00112 
##  9   257    10          7    0.0840  roc_auc     binary     0.5      25 0       
## 10  1867    15         14    0.00279 accuracy    binary     0.555    25 0.00934 
## # ℹ 20 more rows
## # ℹ 1 more variable: .config <chr>

collect_predictions(model_tune) %>%
    group_by(id) %>%
    roc_curve(accreditation, .pred_Accredited) %>%
    autoplot()

Finalize workflow (last_fit)

final_fitted <- model_workflow %>%
    
    # Fit the best model to the train data
    finalize_workflow(select_best(model_tune, metric = "accuracy")) %>%
    
    # Evaluate on the test data
    last_fit(data_split)

collect_metrics(final_fitted)

## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.52  Preprocessor1_Model1
## 2 roc_auc     binary         0.675 Preprocessor1_Model1
## 3 brier_class binary         0.247 Preprocessor1_Model1

collect_predictions(final_fitted) %>%
    conf_mat(accreditation, .pred_class) %>%
    autoplot()

Variable importance

library(vip)

final_fitted %>%
  extract_fit_engine() %>%
  vip()

Apply 7: Museum Accreditation Classification Model

Sara Donahue

2024-12-08

Import Data

Clean Data

Explore Data

Model Building

Model evaluation

Show performance

Finalize workflow (last_fit)

Variable importance