Import Data

museums <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')

museums %>% skimr::skim()
Data summary
Name Piped data
Number of rows 4191
Number of columns 35
_______________________
Column type frequency:
character 24
numeric 11
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
museum_id 0 1.00 8 15 0 4191 0
Name_of_museum 0 1.00 3 76 0 4190 0
Address_line_1 441 0.89 3 61 0 3212 0
Address_line_2 2816 0.33 3 39 0 1167 0
Village,_Town_or_City 4 1.00 3 24 0 1696 0
Postcode 0 1.00 6 9 0 3918 0
Admin_area 0 1.00 12 137 0 393 0
Accreditation 0 1.00 10 12 0 2 0
Governance 0 1.00 7 41 0 13 0
Size 0 1.00 4 7 0 5 0
Size_provenance 179 0.96 2 29 0 16 0
Subject_Matter 0 1.00 5 45 0 114 0
Year_opened 0 1.00 9 9 0 351 0
Year_closed 0 1.00 9 9 0 170 0
DOMUS_Subject_Matter 2788 0.33 5 27 0 21 0
Primary_provenance_of_data 0 1.00 3 8 0 18 0
Identifier_used_in_primary_data_source 2056 0.51 2 8 0 2134 0
Area_Geodemographic_group 49 0.99 11 40 0 17 0
Area_Geodemographic_group_code 49 0.99 3 3 0 16 0
Area_Geodemographic_subgroup 49 0.99 12 39 0 25 0
Area_Geodemographic_subgroup_code 49 0.99 4 4 0 24 0
Area_Geodemographic_supergroup 49 0.99 16 39 0 8 0
Area_Geodemographic_supergroup_code 49 0.99 2 2 0 8 0
Notes 2980 0.29 12 751 0 956 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Latitude 0 1.00 52.93 2.09 49.18 51.48 52.47 53.96 100.00 ▇▁▁▁▁
Longitude 0 1.00 -1.96 1.84 -8.09 -3.10 -1.87 -0.48 1.76 ▁▂▇▇▅
DOMUS_identifier 2347 0.44 1303.45 1597.19 1.00 486.50 991.50 1470.25 7746.00 ▇▂▁▁▁
Area_Deprivation_index 49 0.99 5.44 2.48 1.00 4.00 5.00 7.00 10.00 ▃▆▇▆▃
Area_Deprivation_index_crime 49 0.99 5.43 3.07 1.00 3.00 6.00 8.00 10.00 ▇▆▅▇▇
Area_Deprivation_index_education 49 0.99 6.04 2.61 1.00 4.00 6.00 8.00 10.00 ▃▅▇▇▆
Area_Deprivation_index_employment 49 0.99 6.08 2.76 1.00 4.00 6.00 8.00 10.00 ▅▆▇▇▇
Area_Deprivation_index_health 49 0.99 6.02 2.82 1.00 4.00 6.00 8.00 10.00 ▅▆▆▇▇
Area_Deprivation_index_housing 49 0.99 3.97 2.75 1.00 1.00 3.00 6.00 10.00 ▇▅▃▂▂
Area_Deprivation_index_income 49 0.99 5.99 2.62 1.00 4.00 6.00 8.00 10.00 ▃▆▇▇▆
Area_Deprivation_index_services 49 0.99 4.78 3.01 1.00 2.00 4.00 7.00 10.00 ▇▅▅▅▅

Notes about the dataset

Clean Data

library(janitor)

data_clean <- museums %>%
    
    # Village,_Town_or_City (, can't be used for variable name): clean variable names
    janitor::clean_names() %>%
    
    # supergroup vs groups vs subgroups: run the model with either groups or subgroups and choose one based on the model performance
    select(-area_geodemographic_group, -area_geodemographic_subgroup) %>%
    
    # Variables with too many missing values: Remove
    select(-notes, -address_line_2, -domus_identifier, -domus_subject_matter, -identifier_used_in_primary_data_source) %>%
    
    # Redundant Variables (e.g., Area_Geodemographic_group & Area_Geodemographic_group_code):** Keep only one from each pair, as they contain the same information.
    select(-ends_with("_code")) %>%
    
    # Location Variables (Address_line_1 through Admin_area): Select only one
    select(-address_line_1, -village_town_or_city, -postcode, -latitude, -longitude) %>%
    
    # Extract provinces from admin_area
    mutate(admin_area = admin_area %>% str_remove("\\/")) %>% 
    mutate(admin_area = admin_area %>% str_remove("\\/.*")) %>%
    
    # Year_opened, Year_closed: Drop Year_closed and consider adding later
    select(-year_closed) %>%
    mutate(year_opened = year_opened %>% str_remove("\\:.*") %>% as.numeric()) %>%
    
    # inconsistent white spaces between words (e.g., area_geodemographic_group has "Ethnically Diverse Metropolitan Living" and "Ethnically Diverse Metropolitan  Living"): Revise the character variables so that they have one white space between words
    mutate(across(where(is.character), str_squish)) %>%
    
    # Character variables: Convert them to factor
    mutate(across(where(is.character), factor)) %>%
    
    # id variables (museum_id and name_of_museum): Keep museum_id
    select(-name_of_museum) %>%
    
    # Remove all rows with missing values
    na.omit()

Explore Data

# Check whether the target variable is unbalanced
data_clean %>% count(accreditation)
## # A tibble: 2 × 2
##   accreditation     n
##   <fct>         <int>
## 1 Accredited     1703
## 2 Unaccredited   2267
# correlation funnel analysis

# Step 1: Prepare Data
data_binarized_tbl <- data_clean %>%
    
    # Throw in all predictors except for id and strings
    select(-museum_id) %>%
    binarize()

data_binarized_tbl %>% glimpse()
## Rows: 3,970
## Columns: 109
## $ admin_area__England                                                     <dbl> …
## $ admin_area__Northern_Ireland                                            <dbl> …
## $ admin_area__Scotland                                                    <dbl> …
## $ admin_area__Wales                                                       <dbl> …
## $ `admin_area__-OTHER`                                                    <dbl> …
## $ accreditation__Accredited                                               <dbl> …
## $ accreditation__Unaccredited                                             <dbl> …
## $ `governance__Government-Local_Authority`                                <dbl> …
## $ `governance__Government-National`                                       <dbl> …
## $ `governance__Independent-English_Heritage`                              <dbl> …
## $ `governance__Independent-National_Trust`                                <dbl> …
## $ `governance__Independent-Not_for_profit`                                <dbl> …
## $ `governance__Independent-Private`                                       <dbl> …
## $ `governance__Independent-Unknown`                                       <dbl> …
## $ governance__University                                                  <dbl> …
## $ governance__Unknown                                                     <dbl> …
## $ `governance__-OTHER`                                                    <dbl> …
## $ size__large                                                             <dbl> …
## $ size__medium                                                            <dbl> …
## $ size__small                                                             <dbl> …
## $ size__unknown                                                           <dbl> …
## $ `size__-OTHER`                                                          <dbl> …
## $ size_provenance__ace_size_designation                                   <dbl> …
## $ size_provenance__aim_size_designation                                   <dbl> …
## $ size_provenance__domus                                                  <dbl> …
## $ `size_provenance__ma(fam)`                                              <dbl> …
## $ size_provenance__mm_manual_estimate_2018                                <dbl> …
## $ size_provenance__mm_prediction_random_forest                            <dbl> …
## $ size_provenance__scottish_national_audit                                <dbl> …
## $ size_provenance__unknown                                                <dbl> …
## $ size_provenance__visitbritain                                           <dbl> …
## $ `size_provenance__-OTHER`                                               <dbl> …
## $ `subject_matter__Archaeology-Roman`                                     <dbl> …
## $ `subject_matter__Arts-Fine_and_decorative_arts`                         <dbl> …
## $ `subject_matter__Buildings-Houses-Large_houses`                         <dbl> …
## $ `subject_matter__Buildings-Houses-Medium_houses`                        <dbl> …
## $ `subject_matter__Industry_and_manufacture-Mining_and_quarrying`         <dbl> …
## $ `subject_matter__Leisure_and_sport-Toys_and_models`                     <dbl> …
## $ subject_matter__Local_Histories                                         <dbl> …
## $ `subject_matter__Mixed-Encyclopaedic`                                   <dbl> …
## $ `subject_matter__Mixed-Other`                                           <dbl> …
## $ subject_matter__Other                                                   <dbl> …
## $ `subject_matter__Personality-Literary`                                  <dbl> …
## $ `subject_matter__Rural_Industry-Farming`                                <dbl> …
## $ `subject_matter__Sea_and_seafaring-Boats_and_ships`                     <dbl> …
## $ `subject_matter__Sea_and_seafaring-Mixed`                               <dbl> …
## $ `subject_matter__Transport-Cars_and_motorbikes`                         <dbl> …
## $ `subject_matter__Transport-Trains_and_railways`                         <dbl> …
## $ `subject_matter__War_and_conflict-Airforce`                             <dbl> …
## $ `subject_matter__War_and_conflict-Castles_and_forts`                    <dbl> …
## $ `subject_matter__War_and_conflict-Military`                             <dbl> …
## $ `subject_matter__War_and_conflict-Regiment`                             <dbl> …
## $ `subject_matter__-OTHER`                                                <dbl> …
## $ `year_opened__-Inf_1957`                                                <dbl> …
## $ year_opened__1957_1977                                                  <dbl> …
## $ year_opened__1977_1992                                                  <dbl> …
## $ year_opened__1992_Inf                                                   <dbl> …
## $ primary_provenance_of_data__ace                                         <dbl> …
## $ primary_provenance_of_data__aim                                         <dbl> …
## $ primary_provenance_of_data__aim82M                                      <dbl> …
## $ primary_provenance_of_data__aim82NM                                     <dbl> …
## $ primary_provenance_of_data__domus                                       <dbl> …
## $ primary_provenance_of_data__fcm                                         <dbl> …
## $ primary_provenance_of_data__hha                                         <dbl> …
## $ primary_provenance_of_data__mald                                        <dbl> …
## $ primary_provenance_of_data__mgs                                         <dbl> …
## $ primary_provenance_of_data__misc                                        <dbl> …
## $ primary_provenance_of_data__musassoc                                    <dbl> …
## $ primary_provenance_of_data__wiki                                        <dbl> …
## $ `primary_provenance_of_data__-OTHER`                                    <dbl> …
## $ `area_deprivation_index__-Inf_4`                                        <dbl> …
## $ area_deprivation_index__4_5                                             <dbl> …
## $ area_deprivation_index__5_7                                             <dbl> …
## $ area_deprivation_index__7_Inf                                           <dbl> …
## $ `area_deprivation_index_crime__-Inf_3`                                  <dbl> …
## $ area_deprivation_index_crime__3_6                                       <dbl> …
## $ area_deprivation_index_crime__6_8                                       <dbl> …
## $ area_deprivation_index_crime__8_Inf                                     <dbl> …
## $ `area_deprivation_index_education__-Inf_4`                              <dbl> …
## $ area_deprivation_index_education__4_6                                   <dbl> …
## $ area_deprivation_index_education__6_8                                   <dbl> …
## $ area_deprivation_index_education__8_Inf                                 <dbl> …
## $ `area_deprivation_index_employment__-Inf_4`                             <dbl> …
## $ area_deprivation_index_employment__4_6                                  <dbl> …
## $ area_deprivation_index_employment__6_8                                  <dbl> …
## $ area_deprivation_index_employment__8_Inf                                <dbl> …
## $ `area_deprivation_index_health__-Inf_4`                                 <dbl> …
## $ area_deprivation_index_health__4_6                                      <dbl> …
## $ area_deprivation_index_health__6_8                                      <dbl> …
## $ area_deprivation_index_health__8_Inf                                    <dbl> …
## $ `area_deprivation_index_housing__-Inf_3`                                <dbl> …
## $ area_deprivation_index_housing__3_6                                     <dbl> …
## $ area_deprivation_index_housing__6_Inf                                   <dbl> …
## $ `area_deprivation_index_income__-Inf_4`                                 <dbl> …
## $ area_deprivation_index_income__4_6                                      <dbl> …
## $ area_deprivation_index_income__6_8                                      <dbl> …
## $ area_deprivation_index_income__8_Inf                                    <dbl> …
## $ `area_deprivation_index_services__-Inf_2`                               <dbl> …
## $ area_deprivation_index_services__2_4                                    <dbl> …
## $ area_deprivation_index_services__4_8                                    <dbl> …
## $ area_deprivation_index_services__8_Inf                                  <dbl> …
## $ area_geodemographic_supergroup__Affluent_England                        <dbl> …
## $ area_geodemographic_supergroup__Business_Education_and_Heritage_Centres <dbl> …
## $ area_geodemographic_supergroup__Countryside_Living                      <dbl> …
## $ area_geodemographic_supergroup__Ethnically_Diverse_Metropolitan_Living  <dbl> …
## $ area_geodemographic_supergroup__London_Cosmopolitan                     <dbl> …
## $ area_geodemographic_supergroup__Services_and_Industrial_Legacy          <dbl> …
## $ area_geodemographic_supergroup__Town_and_Country_Living                 <dbl> …
## $ area_geodemographic_supergroup__Urban_Settlements                       <dbl> …
# Step 2: Correlate to the Target
data_corr_tbl <- data_binarized_tbl %>%
    correlate(accreditation__Accredited)

data_corr_tbl
## # A tibble: 109 × 3
##    feature                    bin                         correlation
##    <fct>                      <chr>                             <dbl>
##  1 accreditation              Accredited                        1    
##  2 accreditation              Unaccredited                     -1    
##  3 primary_provenance_of_data domus                             0.571
##  4 size_provenance            mm_prediction_random_forest      -0.516
##  5 size_provenance            domus                             0.458
##  6 governance                 Independent-Private              -0.387
##  7 size                       small                            -0.377
##  8 primary_provenance_of_data wiki                             -0.295
##  9 primary_provenance_of_data ace                               0.261
## 10 size                       large                             0.256
## # ℹ 99 more rows
# Step 3: Plot
data_corr_tbl %>% 
    plot_correlation_funnel()

Model Building

Split Data

# Start with small data until you are sure your code has no errors
data_clean <- data_clean %>% sample_n(100)

library(tidymodels)

set.seed(123)
data_split <- initial_split(data_clean, strata = accreditation)

data_train <- training(data_split)
data_test <- testing(data_split)

set.seed(234)
data_folds <- bootstraps(data_train, strata = accreditation)
data_folds
## # Bootstrap sampling using stratification 
## # A tibble: 25 × 2
##    splits          id         
##    <list>          <chr>      
##  1 <split [75/28]> Bootstrap01
##  2 <split [75/30]> Bootstrap02
##  3 <split [75/25]> Bootstrap03
##  4 <split [75/27]> Bootstrap04
##  5 <split [75/29]> Bootstrap05
##  6 <split [75/28]> Bootstrap06
##  7 <split [75/24]> Bootstrap07
##  8 <split [75/29]> Bootstrap08
##  9 <split [75/28]> Bootstrap09
## 10 <split [75/29]> Bootstrap10
## # ℹ 15 more rows

Preprocess Data Using Recipes Package

# Convert all nominal predictors (characters and factors) to numbers.
data_rec <- 
    recipe(accreditation ~ ., data = data_train) %>%
    update_role(museum_id, new_role = "id") %>%
    step_other(subject_matter) %>%
    step_dummy(all_nominal_predictors()) %>%
    step_normalize(all_numeric_predictors()) %>%
    step_zv()

data_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 75
## Columns: 75
## $ museum_id                                                              <fct> …
## $ year_opened                                                            <dbl> …
## $ area_deprivation_index                                                 <dbl> …
## $ area_deprivation_index_crime                                           <dbl> …
## $ area_deprivation_index_education                                       <dbl> …
## $ area_deprivation_index_employment                                      <dbl> …
## $ area_deprivation_index_health                                          <dbl> …
## $ area_deprivation_index_housing                                         <dbl> …
## $ area_deprivation_index_income                                          <dbl> …
## $ area_deprivation_index_services                                        <dbl> …
## $ accreditation                                                          <fct> …
## $ admin_area_England                                                     <dbl> …
## $ admin_area_Isle.of.Man                                                 <dbl> …
## $ admin_area_Northern.Ireland                                            <dbl> …
## $ admin_area_Scotland                                                    <dbl> …
## $ admin_area_Wales                                                       <dbl> …
## $ governance_Government.Local_Authority                                  <dbl> …
## $ governance_Government.National                                         <dbl> …
## $ governance_Government.Other                                            <dbl> …
## $ governance_Independent.English_Heritage                                <dbl> …
## $ governance_Independent.Historic_Environment_Scotland                   <dbl> …
## $ governance_Independent.National_Trust                                  <dbl> …
## $ governance_Independent.National_Trust_for_Scotland                     <dbl> …
## $ governance_Independent.Not_for_profit                                  <dbl> …
## $ governance_Independent.Private                                         <dbl> …
## $ governance_Independent.Unknown                                         <dbl> …
## $ governance_University                                                  <dbl> …
## $ governance_Unknown                                                     <dbl> …
## $ size_large                                                             <dbl> …
## $ size_medium                                                            <dbl> …
## $ size_small                                                             <dbl> …
## $ size_unknown                                                           <dbl> …
## $ size_provenance_aim_size_designation                                   <dbl> …
## $ size_provenance_babbidge_ewles_and_smith_2006                          <dbl> …
## $ size_provenance_domus                                                  <dbl> …
## $ size_provenance_ma.fam.                                                <dbl> …
## $ size_provenance_ma.fam._year_stated                                    <dbl> …
## $ size_provenance_ma.fam2.                                               <dbl> …
## $ size_provenance_mm                                                     <dbl> …
## $ size_provenance_mm.int                                                 <dbl> …
## $ size_provenance_mm_manual_estimate_2018                                <dbl> …
## $ size_provenance_mm_prediction_random_forest                            <dbl> …
## $ size_provenance_national_trust_annual_report_                          <dbl> …
## $ size_provenance_nilm.vn.                                               <dbl> …
## $ size_provenance_scottish_national_audit                                <dbl> …
## $ size_provenance_unknown                                                <dbl> …
## $ size_provenance_visitbritain                                           <dbl> …
## $ subject_matter_Local_Histories                                         <dbl> …
## $ subject_matter_Mixed.Encyclopaedic                                     <dbl> …
## $ subject_matter_Transport.Trains_and_railways                           <dbl> …
## $ subject_matter_other                                                   <dbl> …
## $ primary_provenance_of_data_aim                                         <dbl> …
## $ primary_provenance_of_data_aim82M                                      <dbl> …
## $ primary_provenance_of_data_aim82NM                                     <dbl> …
## $ primary_provenance_of_data_domus                                       <dbl> …
## $ primary_provenance_of_data_fcm                                         <dbl> …
## $ primary_provenance_of_data_hha                                         <dbl> …
## $ primary_provenance_of_data_hud                                         <dbl> …
## $ primary_provenance_of_data_mald                                        <dbl> …
## $ primary_provenance_of_data_MDN                                         <dbl> …
## $ primary_provenance_of_data_mgs                                         <dbl> …
## $ primary_provenance_of_data_misc                                        <dbl> …
## $ primary_provenance_of_data_Misc                                        <dbl> …
## $ primary_provenance_of_data_musassoc                                    <dbl> …
## $ primary_provenance_of_data_MusCal                                      <dbl> …
## $ primary_provenance_of_data_New                                         <dbl> …
## $ primary_provenance_of_data_nimc                                        <dbl> …
## $ primary_provenance_of_data_wiki                                        <dbl> …
## $ area_geodemographic_supergroup_Business.Education.and.Heritage.Centres <dbl> …
## $ area_geodemographic_supergroup_Countryside.Living                      <dbl> …
## $ area_geodemographic_supergroup_Ethnically.Diverse.Metropolitan.Living  <dbl> …
## $ area_geodemographic_supergroup_London.Cosmopolitan                     <dbl> …
## $ area_geodemographic_supergroup_Services.and.Industrial.Legacy          <dbl> …
## $ area_geodemographic_supergroup_Town.and.Country.Living                 <dbl> …
## $ area_geodemographic_supergroup_Urban.Settlements                       <dbl> …

Specify Model

library(usemodels)
model_spec <-
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

model_workflow <- 
  workflow() %>% 
  add_recipe(data_rec) %>% 
  add_model(model_spec) 

doParallel::registerDoParallel()
set.seed(27358)
model_tune <-
  tune_grid(model_workflow, 
            resamples = data_folds,
            control = control_resamples(save_pred = TRUE))