museums <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')
museums %>% skimr::skim()
Name | Piped data |
Number of rows | 4191 |
Number of columns | 35 |
_______________________ | |
Column type frequency: | |
character | 24 |
numeric | 11 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
museum_id | 0 | 1.00 | 8 | 15 | 0 | 4191 | 0 |
Name_of_museum | 0 | 1.00 | 3 | 76 | 0 | 4190 | 0 |
Address_line_1 | 441 | 0.89 | 3 | 61 | 0 | 3212 | 0 |
Address_line_2 | 2816 | 0.33 | 3 | 39 | 0 | 1167 | 0 |
Village,_Town_or_City | 4 | 1.00 | 3 | 24 | 0 | 1696 | 0 |
Postcode | 0 | 1.00 | 6 | 9 | 0 | 3918 | 0 |
Admin_area | 0 | 1.00 | 12 | 137 | 0 | 393 | 0 |
Accreditation | 0 | 1.00 | 10 | 12 | 0 | 2 | 0 |
Governance | 0 | 1.00 | 7 | 41 | 0 | 13 | 0 |
Size | 0 | 1.00 | 4 | 7 | 0 | 5 | 0 |
Size_provenance | 179 | 0.96 | 2 | 29 | 0 | 16 | 0 |
Subject_Matter | 0 | 1.00 | 5 | 45 | 0 | 114 | 0 |
Year_opened | 0 | 1.00 | 9 | 9 | 0 | 351 | 0 |
Year_closed | 0 | 1.00 | 9 | 9 | 0 | 170 | 0 |
DOMUS_Subject_Matter | 2788 | 0.33 | 5 | 27 | 0 | 21 | 0 |
Primary_provenance_of_data | 0 | 1.00 | 3 | 8 | 0 | 18 | 0 |
Identifier_used_in_primary_data_source | 2056 | 0.51 | 2 | 8 | 0 | 2134 | 0 |
Area_Geodemographic_group | 49 | 0.99 | 11 | 40 | 0 | 17 | 0 |
Area_Geodemographic_group_code | 49 | 0.99 | 3 | 3 | 0 | 16 | 0 |
Area_Geodemographic_subgroup | 49 | 0.99 | 12 | 39 | 0 | 25 | 0 |
Area_Geodemographic_subgroup_code | 49 | 0.99 | 4 | 4 | 0 | 24 | 0 |
Area_Geodemographic_supergroup | 49 | 0.99 | 16 | 39 | 0 | 8 | 0 |
Area_Geodemographic_supergroup_code | 49 | 0.99 | 2 | 2 | 0 | 8 | 0 |
Notes | 2980 | 0.29 | 12 | 751 | 0 | 956 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Latitude | 0 | 1.00 | 52.93 | 2.09 | 49.18 | 51.48 | 52.47 | 53.96 | 100.00 | ▇▁▁▁▁ |
Longitude | 0 | 1.00 | -1.96 | 1.84 | -8.09 | -3.10 | -1.87 | -0.48 | 1.76 | ▁▂▇▇▅ |
DOMUS_identifier | 2347 | 0.44 | 1303.45 | 1597.19 | 1.00 | 486.50 | 991.50 | 1470.25 | 7746.00 | ▇▂▁▁▁ |
Area_Deprivation_index | 49 | 0.99 | 5.44 | 2.48 | 1.00 | 4.00 | 5.00 | 7.00 | 10.00 | ▃▆▇▆▃ |
Area_Deprivation_index_crime | 49 | 0.99 | 5.43 | 3.07 | 1.00 | 3.00 | 6.00 | 8.00 | 10.00 | ▇▆▅▇▇ |
Area_Deprivation_index_education | 49 | 0.99 | 6.04 | 2.61 | 1.00 | 4.00 | 6.00 | 8.00 | 10.00 | ▃▅▇▇▆ |
Area_Deprivation_index_employment | 49 | 0.99 | 6.08 | 2.76 | 1.00 | 4.00 | 6.00 | 8.00 | 10.00 | ▅▆▇▇▇ |
Area_Deprivation_index_health | 49 | 0.99 | 6.02 | 2.82 | 1.00 | 4.00 | 6.00 | 8.00 | 10.00 | ▅▆▆▇▇ |
Area_Deprivation_index_housing | 49 | 0.99 | 3.97 | 2.75 | 1.00 | 1.00 | 3.00 | 6.00 | 10.00 | ▇▅▃▂▂ |
Area_Deprivation_index_income | 49 | 0.99 | 5.99 | 2.62 | 1.00 | 4.00 | 6.00 | 8.00 | 10.00 | ▃▆▇▇▆ |
Area_Deprivation_index_services | 49 | 0.99 | 4.78 | 3.01 | 1.00 | 2.00 | 4.00 | 7.00 | 10.00 | ▇▅▅▅▅ |
Notes about the dataset
**Village,_Town_or_City** (, can’t be used for variable name): clean variable names using janitor::clean_names()
Supergroup vs Group vs subgroup (e.g., Geodemographic_group and Geodemographic_subgroup): Keep only one, as they contain overlapping information. Retain the group variable if the subgroup does not provide additional valuable insights, and vice versa.
Variables with too many missing values: Remove the following:
notes
Address_line_2
DOMUS_identifier
DOMUS_subject_matter
Identifier_used_in_primary_data_source
Redundant Variables (e.g., Area_Geodemographic_group & Area_Geodemographic_group_code): Keep only one from each pair, as they contain the same information.
Location Variables (Address_line_1 through Admin_area): Select only one or two essential location variables, as these are redundant.
Year_opened, Year_closed: These variables contain two years separated by a colon, which requires clarification. Additionally, the value “9999” in Year_closed likely indicates “not closed.” Decide whether to keep these variables. If you keep them, consider data cleaning. For instance, one approach would be to retain only the first year from Year_opened.
Inconsistent white spaces between words (e.g., area_geodemographic_group has “Ethnically Diverse Metropolitan Living” and “Ethnically Diverse Metropolitan Living”): Revise the character variables so that they have one white space between words, using stringr::str_squish()
Character variables (These are not strings, like notes): Convert them to factors.
ID variables (museum_id and name_of_museum): Keep museum_id as name_of_museum has duplicates
library(janitor)
data_clean <- museums %>%
# Village,_Town_or_City (, can't be used for variable name): clean variable names
janitor::clean_names() %>%
# supergroup vs groups vs subgroups: run the model with either groups or subgroups and choose one based on the model performance
select(-area_geodemographic_group, -area_geodemographic_subgroup) %>%
# Variables with too many missing values: Remove
select(-notes, -address_line_2, -domus_identifier, -domus_subject_matter, -identifier_used_in_primary_data_source) %>%
# Redundant Variables (e.g., Area_Geodemographic_group & Area_Geodemographic_group_code):** Keep only one from each pair, as they contain the same information.
select(-ends_with("_code")) %>%
# Location Variables (Address_line_1 through Admin_area): Select only one
select(-address_line_1, -village_town_or_city, -postcode, -latitude, -longitude) %>%
# Extract provinces from admin_area
mutate(admin_area = admin_area %>% str_remove("\\/")) %>%
mutate(admin_area = admin_area %>% str_remove("\\/.*")) %>%
# Year_opened, Year_closed: Drop Year_closed and consider adding later
select(-year_closed) %>%
mutate(year_opened = year_opened %>% str_remove("\\:.*") %>% as.numeric()) %>%
# inconsistent white spaces between words (e.g., area_geodemographic_group has "Ethnically Diverse Metropolitan Living" and "Ethnically Diverse Metropolitan Living"): Revise the character variables so that they have one white space between words
mutate(across(where(is.character), str_squish)) %>%
# Character variables: Convert them to factor
mutate(across(where(is.character), factor)) %>%
# id variables (museum_id and name_of_museum): Keep museum_id
select(-name_of_museum) %>%
# Remove all rows with missing values
na.omit()
# Check whether the target variable is unbalanced
data_clean %>% count(accreditation)
## # A tibble: 2 × 2
## accreditation n
## <fct> <int>
## 1 Accredited 1703
## 2 Unaccredited 2267
# correlation funnel analysis
# Step 1: Prepare Data
data_binarized_tbl <- data_clean %>%
# Throw in all predictors except for id and strings
select(-museum_id) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 3,970
## Columns: 109
## $ admin_area__England <dbl> …
## $ admin_area__Northern_Ireland <dbl> …
## $ admin_area__Scotland <dbl> …
## $ admin_area__Wales <dbl> …
## $ `admin_area__-OTHER` <dbl> …
## $ accreditation__Accredited <dbl> …
## $ accreditation__Unaccredited <dbl> …
## $ `governance__Government-Local_Authority` <dbl> …
## $ `governance__Government-National` <dbl> …
## $ `governance__Independent-English_Heritage` <dbl> …
## $ `governance__Independent-National_Trust` <dbl> …
## $ `governance__Independent-Not_for_profit` <dbl> …
## $ `governance__Independent-Private` <dbl> …
## $ `governance__Independent-Unknown` <dbl> …
## $ governance__University <dbl> …
## $ governance__Unknown <dbl> …
## $ `governance__-OTHER` <dbl> …
## $ size__large <dbl> …
## $ size__medium <dbl> …
## $ size__small <dbl> …
## $ size__unknown <dbl> …
## $ `size__-OTHER` <dbl> …
## $ size_provenance__ace_size_designation <dbl> …
## $ size_provenance__aim_size_designation <dbl> …
## $ size_provenance__domus <dbl> …
## $ `size_provenance__ma(fam)` <dbl> …
## $ size_provenance__mm_manual_estimate_2018 <dbl> …
## $ size_provenance__mm_prediction_random_forest <dbl> …
## $ size_provenance__scottish_national_audit <dbl> …
## $ size_provenance__unknown <dbl> …
## $ size_provenance__visitbritain <dbl> …
## $ `size_provenance__-OTHER` <dbl> …
## $ `subject_matter__Archaeology-Roman` <dbl> …
## $ `subject_matter__Arts-Fine_and_decorative_arts` <dbl> …
## $ `subject_matter__Buildings-Houses-Large_houses` <dbl> …
## $ `subject_matter__Buildings-Houses-Medium_houses` <dbl> …
## $ `subject_matter__Industry_and_manufacture-Mining_and_quarrying` <dbl> …
## $ `subject_matter__Leisure_and_sport-Toys_and_models` <dbl> …
## $ subject_matter__Local_Histories <dbl> …
## $ `subject_matter__Mixed-Encyclopaedic` <dbl> …
## $ `subject_matter__Mixed-Other` <dbl> …
## $ subject_matter__Other <dbl> …
## $ `subject_matter__Personality-Literary` <dbl> …
## $ `subject_matter__Rural_Industry-Farming` <dbl> …
## $ `subject_matter__Sea_and_seafaring-Boats_and_ships` <dbl> …
## $ `subject_matter__Sea_and_seafaring-Mixed` <dbl> …
## $ `subject_matter__Transport-Cars_and_motorbikes` <dbl> …
## $ `subject_matter__Transport-Trains_and_railways` <dbl> …
## $ `subject_matter__War_and_conflict-Airforce` <dbl> …
## $ `subject_matter__War_and_conflict-Castles_and_forts` <dbl> …
## $ `subject_matter__War_and_conflict-Military` <dbl> …
## $ `subject_matter__War_and_conflict-Regiment` <dbl> …
## $ `subject_matter__-OTHER` <dbl> …
## $ `year_opened__-Inf_1957` <dbl> …
## $ year_opened__1957_1977 <dbl> …
## $ year_opened__1977_1992 <dbl> …
## $ year_opened__1992_Inf <dbl> …
## $ primary_provenance_of_data__ace <dbl> …
## $ primary_provenance_of_data__aim <dbl> …
## $ primary_provenance_of_data__aim82M <dbl> …
## $ primary_provenance_of_data__aim82NM <dbl> …
## $ primary_provenance_of_data__domus <dbl> …
## $ primary_provenance_of_data__fcm <dbl> …
## $ primary_provenance_of_data__hha <dbl> …
## $ primary_provenance_of_data__mald <dbl> …
## $ primary_provenance_of_data__mgs <dbl> …
## $ primary_provenance_of_data__misc <dbl> …
## $ primary_provenance_of_data__musassoc <dbl> …
## $ primary_provenance_of_data__wiki <dbl> …
## $ `primary_provenance_of_data__-OTHER` <dbl> …
## $ `area_deprivation_index__-Inf_4` <dbl> …
## $ area_deprivation_index__4_5 <dbl> …
## $ area_deprivation_index__5_7 <dbl> …
## $ area_deprivation_index__7_Inf <dbl> …
## $ `area_deprivation_index_crime__-Inf_3` <dbl> …
## $ area_deprivation_index_crime__3_6 <dbl> …
## $ area_deprivation_index_crime__6_8 <dbl> …
## $ area_deprivation_index_crime__8_Inf <dbl> …
## $ `area_deprivation_index_education__-Inf_4` <dbl> …
## $ area_deprivation_index_education__4_6 <dbl> …
## $ area_deprivation_index_education__6_8 <dbl> …
## $ area_deprivation_index_education__8_Inf <dbl> …
## $ `area_deprivation_index_employment__-Inf_4` <dbl> …
## $ area_deprivation_index_employment__4_6 <dbl> …
## $ area_deprivation_index_employment__6_8 <dbl> …
## $ area_deprivation_index_employment__8_Inf <dbl> …
## $ `area_deprivation_index_health__-Inf_4` <dbl> …
## $ area_deprivation_index_health__4_6 <dbl> …
## $ area_deprivation_index_health__6_8 <dbl> …
## $ area_deprivation_index_health__8_Inf <dbl> …
## $ `area_deprivation_index_housing__-Inf_3` <dbl> …
## $ area_deprivation_index_housing__3_6 <dbl> …
## $ area_deprivation_index_housing__6_Inf <dbl> …
## $ `area_deprivation_index_income__-Inf_4` <dbl> …
## $ area_deprivation_index_income__4_6 <dbl> …
## $ area_deprivation_index_income__6_8 <dbl> …
## $ area_deprivation_index_income__8_Inf <dbl> …
## $ `area_deprivation_index_services__-Inf_2` <dbl> …
## $ area_deprivation_index_services__2_4 <dbl> …
## $ area_deprivation_index_services__4_8 <dbl> …
## $ area_deprivation_index_services__8_Inf <dbl> …
## $ area_geodemographic_supergroup__Affluent_England <dbl> …
## $ area_geodemographic_supergroup__Business_Education_and_Heritage_Centres <dbl> …
## $ area_geodemographic_supergroup__Countryside_Living <dbl> …
## $ area_geodemographic_supergroup__Ethnically_Diverse_Metropolitan_Living <dbl> …
## $ area_geodemographic_supergroup__London_Cosmopolitan <dbl> …
## $ area_geodemographic_supergroup__Services_and_Industrial_Legacy <dbl> …
## $ area_geodemographic_supergroup__Town_and_Country_Living <dbl> …
## $ area_geodemographic_supergroup__Urban_Settlements <dbl> …
# Step 2: Correlate to the Target
data_corr_tbl <- data_binarized_tbl %>%
correlate(accreditation__Accredited)
data_corr_tbl
## # A tibble: 109 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 accreditation Accredited 1
## 2 accreditation Unaccredited -1
## 3 primary_provenance_of_data domus 0.571
## 4 size_provenance mm_prediction_random_forest -0.516
## 5 size_provenance domus 0.458
## 6 governance Independent-Private -0.387
## 7 size small -0.377
## 8 primary_provenance_of_data wiki -0.295
## 9 primary_provenance_of_data ace 0.261
## 10 size large 0.256
## # ℹ 99 more rows
# Step 3: Plot
data_corr_tbl %>%
plot_correlation_funnel()
Split Data
# Start with small data until you are sure your code has no errors
data_clean <- data_clean %>% sample_n(100)
library(tidymodels)
set.seed(123)
data_split <- initial_split(data_clean, strata = accreditation)
data_train <- training(data_split)
data_test <- testing(data_split)
set.seed(234)
data_folds <- bootstraps(data_train, strata = accreditation)
data_folds
## # Bootstrap sampling using stratification
## # A tibble: 25 × 2
## splits id
## <list> <chr>
## 1 <split [75/28]> Bootstrap01
## 2 <split [75/30]> Bootstrap02
## 3 <split [75/24]> Bootstrap03
## 4 <split [75/27]> Bootstrap04
## 5 <split [75/30]> Bootstrap05
## 6 <split [75/27]> Bootstrap06
## 7 <split [75/22]> Bootstrap07
## 8 <split [75/31]> Bootstrap08
## 9 <split [75/29]> Bootstrap09
## 10 <split [75/29]> Bootstrap10
## # ℹ 15 more rows
Preprocess Data Using Recipes Package
# Convert all nominal predictors (characters and factors) to numbers.
data_rec <-
recipe(accreditation ~ ., data = data_train) %>%
update_role(museum_id, new_role = "id") %>%
step_other(subject_matter) %>%
step_dummy(all_nominal_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_zv()
data_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 75
## Columns: 76
## $ museum_id <fct> …
## $ year_opened <dbl> …
## $ area_deprivation_index <dbl> …
## $ area_deprivation_index_crime <dbl> …
## $ area_deprivation_index_education <dbl> …
## $ area_deprivation_index_employment <dbl> …
## $ area_deprivation_index_health <dbl> …
## $ area_deprivation_index_housing <dbl> …
## $ area_deprivation_index_income <dbl> …
## $ area_deprivation_index_services <dbl> …
## $ accreditation <fct> …
## $ admin_area_England <dbl> …
## $ admin_area_Isle.of.Man <dbl> …
## $ admin_area_Northern.Ireland <dbl> …
## $ admin_area_Scotland <dbl> …
## $ admin_area_Wales <dbl> …
## $ governance_Government.Local_Authority <dbl> …
## $ governance_Government.National <dbl> …
## $ governance_Government.Other <dbl> …
## $ governance_Independent.English_Heritage <dbl> …
## $ governance_Independent.Historic_Environment_Scotland <dbl> …
## $ governance_Independent.National_Trust <dbl> …
## $ governance_Independent.National_Trust_for_Scotland <dbl> …
## $ governance_Independent.Not_for_profit <dbl> …
## $ governance_Independent.Private <dbl> …
## $ governance_Independent.Unknown <dbl> …
## $ governance_University <dbl> …
## $ governance_Unknown <dbl> …
## $ size_large <dbl> …
## $ size_medium <dbl> …
## $ size_small <dbl> …
## $ size_unknown <dbl> …
## $ size_provenance_aim_size_designation <dbl> …
## $ size_provenance_babbidge_ewles_and_smith_2006 <dbl> …
## $ size_provenance_domus <dbl> …
## $ size_provenance_ma.fam. <dbl> …
## $ size_provenance_ma.fam._year_stated <dbl> …
## $ size_provenance_ma.fam2. <dbl> …
## $ size_provenance_mm <dbl> …
## $ size_provenance_mm.int <dbl> …
## $ size_provenance_mm_manual_estimate_2018 <dbl> …
## $ size_provenance_mm_prediction_random_forest <dbl> …
## $ size_provenance_national_trust_annual_report_ <dbl> …
## $ size_provenance_nilm.vn. <dbl> …
## $ size_provenance_scottish_national_audit <dbl> …
## $ size_provenance_unknown <dbl> …
## $ size_provenance_visitbritain <dbl> …
## $ subject_matter_Buildings.Houses.Large_houses <dbl> …
## $ subject_matter_Local_Histories <dbl> …
## $ subject_matter_Personality.Literary <dbl> …
## $ subject_matter_War_and_conflict.Regiment <dbl> …
## $ subject_matter_other <dbl> …
## $ primary_provenance_of_data_aim <dbl> …
## $ primary_provenance_of_data_aim82M <dbl> …
## $ primary_provenance_of_data_aim82NM <dbl> …
## $ primary_provenance_of_data_domus <dbl> …
## $ primary_provenance_of_data_fcm <dbl> …
## $ primary_provenance_of_data_hha <dbl> …
## $ primary_provenance_of_data_hud <dbl> …
## $ primary_provenance_of_data_mald <dbl> …
## $ primary_provenance_of_data_MDN <dbl> …
## $ primary_provenance_of_data_mgs <dbl> …
## $ primary_provenance_of_data_misc <dbl> …
## $ primary_provenance_of_data_Misc <dbl> …
## $ primary_provenance_of_data_musassoc <dbl> …
## $ primary_provenance_of_data_MusCal <dbl> …
## $ primary_provenance_of_data_New <dbl> …
## $ primary_provenance_of_data_nimc <dbl> …
## $ primary_provenance_of_data_wiki <dbl> …
## $ area_geodemographic_supergroup_Business.Education.and.Heritage.Centres <dbl> …
## $ area_geodemographic_supergroup_Countryside.Living <dbl> …
## $ area_geodemographic_supergroup_Ethnically.Diverse.Metropolitan.Living <dbl> …
## $ area_geodemographic_supergroup_London.Cosmopolitan <dbl> …
## $ area_geodemographic_supergroup_Services.and.Industrial.Legacy <dbl> …
## $ area_geodemographic_supergroup_Town.and.Country.Living <dbl> …
## $ area_geodemographic_supergroup_Urban.Settlements <dbl> …
Specify Model
library(usemodels)
model_spec <-
boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune()) %>%
set_mode("classification") %>%
set_engine("xgboost")
model_workflow <-
workflow() %>%
add_recipe(data_rec) %>%
add_model(model_spec)
doParallel::registerDoParallel()
set.seed(27358)
model_tune <-
tune_grid(model_workflow,
resamples = data_folds,
control = control_resamples(save_pred = TRUE))
collect_metrics(model_tune)
## # A tibble: 30 × 10
## trees min_n tree_depth learn_rate .metric .estimator mean n std_err
## <int> <int> <int> <dbl> <chr> <chr> <dbl> <int> <dbl>
## 1 6 2 5 0.00648 accuracy binary 0.694 25 0.0178
## 2 6 2 5 0.00648 brier_class binary 0.244 25 0.000320
## 3 6 2 5 0.00648 roc_auc binary 0.794 25 0.0162
## 4 1765 8 1 0.252 accuracy binary 0.555 25 0.0207
## 5 1765 8 1 0.252 brier_class binary 0.260 25 0.00622
## 6 1765 8 1 0.252 roc_auc binary 0.556 25 0.0224
## 7 257 10 7 0.0840 accuracy binary 0.555 25 0.00934
## 8 257 10 7 0.0840 brier_class binary 0.247 25 0.00112
## 9 257 10 7 0.0840 roc_auc binary 0.5 25 0
## 10 1867 15 14 0.00279 accuracy binary 0.555 25 0.00934
## # ℹ 20 more rows
## # ℹ 1 more variable: .config <chr>
collect_predictions(model_tune) %>%
group_by(id) %>%
roc_curve(accreditation, .pred_Accredited) %>%
autoplot()
final_fitted <- model_workflow %>%
# Fit the best model to the train data
finalize_workflow(select_best(model_tune, metric = "accuracy")) %>%
# Evaluate on the test data
last_fit(data_split)
collect_metrics(final_fitted)
## # A tibble: 3 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.52 Preprocessor1_Model1
## 2 roc_auc binary 0.675 Preprocessor1_Model1
## 3 brier_class binary 0.247 Preprocessor1_Model1
collect_predictions(final_fitted) %>%
conf_mat(accreditation, .pred_class) %>%
autoplot()
library(vip)
final_fitted %>%
extract_fit_engine() %>%
vip()