library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(correlationfunnel)
## ══ correlationfunnel Tip #1 ════════════════════════════════════════════════════
## Make sure your data is not overly imbalanced prior to using `correlate()`.
## If less than 5% imbalance, consider sampling. :)
# Import Data
museums <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')
## Rows: 4191 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (24): museum_id, Name_of_museum, Address_line_1, Address_line_2, Village...
## dbl (11): Latitude, Longitude, DOMUS_identifier, Area_Deprivation_index, Are...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(museums)
Data summary
Name museums
Number of rows 4191
Number of columns 35
_______________________
Column type frequency:
character 24
numeric 11
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
museum_id 0 1.00 8 15 0 4191 0
Name_of_museum 0 1.00 3 76 0 4190 0
Address_line_1 441 0.89 3 61 0 3212 0
Address_line_2 2816 0.33 3 39 0 1167 0
Village,_Town_or_City 4 1.00 3 24 0 1696 0
Postcode 0 1.00 6 9 0 3918 0
Admin_area 0 1.00 12 137 0 393 0
Accreditation 0 1.00 10 12 0 2 0
Governance 0 1.00 7 41 0 13 0
Size 0 1.00 4 7 0 5 0
Size_provenance 179 0.96 2 29 0 16 0
Subject_Matter 0 1.00 5 45 0 114 0
Year_opened 0 1.00 9 9 0 351 0
Year_closed 0 1.00 9 9 0 170 0
DOMUS_Subject_Matter 2788 0.33 5 27 0 21 0
Primary_provenance_of_data 0 1.00 3 8 0 18 0
Identifier_used_in_primary_data_source 2056 0.51 2 8 0 2134 0
Area_Geodemographic_group 49 0.99 11 40 0 17 0
Area_Geodemographic_group_code 49 0.99 3 3 0 16 0
Area_Geodemographic_subgroup 49 0.99 12 39 0 25 0
Area_Geodemographic_subgroup_code 49 0.99 4 4 0 24 0
Area_Geodemographic_supergroup 49 0.99 16 39 0 8 0
Area_Geodemographic_supergroup_code 49 0.99 2 2 0 8 0
Notes 2980 0.29 12 751 0 956 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Latitude 0 1.00 52.93 2.09 49.18 51.48 52.47 53.96 100.00 ▇▁▁▁▁
Longitude 0 1.00 -1.96 1.84 -8.09 -3.10 -1.87 -0.48 1.76 ▁▂▇▇▅
DOMUS_identifier 2347 0.44 1303.45 1597.19 1.00 486.50 991.50 1470.25 7746.00 ▇▂▁▁▁
Area_Deprivation_index 49 0.99 5.44 2.48 1.00 4.00 5.00 7.00 10.00 ▃▆▇▆▃
Area_Deprivation_index_crime 49 0.99 5.43 3.07 1.00 3.00 6.00 8.00 10.00 ▇▆▅▇▇
Area_Deprivation_index_education 49 0.99 6.04 2.61 1.00 4.00 6.00 8.00 10.00 ▃▅▇▇▆
Area_Deprivation_index_employment 49 0.99 6.08 2.76 1.00 4.00 6.00 8.00 10.00 ▅▆▇▇▇
Area_Deprivation_index_health 49 0.99 6.02 2.82 1.00 4.00 6.00 8.00 10.00 ▅▆▆▇▇
Area_Deprivation_index_housing 49 0.99 3.97 2.75 1.00 1.00 3.00 6.00 10.00 ▇▅▃▂▂
Area_Deprivation_index_income 49 0.99 5.99 2.62 1.00 4.00 6.00 8.00 10.00 ▃▆▇▇▆
Area_Deprivation_index_services 49 0.99 4.78 3.01 1.00 2.00 4.00 7.00 10.00 ▇▅▅▅▅

missing values Addressline_2, Addressline_1, DOMUS_Subject_Matter, DOMUS_Identifier, Notes factors or numeric variables Zero Variance variables Character variables Unbalanced target variables id variable museum_id

museums %>% count(Accreditation)
## # A tibble: 2 × 2
##   Accreditation     n
##   <chr>         <int>
## 1 Accredited     1720
## 2 Unaccredited   2471
museums %>%
    ggplot(aes(Accreditation)) +
    geom_bar()

data <- museums %>%
    select(-Address_line_1, -Address_line_2, -DOMUS_Subject_Matter,-DOMUS_identifier, -Notes, -Identifier_used_in_primary_data_source, -Area_Geodemographic_supergroup_code, -Area_Geodemographic_group_code, -Area_Geodemographic_subgroup_code, -museum_id) %>%
    na.omit() %>%
    janitor::clean_names()

skimr::skim(data)
Data summary
Name data
Number of rows 3966
Number of columns 25
_______________________
Column type frequency:
character 15
numeric 10
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
name_of_museum 0 1 3 76 0 3965 0
village_town_or_city 0 1 3 24 0 1639 0
postcode 0 1 6 9 0 3725 0
admin_area 0 1 16 137 0 392 0
accreditation 0 1 10 12 0 2 0
governance 0 1 7 41 0 13 0
size 0 1 4 7 0 5 0
size_provenance 0 1 2 29 0 16 0
subject_matter 0 1 5 45 0 112 0
year_opened 0 1 9 9 0 334 0
year_closed 0 1 9 9 0 159 0
primary_provenance_of_data 0 1 3 8 0 17 0
area_geodemographic_group 0 1 11 40 0 17 0
area_geodemographic_subgroup 0 1 12 39 0 25 0
area_geodemographic_supergroup 0 1 16 39 0 8 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
latitude 0 1 52.93 1.95 49.20 51.48 52.46 53.94 60.79 ▅▇▃▁▁
longitude 0 1 -1.94 1.83 -8.09 -3.09 -1.86 -0.47 1.76 ▁▂▇▇▅
area_deprivation_index 0 1 5.46 2.48 1.00 4.00 5.00 7.00 10.00 ▃▆▇▆▃
area_deprivation_index_crime 0 1 5.43 3.07 1.00 3.00 6.00 8.00 10.00 ▇▆▅▆▇
area_deprivation_index_education 0 1 6.05 2.61 1.00 4.00 6.00 8.00 10.00 ▃▅▇▇▆
area_deprivation_index_employment 0 1 6.08 2.77 1.00 4.00 6.00 8.00 10.00 ▅▆▇▇▇
area_deprivation_index_health 0 1 6.02 2.82 1.00 4.00 6.00 8.00 10.00 ▅▆▆▇▇
area_deprivation_index_housing 0 1 3.99 2.76 1.00 1.00 3.00 6.00 10.00 ▇▅▃▃▂
area_deprivation_index_income 0 1 6.00 2.63 1.00 4.00 6.00 8.00 10.00 ▃▆▇▇▆
area_deprivation_index_services 0 1 4.79 3.01 1.00 2.00 4.00 8.00 10.00 ▇▅▅▅▅
data %>% 
    ggplot(aes(accreditation, area_deprivation_index_employment)) +
    geom_boxplot()

data %>% 
    ggplot(aes(accreditation, area_deprivation_index_crime)) +
    geom_boxplot()

data %>% 
    ggplot(aes(accreditation, area_deprivation_index_health)) +
    geom_boxplot()

data_binarized <- data %>%
    select(-name_of_museum) %>%
    binarize()

data_binarized %>% glimpse()
## Rows: 3,966
## Columns: 196
## $ village_town_or_city__Edinburgh                                             <dbl> …
## $ village_town_or_city__London                                                <dbl> …
## $ `village_town_or_city__-OTHER`                                              <dbl> …
## $ postcode__SO23_8TS                                                          <dbl> …
## $ `postcode__-OTHER`                                                          <dbl> …
## $ `latitude__-Inf_51.48120725`                                                <dbl> …
## $ latitude__51.48120725_52.4554475                                            <dbl> …
## $ latitude__52.4554475_53.9431025                                             <dbl> …
## $ latitude__53.9431025_Inf                                                    <dbl> …
## $ `longitude__-Inf_-3.0854455`                                                <dbl> …
## $ `longitude__-3.0854455_-1.8564615`                                          <dbl> …
## $ `longitude__-1.8564615_-0.469075`                                           <dbl> …
## $ `longitude__-0.469075_Inf`                                                  <dbl> …
## $ `admin_area__/England/London_(English_Region)/Westminster_(London_Borough)` <dbl> …
## $ `admin_area__/England/South_West_(English_Region)/Cornwall_(English_UA)`    <dbl> …
## $ `admin_area__/England/South_West_(English_Region)/Wiltshire_(English_UA)`   <dbl> …
## $ `admin_area__/Scotland/City_of_Edinburgh_(Scottish_Council_Area)`           <dbl> …
## $ `admin_area__/Scotland/Dumfries_and_Galloway_(Scottish_Council_Area)`       <dbl> …
## $ `admin_area__/Scotland/Highland_(Scottish_Council_Area)`                    <dbl> …
## $ `admin_area__-OTHER`                                                        <dbl> …
## $ accreditation__Accredited                                                   <dbl> …
## $ accreditation__Unaccredited                                                 <dbl> …
## $ `governance__Government-Local_Authority`                                    <dbl> …
## $ `governance__Government-National`                                           <dbl> …
## $ `governance__Independent-English_Heritage`                                  <dbl> …
## $ `governance__Independent-National_Trust`                                    <dbl> …
## $ `governance__Independent-Not_for_profit`                                    <dbl> …
## $ `governance__Independent-Private`                                           <dbl> …
## $ `governance__Independent-Unknown`                                           <dbl> …
## $ governance__University                                                      <dbl> …
## $ governance__Unknown                                                         <dbl> …
## $ `governance__-OTHER`                                                        <dbl> …
## $ size__large                                                                 <dbl> …
## $ size__medium                                                                <dbl> …
## $ size__small                                                                 <dbl> …
## $ size__unknown                                                               <dbl> …
## $ `size__-OTHER`                                                              <dbl> …
## $ size_provenance__ace_size_designation                                       <dbl> …
## $ size_provenance__aim_size_designation                                       <dbl> …
## $ size_provenance__domus                                                      <dbl> …
## $ `size_provenance__ma(fam)`                                                  <dbl> …
## $ size_provenance__mm_manual_estimate_2018                                    <dbl> …
## $ size_provenance__mm_prediction_random_forest                                <dbl> …
## $ size_provenance__scottish_national_audit                                    <dbl> …
## $ size_provenance__unknown                                                    <dbl> …
## $ size_provenance__visitbritain                                               <dbl> …
## $ `size_provenance__-OTHER`                                                   <dbl> …
## $ `subject_matter__Archaeology-Roman`                                         <dbl> …
## $ `subject_matter__Arts-Fine_and_decorative_arts`                             <dbl> …
## $ `subject_matter__Buildings-Houses-Large_houses`                             <dbl> …
## $ `subject_matter__Buildings-Houses-Medium_houses`                            <dbl> …
## $ `subject_matter__Industry_and_manufacture-Mining_and_quarrying`             <dbl> …
## $ `subject_matter__Leisure_and_sport-Toys_and_models`                         <dbl> …
## $ subject_matter__Local_Histories                                             <dbl> …
## $ `subject_matter__Mixed-Encyclopaedic`                                       <dbl> …
## $ `subject_matter__Mixed-Other`                                               <dbl> …
## $ subject_matter__Other                                                       <dbl> …
## $ `subject_matter__Personality-Literary`                                      <dbl> …
## $ `subject_matter__Rural_Industry-Farming`                                    <dbl> …
## $ `subject_matter__Sea_and_seafaring-Boats_and_ships`                         <dbl> …
## $ `subject_matter__Sea_and_seafaring-Mixed`                                   <dbl> …
## $ `subject_matter__Transport-Cars_and_motorbikes`                             <dbl> …
## $ `subject_matter__Transport-Trains_and_railways`                             <dbl> …
## $ `subject_matter__War_and_conflict-Airforce`                                 <dbl> …
## $ `subject_matter__War_and_conflict-Castles_and_forts`                        <dbl> …
## $ `subject_matter__War_and_conflict-Military`                                 <dbl> …
## $ `subject_matter__War_and_conflict-Regiment`                                 <dbl> …
## $ `subject_matter__-OTHER`                                                    <dbl> …
## $ `year_opened__1945:1960`                                                    <dbl> …
## $ `year_opened__1960:2017`                                                    <dbl> …
## $ `year_opened__1972:1972`                                                    <dbl> …
## $ `year_opened__1973:1973`                                                    <dbl> …
## $ `year_opened__1974:1974`                                                    <dbl> …
## $ `year_opened__1975:1975`                                                    <dbl> …
## $ `year_opened__1976:1976`                                                    <dbl> …
## $ `year_opened__1977:1977`                                                    <dbl> …
## $ `year_opened__1978:1978`                                                    <dbl> …
## $ `year_opened__1979:1979`                                                    <dbl> …
## $ `year_opened__1980:1980`                                                    <dbl> …
## $ `year_opened__1981:1981`                                                    <dbl> …
## $ `year_opened__1982:1982`                                                    <dbl> …
## $ `year_opened__1983:1983`                                                    <dbl> …
## $ `year_opened__1984:1984`                                                    <dbl> …
## $ `year_opened__1985:1985`                                                    <dbl> …
## $ `year_opened__1986:1986`                                                    <dbl> …
## $ `year_opened__1987:1987`                                                    <dbl> …
## $ `year_opened__1988:1988`                                                    <dbl> …
## $ `year_opened__1989:1989`                                                    <dbl> …
## $ `year_opened__1990:1990`                                                    <dbl> …
## $ `year_opened__1991:1991`                                                    <dbl> …
## $ `year_opened__1992:1992`                                                    <dbl> …
## $ `year_opened__1993:1993`                                                    <dbl> …
## $ `year_opened__1994:1994`                                                    <dbl> …
## $ `year_opened__1995:1995`                                                    <dbl> …
## $ `year_opened__1996:1996`                                                    <dbl> …
## $ `year_opened__1997:1997`                                                    <dbl> …
## $ `year_opened__1999:1999`                                                    <dbl> …
## $ `year_opened__2000:2000`                                                    <dbl> …
## $ `year_opened__2001:2001`                                                    <dbl> …
## $ `year_opened__2005:2005`                                                    <dbl> …
## $ `year_opened__-OTHER`                                                       <dbl> …
## $ `year_closed__9999:9999`                                                    <dbl> …
## $ `year_closed__-OTHER`                                                       <dbl> …
## $ primary_provenance_of_data__ace                                             <dbl> …
## $ primary_provenance_of_data__aim                                             <dbl> …
## $ primary_provenance_of_data__aim82M                                          <dbl> …
## $ primary_provenance_of_data__aim82NM                                         <dbl> …
## $ primary_provenance_of_data__domus                                           <dbl> …
## $ primary_provenance_of_data__fcm                                             <dbl> …
## $ primary_provenance_of_data__hha                                             <dbl> …
## $ primary_provenance_of_data__mald                                            <dbl> …
## $ primary_provenance_of_data__mgs                                             <dbl> …
## $ primary_provenance_of_data__misc                                            <dbl> …
## $ primary_provenance_of_data__musassoc                                        <dbl> …
## $ primary_provenance_of_data__wiki                                            <dbl> …
## $ `primary_provenance_of_data__-OTHER`                                        <dbl> …
## $ `area_deprivation_index__-Inf_4`                                            <dbl> …
## $ area_deprivation_index__4_5                                                 <dbl> …
## $ area_deprivation_index__5_7                                                 <dbl> …
## $ area_deprivation_index__7_Inf                                               <dbl> …
## $ `area_deprivation_index_crime__-Inf_3`                                      <dbl> …
## $ area_deprivation_index_crime__3_6                                           <dbl> …
## $ area_deprivation_index_crime__6_8                                           <dbl> …
## $ area_deprivation_index_crime__8_Inf                                         <dbl> …
## $ `area_deprivation_index_education__-Inf_4`                                  <dbl> …
## $ area_deprivation_index_education__4_6                                       <dbl> …
## $ area_deprivation_index_education__6_8                                       <dbl> …
## $ area_deprivation_index_education__8_Inf                                     <dbl> …
## $ `area_deprivation_index_employment__-Inf_4`                                 <dbl> …
## $ area_deprivation_index_employment__4_6                                      <dbl> …
## $ area_deprivation_index_employment__6_8                                      <dbl> …
## $ area_deprivation_index_employment__8_Inf                                    <dbl> …
## $ `area_deprivation_index_health__-Inf_4`                                     <dbl> …
## $ area_deprivation_index_health__4_6                                          <dbl> …
## $ area_deprivation_index_health__6_8                                          <dbl> …
## $ area_deprivation_index_health__8_Inf                                        <dbl> …
## $ `area_deprivation_index_housing__-Inf_3`                                    <dbl> …
## $ area_deprivation_index_housing__3_6                                         <dbl> …
## $ area_deprivation_index_housing__6_Inf                                       <dbl> …
## $ `area_deprivation_index_income__-Inf_4`                                     <dbl> …
## $ area_deprivation_index_income__4_6                                          <dbl> …
## $ area_deprivation_index_income__6_8                                          <dbl> …
## $ area_deprivation_index_income__8_Inf                                        <dbl> …
## $ `area_deprivation_index_services__-Inf_2`                                   <dbl> …
## $ area_deprivation_index_services__2_4                                        <dbl> …
## $ area_deprivation_index_services__4_8                                        <dbl> …
## $ area_deprivation_index_services__8_Inf                                      <dbl> …
## $ area_geodemographic_group__Country_Living                                   <dbl> …
## $ area_geodemographic_group__English_and_Welsh_Countryside                    <dbl> …
## $ area_geodemographic_group__Ethnically_Diverse_Metropolitan_Living           <dbl> …
## $ area_geodemographic_group__Larger_Towns_and_Cities                          <dbl> …
## $ area_geodemographic_group__London_Cosmopolitan                              <dbl> …
## $ area_geodemographic_group__Manufacturing_Traits                             <dbl> …
## $ area_geodemographic_group__Northern_Ireland_Countryside                     <dbl> …
## $ area_geodemographic_group__Remoter_Coastal_Living                           <dbl> …
## $ `area_geodemographic_group__Rural-Urban_Fringe`                             <dbl> …
## $ area_geodemographic_group__Scottish_Countryside                             <dbl> …
## $ area_geodemographic_group__Scottish_Industrial_Heritage                     <dbl> …
## $ area_geodemographic_group__Services_Manufacturing_and_Mining_Legacy         <dbl> …
## $ area_geodemographic_group__Suburban_Traits                                  <dbl> …
## $ area_geodemographic_group__Thriving_Rural                                   <dbl> …
## $ area_geodemographic_group__Town_Living                                      <dbl> …
## $ area_geodemographic_group__University_Towns_and_Cities                      <dbl> …
## $ `area_geodemographic_group__-OTHER`                                         <dbl> …
## $ area_geodemographic_subgroup__Affluent_rural                                <dbl> …
## $ area_geodemographic_subgroup__Ageing_Coastal_Living                         <dbl> …
## $ area_geodemographic_subgroup__City_Periphery                                <dbl> …
## $ area_geodemographic_subgroup__Country_Living                                <dbl> …
## $ area_geodemographic_subgroup__Ethnically_Diverse_Metropolitan_Living        <dbl> …
## $ area_geodemographic_subgroup__Expanding_Areas                               <dbl> …
## $ `area_geodemographic_subgroup__Industrial_and_Multi-ethnic`                 <dbl> …
## $ area_geodemographic_subgroup__Larger_Towns_and_Cities                       <dbl> …
## $ area_geodemographic_subgroup__London_Cosmopolitan                           <dbl> …
## $ area_geodemographic_subgroup__Manufacturing_Legacy                          <dbl> …
## $ area_geodemographic_subgroup__Mining_Legacy                                 <dbl> …
## $ area_geodemographic_subgroup__Northern_Ireland_Countryside                  <dbl> …
## $ area_geodemographic_subgroup__Older_Farming_Communities                     <dbl> …
## $ area_geodemographic_subgroup__Prosperous_Towns                              <dbl> …
## $ area_geodemographic_subgroup__Rural_Growth_Areas                            <dbl> …
## $ `area_geodemographic_subgroup__Rural-Urban_Fringe`                          <dbl> …
## $ area_geodemographic_subgroup__Scottish_Countryside                          <dbl> …
## $ area_geodemographic_subgroup__Scottish_Industrial_Legacy                    <dbl> …
## $ area_geodemographic_subgroup__Seaside_Living                                <dbl> …
## $ area_geodemographic_subgroup__Service_Economy                               <dbl> …
## $ area_geodemographic_subgroup__Sparse_English_and_Welsh_Countryside          <dbl> …
## $ area_geodemographic_subgroup__University_Towns_and_Cities                   <dbl> …
## $ area_geodemographic_subgroup__Urban_Living                                  <dbl> …
## $ `area_geodemographic_subgroup__-OTHER`                                      <dbl> …
## $ area_geodemographic_supergroup__Affluent_England                            <dbl> …
## $ area_geodemographic_supergroup__Business_Education_and_Heritage_Centres     <dbl> …
## $ area_geodemographic_supergroup__Countryside_Living                          <dbl> …
## $ area_geodemographic_supergroup__Ethnically_Diverse_Metropolitan_Living      <dbl> …
## $ area_geodemographic_supergroup__London_Cosmopolitan                         <dbl> …
## $ area_geodemographic_supergroup__Services_and_Industrial_Legacy              <dbl> …
## $ area_geodemographic_supergroup__Town_and_Country_Living                     <dbl> …
## $ area_geodemographic_supergroup__Urban_Settlements                           <dbl> …
data_correlate <- data_binarized %>%
    correlate(accreditation__Accredited)

data_correlate %>%
    correlationfunnel::plot_correlation_funnel()
## Warning: ggrepel: 188 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Build Model

Split Data

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.1
## ✔ dials        1.2.1     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.2.1     ✔ yardstick    1.3.1
## ✔ recipes      1.1.0
## Warning: package 'modeldata' was built under R version 4.3.3
## Warning: package 'recipes' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(usemodels)

set.seed(1123)
data_clean <- data %>% sample_n(100)

data_split <- initial_split(data_clean)
data_train <- training(data_split)
data_test <- testing(data_split)

data_cv <- rsample::vfold_cv(data_train)

data_cv
## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [67/8]> Fold01
##  2 <split [67/8]> Fold02
##  3 <split [67/8]> Fold03
##  4 <split [67/8]> Fold04
##  5 <split [67/8]> Fold05
##  6 <split [68/7]> Fold06
##  7 <split [68/7]> Fold07
##  8 <split [68/7]> Fold08
##  9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10

Preprocess data

library(themis)
library(textrecipes)

skimr::skim(data)
Data summary
Name data
Number of rows 3966
Number of columns 25
_______________________
Column type frequency:
character 15
numeric 10
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
name_of_museum 0 1 3 76 0 3965 0
village_town_or_city 0 1 3 24 0 1639 0
postcode 0 1 6 9 0 3725 0
admin_area 0 1 16 137 0 392 0
accreditation 0 1 10 12 0 2 0
governance 0 1 7 41 0 13 0
size 0 1 4 7 0 5 0
size_provenance 0 1 2 29 0 16 0
subject_matter 0 1 5 45 0 112 0
year_opened 0 1 9 9 0 334 0
year_closed 0 1 9 9 0 159 0
primary_provenance_of_data 0 1 3 8 0 17 0
area_geodemographic_group 0 1 11 40 0 17 0
area_geodemographic_subgroup 0 1 12 39 0 25 0
area_geodemographic_supergroup 0 1 16 39 0 8 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
latitude 0 1 52.93 1.95 49.20 51.48 52.46 53.94 60.79 ▅▇▃▁▁
longitude 0 1 -1.94 1.83 -8.09 -3.09 -1.86 -0.47 1.76 ▁▂▇▇▅
area_deprivation_index 0 1 5.46 2.48 1.00 4.00 5.00 7.00 10.00 ▃▆▇▆▃
area_deprivation_index_crime 0 1 5.43 3.07 1.00 3.00 6.00 8.00 10.00 ▇▆▅▆▇
area_deprivation_index_education 0 1 6.05 2.61 1.00 4.00 6.00 8.00 10.00 ▃▅▇▇▆
area_deprivation_index_employment 0 1 6.08 2.77 1.00 4.00 6.00 8.00 10.00 ▅▆▇▇▇
area_deprivation_index_health 0 1 6.02 2.82 1.00 4.00 6.00 8.00 10.00 ▅▆▆▇▇
area_deprivation_index_housing 0 1 3.99 2.76 1.00 1.00 3.00 6.00 10.00 ▇▅▃▃▂
area_deprivation_index_income 0 1 6.00 2.63 1.00 4.00 6.00 8.00 10.00 ▃▆▇▇▆
area_deprivation_index_services 0 1 4.79 3.01 1.00 2.00 4.00 8.00 10.00 ▇▅▅▅▅
xgboost_rec <- recipes::recipe(accreditation ~ ., data = data_train) %>%
    update_role(name_of_museum, new_role = "ID") %>%
    step_tokenize(admin_area) %>%
    step_tokenfilter(admin_area, max_tokens = 75) %>%
    step_tf(admin_area) %>%
    step_other(village_town_or_city, postcode, subject_matter, year_opened, year_closed) %>%
    step_novel(all_nominal_predictors()) %>%
    step_dummy(village_town_or_city, postcode, subject_matter, year_opened, year_closed, governance, size, size_provenance, primary_provenance_of_data, area_geodemographic_group, area_geodemographic_subgroup, area_geodemographic_supergroup)
    

xgboost_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 75
## Columns: 174
## $ name_of_museum                                                         <fct> …
## $ latitude                                                               <dbl> …
## $ longitude                                                              <dbl> …
## $ area_deprivation_index                                                 <dbl> …
## $ area_deprivation_index_crime                                           <dbl> …
## $ area_deprivation_index_education                                       <dbl> …
## $ area_deprivation_index_employment                                      <dbl> …
## $ area_deprivation_index_health                                          <dbl> …
## $ area_deprivation_index_housing                                         <dbl> …
## $ area_deprivation_index_income                                          <dbl> …
## $ area_deprivation_index_services                                        <dbl> …
## $ accreditation                                                          <fct> …
## $ tf_admin_area_aberdeenshire                                            <int> …
## $ tf_admin_area_and                                                      <int> …
## $ tf_admin_area_antrim                                                   <int> …
## $ tf_admin_area_area                                                     <int> …
## $ tf_admin_area_argyll                                                   <int> …
## $ tf_admin_area_ayrshire                                                 <int> …
## $ tf_admin_area_barnsley                                                 <int> …
## $ tf_admin_area_bath                                                     <int> …
## $ tf_admin_area_borders                                                  <int> …
## $ tf_admin_area_borough                                                  <int> …
## $ tf_admin_area_burnley                                                  <int> …
## $ tf_admin_area_bute                                                     <int> …
## $ tf_admin_area_ca                                                       <int> …
## $ tf_admin_area_cambridgeshire                                           <int> …
## $ tf_admin_area_carmarthenshire                                          <int> …
## $ tf_admin_area_ceredigion                                               <int> …
## $ tf_admin_area_charnwood                                                <int> …
## $ tf_admin_area_city                                                     <int> …
## $ tf_admin_area_coastal                                                  <int> …
## $ tf_admin_area_cornwall                                                 <int> …
## $ tf_admin_area_cotswold                                                 <int> …
## $ tf_admin_area_council                                                  <int> …
## $ tf_admin_area_county                                                   <int> …
## $ tf_admin_area_coventry                                                 <int> …
## $ tf_admin_area_derbyshire                                               <int> …
## $ tf_admin_area_district                                                 <int> …
## $ tf_admin_area_dorset                                                   <int> …
## $ tf_admin_area_dunbartonshire                                           <int> …
## $ tf_admin_area_durham                                                   <int> …
## $ tf_admin_area_east                                                     <int> …
## $ tf_admin_area_edinburgh                                                <int> …
## $ tf_admin_area_england                                                  <int> …
## $ tf_admin_area_english                                                  <int> …
## $ tf_admin_area_essex                                                    <int> …
## $ tf_admin_area_forest                                                   <int> …
## $ tf_admin_area_gloucestershire                                          <int> …
## $ tf_admin_area_gov                                                      <int> …
## $ tf_admin_area_great                                                    <int> …
## $ tf_admin_area_guildford                                                <int> …
## $ tf_admin_area_gwynedd                                                  <int> …
## $ tf_admin_area_harrogate                                                <int> …
## $ tf_admin_area_heath                                                    <int> …
## $ tf_admin_area_herefordshire                                            <int> …
## $ tf_admin_area_hertfordshire                                            <int> …
## $ tf_admin_area_hertsmere                                                <int> …
## $ tf_admin_area_highland                                                 <int> …
## $ tf_admin_area_holland                                                  <int> …
## $ tf_admin_area_humber                                                   <int> …
## $ tf_admin_area_kent                                                     <int> …
## $ tf_admin_area_lancashire                                               <int> …
## $ tf_admin_area_lincolnshire                                             <int> …
## $ tf_admin_area_london                                                   <int> …
## $ tf_admin_area_midlands                                                 <int> …
## $ tf_admin_area_norfolk                                                  <int> …
## $ tf_admin_area_north                                                    <int> …
## $ tf_admin_area_of                                                       <int> …
## $ tf_admin_area_or                                                       <int> …
## $ tf_admin_area_region                                                   <int> …
## $ tf_admin_area_scotland                                                 <int> …
## $ tf_admin_area_scottish                                                 <int> …
## $ tf_admin_area_sevenoaks                                                <int> …
## $ tf_admin_area_sheffield                                                <int> …
## $ tf_admin_area_somerset                                                 <int> …
## $ tf_admin_area_south                                                    <int> …
## $ tf_admin_area_suffolk                                                  <int> …
## $ tf_admin_area_surrey                                                   <int> …
## $ tf_admin_area_sussex                                                   <int> …
## $ tf_admin_area_the                                                      <int> …
## $ tf_admin_area_ua                                                       <int> …
## $ tf_admin_area_wales                                                    <int> …
## $ tf_admin_area_welsh                                                    <int> …
## $ tf_admin_area_west                                                     <int> …
## $ tf_admin_area_westminster                                              <int> …
## $ tf_admin_area_wiltshire                                                <int> …
## $ tf_admin_area_yorkshire                                                <int> …
## $ village_town_or_city_other                                             <dbl> …
## $ village_town_or_city_new                                               <dbl> …
## $ postcode_other                                                         <dbl> …
## $ postcode_new                                                           <dbl> …
## $ subject_matter_Buildings.Houses.Large_houses                           <dbl> …
## $ subject_matter_Local_Histories                                         <dbl> …
## $ subject_matter_other                                                   <dbl> …
## $ subject_matter_new                                                     <dbl> …
## $ year_opened_other                                                      <dbl> …
## $ year_opened_new                                                        <dbl> …
## $ year_closed_other                                                      <dbl> …
## $ year_closed_new                                                        <dbl> …
## $ governance_Government.National                                         <dbl> …
## $ governance_Independent.English_Heritage                                <dbl> …
## $ governance_Independent.Historic_Environment_Scotland                   <dbl> …
## $ governance_Independent.National_Trust                                  <dbl> …
## $ governance_Independent.National_Trust_for_Scotland                     <dbl> …
## $ governance_Independent.Not_for_profit                                  <dbl> …
## $ governance_Independent.Private                                         <dbl> …
## $ governance_Unknown                                                     <dbl> …
## $ governance_new                                                         <dbl> …
## $ size_medium                                                            <dbl> …
## $ size_small                                                             <dbl> …
## $ size_new                                                               <dbl> …
## $ size_provenance_aim_size_designation                                   <dbl> …
## $ size_provenance_domus                                                  <dbl> …
## $ size_provenance_ma.fam.                                                <dbl> …
## $ size_provenance_mm                                                     <dbl> …
## $ size_provenance_mm_prediction_random_forest                            <dbl> …
## $ size_provenance_unknown                                                <dbl> …
## $ size_provenance_visitbritain                                           <dbl> …
## $ size_provenance_new                                                    <dbl> …
## $ primary_provenance_of_data_aim                                         <dbl> …
## $ primary_provenance_of_data_aim82M                                      <dbl> …
## $ primary_provenance_of_data_aim82NM                                     <dbl> …
## $ primary_provenance_of_data_domus                                       <dbl> …
## $ primary_provenance_of_data_fcm                                         <dbl> …
## $ primary_provenance_of_data_hha                                         <dbl> …
## $ primary_provenance_of_data_mald                                        <dbl> …
## $ primary_provenance_of_data_mgs                                         <dbl> …
## $ primary_provenance_of_data_misc                                        <dbl> …
## $ primary_provenance_of_data_Misc                                        <dbl> …
## $ primary_provenance_of_data_musassoc                                    <dbl> …
## $ primary_provenance_of_data_MusCal                                      <dbl> …
## $ primary_provenance_of_data_wiki                                        <dbl> …
## $ primary_provenance_of_data_new                                         <dbl> …
## $ area_geodemographic_group_English.and.Welsh.Countryside                <dbl> …
## $ area_geodemographic_group_Larger.Towns.and.Cities                      <dbl> …
## $ area_geodemographic_group_London.Cosmopolitan                          <dbl> …
## $ area_geodemographic_group_Manufacturing.Traits                         <dbl> …
## $ area_geodemographic_group_Northern.Ireland.Countryside                 <dbl> …
## $ area_geodemographic_group_Remoter.Coastal.Living                       <dbl> …
## $ area_geodemographic_group_Rural.Urban.Fringe                           <dbl> …
## $ area_geodemographic_group_Scottish.Countryside                         <dbl> …
## $ area_geodemographic_group_Scottish.Industrial.Heritage                 <dbl> …
## $ area_geodemographic_group_Services.Manufacturing.and.Mining.Legacy     <dbl> …
## $ area_geodemographic_group_Thriving.Rural                               <dbl> …
## $ area_geodemographic_group_Town.Living                                  <dbl> …
## $ area_geodemographic_group_University.Towns.and.Cities                  <dbl> …
## $ area_geodemographic_group_new                                          <dbl> …
## $ area_geodemographic_subgroup_Ageing.Coastal.Living                     <dbl> …
## $ area_geodemographic_subgroup_Country.Living                            <dbl> …
## $ area_geodemographic_subgroup_Industrial.and.Multi.ethnic               <dbl> …
## $ area_geodemographic_subgroup_Larger.Towns.and.Cities                   <dbl> …
## $ area_geodemographic_subgroup_London.Cosmopolitan                       <dbl> …
## $ area_geodemographic_subgroup_Manufacturing.Legacy                      <dbl> …
## $ area_geodemographic_subgroup_Mining.Legacy                             <dbl> …
## $ area_geodemographic_subgroup_Northern.Ireland.Countryside              <dbl> …
## $ area_geodemographic_subgroup_Older.Farming.Communities                 <dbl> …
## $ area_geodemographic_subgroup_Prosperous.Semi.rural                     <dbl> …
## $ area_geodemographic_subgroup_Prosperous.Towns                          <dbl> …
## $ area_geodemographic_subgroup_Rural.Growth.Areas                        <dbl> …
## $ area_geodemographic_subgroup_Rural.Urban.Fringe                        <dbl> …
## $ area_geodemographic_subgroup_Scottish.Countryside                      <dbl> …
## $ area_geodemographic_subgroup_Scottish.Industrial.Legacy                <dbl> …
## $ area_geodemographic_subgroup_Seaside.Living                            <dbl> …
## $ area_geodemographic_subgroup_Sparse.English.and.Welsh.Countryside      <dbl> …
## $ area_geodemographic_subgroup_University.Towns.and.Cities               <dbl> …
## $ area_geodemographic_subgroup_Urban.Living                              <dbl> …
## $ area_geodemographic_subgroup_new                                       <dbl> …
## $ area_geodemographic_supergroup_Business.Education.and.Heritage.Centres <dbl> …
## $ area_geodemographic_supergroup_Countryside.Living                      <dbl> …
## $ area_geodemographic_supergroup_London.Cosmopolitan                     <dbl> …
## $ area_geodemographic_supergroup_Services.and.Industrial.Legacy          <dbl> …
## $ area_geodemographic_supergroup_Town.and.Country.Living                 <dbl> …
## $ area_geodemographic_supergroup_Urban.Settlements                       <dbl> …
## $ area_geodemographic_supergroup_new                                     <dbl> …

Specify model

xgboost_spec <- 
  boost_tree(trees = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec) 

Tune hyperparameters

doParallel::registerDoParallel
## function (cl, cores = NULL, ...) 
## {
##     opts <- list(...)
##     optnames <- names(opts)
##     if (is.null(optnames)) 
##         optnames <- rep("", length(opts))
##     unnamed <- !nzchar(optnames)
##     if (any(unnamed)) {
##         warning("ignoring doParallel package option(s) specified with unnamed argument")
##         opts <- opts[!unnamed]
##         optnames <- optnames[!unnamed]
##     }
##     recog <- optnames %in% c("nocompile")
##     if (any(!recog)) {
##         warning(sprintf("ignoring unrecognized doParallel package option(s): %s", 
##             paste(optnames[!recog], collapse = ", ")), call. = FALSE)
##         opts <- opts[recog]
##         optnames <- optnames[recog]
##     }
##     old.optnames <- ls(.options, all.names = TRUE)
##     rm(list = old.optnames, pos = .options)
##     for (i in seq_along(opts)) {
##         assign(optnames[i], opts[[i]], pos = .options)
##     }
##     if (missing(cl) || is.numeric(cl)) {
##         if (.Platform$OS.type == "windows") {
##             if (!missing(cl) && is.numeric(cl)) {
##                 cl <- makeCluster(cl)
##             }
##             else {
##                 if (!missing(cores) && is.numeric(cores)) {
##                   cl <- makeCluster(cores)
##                 }
##                 else {
##                   cl <- makeCluster(3)
##                 }
##             }
##             assign(".revoDoParCluster", cl, pos = .options)
##             reg.finalizer(.options, function(e) {
##                 stopImplicitCluster()
##             }, onexit = TRUE)
##             setDoPar(doParallelSNOW, cl, snowinfo)
##         }
##         else {
##             if (!missing(cl) && is.numeric(cl)) {
##                 cores <- cl
##             }
##             setDoPar(doParallelMC, cores, mcinfo)
##         }
##     }
##     else {
##         setDoPar(doParallelSNOW, cl, snowinfo)
##     }
## }
## <bytecode: 0x11114f540>
## <environment: namespace:doParallel>
set.seed(48291)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv, 
            grid = 5)
## Warning: package 'xgboost' was built under R version 4.3.3