library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(correlationfunnel)
## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>
# Import Data
museums <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')
## Rows: 4191 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (24): museum_id, Name_of_museum, Address_line_1, Address_line_2, Village...
## dbl (11): Latitude, Longitude, DOMUS_identifier, Area_Deprivation_index, Are...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(museums)
Data summary
Name museums
Number of rows 4191
Number of columns 35
_______________________
Column type frequency:
character 24
numeric 11
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
museum_id 0 1.00 8 15 0 4191 0
Name_of_museum 0 1.00 3 76 0 4190 0
Address_line_1 441 0.89 3 61 0 3212 0
Address_line_2 2816 0.33 3 39 0 1167 0
Village,_Town_or_City 4 1.00 3 24 0 1696 0
Postcode 0 1.00 6 9 0 3918 0
Admin_area 0 1.00 12 137 0 393 0
Accreditation 0 1.00 10 12 0 2 0
Governance 0 1.00 7 41 0 13 0
Size 0 1.00 4 7 0 5 0
Size_provenance 179 0.96 2 29 0 16 0
Subject_Matter 0 1.00 5 45 0 114 0
Year_opened 0 1.00 9 9 0 351 0
Year_closed 0 1.00 9 9 0 170 0
DOMUS_Subject_Matter 2788 0.33 5 27 0 21 0
Primary_provenance_of_data 0 1.00 3 8 0 18 0
Identifier_used_in_primary_data_source 2056 0.51 2 8 0 2134 0
Area_Geodemographic_group 49 0.99 11 40 0 17 0
Area_Geodemographic_group_code 49 0.99 3 3 0 16 0
Area_Geodemographic_subgroup 49 0.99 12 39 0 25 0
Area_Geodemographic_subgroup_code 49 0.99 4 4 0 24 0
Area_Geodemographic_supergroup 49 0.99 16 39 0 8 0
Area_Geodemographic_supergroup_code 49 0.99 2 2 0 8 0
Notes 2980 0.29 12 751 0 956 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Latitude 0 1.00 52.93 2.09 49.18 51.48 52.47 53.96 100.00 ▇▁▁▁▁
Longitude 0 1.00 -1.96 1.84 -8.09 -3.10 -1.87 -0.48 1.76 ▁▂▇▇▅
DOMUS_identifier 2347 0.44 1303.45 1597.19 1.00 486.50 991.50 1470.25 7746.00 ▇▂▁▁▁
Area_Deprivation_index 49 0.99 5.44 2.48 1.00 4.00 5.00 7.00 10.00 ▃▆▇▆▃
Area_Deprivation_index_crime 49 0.99 5.43 3.07 1.00 3.00 6.00 8.00 10.00 ▇▆▅▇▇
Area_Deprivation_index_education 49 0.99 6.04 2.61 1.00 4.00 6.00 8.00 10.00 ▃▅▇▇▆
Area_Deprivation_index_employment 49 0.99 6.08 2.76 1.00 4.00 6.00 8.00 10.00 ▅▆▇▇▇
Area_Deprivation_index_health 49 0.99 6.02 2.82 1.00 4.00 6.00 8.00 10.00 ▅▆▆▇▇
Area_Deprivation_index_housing 49 0.99 3.97 2.75 1.00 1.00 3.00 6.00 10.00 ▇▅▃▂▂
Area_Deprivation_index_income 49 0.99 5.99 2.62 1.00 4.00 6.00 8.00 10.00 ▃▆▇▇▆
Area_Deprivation_index_services 49 0.99 4.78 3.01 1.00 2.00 4.00 7.00 10.00 ▇▅▅▅▅

missing values Addressline_2, Addressline_1, DOMUS_Subject_Matter, DOMUS_Identifier, Notes factors or numeric variables Zero Variance variables Character variables Unbalanced target variables id variable museum_id

museums %>% count(Accreditation)
## # A tibble: 2 × 2
##   Accreditation     n
##   <chr>         <int>
## 1 Accredited     1720
## 2 Unaccredited   2471
museums %>%
    ggplot(aes(Accreditation)) +
    geom_bar()

data <- museums %>%
  unite("group_subgroup", Area_Geodemographic_group, Area_Geodemographic_subgroup, sep = "/" , remove = FALSE) %>%
  mutate(Year_opened_rev = Year_opened %>% str_extract("\\d{4}")) %>%
    select(-Address_line_1, -Address_line_2, -DOMUS_Subject_Matter,-DOMUS_identifier, -Notes, -Identifier_used_in_primary_data_source, -Area_Geodemographic_supergroup_code, -Area_Geodemographic_group_code, -Area_Geodemographic_subgroup_code, -museum_id, -Year_closed, -Area_Geodemographic_group, -Area_Geodemographic_subgroup, -Area_Geodemographic_supergroup, -Longitude, -Latitude, -Postcode, -Year_opened, -Admin_area) %>%
    na.omit() %>%
    janitor::clean_names()

skimr::skim(data)
Data summary
Name data
Number of rows 3966
Number of columns 18
_______________________
Column type frequency:
character 10
numeric 8
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
name_of_museum 0 1 3 76 0 3965 0
village_town_or_city 0 1 3 24 0 1639 0
accreditation 0 1 10 12 0 2 0
governance 0 1 7 41 0 13 0
size 0 1 4 7 0 5 0
size_provenance 0 1 2 29 0 16 0
subject_matter 0 1 5 45 0 112 0
primary_provenance_of_data 0 1 3 8 0 17 0
group_subgroup 0 1 28 79 0 32 0
year_opened_rev 0 1 4 4 0 211 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
area_deprivation_index 0 1 5.46 2.48 1 4 5 7 10 ▃▆▇▆▃
area_deprivation_index_crime 0 1 5.43 3.07 1 3 6 8 10 ▇▆▅▆▇
area_deprivation_index_education 0 1 6.05 2.61 1 4 6 8 10 ▃▅▇▇▆
area_deprivation_index_employment 0 1 6.08 2.77 1 4 6 8 10 ▅▆▇▇▇
area_deprivation_index_health 0 1 6.02 2.82 1 4 6 8 10 ▅▆▆▇▇
area_deprivation_index_housing 0 1 3.99 2.76 1 1 3 6 10 ▇▅▃▃▂
area_deprivation_index_income 0 1 6.00 2.63 1 4 6 8 10 ▃▆▇▇▆
area_deprivation_index_services 0 1 4.79 3.01 1 2 4 8 10 ▇▅▅▅▅
data %>% 
    ggplot(aes(accreditation, area_deprivation_index_employment)) +
    geom_boxplot()

data %>% 
    ggplot(aes(accreditation, area_deprivation_index_crime)) +
    geom_boxplot()

data %>% 
    ggplot(aes(accreditation, area_deprivation_index_health)) +
    geom_boxplot()

data_binarized <- data %>%
    select(-name_of_museum) %>%
    binarize()

data_binarized %>% glimpse()
## Rows: 3,966
## Columns: 154
## $ village_town_or_city__Edinburgh                                                                 <dbl> …
## $ village_town_or_city__London                                                                    <dbl> …
## $ `village_town_or_city__-OTHER`                                                                  <dbl> …
## $ accreditation__Accredited                                                                       <dbl> …
## $ accreditation__Unaccredited                                                                     <dbl> …
## $ `governance__Government-Local_Authority`                                                        <dbl> …
## $ `governance__Government-National`                                                               <dbl> …
## $ `governance__Independent-English_Heritage`                                                      <dbl> …
## $ `governance__Independent-National_Trust`                                                        <dbl> …
## $ `governance__Independent-Not_for_profit`                                                        <dbl> …
## $ `governance__Independent-Private`                                                               <dbl> …
## $ `governance__Independent-Unknown`                                                               <dbl> …
## $ governance__University                                                                          <dbl> …
## $ governance__Unknown                                                                             <dbl> …
## $ `governance__-OTHER`                                                                            <dbl> …
## $ size__large                                                                                     <dbl> …
## $ size__medium                                                                                    <dbl> …
## $ size__small                                                                                     <dbl> …
## $ size__unknown                                                                                   <dbl> …
## $ `size__-OTHER`                                                                                  <dbl> …
## $ size_provenance__ace_size_designation                                                           <dbl> …
## $ size_provenance__aim_size_designation                                                           <dbl> …
## $ size_provenance__domus                                                                          <dbl> …
## $ `size_provenance__ma(fam)`                                                                      <dbl> …
## $ size_provenance__mm_manual_estimate_2018                                                        <dbl> …
## $ size_provenance__mm_prediction_random_forest                                                    <dbl> …
## $ size_provenance__scottish_national_audit                                                        <dbl> …
## $ size_provenance__unknown                                                                        <dbl> …
## $ size_provenance__visitbritain                                                                   <dbl> …
## $ `size_provenance__-OTHER`                                                                       <dbl> …
## $ `subject_matter__Archaeology-Roman`                                                             <dbl> …
## $ `subject_matter__Arts-Fine_and_decorative_arts`                                                 <dbl> …
## $ `subject_matter__Buildings-Houses-Large_houses`                                                 <dbl> …
## $ `subject_matter__Buildings-Houses-Medium_houses`                                                <dbl> …
## $ `subject_matter__Industry_and_manufacture-Mining_and_quarrying`                                 <dbl> …
## $ `subject_matter__Leisure_and_sport-Toys_and_models`                                             <dbl> …
## $ subject_matter__Local_Histories                                                                 <dbl> …
## $ `subject_matter__Mixed-Encyclopaedic`                                                           <dbl> …
## $ `subject_matter__Mixed-Other`                                                                   <dbl> …
## $ subject_matter__Other                                                                           <dbl> …
## $ `subject_matter__Personality-Literary`                                                          <dbl> …
## $ `subject_matter__Rural_Industry-Farming`                                                        <dbl> …
## $ `subject_matter__Sea_and_seafaring-Boats_and_ships`                                             <dbl> …
## $ `subject_matter__Sea_and_seafaring-Mixed`                                                       <dbl> …
## $ `subject_matter__Transport-Cars_and_motorbikes`                                                 <dbl> …
## $ `subject_matter__Transport-Trains_and_railways`                                                 <dbl> …
## $ `subject_matter__War_and_conflict-Airforce`                                                     <dbl> …
## $ `subject_matter__War_and_conflict-Castles_and_forts`                                            <dbl> …
## $ `subject_matter__War_and_conflict-Military`                                                     <dbl> …
## $ `subject_matter__War_and_conflict-Regiment`                                                     <dbl> …
## $ `subject_matter__-OTHER`                                                                        <dbl> …
## $ primary_provenance_of_data__ace                                                                 <dbl> …
## $ primary_provenance_of_data__aim                                                                 <dbl> …
## $ primary_provenance_of_data__aim82M                                                              <dbl> …
## $ primary_provenance_of_data__aim82NM                                                             <dbl> …
## $ primary_provenance_of_data__domus                                                               <dbl> …
## $ primary_provenance_of_data__fcm                                                                 <dbl> …
## $ primary_provenance_of_data__hha                                                                 <dbl> …
## $ primary_provenance_of_data__mald                                                                <dbl> …
## $ primary_provenance_of_data__mgs                                                                 <dbl> …
## $ primary_provenance_of_data__misc                                                                <dbl> …
## $ primary_provenance_of_data__musassoc                                                            <dbl> …
## $ primary_provenance_of_data__wiki                                                                <dbl> …
## $ `primary_provenance_of_data__-OTHER`                                                            <dbl> …
## $ `area_deprivation_index__-Inf_4`                                                                <dbl> …
## $ area_deprivation_index__4_5                                                                     <dbl> …
## $ area_deprivation_index__5_7                                                                     <dbl> …
## $ area_deprivation_index__7_Inf                                                                   <dbl> …
## $ `area_deprivation_index_crime__-Inf_3`                                                          <dbl> …
## $ area_deprivation_index_crime__3_6                                                               <dbl> …
## $ area_deprivation_index_crime__6_8                                                               <dbl> …
## $ area_deprivation_index_crime__8_Inf                                                             <dbl> …
## $ `area_deprivation_index_education__-Inf_4`                                                      <dbl> …
## $ area_deprivation_index_education__4_6                                                           <dbl> …
## $ area_deprivation_index_education__6_8                                                           <dbl> …
## $ area_deprivation_index_education__8_Inf                                                         <dbl> …
## $ `area_deprivation_index_employment__-Inf_4`                                                     <dbl> …
## $ area_deprivation_index_employment__4_6                                                          <dbl> …
## $ area_deprivation_index_employment__6_8                                                          <dbl> …
## $ area_deprivation_index_employment__8_Inf                                                        <dbl> …
## $ `area_deprivation_index_health__-Inf_4`                                                         <dbl> …
## $ area_deprivation_index_health__4_6                                                              <dbl> …
## $ area_deprivation_index_health__6_8                                                              <dbl> …
## $ area_deprivation_index_health__8_Inf                                                            <dbl> …
## $ `area_deprivation_index_housing__-Inf_3`                                                        <dbl> …
## $ area_deprivation_index_housing__3_6                                                             <dbl> …
## $ area_deprivation_index_housing__6_Inf                                                           <dbl> …
## $ `area_deprivation_index_income__-Inf_4`                                                         <dbl> …
## $ area_deprivation_index_income__4_6                                                              <dbl> …
## $ area_deprivation_index_income__6_8                                                              <dbl> …
## $ area_deprivation_index_income__8_Inf                                                            <dbl> …
## $ `area_deprivation_index_services__-Inf_2`                                                       <dbl> …
## $ area_deprivation_index_services__2_4                                                            <dbl> …
## $ area_deprivation_index_services__4_8                                                            <dbl> …
## $ area_deprivation_index_services__8_Inf                                                          <dbl> …
## $ `group_subgroup__Country_Living/Country_Living`                                                 <dbl> …
## $ `group_subgroup__English_and_Welsh_Countryside/Older_Farming_Communities`                       <dbl> …
## $ `group_subgroup__English_and_Welsh_Countryside/Sparse_English_and_Welsh_Countryside`            <dbl> …
## $ `group_subgroup__Ethnically_Diverse_Metropolitan_Living/Ethnically_Diverse_Metropolitan_Living` <dbl> …
## $ `group_subgroup__Larger_Towns_and_Cities/Larger_Towns_and_Cities`                               <dbl> …
## $ `group_subgroup__London_Cosmopolitan/London_Cosmopolitan`                                       <dbl> …
## $ `group_subgroup__Manufacturing_Traits/Industrial_and_Multi-ethnic`                              <dbl> …
## $ `group_subgroup__Manufacturing_Traits/Urban_Living`                                             <dbl> …
## $ `group_subgroup__Northern_Ireland_Countryside/Northern_Ireland_Countryside`                     <dbl> …
## $ `group_subgroup__Remoter_Coastal_Living/Ageing_Coastal_Living`                                  <dbl> …
## $ `group_subgroup__Remoter_Coastal_Living/Seaside_Living`                                         <dbl> …
## $ `group_subgroup__Rural-Urban_Fringe/Rural-Urban_Fringe`                                         <dbl> …
## $ `group_subgroup__Scottish_Countryside/Scottish_Countryside`                                     <dbl> …
## $ `group_subgroup__Scottish_Industrial_Heritage/Scottish_Industrial_Legacy`                       <dbl> …
## $ `group_subgroup__Services_Manufacturing_and_Mining_Legacy/Manufacturing_Legacy`                 <dbl> …
## $ `group_subgroup__Services_Manufacturing_and_Mining_Legacy/Mining_Legacy`                        <dbl> …
## $ `group_subgroup__Services_Manufacturing_and_Mining_Legacy/Service_Economy`                      <dbl> …
## $ `group_subgroup__Suburban_Traits/City_Periphery`                                                <dbl> …
## $ `group_subgroup__Suburban_Traits/Expanding_Areas`                                               <dbl> …
## $ `group_subgroup__Thriving_Rural/Affluent_rural`                                                 <dbl> …
## $ `group_subgroup__Thriving_Rural/Rural_Growth_Areas`                                             <dbl> …
## $ `group_subgroup__Town_Living/Prosperous_Towns`                                                  <dbl> …
## $ `group_subgroup__University_Towns_and_Cities/University_Towns_and_Cities`                       <dbl> …
## $ `group_subgroup__-OTHER`                                                                        <dbl> …
## $ year_opened_rev__1945                                                                           <dbl> …
## $ year_opened_rev__1960                                                                           <dbl> …
## $ year_opened_rev__1972                                                                           <dbl> …
## $ year_opened_rev__1973                                                                           <dbl> …
## $ year_opened_rev__1974                                                                           <dbl> …
## $ year_opened_rev__1975                                                                           <dbl> …
## $ year_opened_rev__1976                                                                           <dbl> …
## $ year_opened_rev__1977                                                                           <dbl> …
## $ year_opened_rev__1978                                                                           <dbl> …
## $ year_opened_rev__1979                                                                           <dbl> …
## $ year_opened_rev__1980                                                                           <dbl> …
## $ year_opened_rev__1981                                                                           <dbl> …
## $ year_opened_rev__1982                                                                           <dbl> …
## $ year_opened_rev__1983                                                                           <dbl> …
## $ year_opened_rev__1984                                                                           <dbl> …
## $ year_opened_rev__1985                                                                           <dbl> …
## $ year_opened_rev__1986                                                                           <dbl> …
## $ year_opened_rev__1987                                                                           <dbl> …
## $ year_opened_rev__1988                                                                           <dbl> …
## $ year_opened_rev__1989                                                                           <dbl> …
## $ year_opened_rev__1990                                                                           <dbl> …
## $ year_opened_rev__1991                                                                           <dbl> …
## $ year_opened_rev__1992                                                                           <dbl> …
## $ year_opened_rev__1993                                                                           <dbl> …
## $ year_opened_rev__1994                                                                           <dbl> …
## $ year_opened_rev__1995                                                                           <dbl> …
## $ year_opened_rev__1996                                                                           <dbl> …
## $ year_opened_rev__1997                                                                           <dbl> …
## $ year_opened_rev__1998                                                                           <dbl> …
## $ year_opened_rev__1999                                                                           <dbl> …
## $ year_opened_rev__2000                                                                           <dbl> …
## $ year_opened_rev__2001                                                                           <dbl> …
## $ year_opened_rev__2002                                                                           <dbl> …
## $ year_opened_rev__2005                                                                           <dbl> …
## $ `year_opened_rev__-OTHER`                                                                       <dbl> …
data_correlate <- data_binarized %>%
    correlate(accreditation__Accredited)

data_correlate %>%
    correlationfunnel::plot_correlation_funnel()
## Warning: ggrepel: 107 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Build Model

Split Data

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.1
## ✔ dials        1.2.1     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.2.1     ✔ yardstick    1.3.1
## ✔ recipes      1.1.0
## Warning: package 'modeldata' was built under R version 4.3.3
## Warning: package 'recipes' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
library(usemodels)

set.seed(1123)
# data_clean <- data %>% sample_n(100)

data_split <- initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)

data_cv <- rsample::vfold_cv(data_train)

data_cv
## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [2676/298]> Fold01
##  2 <split [2676/298]> Fold02
##  3 <split [2676/298]> Fold03
##  4 <split [2676/298]> Fold04
##  5 <split [2677/297]> Fold05
##  6 <split [2677/297]> Fold06
##  7 <split [2677/297]> Fold07
##  8 <split [2677/297]> Fold08
##  9 <split [2677/297]> Fold09
## 10 <split [2677/297]> Fold10

Preprocess data

library(themis)
library(textrecipes)

skimr::skim(data)
Data summary
Name data
Number of rows 3966
Number of columns 18
_______________________
Column type frequency:
character 10
numeric 8
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
name_of_museum 0 1 3 76 0 3965 0
village_town_or_city 0 1 3 24 0 1639 0
accreditation 0 1 10 12 0 2 0
governance 0 1 7 41 0 13 0
size 0 1 4 7 0 5 0
size_provenance 0 1 2 29 0 16 0
subject_matter 0 1 5 45 0 112 0
primary_provenance_of_data 0 1 3 8 0 17 0
group_subgroup 0 1 28 79 0 32 0
year_opened_rev 0 1 4 4 0 211 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
area_deprivation_index 0 1 5.46 2.48 1 4 5 7 10 ▃▆▇▆▃
area_deprivation_index_crime 0 1 5.43 3.07 1 3 6 8 10 ▇▆▅▆▇
area_deprivation_index_education 0 1 6.05 2.61 1 4 6 8 10 ▃▅▇▇▆
area_deprivation_index_employment 0 1 6.08 2.77 1 4 6 8 10 ▅▆▇▇▇
area_deprivation_index_health 0 1 6.02 2.82 1 4 6 8 10 ▅▆▆▇▇
area_deprivation_index_housing 0 1 3.99 2.76 1 1 3 6 10 ▇▅▃▃▂
area_deprivation_index_income 0 1 6.00 2.63 1 4 6 8 10 ▃▆▇▇▆
area_deprivation_index_services 0 1 4.79 3.01 1 2 4 8 10 ▇▅▅▅▅
xgboost_rec <- recipes::recipe(accreditation ~ ., data = data_train) %>%
    update_role(name_of_museum, new_role = "ID") %>%
    step_tokenize(subject_matter) %>%
    step_tokenfilter(subject_matter, max_tokens = 50) %>%
    step_tf(subject_matter) %>%
    step_other(village_town_or_city) %>%
    step_novel(all_nominal_predictors()) %>%
    step_dummy(all_nominal_predictors()) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_smote(accreditation)
    

xgboost_rec %>% prep() %>% juice() %>% glimpse()
## Warning: !  The following columns have zero variance so scaling cannot be used:
##   village_town_or_city_new, governance_new, size_new, size_provenance_new,
##   primary_provenance_of_data_new, group_subgroup_new, and year_opened_rev_new.
## ℹ Consider using ?step_zv (`?recipes::step_zv()`) to remove those columns
##   before normalizing.
## Rows: 3,368
## Columns: 336
## $ name_of_museum                                                                                 <fct> …
## $ area_deprivation_index                                                                         <dbl> …
## $ area_deprivation_index_crime                                                                   <dbl> …
## $ area_deprivation_index_education                                                               <dbl> …
## $ area_deprivation_index_employment                                                              <dbl> …
## $ area_deprivation_index_health                                                                  <dbl> …
## $ area_deprivation_index_housing                                                                 <dbl> …
## $ area_deprivation_index_income                                                                  <dbl> …
## $ area_deprivation_index_services                                                                <dbl> …
## $ accreditation                                                                                  <fct> …
## $ tf_subject_matter_airforce                                                                     <dbl> …
## $ tf_subject_matter_archaeology                                                                  <dbl> …
## $ tf_subject_matter_arts                                                                         <dbl> …
## $ tf_subject_matter_aviation                                                                     <dbl> …
## $ tf_subject_matter_belief_and_identity                                                          <dbl> …
## $ tf_subject_matter_boats_and_ships                                                              <dbl> …
## $ tf_subject_matter_buildings                                                                    <dbl> …
## $ tf_subject_matter_cars_and_motorbikes                                                          <dbl> …
## $ tf_subject_matter_castles_and_forts                                                            <dbl> …
## $ tf_subject_matter_costume_and_textiles                                                         <dbl> …
## $ tf_subject_matter_encyclopaedic                                                                <dbl> …
## $ tf_subject_matter_ethnic_group                                                                 <dbl> …
## $ tf_subject_matter_event_or_site                                                                <dbl> …
## $ tf_subject_matter_farming                                                                      <dbl> …
## $ tf_subject_matter_film_cinema_and_tv                                                           <dbl> …
## $ tf_subject_matter_fine_and_decorative_arts                                                     <dbl> …
## $ tf_subject_matter_food_and_drink                                                               <dbl> …
## $ tf_subject_matter_houses                                                                       <dbl> …
## $ tf_subject_matter_industry_and_manufacture                                                     <dbl> …
## $ tf_subject_matter_large_houses                                                                 <dbl> …
## $ tf_subject_matter_leisure_and_sport                                                            <dbl> …
## $ tf_subject_matter_literary                                                                     <dbl> …
## $ tf_subject_matter_local_histories                                                              <dbl> …
## $ tf_subject_matter_medicine_and_health                                                          <dbl> …
## $ tf_subject_matter_medium_houses                                                                <dbl> …
## $ tf_subject_matter_military                                                                     <dbl> …
## $ tf_subject_matter_mining_and_quarrying                                                         <dbl> …
## $ tf_subject_matter_mixed                                                                        <dbl> …
## $ tf_subject_matter_music                                                                        <dbl> …
## $ tf_subject_matter_natural_world                                                                <dbl> …
## $ tf_subject_matter_other                                                                        <dbl> …
## $ tf_subject_matter_personality                                                                  <dbl> …
## $ tf_subject_matter_police                                                                       <dbl> …
## $ tf_subject_matter_prehistory                                                                   <dbl> …
## $ tf_subject_matter_regiment                                                                     <dbl> …
## $ tf_subject_matter_religious_buildings                                                          <dbl> …
## $ tf_subject_matter_roman                                                                        <dbl> …
## $ tf_subject_matter_rural_industry                                                               <dbl> …
## $ tf_subject_matter_rural_life                                                                   <dbl> …
## $ tf_subject_matter_sea_and_seafaring                                                            <dbl> …
## $ tf_subject_matter_services                                                                     <dbl> …
## $ tf_subject_matter_small_houses                                                                 <dbl> …
## $ tf_subject_matter_textiles                                                                     <dbl> …
## $ tf_subject_matter_toys_and_models                                                              <dbl> …
## $ tf_subject_matter_trains_and_railways                                                          <dbl> …
## $ tf_subject_matter_transport                                                                    <dbl> …
## $ tf_subject_matter_utilities                                                                    <dbl> …
## $ tf_subject_matter_war_and_conflict                                                             <dbl> …
## $ tf_subject_matter_water_and_waste                                                              <dbl> …
## $ tf_subject_matter_watermills                                                                   <dbl> …
## $ village_town_or_city_other                                                                     <dbl> …
## $ village_town_or_city_new                                                                       <dbl> …
## $ governance_Government.Local_Authority                                                          <dbl> …
## $ governance_Government.National                                                                 <dbl> …
## $ governance_Government.Other                                                                    <dbl> …
## $ governance_Independent.English_Heritage                                                        <dbl> …
## $ governance_Independent.Historic_Environment_Scotland                                           <dbl> …
## $ governance_Independent.National_Trust                                                          <dbl> …
## $ governance_Independent.National_Trust_for_Scotland                                             <dbl> …
## $ governance_Independent.Not_for_profit                                                          <dbl> …
## $ governance_Independent.Private                                                                 <dbl> …
## $ governance_Independent.Unknown                                                                 <dbl> …
## $ governance_University                                                                          <dbl> …
## $ governance_Unknown                                                                             <dbl> …
## $ governance_new                                                                                 <dbl> …
## $ size_large                                                                                     <dbl> …
## $ size_medium                                                                                    <dbl> …
## $ size_small                                                                                     <dbl> …
## $ size_unknown                                                                                   <dbl> …
## $ size_new                                                                                       <dbl> …
## $ size_provenance_aim_size_designation                                                           <dbl> …
## $ size_provenance_babbidge_ewles_and_smith_2006                                                  <dbl> …
## $ size_provenance_domus                                                                          <dbl> …
## $ size_provenance_ma.fam.                                                                        <dbl> …
## $ size_provenance_ma.fam._year_stated                                                            <dbl> …
## $ size_provenance_ma.fam2.                                                                       <dbl> …
## $ size_provenance_mm                                                                             <dbl> …
## $ size_provenance_mm_manual_estimate_2018                                                        <dbl> …
## $ size_provenance_mm_prediction_random_forest                                                    <dbl> …
## $ size_provenance_national_trust_annual_report_                                                  <dbl> …
## $ size_provenance_nilm.vn.                                                                       <dbl> …
## $ size_provenance_scottish_national_audit                                                        <dbl> …
## $ size_provenance_unknown                                                                        <dbl> …
## $ size_provenance_visitbritain                                                                   <dbl> …
## $ size_provenance_new                                                                            <dbl> …
## $ primary_provenance_of_data_aim                                                                 <dbl> …
## $ primary_provenance_of_data_aim82M                                                              <dbl> …
## $ primary_provenance_of_data_aim82NM                                                             <dbl> …
## $ primary_provenance_of_data_domus                                                               <dbl> …
## $ primary_provenance_of_data_fcm                                                                 <dbl> …
## $ primary_provenance_of_data_hha                                                                 <dbl> …
## $ primary_provenance_of_data_hud                                                                 <dbl> …
## $ primary_provenance_of_data_mald                                                                <dbl> …
## $ primary_provenance_of_data_MDN                                                                 <dbl> …
## $ primary_provenance_of_data_mgs                                                                 <dbl> …
## $ primary_provenance_of_data_misc                                                                <dbl> …
## $ primary_provenance_of_data_Misc                                                                <dbl> …
## $ primary_provenance_of_data_musassoc                                                            <dbl> …
## $ primary_provenance_of_data_MusCal                                                              <dbl> …
## $ primary_provenance_of_data_nimc                                                                <dbl> …
## $ primary_provenance_of_data_wiki                                                                <dbl> …
## $ primary_provenance_of_data_new                                                                 <dbl> …
## $ group_subgroup_English.and.Welsh.Countryside.Ethnically.Diverse.Metropolitan.Living            <dbl> …
## $ group_subgroup_English.and.Welsh.Countryside.London.Cosmopolitan                               <dbl> …
## $ group_subgroup_English.and.Welsh.Countryside.Older.Farming.Communities                         <dbl> …
## $ group_subgroup_English.and.Welsh.Countryside.Sparse.English.and.Welsh.Countryside              <dbl> …
## $ group_subgroup_Ethnically.Diverse.Metropolitan..Living.Ethnically.Diverse.Metropolitan..Living <dbl> …
## $ group_subgroup_Ethnically.Diverse.Metropolitan.Living.Ethnically.Diverse.Metropolitan.Living   <dbl> …
## $ group_subgroup_Larger.Towns.and.Cities.Larger.Towns.and.Cities                                 <dbl> …
## $ group_subgroup_London.Cosmopolitan.London.Cosmopolitan                                         <dbl> …
## $ group_subgroup_Manufacturing.Traits.Industrial.and.Multi.ethnic                                <dbl> …
## $ group_subgroup_Manufacturing.Traits.Urban.Living                                               <dbl> …
## $ group_subgroup_Northern.Ireland.Countryside.Northern.Ireland.Countryside                       <dbl> …
## $ group_subgroup_Remoter.Coastal.Living.Ageing.Coastal.Living                                    <dbl> …
## $ group_subgroup_Remoter.Coastal.Living.Seaside.Living                                           <dbl> …
## $ group_subgroup_Rural.Urban.Fringe.Ethnically.Diverse.Metropolitan.Living                       <dbl> …
## $ group_subgroup_Rural.Urban.Fringe.Rural.Urban.Fringe                                           <dbl> …
## $ group_subgroup_Scottish.Countryside.Scottish.Countryside                                       <dbl> …
## $ group_subgroup_Scottish.Industrial.Heritage.Scottish.Industrial.Legacy                         <dbl> …
## $ group_subgroup_Services.Manufacturing.and.Mining.Legacy.Affluent.rural                         <dbl> …
## $ group_subgroup_Services.Manufacturing.and.Mining.Legacy.Manufacturing.Legacy                   <dbl> …
## $ group_subgroup_Services.Manufacturing.and.Mining.Legacy.Mining.Legacy                          <dbl> …
## $ group_subgroup_Services.Manufacturing.and.Mining.Legacy.Service.Economy                        <dbl> …
## $ group_subgroup_Suburban.Traits.City.Periphery                                                  <dbl> …
## $ group_subgroup_Suburban.Traits.Expanding.Areas                                                 <dbl> …
## $ group_subgroup_Thriving.Rural.Affluent.rural                                                   <dbl> …
## $ group_subgroup_Thriving.Rural.Rural.Growth.Areas                                               <dbl> …
## $ group_subgroup_Town.Living.Prosperous.Semi.rural                                               <dbl> …
## $ group_subgroup_Town.Living.Prosperous.Towns                                                    <dbl> …
## $ group_subgroup_University.Towns.and.Cities.University.Towns.and.Cities                         <dbl> …
## $ group_subgroup_new                                                                             <dbl> …
## $ year_opened_rev_X1653                                                                          <dbl> …
## $ year_opened_rev_X1676                                                                          <dbl> …
## $ year_opened_rev_X1683                                                                          <dbl> …
## $ year_opened_rev_X1728                                                                          <dbl> …
## $ year_opened_rev_X1739                                                                          <dbl> …
## $ year_opened_rev_X1750                                                                          <dbl> …
## $ year_opened_rev_X1761                                                                          <dbl> …
## $ year_opened_rev_X1771                                                                          <dbl> …
## $ year_opened_rev_X1796                                                                          <dbl> …
## $ year_opened_rev_X1800                                                                          <dbl> …
## $ year_opened_rev_X1807                                                                          <dbl> …
## $ year_opened_rev_X1812                                                                          <dbl> …
## $ year_opened_rev_X1814                                                                          <dbl> …
## $ year_opened_rev_X1815                                                                          <dbl> …
## $ year_opened_rev_X1816                                                                          <dbl> …
## $ year_opened_rev_X1817                                                                          <dbl> …
## $ year_opened_rev_X1818                                                                          <dbl> …
## $ year_opened_rev_X1819                                                                          <dbl> …
## $ year_opened_rev_X1820                                                                          <dbl> …
## $ year_opened_rev_X1821                                                                          <dbl> …
## $ year_opened_rev_X1823                                                                          <dbl> …
## $ year_opened_rev_X1824                                                                          <dbl> …
## $ year_opened_rev_X1826                                                                          <dbl> …
## $ year_opened_rev_X1830                                                                          <dbl> …
## $ year_opened_rev_X1832                                                                          <dbl> …
## $ year_opened_rev_X1833                                                                          <dbl> …
## $ year_opened_rev_X1835                                                                          <dbl> …
## $ year_opened_rev_X1836                                                                          <dbl> …
## $ year_opened_rev_X1837                                                                          <dbl> …
## $ year_opened_rev_X1839                                                                          <dbl> …
## $ year_opened_rev_X1842                                                                          <dbl> …
## $ year_opened_rev_X1843                                                                          <dbl> …
## $ year_opened_rev_X1844                                                                          <dbl> …
## $ year_opened_rev_X1845                                                                          <dbl> …
## $ year_opened_rev_X1846                                                                          <dbl> …
## $ year_opened_rev_X1847                                                                          <dbl> …
## $ year_opened_rev_X1848                                                                          <dbl> …
## $ year_opened_rev_X1849                                                                          <dbl> …
## $ year_opened_rev_X1850                                                                          <dbl> …
## $ year_opened_rev_X1852                                                                          <dbl> …
## $ year_opened_rev_X1856                                                                          <dbl> …
## $ year_opened_rev_X1857                                                                          <dbl> …
## $ year_opened_rev_X1858                                                                          <dbl> …
## $ year_opened_rev_X1859                                                                          <dbl> …
## $ year_opened_rev_X1860                                                                          <dbl> …
## $ year_opened_rev_X1862                                                                          <dbl> …
## $ year_opened_rev_X1864                                                                          <dbl> …
## $ year_opened_rev_X1865                                                                          <dbl> …
## $ year_opened_rev_X1867                                                                          <dbl> …
## $ year_opened_rev_X1868                                                                          <dbl> …
## $ year_opened_rev_X1869                                                                          <dbl> …
## $ year_opened_rev_X1870                                                                          <dbl> …
## $ year_opened_rev_X1871                                                                          <dbl> …
## $ year_opened_rev_X1872                                                                          <dbl> …
## $ year_opened_rev_X1874                                                                          <dbl> …
## $ year_opened_rev_X1875                                                                          <dbl> …
## $ year_opened_rev_X1876                                                                          <dbl> …
## $ year_opened_rev_X1878                                                                          <dbl> …
## $ year_opened_rev_X1880                                                                          <dbl> …
## $ year_opened_rev_X1881                                                                          <dbl> …
## $ year_opened_rev_X1882                                                                          <dbl> …
## $ year_opened_rev_X1883                                                                          <dbl> …
## $ year_opened_rev_X1884                                                                          <dbl> …
## $ year_opened_rev_X1885                                                                          <dbl> …
## $ year_opened_rev_X1886                                                                          <dbl> …
## $ year_opened_rev_X1887                                                                          <dbl> …
## $ year_opened_rev_X1888                                                                          <dbl> …
## $ year_opened_rev_X1890                                                                          <dbl> …
## $ year_opened_rev_X1891                                                                          <dbl> …
## $ year_opened_rev_X1892                                                                          <dbl> …
## $ year_opened_rev_X1893                                                                          <dbl> …
## $ year_opened_rev_X1894                                                                          <dbl> …
## $ year_opened_rev_X1895                                                                          <dbl> …
## $ year_opened_rev_X1896                                                                          <dbl> …
## $ year_opened_rev_X1897                                                                          <dbl> …
## $ year_opened_rev_X1898                                                                          <dbl> …
## $ year_opened_rev_X1899                                                                          <dbl> …
## $ year_opened_rev_X1900                                                                          <dbl> …
## $ year_opened_rev_X1901                                                                          <dbl> …
## $ year_opened_rev_X1902                                                                          <dbl> …
## $ year_opened_rev_X1903                                                                          <dbl> …
## $ year_opened_rev_X1904                                                                          <dbl> …
## $ year_opened_rev_X1905                                                                          <dbl> …
## $ year_opened_rev_X1906                                                                          <dbl> …
## $ year_opened_rev_X1907                                                                          <dbl> …
## $ year_opened_rev_X1908                                                                          <dbl> …
## $ year_opened_rev_X1909                                                                          <dbl> …
## $ year_opened_rev_X1910                                                                          <dbl> …
## $ year_opened_rev_X1911                                                                          <dbl> …
## $ year_opened_rev_X1912                                                                          <dbl> …
## $ year_opened_rev_X1913                                                                          <dbl> …
## $ year_opened_rev_X1914                                                                          <dbl> …
## $ year_opened_rev_X1915                                                                          <dbl> …
## $ year_opened_rev_X1917                                                                          <dbl> …
## $ year_opened_rev_X1919                                                                          <dbl> …
## $ year_opened_rev_X1920                                                                          <dbl> …
## $ year_opened_rev_X1921                                                                          <dbl> …
## $ year_opened_rev_X1922                                                                          <dbl> …
## $ year_opened_rev_X1923                                                                          <dbl> …
## $ year_opened_rev_X1924                                                                          <dbl> …
## $ year_opened_rev_X1925                                                                          <dbl> …
## $ year_opened_rev_X1926                                                                          <dbl> …
## $ year_opened_rev_X1927                                                                          <dbl> …
## $ year_opened_rev_X1928                                                                          <dbl> …
## $ year_opened_rev_X1929                                                                          <dbl> …
## $ year_opened_rev_X1930                                                                          <dbl> …
## $ year_opened_rev_X1931                                                                          <dbl> …
## $ year_opened_rev_X1932                                                                          <dbl> …
## $ year_opened_rev_X1933                                                                          <dbl> …
## $ year_opened_rev_X1934                                                                          <dbl> …
## $ year_opened_rev_X1935                                                                          <dbl> …
## $ year_opened_rev_X1936                                                                          <dbl> …
## $ year_opened_rev_X1937                                                                          <dbl> …
## $ year_opened_rev_X1938                                                                          <dbl> …
## $ year_opened_rev_X1939                                                                          <dbl> …
## $ year_opened_rev_X1940                                                                          <dbl> …
## $ year_opened_rev_X1941                                                                          <dbl> …
## $ year_opened_rev_X1942                                                                          <dbl> …
## $ year_opened_rev_X1943                                                                          <dbl> …
## $ year_opened_rev_X1944                                                                          <dbl> …
## $ year_opened_rev_X1945                                                                          <dbl> …
## $ year_opened_rev_X1946                                                                          <dbl> …
## $ year_opened_rev_X1947                                                                          <dbl> …
## $ year_opened_rev_X1948                                                                          <dbl> …
## $ year_opened_rev_X1949                                                                          <dbl> …
## $ year_opened_rev_X1950                                                                          <dbl> …
## $ year_opened_rev_X1951                                                                          <dbl> …
## $ year_opened_rev_X1952                                                                          <dbl> …
## $ year_opened_rev_X1953                                                                          <dbl> …
## $ year_opened_rev_X1954                                                                          <dbl> …
## $ year_opened_rev_X1955                                                                          <dbl> …
## $ year_opened_rev_X1956                                                                          <dbl> …
## $ year_opened_rev_X1957                                                                          <dbl> …
## $ year_opened_rev_X1958                                                                          <dbl> …
## $ year_opened_rev_X1959                                                                          <dbl> …
## $ year_opened_rev_X1960                                                                          <dbl> …
## $ year_opened_rev_X1961                                                                          <dbl> …
## $ year_opened_rev_X1962                                                                          <dbl> …
## $ year_opened_rev_X1963                                                                          <dbl> …
## $ year_opened_rev_X1964                                                                          <dbl> …
## $ year_opened_rev_X1965                                                                          <dbl> …
## $ year_opened_rev_X1966                                                                          <dbl> …
## $ year_opened_rev_X1967                                                                          <dbl> …
## $ year_opened_rev_X1968                                                                          <dbl> …
## $ year_opened_rev_X1969                                                                          <dbl> …
## $ year_opened_rev_X1970                                                                          <dbl> …
## $ year_opened_rev_X1971                                                                          <dbl> …
## $ year_opened_rev_X1972                                                                          <dbl> …
## $ year_opened_rev_X1973                                                                          <dbl> …
## $ year_opened_rev_X1974                                                                          <dbl> …
## $ year_opened_rev_X1975                                                                          <dbl> …
## $ year_opened_rev_X1976                                                                          <dbl> …
## $ year_opened_rev_X1977                                                                          <dbl> …
## $ year_opened_rev_X1978                                                                          <dbl> …
## $ year_opened_rev_X1979                                                                          <dbl> …
## $ year_opened_rev_X1980                                                                          <dbl> …
## $ year_opened_rev_X1981                                                                          <dbl> …
## $ year_opened_rev_X1982                                                                          <dbl> …
## $ year_opened_rev_X1983                                                                          <dbl> …
## $ year_opened_rev_X1984                                                                          <dbl> …
## $ year_opened_rev_X1985                                                                          <dbl> …
## $ year_opened_rev_X1986                                                                          <dbl> …
## $ year_opened_rev_X1987                                                                          <dbl> …
## $ year_opened_rev_X1988                                                                          <dbl> …
## $ year_opened_rev_X1989                                                                          <dbl> …
## $ year_opened_rev_X1990                                                                          <dbl> …
## $ year_opened_rev_X1991                                                                          <dbl> …
## $ year_opened_rev_X1992                                                                          <dbl> …
## $ year_opened_rev_X1993                                                                          <dbl> …
## $ year_opened_rev_X1994                                                                          <dbl> …
## $ year_opened_rev_X1995                                                                          <dbl> …
## $ year_opened_rev_X1996                                                                          <dbl> …
## $ year_opened_rev_X1997                                                                          <dbl> …
## $ year_opened_rev_X1998                                                                          <dbl> …
## $ year_opened_rev_X1999                                                                          <dbl> …
## $ year_opened_rev_X2000                                                                          <dbl> …
## $ year_opened_rev_X2001                                                                          <dbl> …
## $ year_opened_rev_X2002                                                                          <dbl> …
## $ year_opened_rev_X2003                                                                          <dbl> …
## $ year_opened_rev_X2004                                                                          <dbl> …
## $ year_opened_rev_X2005                                                                          <dbl> …
## $ year_opened_rev_X2006                                                                          <dbl> …
## $ year_opened_rev_X2007                                                                          <dbl> …
## $ year_opened_rev_X2008                                                                          <dbl> …
## $ year_opened_rev_X2009                                                                          <dbl> …
## $ year_opened_rev_X2010                                                                          <dbl> …
## $ year_opened_rev_X2011                                                                          <dbl> …
## $ year_opened_rev_X2012                                                                          <dbl> …
## $ year_opened_rev_X2013                                                                          <dbl> …
## $ year_opened_rev_X2014                                                                          <dbl> …
## $ year_opened_rev_X2015                                                                          <dbl> …
## $ year_opened_rev_X2016                                                                          <dbl> …
## $ year_opened_rev_X2017                                                                          <dbl> …
## $ year_opened_rev_X2018                                                                          <dbl> …
## $ year_opened_rev_new                                                                            <dbl> …

Specify model

xgboost_spec <- 
  boost_tree(trees = tune(), mtry = tune(), learn_rate = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec) 

Tune hyperparameters

doParallel::registerDoParallel()

set.seed(48291)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv, 
            grid = 5,
            control = control_grid(save_pred = TRUE))
## i Creating pre-processing data to finalize unknown parameter: mtry
## Warning: !  The following columns have zero variance so scaling cannot be used:
##   village_town_or_city_new, governance_new, size_new, size_provenance_new,
##   primary_provenance_of_data_new, group_subgroup_new, and year_opened_rev_new.
## ℹ Consider using ?step_zv (`?recipes::step_zv()`) to remove those columns
##   before normalizing.
## Warning: package 'xgboost' was built under R version 4.3.3

Evaluate model

Tune Hyperparameters

collect_metrics(xgboost_tune)
## # A tibble: 15 × 9
##     mtry trees learn_rate .metric     .estimator  mean     n std_err .config    
##    <int> <int>      <dbl> <chr>       <chr>      <dbl> <int>   <dbl> <chr>      
##  1     9  1186    0.00128 accuracy    binary     0.830    10 0.00894 Preprocess…
##  2     9  1186    0.00128 brier_class binary     0.158    10 0.00229 Preprocess…
##  3     9  1186    0.00128 roc_auc     binary     0.894    10 0.00688 Preprocess…
##  4    69   388    0.133   accuracy    binary     0.846    10 0.00782 Preprocess…
##  5    69   388    0.133   brier_class binary     0.114    10 0.00491 Preprocess…
##  6    69   388    0.133   roc_auc     binary     0.913    10 0.00589 Preprocess…
##  7   174  1960    0.00657 accuracy    binary     0.855    10 0.00554 Preprocess…
##  8   174  1960    0.00657 brier_class binary     0.108    10 0.00403 Preprocess…
##  9   174  1960    0.00657 roc_auc     binary     0.918    10 0.00542 Preprocess…
## 10   265   777    0.0651  accuracy    binary     0.842    10 0.00826 Preprocess…
## 11   265   777    0.0651  brier_class binary     0.118    10 0.00511 Preprocess…
## 12   265   777    0.0651  roc_auc     binary     0.911    10 0.00555 Preprocess…
## 13   326  1334    0.0237  accuracy    binary     0.843    10 0.00818 Preprocess…
## 14   326  1334    0.0237  brier_class binary     0.113    10 0.00468 Preprocess…
## 15   326  1334    0.0237  roc_auc     binary     0.915    10 0.00521 Preprocess…
collect_predictions(xgboost_tune) %>%
  group_by(id) %>%
  roc_curve(accreditation, .pred_Accredited) %>%
  autoplot()

Fit the model for the last time

xgboost_last <- xgboost_workflow %>%
  finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
  last_fit(data_split)
## → A | warning: !  The following columns have zero variance so scaling cannot be used:
##                  village_town_or_city_new, governance_new, size_new, size_provenance_new,
##                  primary_provenance_of_data_new, group_subgroup_new, and year_opened_rev_new.
##                ℹ Consider using ?step_zv (`?recipes::step_zv()`) to remove those columns
##                  before normalizing.
## 
There were issues with some computations   A: x1

There were issues with some computations   A: x1
collect_metrics(xgboost_last)
## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.837 Preprocessor1_Model1
## 2 roc_auc     binary         0.912 Preprocessor1_Model1
## 3 brier_class binary         0.118 Preprocessor1_Model1
collect_predictions(xgboost_last) %>%
  yardstick::conf_mat(accreditation, .pred_class) %>%
  autoplot()

Variable importance

library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
xgboost_last %>%
  workflows::extract_fit_engine() %>%
  vip()

Conclusion

previous model had accuaracy: 0.815 auc: 0.890 in apply 7 after cleaning more of the data the model had an accuracy: 0.812 and auc: 0.896