library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(correlationfunnel)
## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>
# Import Data
museums <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')
## Rows: 4191 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (24): museum_id, Name_of_museum, Address_line_1, Address_line_2, Village...
## dbl (11): Latitude, Longitude, DOMUS_identifier, Area_Deprivation_index, Are...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(museums)
Data summary
Name |
museums |
Number of rows |
4191 |
Number of columns |
35 |
_______________________ |
|
Column type frequency: |
|
character |
24 |
numeric |
11 |
________________________ |
|
Group variables |
None |
Variable type: character
museum_id |
0 |
1.00 |
8 |
15 |
0 |
4191 |
0 |
Name_of_museum |
0 |
1.00 |
3 |
76 |
0 |
4190 |
0 |
Address_line_1 |
441 |
0.89 |
3 |
61 |
0 |
3212 |
0 |
Address_line_2 |
2816 |
0.33 |
3 |
39 |
0 |
1167 |
0 |
Village,_Town_or_City |
4 |
1.00 |
3 |
24 |
0 |
1696 |
0 |
Postcode |
0 |
1.00 |
6 |
9 |
0 |
3918 |
0 |
Admin_area |
0 |
1.00 |
12 |
137 |
0 |
393 |
0 |
Accreditation |
0 |
1.00 |
10 |
12 |
0 |
2 |
0 |
Governance |
0 |
1.00 |
7 |
41 |
0 |
13 |
0 |
Size |
0 |
1.00 |
4 |
7 |
0 |
5 |
0 |
Size_provenance |
179 |
0.96 |
2 |
29 |
0 |
16 |
0 |
Subject_Matter |
0 |
1.00 |
5 |
45 |
0 |
114 |
0 |
Year_opened |
0 |
1.00 |
9 |
9 |
0 |
351 |
0 |
Year_closed |
0 |
1.00 |
9 |
9 |
0 |
170 |
0 |
DOMUS_Subject_Matter |
2788 |
0.33 |
5 |
27 |
0 |
21 |
0 |
Primary_provenance_of_data |
0 |
1.00 |
3 |
8 |
0 |
18 |
0 |
Identifier_used_in_primary_data_source |
2056 |
0.51 |
2 |
8 |
0 |
2134 |
0 |
Area_Geodemographic_group |
49 |
0.99 |
11 |
40 |
0 |
17 |
0 |
Area_Geodemographic_group_code |
49 |
0.99 |
3 |
3 |
0 |
16 |
0 |
Area_Geodemographic_subgroup |
49 |
0.99 |
12 |
39 |
0 |
25 |
0 |
Area_Geodemographic_subgroup_code |
49 |
0.99 |
4 |
4 |
0 |
24 |
0 |
Area_Geodemographic_supergroup |
49 |
0.99 |
16 |
39 |
0 |
8 |
0 |
Area_Geodemographic_supergroup_code |
49 |
0.99 |
2 |
2 |
0 |
8 |
0 |
Notes |
2980 |
0.29 |
12 |
751 |
0 |
956 |
0 |
Variable type: numeric
Latitude |
0 |
1.00 |
52.93 |
2.09 |
49.18 |
51.48 |
52.47 |
53.96 |
100.00 |
▇▁▁▁▁ |
Longitude |
0 |
1.00 |
-1.96 |
1.84 |
-8.09 |
-3.10 |
-1.87 |
-0.48 |
1.76 |
▁▂▇▇▅ |
DOMUS_identifier |
2347 |
0.44 |
1303.45 |
1597.19 |
1.00 |
486.50 |
991.50 |
1470.25 |
7746.00 |
▇▂▁▁▁ |
Area_Deprivation_index |
49 |
0.99 |
5.44 |
2.48 |
1.00 |
4.00 |
5.00 |
7.00 |
10.00 |
▃▆▇▆▃ |
Area_Deprivation_index_crime |
49 |
0.99 |
5.43 |
3.07 |
1.00 |
3.00 |
6.00 |
8.00 |
10.00 |
▇▆▅▇▇ |
Area_Deprivation_index_education |
49 |
0.99 |
6.04 |
2.61 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▃▅▇▇▆ |
Area_Deprivation_index_employment |
49 |
0.99 |
6.08 |
2.76 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▅▆▇▇▇ |
Area_Deprivation_index_health |
49 |
0.99 |
6.02 |
2.82 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▅▆▆▇▇ |
Area_Deprivation_index_housing |
49 |
0.99 |
3.97 |
2.75 |
1.00 |
1.00 |
3.00 |
6.00 |
10.00 |
▇▅▃▂▂ |
Area_Deprivation_index_income |
49 |
0.99 |
5.99 |
2.62 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▃▆▇▇▆ |
Area_Deprivation_index_services |
49 |
0.99 |
4.78 |
3.01 |
1.00 |
2.00 |
4.00 |
7.00 |
10.00 |
▇▅▅▅▅ |
missing values Addressline_2, Addressline_1, DOMUS_Subject_Matter,
DOMUS_Identifier, Notes factors or numeric variables Zero Variance
variables Character variables Unbalanced target variables id variable
museum_id
museums %>% count(Accreditation)
## # A tibble: 2 × 2
## Accreditation n
## <chr> <int>
## 1 Accredited 1720
## 2 Unaccredited 2471
museums %>%
ggplot(aes(Accreditation)) +
geom_bar()

data <- museums %>%
select(-Address_line_1, -Address_line_2, -DOMUS_Subject_Matter,-DOMUS_identifier, -Notes, -Identifier_used_in_primary_data_source, -Area_Geodemographic_supergroup_code, -Area_Geodemographic_group_code, -Area_Geodemographic_subgroup_code, -museum_id, -Area_Geodemographic_group, -Year_closed, -Year_opened) %>%
na.omit() %>%
janitor::clean_names()
skimr::skim(data)
Data summary
Name |
data |
Number of rows |
3966 |
Number of columns |
22 |
_______________________ |
|
Column type frequency: |
|
character |
12 |
numeric |
10 |
________________________ |
|
Group variables |
None |
Variable type: character
name_of_museum |
0 |
1 |
3 |
76 |
0 |
3965 |
0 |
village_town_or_city |
0 |
1 |
3 |
24 |
0 |
1639 |
0 |
postcode |
0 |
1 |
6 |
9 |
0 |
3725 |
0 |
admin_area |
0 |
1 |
16 |
137 |
0 |
392 |
0 |
accreditation |
0 |
1 |
10 |
12 |
0 |
2 |
0 |
governance |
0 |
1 |
7 |
41 |
0 |
13 |
0 |
size |
0 |
1 |
4 |
7 |
0 |
5 |
0 |
size_provenance |
0 |
1 |
2 |
29 |
0 |
16 |
0 |
subject_matter |
0 |
1 |
5 |
45 |
0 |
112 |
0 |
primary_provenance_of_data |
0 |
1 |
3 |
8 |
0 |
17 |
0 |
area_geodemographic_subgroup |
0 |
1 |
12 |
39 |
0 |
25 |
0 |
area_geodemographic_supergroup |
0 |
1 |
16 |
39 |
0 |
8 |
0 |
Variable type: numeric
latitude |
0 |
1 |
52.93 |
1.95 |
49.20 |
51.48 |
52.46 |
53.94 |
60.79 |
▅▇▃▁▁ |
longitude |
0 |
1 |
-1.94 |
1.83 |
-8.09 |
-3.09 |
-1.86 |
-0.47 |
1.76 |
▁▂▇▇▅ |
area_deprivation_index |
0 |
1 |
5.46 |
2.48 |
1.00 |
4.00 |
5.00 |
7.00 |
10.00 |
▃▆▇▆▃ |
area_deprivation_index_crime |
0 |
1 |
5.43 |
3.07 |
1.00 |
3.00 |
6.00 |
8.00 |
10.00 |
▇▆▅▆▇ |
area_deprivation_index_education |
0 |
1 |
6.05 |
2.61 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▃▅▇▇▆ |
area_deprivation_index_employment |
0 |
1 |
6.08 |
2.77 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▅▆▇▇▇ |
area_deprivation_index_health |
0 |
1 |
6.02 |
2.82 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▅▆▆▇▇ |
area_deprivation_index_housing |
0 |
1 |
3.99 |
2.76 |
1.00 |
1.00 |
3.00 |
6.00 |
10.00 |
▇▅▃▃▂ |
area_deprivation_index_income |
0 |
1 |
6.00 |
2.63 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▃▆▇▇▆ |
area_deprivation_index_services |
0 |
1 |
4.79 |
3.01 |
1.00 |
2.00 |
4.00 |
8.00 |
10.00 |
▇▅▅▅▅ |
data %>%
ggplot(aes(accreditation, area_deprivation_index_employment)) +
geom_boxplot()

data %>%
ggplot(aes(accreditation, area_deprivation_index_crime)) +
geom_boxplot()

data %>%
ggplot(aes(accreditation, area_deprivation_index_health)) +
geom_boxplot()

data_binarized <- data %>%
select(-name_of_museum) %>%
binarize()
data_binarized %>% glimpse()
## Rows: 3,966
## Columns: 144
## $ village_town_or_city__Edinburgh <dbl> …
## $ village_town_or_city__London <dbl> …
## $ `village_town_or_city__-OTHER` <dbl> …
## $ postcode__SO23_8TS <dbl> …
## $ `postcode__-OTHER` <dbl> …
## $ `latitude__-Inf_51.48120725` <dbl> …
## $ latitude__51.48120725_52.4554475 <dbl> …
## $ latitude__52.4554475_53.9431025 <dbl> …
## $ latitude__53.9431025_Inf <dbl> …
## $ `longitude__-Inf_-3.0854455` <dbl> …
## $ `longitude__-3.0854455_-1.8564615` <dbl> …
## $ `longitude__-1.8564615_-0.469075` <dbl> …
## $ `longitude__-0.469075_Inf` <dbl> …
## $ `admin_area__/England/London_(English_Region)/Westminster_(London_Borough)` <dbl> …
## $ `admin_area__/England/South_West_(English_Region)/Cornwall_(English_UA)` <dbl> …
## $ `admin_area__/England/South_West_(English_Region)/Wiltshire_(English_UA)` <dbl> …
## $ `admin_area__/Scotland/City_of_Edinburgh_(Scottish_Council_Area)` <dbl> …
## $ `admin_area__/Scotland/Dumfries_and_Galloway_(Scottish_Council_Area)` <dbl> …
## $ `admin_area__/Scotland/Highland_(Scottish_Council_Area)` <dbl> …
## $ `admin_area__-OTHER` <dbl> …
## $ accreditation__Accredited <dbl> …
## $ accreditation__Unaccredited <dbl> …
## $ `governance__Government-Local_Authority` <dbl> …
## $ `governance__Government-National` <dbl> …
## $ `governance__Independent-English_Heritage` <dbl> …
## $ `governance__Independent-National_Trust` <dbl> …
## $ `governance__Independent-Not_for_profit` <dbl> …
## $ `governance__Independent-Private` <dbl> …
## $ `governance__Independent-Unknown` <dbl> …
## $ governance__University <dbl> …
## $ governance__Unknown <dbl> …
## $ `governance__-OTHER` <dbl> …
## $ size__large <dbl> …
## $ size__medium <dbl> …
## $ size__small <dbl> …
## $ size__unknown <dbl> …
## $ `size__-OTHER` <dbl> …
## $ size_provenance__ace_size_designation <dbl> …
## $ size_provenance__aim_size_designation <dbl> …
## $ size_provenance__domus <dbl> …
## $ `size_provenance__ma(fam)` <dbl> …
## $ size_provenance__mm_manual_estimate_2018 <dbl> …
## $ size_provenance__mm_prediction_random_forest <dbl> …
## $ size_provenance__scottish_national_audit <dbl> …
## $ size_provenance__unknown <dbl> …
## $ size_provenance__visitbritain <dbl> …
## $ `size_provenance__-OTHER` <dbl> …
## $ `subject_matter__Archaeology-Roman` <dbl> …
## $ `subject_matter__Arts-Fine_and_decorative_arts` <dbl> …
## $ `subject_matter__Buildings-Houses-Large_houses` <dbl> …
## $ `subject_matter__Buildings-Houses-Medium_houses` <dbl> …
## $ `subject_matter__Industry_and_manufacture-Mining_and_quarrying` <dbl> …
## $ `subject_matter__Leisure_and_sport-Toys_and_models` <dbl> …
## $ subject_matter__Local_Histories <dbl> …
## $ `subject_matter__Mixed-Encyclopaedic` <dbl> …
## $ `subject_matter__Mixed-Other` <dbl> …
## $ subject_matter__Other <dbl> …
## $ `subject_matter__Personality-Literary` <dbl> …
## $ `subject_matter__Rural_Industry-Farming` <dbl> …
## $ `subject_matter__Sea_and_seafaring-Boats_and_ships` <dbl> …
## $ `subject_matter__Sea_and_seafaring-Mixed` <dbl> …
## $ `subject_matter__Transport-Cars_and_motorbikes` <dbl> …
## $ `subject_matter__Transport-Trains_and_railways` <dbl> …
## $ `subject_matter__War_and_conflict-Airforce` <dbl> …
## $ `subject_matter__War_and_conflict-Castles_and_forts` <dbl> …
## $ `subject_matter__War_and_conflict-Military` <dbl> …
## $ `subject_matter__War_and_conflict-Regiment` <dbl> …
## $ `subject_matter__-OTHER` <dbl> …
## $ primary_provenance_of_data__ace <dbl> …
## $ primary_provenance_of_data__aim <dbl> …
## $ primary_provenance_of_data__aim82M <dbl> …
## $ primary_provenance_of_data__aim82NM <dbl> …
## $ primary_provenance_of_data__domus <dbl> …
## $ primary_provenance_of_data__fcm <dbl> …
## $ primary_provenance_of_data__hha <dbl> …
## $ primary_provenance_of_data__mald <dbl> …
## $ primary_provenance_of_data__mgs <dbl> …
## $ primary_provenance_of_data__misc <dbl> …
## $ primary_provenance_of_data__musassoc <dbl> …
## $ primary_provenance_of_data__wiki <dbl> …
## $ `primary_provenance_of_data__-OTHER` <dbl> …
## $ `area_deprivation_index__-Inf_4` <dbl> …
## $ area_deprivation_index__4_5 <dbl> …
## $ area_deprivation_index__5_7 <dbl> …
## $ area_deprivation_index__7_Inf <dbl> …
## $ `area_deprivation_index_crime__-Inf_3` <dbl> …
## $ area_deprivation_index_crime__3_6 <dbl> …
## $ area_deprivation_index_crime__6_8 <dbl> …
## $ area_deprivation_index_crime__8_Inf <dbl> …
## $ `area_deprivation_index_education__-Inf_4` <dbl> …
## $ area_deprivation_index_education__4_6 <dbl> …
## $ area_deprivation_index_education__6_8 <dbl> …
## $ area_deprivation_index_education__8_Inf <dbl> …
## $ `area_deprivation_index_employment__-Inf_4` <dbl> …
## $ area_deprivation_index_employment__4_6 <dbl> …
## $ area_deprivation_index_employment__6_8 <dbl> …
## $ area_deprivation_index_employment__8_Inf <dbl> …
## $ `area_deprivation_index_health__-Inf_4` <dbl> …
## $ area_deprivation_index_health__4_6 <dbl> …
## $ area_deprivation_index_health__6_8 <dbl> …
## $ area_deprivation_index_health__8_Inf <dbl> …
## $ `area_deprivation_index_housing__-Inf_3` <dbl> …
## $ area_deprivation_index_housing__3_6 <dbl> …
## $ area_deprivation_index_housing__6_Inf <dbl> …
## $ `area_deprivation_index_income__-Inf_4` <dbl> …
## $ area_deprivation_index_income__4_6 <dbl> …
## $ area_deprivation_index_income__6_8 <dbl> …
## $ area_deprivation_index_income__8_Inf <dbl> …
## $ `area_deprivation_index_services__-Inf_2` <dbl> …
## $ area_deprivation_index_services__2_4 <dbl> …
## $ area_deprivation_index_services__4_8 <dbl> …
## $ area_deprivation_index_services__8_Inf <dbl> …
## $ area_geodemographic_subgroup__Affluent_rural <dbl> …
## $ area_geodemographic_subgroup__Ageing_Coastal_Living <dbl> …
## $ area_geodemographic_subgroup__City_Periphery <dbl> …
## $ area_geodemographic_subgroup__Country_Living <dbl> …
## $ area_geodemographic_subgroup__Ethnically_Diverse_Metropolitan_Living <dbl> …
## $ area_geodemographic_subgroup__Expanding_Areas <dbl> …
## $ `area_geodemographic_subgroup__Industrial_and_Multi-ethnic` <dbl> …
## $ area_geodemographic_subgroup__Larger_Towns_and_Cities <dbl> …
## $ area_geodemographic_subgroup__London_Cosmopolitan <dbl> …
## $ area_geodemographic_subgroup__Manufacturing_Legacy <dbl> …
## $ area_geodemographic_subgroup__Mining_Legacy <dbl> …
## $ area_geodemographic_subgroup__Northern_Ireland_Countryside <dbl> …
## $ area_geodemographic_subgroup__Older_Farming_Communities <dbl> …
## $ area_geodemographic_subgroup__Prosperous_Towns <dbl> …
## $ area_geodemographic_subgroup__Rural_Growth_Areas <dbl> …
## $ `area_geodemographic_subgroup__Rural-Urban_Fringe` <dbl> …
## $ area_geodemographic_subgroup__Scottish_Countryside <dbl> …
## $ area_geodemographic_subgroup__Scottish_Industrial_Legacy <dbl> …
## $ area_geodemographic_subgroup__Seaside_Living <dbl> …
## $ area_geodemographic_subgroup__Service_Economy <dbl> …
## $ area_geodemographic_subgroup__Sparse_English_and_Welsh_Countryside <dbl> …
## $ area_geodemographic_subgroup__University_Towns_and_Cities <dbl> …
## $ area_geodemographic_subgroup__Urban_Living <dbl> …
## $ `area_geodemographic_subgroup__-OTHER` <dbl> …
## $ area_geodemographic_supergroup__Affluent_England <dbl> …
## $ area_geodemographic_supergroup__Business_Education_and_Heritage_Centres <dbl> …
## $ area_geodemographic_supergroup__Countryside_Living <dbl> …
## $ area_geodemographic_supergroup__Ethnically_Diverse_Metropolitan_Living <dbl> …
## $ area_geodemographic_supergroup__London_Cosmopolitan <dbl> …
## $ area_geodemographic_supergroup__Services_and_Industrial_Legacy <dbl> …
## $ area_geodemographic_supergroup__Town_and_Country_Living <dbl> …
## $ area_geodemographic_supergroup__Urban_Settlements <dbl> …
data_correlate <- data_binarized %>%
correlate(accreditation__Accredited)
data_correlate %>%
correlationfunnel::plot_correlation_funnel()
## Warning: ggrepel: 87 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Build Model
Split Data
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.1
## ✔ dials 1.2.1 ✔ tune 1.2.1
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.2.1 ✔ yardstick 1.3.1
## ✔ recipes 1.1.0
## Warning: package 'modeldata' was built under R version 4.3.3
## Warning: package 'recipes' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(usemodels)
set.seed(1123)
# data_clean <- data %>% sample_n(100)
data_split <- initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [2676/298]> Fold01
## 2 <split [2676/298]> Fold02
## 3 <split [2676/298]> Fold03
## 4 <split [2676/298]> Fold04
## 5 <split [2677/297]> Fold05
## 6 <split [2677/297]> Fold06
## 7 <split [2677/297]> Fold07
## 8 <split [2677/297]> Fold08
## 9 <split [2677/297]> Fold09
## 10 <split [2677/297]> Fold10
Preprocess data
library(themis)
library(textrecipes)
skimr::skim(data)
Data summary
Name |
data |
Number of rows |
3966 |
Number of columns |
22 |
_______________________ |
|
Column type frequency: |
|
character |
12 |
numeric |
10 |
________________________ |
|
Group variables |
None |
Variable type: character
name_of_museum |
0 |
1 |
3 |
76 |
0 |
3965 |
0 |
village_town_or_city |
0 |
1 |
3 |
24 |
0 |
1639 |
0 |
postcode |
0 |
1 |
6 |
9 |
0 |
3725 |
0 |
admin_area |
0 |
1 |
16 |
137 |
0 |
392 |
0 |
accreditation |
0 |
1 |
10 |
12 |
0 |
2 |
0 |
governance |
0 |
1 |
7 |
41 |
0 |
13 |
0 |
size |
0 |
1 |
4 |
7 |
0 |
5 |
0 |
size_provenance |
0 |
1 |
2 |
29 |
0 |
16 |
0 |
subject_matter |
0 |
1 |
5 |
45 |
0 |
112 |
0 |
primary_provenance_of_data |
0 |
1 |
3 |
8 |
0 |
17 |
0 |
area_geodemographic_subgroup |
0 |
1 |
12 |
39 |
0 |
25 |
0 |
area_geodemographic_supergroup |
0 |
1 |
16 |
39 |
0 |
8 |
0 |
Variable type: numeric
latitude |
0 |
1 |
52.93 |
1.95 |
49.20 |
51.48 |
52.46 |
53.94 |
60.79 |
▅▇▃▁▁ |
longitude |
0 |
1 |
-1.94 |
1.83 |
-8.09 |
-3.09 |
-1.86 |
-0.47 |
1.76 |
▁▂▇▇▅ |
area_deprivation_index |
0 |
1 |
5.46 |
2.48 |
1.00 |
4.00 |
5.00 |
7.00 |
10.00 |
▃▆▇▆▃ |
area_deprivation_index_crime |
0 |
1 |
5.43 |
3.07 |
1.00 |
3.00 |
6.00 |
8.00 |
10.00 |
▇▆▅▆▇ |
area_deprivation_index_education |
0 |
1 |
6.05 |
2.61 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▃▅▇▇▆ |
area_deprivation_index_employment |
0 |
1 |
6.08 |
2.77 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▅▆▇▇▇ |
area_deprivation_index_health |
0 |
1 |
6.02 |
2.82 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▅▆▆▇▇ |
area_deprivation_index_housing |
0 |
1 |
3.99 |
2.76 |
1.00 |
1.00 |
3.00 |
6.00 |
10.00 |
▇▅▃▃▂ |
area_deprivation_index_income |
0 |
1 |
6.00 |
2.63 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▃▆▇▇▆ |
area_deprivation_index_services |
0 |
1 |
4.79 |
3.01 |
1.00 |
2.00 |
4.00 |
8.00 |
10.00 |
▇▅▅▅▅ |
xgboost_rec <- recipes::recipe(accreditation ~ ., data = data_train) %>%
update_role(name_of_museum, new_role = "ID") %>%
step_tokenize(admin_area) %>%
step_tokenfilter(admin_area, max_tokens = 75) %>%
step_tf(admin_area) %>%
step_other(village_town_or_city, postcode, subject_matter) %>%
step_novel(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_smote(accreditation)
xgboost_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 3,368
## Columns: 177
## $ name_of_museum <fct> …
## $ latitude <dbl> …
## $ longitude <dbl> …
## $ area_deprivation_index <dbl> …
## $ area_deprivation_index_crime <dbl> …
## $ area_deprivation_index_education <dbl> …
## $ area_deprivation_index_employment <dbl> …
## $ area_deprivation_index_health <dbl> …
## $ area_deprivation_index_housing <dbl> …
## $ area_deprivation_index_income <dbl> …
## $ area_deprivation_index_services <dbl> …
## $ accreditation <fct> …
## $ tf_admin_area_aberdeenshire <dbl> …
## $ tf_admin_area_and <dbl> …
## $ tf_admin_area_area <dbl> …
## $ tf_admin_area_ayrshire <dbl> …
## $ tf_admin_area_borough <dbl> …
## $ tf_admin_area_ca <dbl> …
## $ tf_admin_area_cambridgeshire <dbl> …
## $ tf_admin_area_cheshire <dbl> …
## $ tf_admin_area_city <dbl> …
## $ tf_admin_area_cornwall <dbl> …
## $ tf_admin_area_council <dbl> …
## $ tf_admin_area_county <dbl> …
## $ tf_admin_area_cumbria <dbl> …
## $ tf_admin_area_derbyshire <dbl> …
## $ tf_admin_area_devon <dbl> …
## $ tf_admin_area_district <dbl> …
## $ tf_admin_area_dorset <dbl> …
## $ tf_admin_area_dumfries <dbl> …
## $ tf_admin_area_east <dbl> …
## $ tf_admin_area_edinburgh <dbl> …
## $ tf_admin_area_england <dbl> …
## $ tf_admin_area_english <dbl> …
## $ tf_admin_area_essex <dbl> …
## $ tf_admin_area_forest <dbl> …
## $ tf_admin_area_galloway <dbl> …
## $ tf_admin_area_gloucestershire <dbl> …
## $ tf_admin_area_gov <dbl> …
## $ tf_admin_area_greater <dbl> …
## $ tf_admin_area_hampshire <dbl> …
## $ tf_admin_area_hertfordshire <dbl> …
## $ tf_admin_area_highland <dbl> …
## $ tf_admin_area_humber <dbl> …
## $ tf_admin_area_ireland <dbl> …
## $ tf_admin_area_isle <dbl> …
## $ tf_admin_area_kent <dbl> …
## $ tf_admin_area_lancashire <dbl> …
## $ tf_admin_area_leicestershire <dbl> …
## $ tf_admin_area_lincolnshire <dbl> …
## $ tf_admin_area_liverpool <dbl> …
## $ tf_admin_area_loc <dbl> …
## $ tf_admin_area_london <dbl> …
## $ tf_admin_area_manchester <dbl> …
## $ tf_admin_area_mid <dbl> …
## $ tf_admin_area_midlands <dbl> …
## $ tf_admin_area_ni <dbl> …
## $ tf_admin_area_norfolk <dbl> …
## $ tf_admin_area_north <dbl> …
## $ tf_admin_area_northamptonshire <dbl> …
## $ tf_admin_area_northern <dbl> …
## $ tf_admin_area_of <dbl> …
## $ tf_admin_area_on <dbl> …
## $ tf_admin_area_or <dbl> …
## $ tf_admin_area_oxfordshire <dbl> …
## $ tf_admin_area_peterborough <dbl> …
## $ tf_admin_area_region <dbl> …
## $ tf_admin_area_scotland <dbl> …
## $ tf_admin_area_scottish <dbl> …
## $ tf_admin_area_sheffield <dbl> …
## $ tf_admin_area_somerset <dbl> …
## $ tf_admin_area_south <dbl> …
## $ tf_admin_area_staffordshire <dbl> …
## $ tf_admin_area_suffolk <dbl> …
## $ tf_admin_area_surrey <dbl> …
## $ tf_admin_area_sussex <dbl> …
## $ tf_admin_area_the <dbl> …
## $ tf_admin_area_ua <dbl> …
## $ tf_admin_area_valley <dbl> …
## $ tf_admin_area_wales <dbl> …
## $ tf_admin_area_warwickshire <dbl> …
## $ tf_admin_area_welsh <dbl> …
## $ tf_admin_area_west <dbl> …
## $ tf_admin_area_westminster <dbl> …
## $ tf_admin_area_wight <dbl> …
## $ tf_admin_area_wiltshire <dbl> …
## $ tf_admin_area_yorkshire <dbl> …
## $ village_town_or_city_other <dbl> …
## $ village_town_or_city_new <dbl> …
## $ postcode_other <dbl> …
## $ postcode_new <dbl> …
## $ governance_Government.Local_Authority <dbl> …
## $ governance_Government.National <dbl> …
## $ governance_Government.Other <dbl> …
## $ governance_Independent.English_Heritage <dbl> …
## $ governance_Independent.Historic_Environment_Scotland <dbl> …
## $ governance_Independent.National_Trust <dbl> …
## $ governance_Independent.National_Trust_for_Scotland <dbl> …
## $ governance_Independent.Not_for_profit <dbl> …
## $ governance_Independent.Private <dbl> …
## $ governance_Independent.Unknown <dbl> …
## $ governance_University <dbl> …
## $ governance_Unknown <dbl> …
## $ governance_new <dbl> …
## $ size_large <dbl> …
## $ size_medium <dbl> …
## $ size_small <dbl> …
## $ size_unknown <dbl> …
## $ size_new <dbl> …
## $ size_provenance_aim_size_designation <dbl> …
## $ size_provenance_babbidge_ewles_and_smith_2006 <dbl> …
## $ size_provenance_domus <dbl> …
## $ size_provenance_ma.fam. <dbl> …
## $ size_provenance_ma.fam._year_stated <dbl> …
## $ size_provenance_ma.fam2. <dbl> …
## $ size_provenance_mm <dbl> …
## $ size_provenance_mm_manual_estimate_2018 <dbl> …
## $ size_provenance_mm_prediction_random_forest <dbl> …
## $ size_provenance_national_trust_annual_report_ <dbl> …
## $ size_provenance_nilm.vn. <dbl> …
## $ size_provenance_scottish_national_audit <dbl> …
## $ size_provenance_unknown <dbl> …
## $ size_provenance_visitbritain <dbl> …
## $ size_provenance_new <dbl> …
## $ subject_matter_Local_Histories <dbl> …
## $ subject_matter_other <dbl> …
## $ subject_matter_new <dbl> …
## $ primary_provenance_of_data_aim <dbl> …
## $ primary_provenance_of_data_aim82M <dbl> …
## $ primary_provenance_of_data_aim82NM <dbl> …
## $ primary_provenance_of_data_domus <dbl> …
## $ primary_provenance_of_data_fcm <dbl> …
## $ primary_provenance_of_data_hha <dbl> …
## $ primary_provenance_of_data_hud <dbl> …
## $ primary_provenance_of_data_mald <dbl> …
## $ primary_provenance_of_data_MDN <dbl> …
## $ primary_provenance_of_data_mgs <dbl> …
## $ primary_provenance_of_data_misc <dbl> …
## $ primary_provenance_of_data_Misc <dbl> …
## $ primary_provenance_of_data_musassoc <dbl> …
## $ primary_provenance_of_data_MusCal <dbl> …
## $ primary_provenance_of_data_nimc <dbl> …
## $ primary_provenance_of_data_wiki <dbl> …
## $ primary_provenance_of_data_new <dbl> …
## $ area_geodemographic_subgroup_Ageing.Coastal.Living <dbl> …
## $ area_geodemographic_subgroup_City.Periphery <dbl> …
## $ area_geodemographic_subgroup_Country.Living <dbl> …
## $ area_geodemographic_subgroup_Ethnically.Diverse.Metropolitan..Living <dbl> …
## $ area_geodemographic_subgroup_Ethnically.Diverse.Metropolitan.Living <dbl> …
## $ area_geodemographic_subgroup_Expanding.Areas <dbl> …
## $ area_geodemographic_subgroup_Industrial.and.Multi.ethnic <dbl> …
## $ area_geodemographic_subgroup_Larger.Towns.and.Cities <dbl> …
## $ area_geodemographic_subgroup_London.Cosmopolitan <dbl> …
## $ area_geodemographic_subgroup_Manufacturing.Legacy <dbl> …
## $ area_geodemographic_subgroup_Mining.Legacy <dbl> …
## $ area_geodemographic_subgroup_Northern.Ireland.Countryside <dbl> …
## $ area_geodemographic_subgroup_Older.Farming.Communities <dbl> …
## $ area_geodemographic_subgroup_Prosperous.Semi.rural <dbl> …
## $ area_geodemographic_subgroup_Prosperous.Towns <dbl> …
## $ area_geodemographic_subgroup_Rural.Growth.Areas <dbl> …
## $ area_geodemographic_subgroup_Rural.Urban.Fringe <dbl> …
## $ area_geodemographic_subgroup_Scottish.Countryside <dbl> …
## $ area_geodemographic_subgroup_Scottish.Industrial.Legacy <dbl> …
## $ area_geodemographic_subgroup_Seaside.Living <dbl> …
## $ area_geodemographic_subgroup_Service.Economy <dbl> …
## $ area_geodemographic_subgroup_Sparse.English.and.Welsh.Countryside <dbl> …
## $ area_geodemographic_subgroup_University.Towns.and.Cities <dbl> …
## $ area_geodemographic_subgroup_Urban.Living <dbl> …
## $ area_geodemographic_subgroup_new <dbl> …
## $ area_geodemographic_supergroup_Business.Education.and.Heritage.Centres <dbl> …
## $ area_geodemographic_supergroup_Countryside.Living <dbl> …
## $ area_geodemographic_supergroup_Ethnically.Diverse.Metropolitan.Living <dbl> …
## $ area_geodemographic_supergroup_London.Cosmopolitan <dbl> …
## $ area_geodemographic_supergroup_Services.and.Industrial.Legacy <dbl> …
## $ area_geodemographic_supergroup_Town.and.Country.Living <dbl> …
## $ area_geodemographic_supergroup_Urban.Settlements <dbl> …
## $ area_geodemographic_supergroup_new <dbl> …
Specify model
xgboost_spec <-
boost_tree(trees = tune()) %>%
set_mode("classification") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_rec) %>%
add_model(xgboost_spec)
Tune hyperparameters
doParallel::registerDoParallel
## function (cl, cores = NULL, ...)
## {
## opts <- list(...)
## optnames <- names(opts)
## if (is.null(optnames))
## optnames <- rep("", length(opts))
## unnamed <- !nzchar(optnames)
## if (any(unnamed)) {
## warning("ignoring doParallel package option(s) specified with unnamed argument")
## opts <- opts[!unnamed]
## optnames <- optnames[!unnamed]
## }
## recog <- optnames %in% c("nocompile")
## if (any(!recog)) {
## warning(sprintf("ignoring unrecognized doParallel package option(s): %s",
## paste(optnames[!recog], collapse = ", ")), call. = FALSE)
## opts <- opts[recog]
## optnames <- optnames[recog]
## }
## old.optnames <- ls(.options, all.names = TRUE)
## rm(list = old.optnames, pos = .options)
## for (i in seq_along(opts)) {
## assign(optnames[i], opts[[i]], pos = .options)
## }
## if (missing(cl) || is.numeric(cl)) {
## if (.Platform$OS.type == "windows") {
## if (!missing(cl) && is.numeric(cl)) {
## cl <- makeCluster(cl)
## }
## else {
## if (!missing(cores) && is.numeric(cores)) {
## cl <- makeCluster(cores)
## }
## else {
## cl <- makeCluster(3)
## }
## }
## assign(".revoDoParCluster", cl, pos = .options)
## reg.finalizer(.options, function(e) {
## stopImplicitCluster()
## }, onexit = TRUE)
## setDoPar(doParallelSNOW, cl, snowinfo)
## }
## else {
## if (!missing(cl) && is.numeric(cl)) {
## cores <- cl
## }
## setDoPar(doParallelMC, cores, mcinfo)
## }
## }
## else {
## setDoPar(doParallelSNOW, cl, snowinfo)
## }
## }
## <bytecode: 0x13a301de0>
## <environment: namespace:doParallel>
set.seed(48291)
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = data_cv,
grid = 5,
control = control_grid(save_pred = TRUE))
## Warning: package 'xgboost' was built under R version 4.3.3
Evaluate model
Tune Hyperparameters
collect_metrics(xgboost_tune)
## # A tibble: 15 × 7
## trees .metric .estimator mean n std_err .config
## <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 241 accuracy binary 0.825 10 0.00754 Preprocessor1_Model1
## 2 241 brier_class binary 0.134 10 0.00626 Preprocessor1_Model1
## 3 241 roc_auc binary 0.900 10 0.00702 Preprocessor1_Model1
## 4 784 accuracy binary 0.822 10 0.00796 Preprocessor1_Model2
## 5 784 brier_class binary 0.149 10 0.00712 Preprocessor1_Model2
## 6 784 roc_auc binary 0.890 10 0.00750 Preprocessor1_Model2
## 7 851 accuracy binary 0.820 10 0.00812 Preprocessor1_Model3
## 8 851 brier_class binary 0.149 10 0.00708 Preprocessor1_Model3
## 9 851 roc_auc binary 0.890 10 0.00754 Preprocessor1_Model3
## 10 1554 accuracy binary 0.819 10 0.00893 Preprocessor1_Model4
## 11 1554 brier_class binary 0.154 10 0.00722 Preprocessor1_Model4
## 12 1554 roc_auc binary 0.887 10 0.00754 Preprocessor1_Model4
## 13 1608 accuracy binary 0.818 10 0.00876 Preprocessor1_Model5
## 14 1608 brier_class binary 0.154 10 0.00725 Preprocessor1_Model5
## 15 1608 roc_auc binary 0.887 10 0.00758 Preprocessor1_Model5
collect_predictions(xgboost_tune) %>%
group_by(id) %>%
roc_curve(accreditation, .pred_Accredited) %>%
autoplot()

Fit the model for the last time
xgboost_last <- xgboost_workflow %>%
finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
last_fit(data_split)
collect_metrics(xgboost_last)
## # A tibble: 3 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.815 Preprocessor1_Model1
## 2 roc_auc binary 0.890 Preprocessor1_Model1
## 3 brier_class binary 0.143 Preprocessor1_Model1
collect_predictions(xgboost_last) %>%
yardstick::conf_mat(accreditation, .pred_class) %>%
autoplot()

Variable importance
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
xgboost_last %>%
workflows::extract_fit_engine() %>%
vip()
