library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(correlationfunnel)
## ══ correlationfunnel Tip #1 ════════════════════════════════════════════════════
## Make sure your data is not overly imbalanced prior to using `correlate()`.
## If less than 5% imbalance, consider sampling. :)
# Import Data
museums <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')
## Rows: 4191 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (24): museum_id, Name_of_museum, Address_line_1, Address_line_2, Village...
## dbl (11): Latitude, Longitude, DOMUS_identifier, Area_Deprivation_index, Are...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(museums)
Data summary
Name |
museums |
Number of rows |
4191 |
Number of columns |
35 |
_______________________ |
|
Column type frequency: |
|
character |
24 |
numeric |
11 |
________________________ |
|
Group variables |
None |
Variable type: character
museum_id |
0 |
1.00 |
8 |
15 |
0 |
4191 |
0 |
Name_of_museum |
0 |
1.00 |
3 |
76 |
0 |
4190 |
0 |
Address_line_1 |
441 |
0.89 |
3 |
61 |
0 |
3212 |
0 |
Address_line_2 |
2816 |
0.33 |
3 |
39 |
0 |
1167 |
0 |
Village,_Town_or_City |
4 |
1.00 |
3 |
24 |
0 |
1696 |
0 |
Postcode |
0 |
1.00 |
6 |
9 |
0 |
3918 |
0 |
Admin_area |
0 |
1.00 |
12 |
137 |
0 |
393 |
0 |
Accreditation |
0 |
1.00 |
10 |
12 |
0 |
2 |
0 |
Governance |
0 |
1.00 |
7 |
41 |
0 |
13 |
0 |
Size |
0 |
1.00 |
4 |
7 |
0 |
5 |
0 |
Size_provenance |
179 |
0.96 |
2 |
29 |
0 |
16 |
0 |
Subject_Matter |
0 |
1.00 |
5 |
45 |
0 |
114 |
0 |
Year_opened |
0 |
1.00 |
9 |
9 |
0 |
351 |
0 |
Year_closed |
0 |
1.00 |
9 |
9 |
0 |
170 |
0 |
DOMUS_Subject_Matter |
2788 |
0.33 |
5 |
27 |
0 |
21 |
0 |
Primary_provenance_of_data |
0 |
1.00 |
3 |
8 |
0 |
18 |
0 |
Identifier_used_in_primary_data_source |
2056 |
0.51 |
2 |
8 |
0 |
2134 |
0 |
Area_Geodemographic_group |
49 |
0.99 |
11 |
40 |
0 |
17 |
0 |
Area_Geodemographic_group_code |
49 |
0.99 |
3 |
3 |
0 |
16 |
0 |
Area_Geodemographic_subgroup |
49 |
0.99 |
12 |
39 |
0 |
25 |
0 |
Area_Geodemographic_subgroup_code |
49 |
0.99 |
4 |
4 |
0 |
24 |
0 |
Area_Geodemographic_supergroup |
49 |
0.99 |
16 |
39 |
0 |
8 |
0 |
Area_Geodemographic_supergroup_code |
49 |
0.99 |
2 |
2 |
0 |
8 |
0 |
Notes |
2980 |
0.29 |
12 |
751 |
0 |
956 |
0 |
Variable type: numeric
Latitude |
0 |
1.00 |
52.93 |
2.09 |
49.18 |
51.48 |
52.47 |
53.96 |
100.00 |
▇▁▁▁▁ |
Longitude |
0 |
1.00 |
-1.96 |
1.84 |
-8.09 |
-3.10 |
-1.87 |
-0.48 |
1.76 |
▁▂▇▇▅ |
DOMUS_identifier |
2347 |
0.44 |
1303.45 |
1597.19 |
1.00 |
486.50 |
991.50 |
1470.25 |
7746.00 |
▇▂▁▁▁ |
Area_Deprivation_index |
49 |
0.99 |
5.44 |
2.48 |
1.00 |
4.00 |
5.00 |
7.00 |
10.00 |
▃▆▇▆▃ |
Area_Deprivation_index_crime |
49 |
0.99 |
5.43 |
3.07 |
1.00 |
3.00 |
6.00 |
8.00 |
10.00 |
▇▆▅▇▇ |
Area_Deprivation_index_education |
49 |
0.99 |
6.04 |
2.61 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▃▅▇▇▆ |
Area_Deprivation_index_employment |
49 |
0.99 |
6.08 |
2.76 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▅▆▇▇▇ |
Area_Deprivation_index_health |
49 |
0.99 |
6.02 |
2.82 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▅▆▆▇▇ |
Area_Deprivation_index_housing |
49 |
0.99 |
3.97 |
2.75 |
1.00 |
1.00 |
3.00 |
6.00 |
10.00 |
▇▅▃▂▂ |
Area_Deprivation_index_income |
49 |
0.99 |
5.99 |
2.62 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▃▆▇▇▆ |
Area_Deprivation_index_services |
49 |
0.99 |
4.78 |
3.01 |
1.00 |
2.00 |
4.00 |
7.00 |
10.00 |
▇▅▅▅▅ |
missing values Addressline_2, Addressline_1, DOMUS_Subject_Matter,
DOMUS_Identifier, Notes factors or numeric variables Zero Variance
variables Character variables Unbalanced target variables id variable
museum_id
museums %>% count(Accreditation)
## # A tibble: 2 × 2
## Accreditation n
## <chr> <int>
## 1 Accredited 1720
## 2 Unaccredited 2471
museums %>%
ggplot(aes(Accreditation)) +
geom_bar()

data <- museums %>%
select(-Address_line_1, -Address_line_2, -DOMUS_Subject_Matter,-DOMUS_identifier, -Notes, -Identifier_used_in_primary_data_source, -Area_Geodemographic_supergroup_code, -Area_Geodemographic_group_code, -Area_Geodemographic_subgroup_code, -museum_id) %>%
na.omit() %>%
janitor::clean_names()
skimr::skim(data)
Data summary
Name |
data |
Number of rows |
3966 |
Number of columns |
25 |
_______________________ |
|
Column type frequency: |
|
character |
15 |
numeric |
10 |
________________________ |
|
Group variables |
None |
Variable type: character
name_of_museum |
0 |
1 |
3 |
76 |
0 |
3965 |
0 |
village_town_or_city |
0 |
1 |
3 |
24 |
0 |
1639 |
0 |
postcode |
0 |
1 |
6 |
9 |
0 |
3725 |
0 |
admin_area |
0 |
1 |
16 |
137 |
0 |
392 |
0 |
accreditation |
0 |
1 |
10 |
12 |
0 |
2 |
0 |
governance |
0 |
1 |
7 |
41 |
0 |
13 |
0 |
size |
0 |
1 |
4 |
7 |
0 |
5 |
0 |
size_provenance |
0 |
1 |
2 |
29 |
0 |
16 |
0 |
subject_matter |
0 |
1 |
5 |
45 |
0 |
112 |
0 |
year_opened |
0 |
1 |
9 |
9 |
0 |
334 |
0 |
year_closed |
0 |
1 |
9 |
9 |
0 |
159 |
0 |
primary_provenance_of_data |
0 |
1 |
3 |
8 |
0 |
17 |
0 |
area_geodemographic_group |
0 |
1 |
11 |
40 |
0 |
17 |
0 |
area_geodemographic_subgroup |
0 |
1 |
12 |
39 |
0 |
25 |
0 |
area_geodemographic_supergroup |
0 |
1 |
16 |
39 |
0 |
8 |
0 |
Variable type: numeric
latitude |
0 |
1 |
52.93 |
1.95 |
49.20 |
51.48 |
52.46 |
53.94 |
60.79 |
▅▇▃▁▁ |
longitude |
0 |
1 |
-1.94 |
1.83 |
-8.09 |
-3.09 |
-1.86 |
-0.47 |
1.76 |
▁▂▇▇▅ |
area_deprivation_index |
0 |
1 |
5.46 |
2.48 |
1.00 |
4.00 |
5.00 |
7.00 |
10.00 |
▃▆▇▆▃ |
area_deprivation_index_crime |
0 |
1 |
5.43 |
3.07 |
1.00 |
3.00 |
6.00 |
8.00 |
10.00 |
▇▆▅▆▇ |
area_deprivation_index_education |
0 |
1 |
6.05 |
2.61 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▃▅▇▇▆ |
area_deprivation_index_employment |
0 |
1 |
6.08 |
2.77 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▅▆▇▇▇ |
area_deprivation_index_health |
0 |
1 |
6.02 |
2.82 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▅▆▆▇▇ |
area_deprivation_index_housing |
0 |
1 |
3.99 |
2.76 |
1.00 |
1.00 |
3.00 |
6.00 |
10.00 |
▇▅▃▃▂ |
area_deprivation_index_income |
0 |
1 |
6.00 |
2.63 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▃▆▇▇▆ |
area_deprivation_index_services |
0 |
1 |
4.79 |
3.01 |
1.00 |
2.00 |
4.00 |
8.00 |
10.00 |
▇▅▅▅▅ |
data %>%
ggplot(aes(accreditation, area_deprivation_index_employment)) +
geom_boxplot()

data %>%
ggplot(aes(accreditation, area_deprivation_index_crime)) +
geom_boxplot()

data %>%
ggplot(aes(accreditation, area_deprivation_index_health)) +
geom_boxplot()

data_binarized <- data %>%
select(-name_of_museum) %>%
binarize()
data_binarized %>% glimpse()
## Rows: 3,966
## Columns: 196
## $ village_town_or_city__Edinburgh <dbl> …
## $ village_town_or_city__London <dbl> …
## $ `village_town_or_city__-OTHER` <dbl> …
## $ postcode__SO23_8TS <dbl> …
## $ `postcode__-OTHER` <dbl> …
## $ `latitude__-Inf_51.48120725` <dbl> …
## $ latitude__51.48120725_52.4554475 <dbl> …
## $ latitude__52.4554475_53.9431025 <dbl> …
## $ latitude__53.9431025_Inf <dbl> …
## $ `longitude__-Inf_-3.0854455` <dbl> …
## $ `longitude__-3.0854455_-1.8564615` <dbl> …
## $ `longitude__-1.8564615_-0.469075` <dbl> …
## $ `longitude__-0.469075_Inf` <dbl> …
## $ `admin_area__/England/London_(English_Region)/Westminster_(London_Borough)` <dbl> …
## $ `admin_area__/England/South_West_(English_Region)/Cornwall_(English_UA)` <dbl> …
## $ `admin_area__/England/South_West_(English_Region)/Wiltshire_(English_UA)` <dbl> …
## $ `admin_area__/Scotland/City_of_Edinburgh_(Scottish_Council_Area)` <dbl> …
## $ `admin_area__/Scotland/Dumfries_and_Galloway_(Scottish_Council_Area)` <dbl> …
## $ `admin_area__/Scotland/Highland_(Scottish_Council_Area)` <dbl> …
## $ `admin_area__-OTHER` <dbl> …
## $ accreditation__Accredited <dbl> …
## $ accreditation__Unaccredited <dbl> …
## $ `governance__Government-Local_Authority` <dbl> …
## $ `governance__Government-National` <dbl> …
## $ `governance__Independent-English_Heritage` <dbl> …
## $ `governance__Independent-National_Trust` <dbl> …
## $ `governance__Independent-Not_for_profit` <dbl> …
## $ `governance__Independent-Private` <dbl> …
## $ `governance__Independent-Unknown` <dbl> …
## $ governance__University <dbl> …
## $ governance__Unknown <dbl> …
## $ `governance__-OTHER` <dbl> …
## $ size__large <dbl> …
## $ size__medium <dbl> …
## $ size__small <dbl> …
## $ size__unknown <dbl> …
## $ `size__-OTHER` <dbl> …
## $ size_provenance__ace_size_designation <dbl> …
## $ size_provenance__aim_size_designation <dbl> …
## $ size_provenance__domus <dbl> …
## $ `size_provenance__ma(fam)` <dbl> …
## $ size_provenance__mm_manual_estimate_2018 <dbl> …
## $ size_provenance__mm_prediction_random_forest <dbl> …
## $ size_provenance__scottish_national_audit <dbl> …
## $ size_provenance__unknown <dbl> …
## $ size_provenance__visitbritain <dbl> …
## $ `size_provenance__-OTHER` <dbl> …
## $ `subject_matter__Archaeology-Roman` <dbl> …
## $ `subject_matter__Arts-Fine_and_decorative_arts` <dbl> …
## $ `subject_matter__Buildings-Houses-Large_houses` <dbl> …
## $ `subject_matter__Buildings-Houses-Medium_houses` <dbl> …
## $ `subject_matter__Industry_and_manufacture-Mining_and_quarrying` <dbl> …
## $ `subject_matter__Leisure_and_sport-Toys_and_models` <dbl> …
## $ subject_matter__Local_Histories <dbl> …
## $ `subject_matter__Mixed-Encyclopaedic` <dbl> …
## $ `subject_matter__Mixed-Other` <dbl> …
## $ subject_matter__Other <dbl> …
## $ `subject_matter__Personality-Literary` <dbl> …
## $ `subject_matter__Rural_Industry-Farming` <dbl> …
## $ `subject_matter__Sea_and_seafaring-Boats_and_ships` <dbl> …
## $ `subject_matter__Sea_and_seafaring-Mixed` <dbl> …
## $ `subject_matter__Transport-Cars_and_motorbikes` <dbl> …
## $ `subject_matter__Transport-Trains_and_railways` <dbl> …
## $ `subject_matter__War_and_conflict-Airforce` <dbl> …
## $ `subject_matter__War_and_conflict-Castles_and_forts` <dbl> …
## $ `subject_matter__War_and_conflict-Military` <dbl> …
## $ `subject_matter__War_and_conflict-Regiment` <dbl> …
## $ `subject_matter__-OTHER` <dbl> …
## $ `year_opened__1945:1960` <dbl> …
## $ `year_opened__1960:2017` <dbl> …
## $ `year_opened__1972:1972` <dbl> …
## $ `year_opened__1973:1973` <dbl> …
## $ `year_opened__1974:1974` <dbl> …
## $ `year_opened__1975:1975` <dbl> …
## $ `year_opened__1976:1976` <dbl> …
## $ `year_opened__1977:1977` <dbl> …
## $ `year_opened__1978:1978` <dbl> …
## $ `year_opened__1979:1979` <dbl> …
## $ `year_opened__1980:1980` <dbl> …
## $ `year_opened__1981:1981` <dbl> …
## $ `year_opened__1982:1982` <dbl> …
## $ `year_opened__1983:1983` <dbl> …
## $ `year_opened__1984:1984` <dbl> …
## $ `year_opened__1985:1985` <dbl> …
## $ `year_opened__1986:1986` <dbl> …
## $ `year_opened__1987:1987` <dbl> …
## $ `year_opened__1988:1988` <dbl> …
## $ `year_opened__1989:1989` <dbl> …
## $ `year_opened__1990:1990` <dbl> …
## $ `year_opened__1991:1991` <dbl> …
## $ `year_opened__1992:1992` <dbl> …
## $ `year_opened__1993:1993` <dbl> …
## $ `year_opened__1994:1994` <dbl> …
## $ `year_opened__1995:1995` <dbl> …
## $ `year_opened__1996:1996` <dbl> …
## $ `year_opened__1997:1997` <dbl> …
## $ `year_opened__1999:1999` <dbl> …
## $ `year_opened__2000:2000` <dbl> …
## $ `year_opened__2001:2001` <dbl> …
## $ `year_opened__2005:2005` <dbl> …
## $ `year_opened__-OTHER` <dbl> …
## $ `year_closed__9999:9999` <dbl> …
## $ `year_closed__-OTHER` <dbl> …
## $ primary_provenance_of_data__ace <dbl> …
## $ primary_provenance_of_data__aim <dbl> …
## $ primary_provenance_of_data__aim82M <dbl> …
## $ primary_provenance_of_data__aim82NM <dbl> …
## $ primary_provenance_of_data__domus <dbl> …
## $ primary_provenance_of_data__fcm <dbl> …
## $ primary_provenance_of_data__hha <dbl> …
## $ primary_provenance_of_data__mald <dbl> …
## $ primary_provenance_of_data__mgs <dbl> …
## $ primary_provenance_of_data__misc <dbl> …
## $ primary_provenance_of_data__musassoc <dbl> …
## $ primary_provenance_of_data__wiki <dbl> …
## $ `primary_provenance_of_data__-OTHER` <dbl> …
## $ `area_deprivation_index__-Inf_4` <dbl> …
## $ area_deprivation_index__4_5 <dbl> …
## $ area_deprivation_index__5_7 <dbl> …
## $ area_deprivation_index__7_Inf <dbl> …
## $ `area_deprivation_index_crime__-Inf_3` <dbl> …
## $ area_deprivation_index_crime__3_6 <dbl> …
## $ area_deprivation_index_crime__6_8 <dbl> …
## $ area_deprivation_index_crime__8_Inf <dbl> …
## $ `area_deprivation_index_education__-Inf_4` <dbl> …
## $ area_deprivation_index_education__4_6 <dbl> …
## $ area_deprivation_index_education__6_8 <dbl> …
## $ area_deprivation_index_education__8_Inf <dbl> …
## $ `area_deprivation_index_employment__-Inf_4` <dbl> …
## $ area_deprivation_index_employment__4_6 <dbl> …
## $ area_deprivation_index_employment__6_8 <dbl> …
## $ area_deprivation_index_employment__8_Inf <dbl> …
## $ `area_deprivation_index_health__-Inf_4` <dbl> …
## $ area_deprivation_index_health__4_6 <dbl> …
## $ area_deprivation_index_health__6_8 <dbl> …
## $ area_deprivation_index_health__8_Inf <dbl> …
## $ `area_deprivation_index_housing__-Inf_3` <dbl> …
## $ area_deprivation_index_housing__3_6 <dbl> …
## $ area_deprivation_index_housing__6_Inf <dbl> …
## $ `area_deprivation_index_income__-Inf_4` <dbl> …
## $ area_deprivation_index_income__4_6 <dbl> …
## $ area_deprivation_index_income__6_8 <dbl> …
## $ area_deprivation_index_income__8_Inf <dbl> …
## $ `area_deprivation_index_services__-Inf_2` <dbl> …
## $ area_deprivation_index_services__2_4 <dbl> …
## $ area_deprivation_index_services__4_8 <dbl> …
## $ area_deprivation_index_services__8_Inf <dbl> …
## $ area_geodemographic_group__Country_Living <dbl> …
## $ area_geodemographic_group__English_and_Welsh_Countryside <dbl> …
## $ area_geodemographic_group__Ethnically_Diverse_Metropolitan_Living <dbl> …
## $ area_geodemographic_group__Larger_Towns_and_Cities <dbl> …
## $ area_geodemographic_group__London_Cosmopolitan <dbl> …
## $ area_geodemographic_group__Manufacturing_Traits <dbl> …
## $ area_geodemographic_group__Northern_Ireland_Countryside <dbl> …
## $ area_geodemographic_group__Remoter_Coastal_Living <dbl> …
## $ `area_geodemographic_group__Rural-Urban_Fringe` <dbl> …
## $ area_geodemographic_group__Scottish_Countryside <dbl> …
## $ area_geodemographic_group__Scottish_Industrial_Heritage <dbl> …
## $ area_geodemographic_group__Services_Manufacturing_and_Mining_Legacy <dbl> …
## $ area_geodemographic_group__Suburban_Traits <dbl> …
## $ area_geodemographic_group__Thriving_Rural <dbl> …
## $ area_geodemographic_group__Town_Living <dbl> …
## $ area_geodemographic_group__University_Towns_and_Cities <dbl> …
## $ `area_geodemographic_group__-OTHER` <dbl> …
## $ area_geodemographic_subgroup__Affluent_rural <dbl> …
## $ area_geodemographic_subgroup__Ageing_Coastal_Living <dbl> …
## $ area_geodemographic_subgroup__City_Periphery <dbl> …
## $ area_geodemographic_subgroup__Country_Living <dbl> …
## $ area_geodemographic_subgroup__Ethnically_Diverse_Metropolitan_Living <dbl> …
## $ area_geodemographic_subgroup__Expanding_Areas <dbl> …
## $ `area_geodemographic_subgroup__Industrial_and_Multi-ethnic` <dbl> …
## $ area_geodemographic_subgroup__Larger_Towns_and_Cities <dbl> …
## $ area_geodemographic_subgroup__London_Cosmopolitan <dbl> …
## $ area_geodemographic_subgroup__Manufacturing_Legacy <dbl> …
## $ area_geodemographic_subgroup__Mining_Legacy <dbl> …
## $ area_geodemographic_subgroup__Northern_Ireland_Countryside <dbl> …
## $ area_geodemographic_subgroup__Older_Farming_Communities <dbl> …
## $ area_geodemographic_subgroup__Prosperous_Towns <dbl> …
## $ area_geodemographic_subgroup__Rural_Growth_Areas <dbl> …
## $ `area_geodemographic_subgroup__Rural-Urban_Fringe` <dbl> …
## $ area_geodemographic_subgroup__Scottish_Countryside <dbl> …
## $ area_geodemographic_subgroup__Scottish_Industrial_Legacy <dbl> …
## $ area_geodemographic_subgroup__Seaside_Living <dbl> …
## $ area_geodemographic_subgroup__Service_Economy <dbl> …
## $ area_geodemographic_subgroup__Sparse_English_and_Welsh_Countryside <dbl> …
## $ area_geodemographic_subgroup__University_Towns_and_Cities <dbl> …
## $ area_geodemographic_subgroup__Urban_Living <dbl> …
## $ `area_geodemographic_subgroup__-OTHER` <dbl> …
## $ area_geodemographic_supergroup__Affluent_England <dbl> …
## $ area_geodemographic_supergroup__Business_Education_and_Heritage_Centres <dbl> …
## $ area_geodemographic_supergroup__Countryside_Living <dbl> …
## $ area_geodemographic_supergroup__Ethnically_Diverse_Metropolitan_Living <dbl> …
## $ area_geodemographic_supergroup__London_Cosmopolitan <dbl> …
## $ area_geodemographic_supergroup__Services_and_Industrial_Legacy <dbl> …
## $ area_geodemographic_supergroup__Town_and_Country_Living <dbl> …
## $ area_geodemographic_supergroup__Urban_Settlements <dbl> …
data_correlate <- data_binarized %>%
correlate(accreditation__Accredited)
data_correlate %>%
correlationfunnel::plot_correlation_funnel()
## Warning: ggrepel: 188 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Build Model
Split Data
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.1
## ✔ dials 1.2.1 ✔ tune 1.2.1
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.2.1 ✔ yardstick 1.3.1
## ✔ recipes 1.1.0
## Warning: package 'modeldata' was built under R version 4.3.3
## Warning: package 'recipes' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(usemodels)
set.seed(1123)
data_clean <- data %>% sample_n(100)
data_split <- initial_split(data_clean)
data_train <- training(data_split)
data_test <- testing(data_split)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [67/8]> Fold01
## 2 <split [67/8]> Fold02
## 3 <split [67/8]> Fold03
## 4 <split [67/8]> Fold04
## 5 <split [67/8]> Fold05
## 6 <split [68/7]> Fold06
## 7 <split [68/7]> Fold07
## 8 <split [68/7]> Fold08
## 9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10
Preprocess data
library(themis)
library(textrecipes)
skimr::skim(data)
Data summary
Name |
data |
Number of rows |
3966 |
Number of columns |
25 |
_______________________ |
|
Column type frequency: |
|
character |
15 |
numeric |
10 |
________________________ |
|
Group variables |
None |
Variable type: character
name_of_museum |
0 |
1 |
3 |
76 |
0 |
3965 |
0 |
village_town_or_city |
0 |
1 |
3 |
24 |
0 |
1639 |
0 |
postcode |
0 |
1 |
6 |
9 |
0 |
3725 |
0 |
admin_area |
0 |
1 |
16 |
137 |
0 |
392 |
0 |
accreditation |
0 |
1 |
10 |
12 |
0 |
2 |
0 |
governance |
0 |
1 |
7 |
41 |
0 |
13 |
0 |
size |
0 |
1 |
4 |
7 |
0 |
5 |
0 |
size_provenance |
0 |
1 |
2 |
29 |
0 |
16 |
0 |
subject_matter |
0 |
1 |
5 |
45 |
0 |
112 |
0 |
year_opened |
0 |
1 |
9 |
9 |
0 |
334 |
0 |
year_closed |
0 |
1 |
9 |
9 |
0 |
159 |
0 |
primary_provenance_of_data |
0 |
1 |
3 |
8 |
0 |
17 |
0 |
area_geodemographic_group |
0 |
1 |
11 |
40 |
0 |
17 |
0 |
area_geodemographic_subgroup |
0 |
1 |
12 |
39 |
0 |
25 |
0 |
area_geodemographic_supergroup |
0 |
1 |
16 |
39 |
0 |
8 |
0 |
Variable type: numeric
latitude |
0 |
1 |
52.93 |
1.95 |
49.20 |
51.48 |
52.46 |
53.94 |
60.79 |
▅▇▃▁▁ |
longitude |
0 |
1 |
-1.94 |
1.83 |
-8.09 |
-3.09 |
-1.86 |
-0.47 |
1.76 |
▁▂▇▇▅ |
area_deprivation_index |
0 |
1 |
5.46 |
2.48 |
1.00 |
4.00 |
5.00 |
7.00 |
10.00 |
▃▆▇▆▃ |
area_deprivation_index_crime |
0 |
1 |
5.43 |
3.07 |
1.00 |
3.00 |
6.00 |
8.00 |
10.00 |
▇▆▅▆▇ |
area_deprivation_index_education |
0 |
1 |
6.05 |
2.61 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▃▅▇▇▆ |
area_deprivation_index_employment |
0 |
1 |
6.08 |
2.77 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▅▆▇▇▇ |
area_deprivation_index_health |
0 |
1 |
6.02 |
2.82 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▅▆▆▇▇ |
area_deprivation_index_housing |
0 |
1 |
3.99 |
2.76 |
1.00 |
1.00 |
3.00 |
6.00 |
10.00 |
▇▅▃▃▂ |
area_deprivation_index_income |
0 |
1 |
6.00 |
2.63 |
1.00 |
4.00 |
6.00 |
8.00 |
10.00 |
▃▆▇▇▆ |
area_deprivation_index_services |
0 |
1 |
4.79 |
3.01 |
1.00 |
2.00 |
4.00 |
8.00 |
10.00 |
▇▅▅▅▅ |
xgboost_rec <- recipes::recipe(accreditation ~ ., data = data_train) %>%
update_role(name_of_museum, new_role = "ID") %>%
step_tokenize(admin_area) %>%
step_tokenfilter(admin_area, max_tokens = 75) %>%
step_tf(admin_area) %>%
step_other(village_town_or_city, postcode, subject_matter, year_opened, year_closed) %>%
step_novel(all_nominal_predictors()) %>%
step_dummy(village_town_or_city, postcode, subject_matter, year_opened, year_closed, governance, size, size_provenance, primary_provenance_of_data, area_geodemographic_group, area_geodemographic_subgroup, area_geodemographic_supergroup)
xgboost_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 75
## Columns: 174
## $ name_of_museum <fct> …
## $ latitude <dbl> …
## $ longitude <dbl> …
## $ area_deprivation_index <dbl> …
## $ area_deprivation_index_crime <dbl> …
## $ area_deprivation_index_education <dbl> …
## $ area_deprivation_index_employment <dbl> …
## $ area_deprivation_index_health <dbl> …
## $ area_deprivation_index_housing <dbl> …
## $ area_deprivation_index_income <dbl> …
## $ area_deprivation_index_services <dbl> …
## $ accreditation <fct> …
## $ tf_admin_area_aberdeenshire <int> …
## $ tf_admin_area_and <int> …
## $ tf_admin_area_antrim <int> …
## $ tf_admin_area_area <int> …
## $ tf_admin_area_argyll <int> …
## $ tf_admin_area_ayrshire <int> …
## $ tf_admin_area_barnsley <int> …
## $ tf_admin_area_bath <int> …
## $ tf_admin_area_borders <int> …
## $ tf_admin_area_borough <int> …
## $ tf_admin_area_burnley <int> …
## $ tf_admin_area_bute <int> …
## $ tf_admin_area_ca <int> …
## $ tf_admin_area_cambridgeshire <int> …
## $ tf_admin_area_carmarthenshire <int> …
## $ tf_admin_area_ceredigion <int> …
## $ tf_admin_area_charnwood <int> …
## $ tf_admin_area_city <int> …
## $ tf_admin_area_coastal <int> …
## $ tf_admin_area_cornwall <int> …
## $ tf_admin_area_cotswold <int> …
## $ tf_admin_area_council <int> …
## $ tf_admin_area_county <int> …
## $ tf_admin_area_coventry <int> …
## $ tf_admin_area_derbyshire <int> …
## $ tf_admin_area_district <int> …
## $ tf_admin_area_dorset <int> …
## $ tf_admin_area_dunbartonshire <int> …
## $ tf_admin_area_durham <int> …
## $ tf_admin_area_east <int> …
## $ tf_admin_area_edinburgh <int> …
## $ tf_admin_area_england <int> …
## $ tf_admin_area_english <int> …
## $ tf_admin_area_essex <int> …
## $ tf_admin_area_forest <int> …
## $ tf_admin_area_gloucestershire <int> …
## $ tf_admin_area_gov <int> …
## $ tf_admin_area_great <int> …
## $ tf_admin_area_guildford <int> …
## $ tf_admin_area_gwynedd <int> …
## $ tf_admin_area_harrogate <int> …
## $ tf_admin_area_heath <int> …
## $ tf_admin_area_herefordshire <int> …
## $ tf_admin_area_hertfordshire <int> …
## $ tf_admin_area_hertsmere <int> …
## $ tf_admin_area_highland <int> …
## $ tf_admin_area_holland <int> …
## $ tf_admin_area_humber <int> …
## $ tf_admin_area_kent <int> …
## $ tf_admin_area_lancashire <int> …
## $ tf_admin_area_lincolnshire <int> …
## $ tf_admin_area_london <int> …
## $ tf_admin_area_midlands <int> …
## $ tf_admin_area_norfolk <int> …
## $ tf_admin_area_north <int> …
## $ tf_admin_area_of <int> …
## $ tf_admin_area_or <int> …
## $ tf_admin_area_region <int> …
## $ tf_admin_area_scotland <int> …
## $ tf_admin_area_scottish <int> …
## $ tf_admin_area_sevenoaks <int> …
## $ tf_admin_area_sheffield <int> …
## $ tf_admin_area_somerset <int> …
## $ tf_admin_area_south <int> …
## $ tf_admin_area_suffolk <int> …
## $ tf_admin_area_surrey <int> …
## $ tf_admin_area_sussex <int> …
## $ tf_admin_area_the <int> …
## $ tf_admin_area_ua <int> …
## $ tf_admin_area_wales <int> …
## $ tf_admin_area_welsh <int> …
## $ tf_admin_area_west <int> …
## $ tf_admin_area_westminster <int> …
## $ tf_admin_area_wiltshire <int> …
## $ tf_admin_area_yorkshire <int> …
## $ village_town_or_city_other <dbl> …
## $ village_town_or_city_new <dbl> …
## $ postcode_other <dbl> …
## $ postcode_new <dbl> …
## $ subject_matter_Buildings.Houses.Large_houses <dbl> …
## $ subject_matter_Local_Histories <dbl> …
## $ subject_matter_other <dbl> …
## $ subject_matter_new <dbl> …
## $ year_opened_other <dbl> …
## $ year_opened_new <dbl> …
## $ year_closed_other <dbl> …
## $ year_closed_new <dbl> …
## $ governance_Government.National <dbl> …
## $ governance_Independent.English_Heritage <dbl> …
## $ governance_Independent.Historic_Environment_Scotland <dbl> …
## $ governance_Independent.National_Trust <dbl> …
## $ governance_Independent.National_Trust_for_Scotland <dbl> …
## $ governance_Independent.Not_for_profit <dbl> …
## $ governance_Independent.Private <dbl> …
## $ governance_Unknown <dbl> …
## $ governance_new <dbl> …
## $ size_medium <dbl> …
## $ size_small <dbl> …
## $ size_new <dbl> …
## $ size_provenance_aim_size_designation <dbl> …
## $ size_provenance_domus <dbl> …
## $ size_provenance_ma.fam. <dbl> …
## $ size_provenance_mm <dbl> …
## $ size_provenance_mm_prediction_random_forest <dbl> …
## $ size_provenance_unknown <dbl> …
## $ size_provenance_visitbritain <dbl> …
## $ size_provenance_new <dbl> …
## $ primary_provenance_of_data_aim <dbl> …
## $ primary_provenance_of_data_aim82M <dbl> …
## $ primary_provenance_of_data_aim82NM <dbl> …
## $ primary_provenance_of_data_domus <dbl> …
## $ primary_provenance_of_data_fcm <dbl> …
## $ primary_provenance_of_data_hha <dbl> …
## $ primary_provenance_of_data_mald <dbl> …
## $ primary_provenance_of_data_mgs <dbl> …
## $ primary_provenance_of_data_misc <dbl> …
## $ primary_provenance_of_data_Misc <dbl> …
## $ primary_provenance_of_data_musassoc <dbl> …
## $ primary_provenance_of_data_MusCal <dbl> …
## $ primary_provenance_of_data_wiki <dbl> …
## $ primary_provenance_of_data_new <dbl> …
## $ area_geodemographic_group_English.and.Welsh.Countryside <dbl> …
## $ area_geodemographic_group_Larger.Towns.and.Cities <dbl> …
## $ area_geodemographic_group_London.Cosmopolitan <dbl> …
## $ area_geodemographic_group_Manufacturing.Traits <dbl> …
## $ area_geodemographic_group_Northern.Ireland.Countryside <dbl> …
## $ area_geodemographic_group_Remoter.Coastal.Living <dbl> …
## $ area_geodemographic_group_Rural.Urban.Fringe <dbl> …
## $ area_geodemographic_group_Scottish.Countryside <dbl> …
## $ area_geodemographic_group_Scottish.Industrial.Heritage <dbl> …
## $ area_geodemographic_group_Services.Manufacturing.and.Mining.Legacy <dbl> …
## $ area_geodemographic_group_Thriving.Rural <dbl> …
## $ area_geodemographic_group_Town.Living <dbl> …
## $ area_geodemographic_group_University.Towns.and.Cities <dbl> …
## $ area_geodemographic_group_new <dbl> …
## $ area_geodemographic_subgroup_Ageing.Coastal.Living <dbl> …
## $ area_geodemographic_subgroup_Country.Living <dbl> …
## $ area_geodemographic_subgroup_Industrial.and.Multi.ethnic <dbl> …
## $ area_geodemographic_subgroup_Larger.Towns.and.Cities <dbl> …
## $ area_geodemographic_subgroup_London.Cosmopolitan <dbl> …
## $ area_geodemographic_subgroup_Manufacturing.Legacy <dbl> …
## $ area_geodemographic_subgroup_Mining.Legacy <dbl> …
## $ area_geodemographic_subgroup_Northern.Ireland.Countryside <dbl> …
## $ area_geodemographic_subgroup_Older.Farming.Communities <dbl> …
## $ area_geodemographic_subgroup_Prosperous.Semi.rural <dbl> …
## $ area_geodemographic_subgroup_Prosperous.Towns <dbl> …
## $ area_geodemographic_subgroup_Rural.Growth.Areas <dbl> …
## $ area_geodemographic_subgroup_Rural.Urban.Fringe <dbl> …
## $ area_geodemographic_subgroup_Scottish.Countryside <dbl> …
## $ area_geodemographic_subgroup_Scottish.Industrial.Legacy <dbl> …
## $ area_geodemographic_subgroup_Seaside.Living <dbl> …
## $ area_geodemographic_subgroup_Sparse.English.and.Welsh.Countryside <dbl> …
## $ area_geodemographic_subgroup_University.Towns.and.Cities <dbl> …
## $ area_geodemographic_subgroup_Urban.Living <dbl> …
## $ area_geodemographic_subgroup_new <dbl> …
## $ area_geodemographic_supergroup_Business.Education.and.Heritage.Centres <dbl> …
## $ area_geodemographic_supergroup_Countryside.Living <dbl> …
## $ area_geodemographic_supergroup_London.Cosmopolitan <dbl> …
## $ area_geodemographic_supergroup_Services.and.Industrial.Legacy <dbl> …
## $ area_geodemographic_supergroup_Town.and.Country.Living <dbl> …
## $ area_geodemographic_supergroup_Urban.Settlements <dbl> …
## $ area_geodemographic_supergroup_new <dbl> …
Specify model
xgboost_spec <-
boost_tree(trees = tune()) %>%
set_mode("classification") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_rec) %>%
add_model(xgboost_spec)
Tune hyperparameters
doParallel::registerDoParallel
## function (cl, cores = NULL, ...)
## {
## opts <- list(...)
## optnames <- names(opts)
## if (is.null(optnames))
## optnames <- rep("", length(opts))
## unnamed <- !nzchar(optnames)
## if (any(unnamed)) {
## warning("ignoring doParallel package option(s) specified with unnamed argument")
## opts <- opts[!unnamed]
## optnames <- optnames[!unnamed]
## }
## recog <- optnames %in% c("nocompile")
## if (any(!recog)) {
## warning(sprintf("ignoring unrecognized doParallel package option(s): %s",
## paste(optnames[!recog], collapse = ", ")), call. = FALSE)
## opts <- opts[recog]
## optnames <- optnames[recog]
## }
## old.optnames <- ls(.options, all.names = TRUE)
## rm(list = old.optnames, pos = .options)
## for (i in seq_along(opts)) {
## assign(optnames[i], opts[[i]], pos = .options)
## }
## if (missing(cl) || is.numeric(cl)) {
## if (.Platform$OS.type == "windows") {
## if (!missing(cl) && is.numeric(cl)) {
## cl <- makeCluster(cl)
## }
## else {
## if (!missing(cores) && is.numeric(cores)) {
## cl <- makeCluster(cores)
## }
## else {
## cl <- makeCluster(3)
## }
## }
## assign(".revoDoParCluster", cl, pos = .options)
## reg.finalizer(.options, function(e) {
## stopImplicitCluster()
## }, onexit = TRUE)
## setDoPar(doParallelSNOW, cl, snowinfo)
## }
## else {
## if (!missing(cl) && is.numeric(cl)) {
## cores <- cl
## }
## setDoPar(doParallelMC, cores, mcinfo)
## }
## }
## else {
## setDoPar(doParallelSNOW, cl, snowinfo)
## }
## }
## <bytecode: 0x11114f540>
## <environment: namespace:doParallel>
set.seed(48291)
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = data_cv,
grid = 5)
## Warning: package 'xgboost' was built under R version 4.3.3