museums <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')
## Rows: 4191 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (24): museum_id, Name_of_museum, Address_line_1, Address_line_2, Village...
## dbl (11): Latitude, Longitude, DOMUS_identifier, Area_Deprivation_index, Are...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(museums)
Name | museums |
Number of rows | 4191 |
Number of columns | 35 |
_______________________ | |
Column type frequency: | |
character | 24 |
numeric | 11 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
museum_id | 0 | 1.00 | 8 | 15 | 0 | 4191 | 0 |
Name_of_museum | 0 | 1.00 | 3 | 76 | 0 | 4190 | 0 |
Address_line_1 | 441 | 0.89 | 3 | 61 | 0 | 3212 | 0 |
Address_line_2 | 2816 | 0.33 | 3 | 39 | 0 | 1167 | 0 |
Village,_Town_or_City | 4 | 1.00 | 3 | 24 | 0 | 1696 | 0 |
Postcode | 0 | 1.00 | 6 | 9 | 0 | 3918 | 0 |
Admin_area | 0 | 1.00 | 12 | 137 | 0 | 393 | 0 |
Accreditation | 0 | 1.00 | 10 | 12 | 0 | 2 | 0 |
Governance | 0 | 1.00 | 7 | 41 | 0 | 13 | 0 |
Size | 0 | 1.00 | 4 | 7 | 0 | 5 | 0 |
Size_provenance | 179 | 0.96 | 2 | 29 | 0 | 16 | 0 |
Subject_Matter | 0 | 1.00 | 5 | 45 | 0 | 114 | 0 |
Year_opened | 0 | 1.00 | 9 | 9 | 0 | 351 | 0 |
Year_closed | 0 | 1.00 | 9 | 9 | 0 | 170 | 0 |
DOMUS_Subject_Matter | 2788 | 0.33 | 5 | 27 | 0 | 21 | 0 |
Primary_provenance_of_data | 0 | 1.00 | 3 | 8 | 0 | 18 | 0 |
Identifier_used_in_primary_data_source | 2056 | 0.51 | 2 | 8 | 0 | 2134 | 0 |
Area_Geodemographic_group | 49 | 0.99 | 11 | 40 | 0 | 17 | 0 |
Area_Geodemographic_group_code | 49 | 0.99 | 3 | 3 | 0 | 16 | 0 |
Area_Geodemographic_subgroup | 49 | 0.99 | 12 | 39 | 0 | 25 | 0 |
Area_Geodemographic_subgroup_code | 49 | 0.99 | 4 | 4 | 0 | 24 | 0 |
Area_Geodemographic_supergroup | 49 | 0.99 | 16 | 39 | 0 | 8 | 0 |
Area_Geodemographic_supergroup_code | 49 | 0.99 | 2 | 2 | 0 | 8 | 0 |
Notes | 2980 | 0.29 | 12 | 751 | 0 | 956 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Latitude | 0 | 1.00 | 52.93 | 2.09 | 49.18 | 51.48 | 52.47 | 53.96 | 100.00 | ▇▁▁▁▁ |
Longitude | 0 | 1.00 | -1.96 | 1.84 | -8.09 | -3.10 | -1.87 | -0.48 | 1.76 | ▁▂▇▇▅ |
DOMUS_identifier | 2347 | 0.44 | 1303.45 | 1597.19 | 1.00 | 486.50 | 991.50 | 1470.25 | 7746.00 | ▇▂▁▁▁ |
Area_Deprivation_index | 49 | 0.99 | 5.44 | 2.48 | 1.00 | 4.00 | 5.00 | 7.00 | 10.00 | ▃▆▇▆▃ |
Area_Deprivation_index_crime | 49 | 0.99 | 5.43 | 3.07 | 1.00 | 3.00 | 6.00 | 8.00 | 10.00 | ▇▆▅▇▇ |
Area_Deprivation_index_education | 49 | 0.99 | 6.04 | 2.61 | 1.00 | 4.00 | 6.00 | 8.00 | 10.00 | ▃▅▇▇▆ |
Area_Deprivation_index_employment | 49 | 0.99 | 6.08 | 2.76 | 1.00 | 4.00 | 6.00 | 8.00 | 10.00 | ▅▆▇▇▇ |
Area_Deprivation_index_health | 49 | 0.99 | 6.02 | 2.82 | 1.00 | 4.00 | 6.00 | 8.00 | 10.00 | ▅▆▆▇▇ |
Area_Deprivation_index_housing | 49 | 0.99 | 3.97 | 2.75 | 1.00 | 1.00 | 3.00 | 6.00 | 10.00 | ▇▅▃▂▂ |
Area_Deprivation_index_income | 49 | 0.99 | 5.99 | 2.62 | 1.00 | 4.00 | 6.00 | 8.00 | 10.00 | ▃▆▇▇▆ |
Area_Deprivation_index_services | 49 | 0.99 | 4.78 | 3.01 | 1.00 | 2.00 | 4.00 | 7.00 | 10.00 | ▇▅▅▅▅ |
colnames(museums)[5] <- "Village_Town_or_City"
factors_vec <- museums %>% select(Name_of_museum, Village_Town_or_City, Accreditation, Governance, Subject_Matter) %>% names()
museums_clean <- museums %>%
mutate(across(where(is.character), factor)) %>%
separate_wider_delim(Year_opened, delim = ":", names = c("Year_opened", "Y_o_duplicate")) %>%
select(-museum_id, -Latitude, -Longitude, -Notes, -Address_line_2, -Postcode, DOMUS_Subject_Matter, -DOMUS_identifier,-Identifier_used_in_primary_data_source, Address_line_1, -Size_provenance, Primary_provenance_of_data, -Y_o_duplicate, -Year_closed, -Area_Deprivation_index, Area_Deprivation_index_crime, Area_Deprivation_index_education, Area_Deprivation_index_employment, Area_Deprivation_index_health, Area_Deprivation_index_housing, Area_Deprivation_index_income, Area_Deprivation_index_services, -Area_Geodemographic_group, -Area_Geodemographic_group_code, Area_Geodemographic_subgroup, -Area_Geodemographic_subgroup_code, -Area_Geodemographic_supergroup, -Area_Geodemographic_supergroup_code, Village_Town_or_City) %>%
na.omit()
museums_clean %>% glimpse()
## Rows: 1,370
## Columns: 19
## $ Name_of_museum <fct> "Prestongrange Industrial Heritage M…
## $ Address_line_1 <fct> "Prestongrange", "Chambers Street", …
## $ Village_Town_or_City <fct> "nr Prestonpans", "Edinburgh", "Cumn…
## $ Admin_area <fct> /Scotland/East Lothian (Scottish Cou…
## $ Accreditation <fct> Accredited, Accredited, Accredited, …
## $ Governance <fct> Government-Local_Authority, Governme…
## $ Size <fct> small, large, small, small, small, l…
## $ Subject_Matter <fct> Industry_and_manufacture-Mining_and_…
## $ Year_opened <chr> "1993", "1854", "1891", "1966", "198…
## $ DOMUS_Subject_Matter <fct> science and industry, mixed collecti…
## $ Primary_provenance_of_data <fct> domus, domus, domus, domus, domus, d…
## $ Area_Deprivation_index_crime <dbl> 3, 1, 1, 7, 10, 1, 6, 4, 9, 4, 7, 3,…
## $ Area_Deprivation_index_education <dbl> 7, 7, 4, 7, 6, 4, 3, 4, 8, 6, 7, 4, …
## $ Area_Deprivation_index_employment <dbl> 6, 6, 2, 7, 6, 4, 3, 6, 10, 4, 10, 4…
## $ Area_Deprivation_index_health <dbl> 8, 5, 2, 9, 5, 5, 5, 6, 10, 2, 10, 4…
## $ Area_Deprivation_index_housing <dbl> 9, 1, 6, 7, 7, 1, 10, 3, 2, 1, 10, 6…
## $ Area_Deprivation_index_income <dbl> 5, 7, 3, 8, 6, 6, 3, 6, 10, 5, 10, 4…
## $ Area_Deprivation_index_services <dbl> 3, 9, 9, 1, 1, 9, 9, 6, 1, 10, 2, 9,…
## $ Area_Geodemographic_subgroup <fct> Scottish Industrial Legacy, Larger T…
museums_clean %>% count(Accreditation)
## # A tibble: 2 × 2
## Accreditation n
## <fct> <int>
## 1 Accredited 1007
## 2 Unaccredited 363
museums_clean %>% distinct(Subject_Matter)
## # A tibble: 98 × 1
## Subject_Matter
## <fct>
## 1 Industry_and_manufacture-Mining_and_quarrying
## 2 Mixed-Encyclopaedic
## 3 Local_Histories
## 4 War_and_conflict-Military
## 5 Personality-Explorer
## 6 Archaeology-Roman
## 7 Personality-Literary
## 8 Sea_and_seafaring-Fishing
## 9 Science_and_technology-Other
## 10 Sea_and_seafaring-Boats_and_ships
## # ℹ 88 more rows
museums_clean %>% skimr::skim(Subject_Matter)
Name | Piped data |
Number of rows | 1370 |
Number of columns | 19 |
_______________________ | |
Column type frequency: | |
factor | 1 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
Subject_Matter | 0 | 1 | FALSE | 98 | Loc: 397, Art: 101, War: 95, Bui: 71 |
museums_clean %>%
ggplot(aes(Accreditation)) +
geom_bar()
Accreditation vs. Governance
museums_clean %>%
ggplot(aes(Accreditation, Governance)) +
geom_boxplot()
Correlation Plot
# Step 1: Binarize
museums_binarized <- museums_clean %>%
select(-Name_of_museum) %>%
binarize()
museums_binarized %>% glimpse()
## Rows: 1,370
## Columns: 150
## $ Address_line_1__High_Street <dbl> …
## $ `Address_line_1__-OTHER` <dbl> …
## $ Village_Town_or_City__Edinburgh <dbl> …
## $ Village_Town_or_City__Leeds <dbl> …
## $ Village_Town_or_City__London <dbl> …
## $ Village_Town_or_City__Manchester <dbl> …
## $ Village_Town_or_City__York <dbl> …
## $ `Village_Town_or_City__-OTHER` <dbl> …
## $ `Admin_area__/England/London_(English_Region)/Camden_(London_Borough)` <dbl> …
## $ `Admin_area__/England/North_East_(English_Region)/North_East_(English_CA)/Northumberland_(English_District_or_Borough)` <dbl> …
## $ `Admin_area__/England/North_West_(English_Region)/Greater_Manchester_(English_CA)/Manchester_(English_District_or_Borough)` <dbl> …
## $ `Admin_area__/England/South_West_(English_Region)/Cornwall_(English_UA)` <dbl> …
## $ `Admin_area__/England/South_West_(English_Region)/Wiltshire_(English_UA)` <dbl> …
## $ `Admin_area__/England/Yorkshire_and_The_Humber_(English_Region)/West_Yorkshire_(English_CA)/Leeds_(English_District_or_Borough)` <dbl> …
## $ `Admin_area__/England/Yorkshire_and_The_Humber_(English_Region)/York_(English_UA)` <dbl> …
## $ `Admin_area__/Scotland/City_of_Edinburgh_(Scottish_Council_Area)` <dbl> …
## $ `Admin_area__/Scotland/Dumfries_and_Galloway_(Scottish_Council_Area)` <dbl> …
## $ `Admin_area__/Scotland/Highland_(Scottish_Council_Area)` <dbl> …
## $ `Admin_area__/Scotland/Scottish_Borders_(Scottish_Council_Area)` <dbl> …
## $ `Admin_area__-OTHER` <dbl> …
## $ Accreditation__Accredited <dbl> …
## $ Accreditation__Unaccredited <dbl> …
## $ `Governance__Government-Local_Authority` <dbl> …
## $ `Governance__Government-National` <dbl> …
## $ `Governance__Independent-English_Heritage` <dbl> …
## $ `Governance__Independent-National_Trust` <dbl> …
## $ `Governance__Independent-Not_for_profit` <dbl> …
## $ `Governance__Independent-Unknown` <dbl> …
## $ Governance__University <dbl> …
## $ `Governance__-OTHER` <dbl> …
## $ Size__large <dbl> …
## $ Size__medium <dbl> …
## $ Size__small <dbl> …
## $ `Size__-OTHER` <dbl> …
## $ `Subject_Matter__Archaeology-Roman` <dbl> …
## $ `Subject_Matter__Arts-Fine_and_decorative_arts` <dbl> …
## $ `Subject_Matter__Buildings-Houses-Large_houses` <dbl> …
## $ `Subject_Matter__Buildings-Houses-Medium_houses` <dbl> …
## $ `Subject_Matter__Buildings-Other` <dbl> …
## $ `Subject_Matter__Industry_and_manufacture-Mining_and_quarrying` <dbl> …
## $ Subject_Matter__Local_Histories <dbl> …
## $ `Subject_Matter__Mixed-Encyclopaedic` <dbl> …
## $ `Subject_Matter__Mixed-Other` <dbl> …
## $ `Subject_Matter__Personality-Literary` <dbl> …
## $ `Subject_Matter__Sea_and_seafaring-Mixed` <dbl> …
## $ `Subject_Matter__Transport-Trains_and_railways` <dbl> …
## $ `Subject_Matter__War_and_conflict-Castles_and_forts` <dbl> …
## $ `Subject_Matter__War_and_conflict-Regiment` <dbl> …
## $ `Subject_Matter__-OTHER` <dbl> …
## $ Year_opened__1927 <dbl> …
## $ Year_opened__1951 <dbl> …
## $ Year_opened__1961 <dbl> …
## $ Year_opened__1962 <dbl> …
## $ Year_opened__1963 <dbl> …
## $ Year_opened__1964 <dbl> …
## $ Year_opened__1966 <dbl> …
## $ Year_opened__1967 <dbl> …
## $ Year_opened__1968 <dbl> …
## $ Year_opened__1969 <dbl> …
## $ Year_opened__1972 <dbl> …
## $ Year_opened__1973 <dbl> …
## $ Year_opened__1974 <dbl> …
## $ Year_opened__1975 <dbl> …
## $ Year_opened__1976 <dbl> …
## $ Year_opened__1977 <dbl> …
## $ Year_opened__1978 <dbl> …
## $ Year_opened__1979 <dbl> …
## $ Year_opened__1980 <dbl> …
## $ Year_opened__1981 <dbl> …
## $ Year_opened__1982 <dbl> …
## $ Year_opened__1983 <dbl> …
## $ Year_opened__1984 <dbl> …
## $ Year_opened__1985 <dbl> …
## $ Year_opened__1986 <dbl> …
## $ Year_opened__1987 <dbl> …
## $ Year_opened__1988 <dbl> …
## $ Year_opened__1989 <dbl> …
## $ Year_opened__1990 <dbl> …
## $ Year_opened__1991 <dbl> …
## $ Year_opened__1992 <dbl> …
## $ Year_opened__1993 <dbl> …
## $ Year_opened__1994 <dbl> …
## $ `Year_opened__-OTHER` <dbl> …
## $ DOMUS_Subject_Matter__agriculture <dbl> …
## $ DOMUS_Subject_Matter__archaeology <dbl> …
## $ DOMUS_Subject_Matter__archives <dbl> …
## $ DOMUS_Subject_Matter__biology_and_natural_history <dbl> …
## $ DOMUS_Subject_Matter__costume_and_textiles <dbl> …
## $ DOMUS_Subject_Matter__decorative_and_applied_arts <dbl> …
## $ DOMUS_Subject_Matter__fine_art <dbl> …
## $ DOMUS_Subject_Matter__maritime <dbl> …
## $ DOMUS_Subject_Matter__military <dbl> …
## $ DOMUS_Subject_Matter__mixed_collection <dbl> …
## $ DOMUS_Subject_Matter__personalia <dbl> …
## $ DOMUS_Subject_Matter__science_and_industry <dbl> …
## $ DOMUS_Subject_Matter__social_history <dbl> …
## $ DOMUS_Subject_Matter__transport <dbl> …
## $ `DOMUS_Subject_Matter__-OTHER` <dbl> …
## $ Primary_provenance_of_data__domus <dbl> …
## $ `Primary_provenance_of_data__-OTHER` <dbl> …
## $ `Area_Deprivation_index_crime__-Inf_2` <dbl> …
## $ Area_Deprivation_index_crime__2_4 <dbl> …
## $ Area_Deprivation_index_crime__4_7 <dbl> …
## $ Area_Deprivation_index_crime__7_Inf <dbl> …
## $ `Area_Deprivation_index_education__-Inf_4` <dbl> …
## $ Area_Deprivation_index_education__4_6 <dbl> …
## $ Area_Deprivation_index_education__6_8 <dbl> …
## $ Area_Deprivation_index_education__8_Inf <dbl> …
## $ `Area_Deprivation_index_employment__-Inf_3` <dbl> …
## $ Area_Deprivation_index_employment__3_6 <dbl> …
## $ Area_Deprivation_index_employment__6_8 <dbl> …
## $ Area_Deprivation_index_employment__8_Inf <dbl> …
## $ `Area_Deprivation_index_health__-Inf_3` <dbl> …
## $ Area_Deprivation_index_health__3_5 <dbl> …
## $ Area_Deprivation_index_health__5_8 <dbl> …
## $ Area_Deprivation_index_health__8_Inf <dbl> …
## $ `Area_Deprivation_index_housing__-Inf_3` <dbl> …
## $ Area_Deprivation_index_housing__3_6 <dbl> …
## $ Area_Deprivation_index_housing__6_Inf <dbl> …
## $ `Area_Deprivation_index_income__-Inf_4` <dbl> …
## $ Area_Deprivation_index_income__4_6 <dbl> …
## $ Area_Deprivation_index_income__6_8 <dbl> …
## $ Area_Deprivation_index_income__8_Inf <dbl> …
## $ `Area_Deprivation_index_services__-Inf_3` <dbl> …
## $ Area_Deprivation_index_services__3_5 <dbl> …
## $ Area_Deprivation_index_services__5_8 <dbl> …
## $ Area_Deprivation_index_services__8_Inf <dbl> …
## $ Area_Geodemographic_subgroup__Affluent_rural <dbl> …
## $ Area_Geodemographic_subgroup__Ageing_Coastal_Living <dbl> …
## $ Area_Geodemographic_subgroup__Country_Living <dbl> …
## $ Area_Geodemographic_subgroup__Ethnically_Diverse_Metropolitan_Living <dbl> …
## $ Area_Geodemographic_subgroup__Expanding_Areas <dbl> …
## $ `Area_Geodemographic_subgroup__Industrial_and_Multi-ethnic` <dbl> …
## $ Area_Geodemographic_subgroup__Larger_Towns_and_Cities <dbl> …
## $ Area_Geodemographic_subgroup__London_Cosmopolitan <dbl> …
## $ Area_Geodemographic_subgroup__Manufacturing_Legacy <dbl> …
## $ Area_Geodemographic_subgroup__Mining_Legacy <dbl> …
## $ Area_Geodemographic_subgroup__Northern_Ireland_Countryside <dbl> …
## $ Area_Geodemographic_subgroup__Older_Farming_Communities <dbl> …
## $ Area_Geodemographic_subgroup__Prosperous_Towns <dbl> …
## $ `Area_Geodemographic_subgroup__Rural-Urban_Fringe` <dbl> …
## $ Area_Geodemographic_subgroup__Rural_Growth_Areas <dbl> …
## $ Area_Geodemographic_subgroup__Scottish_Countryside <dbl> …
## $ Area_Geodemographic_subgroup__Scottish_Industrial_Legacy <dbl> …
## $ Area_Geodemographic_subgroup__Seaside_Living <dbl> …
## $ Area_Geodemographic_subgroup__Service_Economy <dbl> …
## $ Area_Geodemographic_subgroup__Sparse_English_and_Welsh_Countryside <dbl> …
## $ Area_Geodemographic_subgroup__University_Towns_and_Cities <dbl> …
## $ Area_Geodemographic_subgroup__Urban_Living <dbl> …
## $ `Area_Geodemographic_subgroup__-OTHER` <dbl> …
# Step 2: Correlation
museums_correlation <- museums_binarized %>%
correlate(Accreditation__Accredited)
museums_correlation %>% glimpse()
## Rows: 150
## Columns: 3
## $ feature <fct> Accreditation, Accreditation, Governance, Size, Year_opene…
## $ bin <chr> "Accredited", "Unaccredited", "Independent-Unknown", "smal…
## $ correlation <dbl> 1.00000000, -1.00000000, -0.19751976, -0.15704734, 0.13432…
# Step 3: Plot
museums_correlation %>%
correlationfunnel::plot_correlation_funnel()
## Warning: ggrepel: 113 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Split Data
set.seed(1234)
museums_clean <- museums_clean %>% sample_n(100)
museums_split <- initial_split(museums_clean, strata = Accreditation)
museums_train <- training(museums_split)
museums_test <- testing(museums_split)
museums_cv <- rsample::vfold_cv(museums_train, strata = Accreditation)
museums_cv
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [66/8]> Fold01
## 2 <split [66/8]> Fold02
## 3 <split [66/8]> Fold03
## 4 <split [66/8]> Fold04
## 5 <split [67/7]> Fold05
## 6 <split [67/7]> Fold06
## 7 <split [67/7]> Fold07
## 8 <split [67/7]> Fold08
## 9 <split [67/7]> Fold09
## 10 <split [67/7]> Fold10
Preprocess Data Using Recipes Package
library(themis)
## Warning: package 'themis' was built under R version 4.4.1
# Convert all nominal predictors (characters and factors) to numbers.
museums_rec <- recipes::recipe(Accreditation ~ ., data = museums_train) %>%
update_role(Name_of_museum, new_role = "ID") %>%
step_other(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_smote(Accreditation)
museums_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 100
## Columns: 39
## $ Name_of_museum <fct> "Gro…
## $ Area_Deprivation_index_crime <dbl> 8, 6…
## $ Area_Deprivation_index_education <dbl> 8, 6…
## $ Area_Deprivation_index_employment <dbl> 8, 5…
## $ Area_Deprivation_index_health <dbl> 8, 6…
## $ Area_Deprivation_index_housing <dbl> 5, 3…
## $ Area_Deprivation_index_income <dbl> 8, 5…
## $ Area_Deprivation_index_services <dbl> 1, 8…
## $ Accreditation <fct> Accr…
## $ Address_line_1_other <dbl> 1, 1…
## $ Village_Town_or_City_other <dbl> 1, 1…
## $ Admin_area_other <dbl> 1, 1…
## $ Governance_Government.National <dbl> 0, 0…
## $ Governance_Independent.Not_for_profit <dbl> 1, 1…
## $ Governance_other <dbl> 0, 0…
## $ Size_medium <dbl> 0, 0…
## $ Size_small <dbl> 1, 1…
## $ Size_other <dbl> 0, 0…
## $ Subject_Matter_Local_Histories <dbl> 0, 1…
## $ Subject_Matter_War_and_conflict.Regiment <dbl> 0, 0…
## $ Subject_Matter_other <dbl> 1, 0…
## $ Year_opened_X1989 <dbl> 1, 0…
## $ Year_opened_other <dbl> 0, 1…
## $ DOMUS_Subject_Matter_decorative.and.applied.arts <dbl> 0, 0…
## $ DOMUS_Subject_Matter_fine.art <dbl> 0, 0…
## $ DOMUS_Subject_Matter_military <dbl> 0, 0…
## $ DOMUS_Subject_Matter_mixed.collection <dbl> 0, 0…
## $ DOMUS_Subject_Matter_science.and.industry <dbl> 0, 0…
## $ DOMUS_Subject_Matter_social.history <dbl> 0, 0…
## $ DOMUS_Subject_Matter_other <dbl> 1, 0…
## $ Primary_provenance_of_data_other <dbl> 0, 0…
## $ Area_Geodemographic_subgroup_Country.Living <dbl> 0, 0…
## $ Area_Geodemographic_subgroup_Larger.Towns.and.Cities <dbl> 0, 0…
## $ Area_Geodemographic_subgroup_London.Cosmopolitan <dbl> 0, 0…
## $ Area_Geodemographic_subgroup_Older.Farming.Communities <dbl> 0, 0…
## $ Area_Geodemographic_subgroup_Rural.Growth.Areas <dbl> 0, 0…
## $ Area_Geodemographic_subgroup_Scottish.Countryside <dbl> 1, 0…
## $ Area_Geodemographic_subgroup_Sparse.English.and.Welsh.Countryside <dbl> 0, 0…
## $ Area_Geodemographic_subgroup_other <dbl> 0, 0…
Specify Model
library(usemodels)
## Warning: package 'usemodels' was built under R version 4.4.1
#usemodels::use_xgboost(Attrition ~ ., data = museums_train)
xgboost_spec <-
boost_tree(trees = tune()) %>%
set_mode("classification") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(museums_rec) %>%
add_model(xgboost_spec)
Tune Hyperparameters
doParallel::registerDoParallel()
set.seed(47927)
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = museums_cv,
grid = 5,
control = control_grid(save_pred = TRUE))
Identify Optimal Values for Hyperparameters
collect_metrics(xgboost_tune)
## # A tibble: 15 × 7
## trees .metric .estimator mean n std_err .config
## <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 102 accuracy binary 0.657 10 0.0703 Preprocessor1_Model1
## 2 102 brier_class binary 0.282 10 0.0492 Preprocessor1_Model1
## 3 102 roc_auc binary 0.53 10 0.0989 Preprocessor1_Model1
## 4 463 accuracy binary 0.629 10 0.0734 Preprocessor1_Model2
## 5 463 brier_class binary 0.296 10 0.0538 Preprocessor1_Model2
## 6 463 roc_auc binary 0.507 10 0.103 Preprocessor1_Model2
## 7 1088 accuracy binary 0.641 10 0.0688 Preprocessor1_Model3
## 8 1088 brier_class binary 0.298 10 0.0557 Preprocessor1_Model3
## 9 1088 roc_auc binary 0.537 10 0.103 Preprocessor1_Model3
## 10 1236 accuracy binary 0.629 10 0.0678 Preprocessor1_Model4
## 11 1236 brier_class binary 0.298 10 0.0562 Preprocessor1_Model4
## 12 1236 roc_auc binary 0.527 10 0.103 Preprocessor1_Model4
## 13 1945 accuracy binary 0.604 10 0.0754 Preprocessor1_Model5
## 14 1945 brier_class binary 0.298 10 0.0573 Preprocessor1_Model5
## 15 1945 roc_auc binary 0.51 10 0.104 Preprocessor1_Model5
collect_predictions(xgboost_tune) %>%
group_by(id) %>%
roc_curve(Accreditation, .pred_Accredited) %>%
autoplot()
Fit the Model for the Last Time
xgboost_last <- xgboost_workflow %>%
finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
last_fit(museums_split)
## Warning: package 'xgboost' was built under R version 4.4.1
collect_metrics(xgboost_last)
## # A tibble: 3 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.577 Preprocessor1_Model1
## 2 roc_auc binary 0.497 Preprocessor1_Model1
## 3 brier_class binary 0.331 Preprocessor1_Model1
collect_predictions(xgboost_last) %>%
yardstick::conf_mat(Accreditation, .pred_class) %>%
autoplot()
Variable Importance
library(vip)
## Warning: package 'vip' was built under R version 4.4.1
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
xgboost_last %>%
workflows::extract_fit_engine() %>%
vip()