library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(correlationfunnel)
## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>
# Import Data
museums <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')
## Rows: 4191 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (24): museum_id, Name_of_museum, Address_line_1, Address_line_2, Village...
## dbl (11): Latitude, Longitude, DOMUS_identifier, Area_Deprivation_index, Are...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(museums)
Name | museums |
Number of rows | 4191 |
Number of columns | 35 |
_______________________ | |
Column type frequency: | |
character | 24 |
numeric | 11 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
museum_id | 0 | 1.00 | 8 | 15 | 0 | 4191 | 0 |
Name_of_museum | 0 | 1.00 | 3 | 76 | 0 | 4190 | 0 |
Address_line_1 | 441 | 0.89 | 3 | 61 | 0 | 3212 | 0 |
Address_line_2 | 2816 | 0.33 | 3 | 39 | 0 | 1167 | 0 |
Village,_Town_or_City | 4 | 1.00 | 3 | 24 | 0 | 1696 | 0 |
Postcode | 0 | 1.00 | 6 | 9 | 0 | 3918 | 0 |
Admin_area | 0 | 1.00 | 12 | 137 | 0 | 393 | 0 |
Accreditation | 0 | 1.00 | 10 | 12 | 0 | 2 | 0 |
Governance | 0 | 1.00 | 7 | 41 | 0 | 13 | 0 |
Size | 0 | 1.00 | 4 | 7 | 0 | 5 | 0 |
Size_provenance | 179 | 0.96 | 2 | 29 | 0 | 16 | 0 |
Subject_Matter | 0 | 1.00 | 5 | 45 | 0 | 114 | 0 |
Year_opened | 0 | 1.00 | 9 | 9 | 0 | 351 | 0 |
Year_closed | 0 | 1.00 | 9 | 9 | 0 | 170 | 0 |
DOMUS_Subject_Matter | 2788 | 0.33 | 5 | 27 | 0 | 21 | 0 |
Primary_provenance_of_data | 0 | 1.00 | 3 | 8 | 0 | 18 | 0 |
Identifier_used_in_primary_data_source | 2056 | 0.51 | 2 | 8 | 0 | 2134 | 0 |
Area_Geodemographic_group | 49 | 0.99 | 11 | 40 | 0 | 17 | 0 |
Area_Geodemographic_group_code | 49 | 0.99 | 3 | 3 | 0 | 16 | 0 |
Area_Geodemographic_subgroup | 49 | 0.99 | 12 | 39 | 0 | 25 | 0 |
Area_Geodemographic_subgroup_code | 49 | 0.99 | 4 | 4 | 0 | 24 | 0 |
Area_Geodemographic_supergroup | 49 | 0.99 | 16 | 39 | 0 | 8 | 0 |
Area_Geodemographic_supergroup_code | 49 | 0.99 | 2 | 2 | 0 | 8 | 0 |
Notes | 2980 | 0.29 | 12 | 751 | 0 | 956 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Latitude | 0 | 1.00 | 52.93 | 2.09 | 49.18 | 51.48 | 52.47 | 53.96 | 100.00 | ▇▁▁▁▁ |
Longitude | 0 | 1.00 | -1.96 | 1.84 | -8.09 | -3.10 | -1.87 | -0.48 | 1.76 | ▁▂▇▇▅ |
DOMUS_identifier | 2347 | 0.44 | 1303.45 | 1597.19 | 1.00 | 486.50 | 991.50 | 1470.25 | 7746.00 | ▇▂▁▁▁ |
Area_Deprivation_index | 49 | 0.99 | 5.44 | 2.48 | 1.00 | 4.00 | 5.00 | 7.00 | 10.00 | ▃▆▇▆▃ |
Area_Deprivation_index_crime | 49 | 0.99 | 5.43 | 3.07 | 1.00 | 3.00 | 6.00 | 8.00 | 10.00 | ▇▆▅▇▇ |
Area_Deprivation_index_education | 49 | 0.99 | 6.04 | 2.61 | 1.00 | 4.00 | 6.00 | 8.00 | 10.00 | ▃▅▇▇▆ |
Area_Deprivation_index_employment | 49 | 0.99 | 6.08 | 2.76 | 1.00 | 4.00 | 6.00 | 8.00 | 10.00 | ▅▆▇▇▇ |
Area_Deprivation_index_health | 49 | 0.99 | 6.02 | 2.82 | 1.00 | 4.00 | 6.00 | 8.00 | 10.00 | ▅▆▆▇▇ |
Area_Deprivation_index_housing | 49 | 0.99 | 3.97 | 2.75 | 1.00 | 1.00 | 3.00 | 6.00 | 10.00 | ▇▅▃▂▂ |
Area_Deprivation_index_income | 49 | 0.99 | 5.99 | 2.62 | 1.00 | 4.00 | 6.00 | 8.00 | 10.00 | ▃▆▇▇▆ |
Area_Deprivation_index_services | 49 | 0.99 | 4.78 | 3.01 | 1.00 | 2.00 | 4.00 | 7.00 | 10.00 | ▇▅▅▅▅ |
missing values Addressline_2, Addressline_1, DOMUS_Subject_Matter, DOMUS_Identifier, Notes factors or numeric variables Zero Variance variables Character variables Unbalanced target variables id variable museum_id
museums %>% count(Accreditation)
## # A tibble: 2 × 2
## Accreditation n
## <chr> <int>
## 1 Accredited 1720
## 2 Unaccredited 2471
museums %>%
ggplot(aes(Accreditation)) +
geom_bar()
data <- museums %>%
unite("group_subgroup", Area_Geodemographic_group, Area_Geodemographic_subgroup, sep = "/" , remove = FALSE) %>%
mutate(Year_opened_rev = Year_opened %>% str_extract("\\d{4}")) %>%
select(-Address_line_1, -Address_line_2, -DOMUS_Subject_Matter,-DOMUS_identifier, -Notes, -Identifier_used_in_primary_data_source, -Area_Geodemographic_supergroup_code, -Area_Geodemographic_group_code, -Area_Geodemographic_subgroup_code, -museum_id, -Year_closed, -Area_Geodemographic_group, -Area_Geodemographic_subgroup, -Area_Geodemographic_supergroup, -Longitude, -Latitude, -Postcode, -Year_opened, -Admin_area) %>%
na.omit() %>%
janitor::clean_names()
skimr::skim(data)
Name | data |
Number of rows | 3966 |
Number of columns | 18 |
_______________________ | |
Column type frequency: | |
character | 10 |
numeric | 8 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
name_of_museum | 0 | 1 | 3 | 76 | 0 | 3965 | 0 |
village_town_or_city | 0 | 1 | 3 | 24 | 0 | 1639 | 0 |
accreditation | 0 | 1 | 10 | 12 | 0 | 2 | 0 |
governance | 0 | 1 | 7 | 41 | 0 | 13 | 0 |
size | 0 | 1 | 4 | 7 | 0 | 5 | 0 |
size_provenance | 0 | 1 | 2 | 29 | 0 | 16 | 0 |
subject_matter | 0 | 1 | 5 | 45 | 0 | 112 | 0 |
primary_provenance_of_data | 0 | 1 | 3 | 8 | 0 | 17 | 0 |
group_subgroup | 0 | 1 | 28 | 79 | 0 | 32 | 0 |
year_opened_rev | 0 | 1 | 4 | 4 | 0 | 211 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
area_deprivation_index | 0 | 1 | 5.46 | 2.48 | 1 | 4 | 5 | 7 | 10 | ▃▆▇▆▃ |
area_deprivation_index_crime | 0 | 1 | 5.43 | 3.07 | 1 | 3 | 6 | 8 | 10 | ▇▆▅▆▇ |
area_deprivation_index_education | 0 | 1 | 6.05 | 2.61 | 1 | 4 | 6 | 8 | 10 | ▃▅▇▇▆ |
area_deprivation_index_employment | 0 | 1 | 6.08 | 2.77 | 1 | 4 | 6 | 8 | 10 | ▅▆▇▇▇ |
area_deprivation_index_health | 0 | 1 | 6.02 | 2.82 | 1 | 4 | 6 | 8 | 10 | ▅▆▆▇▇ |
area_deprivation_index_housing | 0 | 1 | 3.99 | 2.76 | 1 | 1 | 3 | 6 | 10 | ▇▅▃▃▂ |
area_deprivation_index_income | 0 | 1 | 6.00 | 2.63 | 1 | 4 | 6 | 8 | 10 | ▃▆▇▇▆ |
area_deprivation_index_services | 0 | 1 | 4.79 | 3.01 | 1 | 2 | 4 | 8 | 10 | ▇▅▅▅▅ |
data %>%
ggplot(aes(accreditation, area_deprivation_index_employment)) +
geom_boxplot()
data %>%
ggplot(aes(accreditation, area_deprivation_index_crime)) +
geom_boxplot()
data %>%
ggplot(aes(accreditation, area_deprivation_index_health)) +
geom_boxplot()
data_binarized <- data %>%
select(-name_of_museum) %>%
binarize()
data_binarized %>% glimpse()
## Rows: 3,966
## Columns: 154
## $ village_town_or_city__Edinburgh <dbl> …
## $ village_town_or_city__London <dbl> …
## $ `village_town_or_city__-OTHER` <dbl> …
## $ accreditation__Accredited <dbl> …
## $ accreditation__Unaccredited <dbl> …
## $ `governance__Government-Local_Authority` <dbl> …
## $ `governance__Government-National` <dbl> …
## $ `governance__Independent-English_Heritage` <dbl> …
## $ `governance__Independent-National_Trust` <dbl> …
## $ `governance__Independent-Not_for_profit` <dbl> …
## $ `governance__Independent-Private` <dbl> …
## $ `governance__Independent-Unknown` <dbl> …
## $ governance__University <dbl> …
## $ governance__Unknown <dbl> …
## $ `governance__-OTHER` <dbl> …
## $ size__large <dbl> …
## $ size__medium <dbl> …
## $ size__small <dbl> …
## $ size__unknown <dbl> …
## $ `size__-OTHER` <dbl> …
## $ size_provenance__ace_size_designation <dbl> …
## $ size_provenance__aim_size_designation <dbl> …
## $ size_provenance__domus <dbl> …
## $ `size_provenance__ma(fam)` <dbl> …
## $ size_provenance__mm_manual_estimate_2018 <dbl> …
## $ size_provenance__mm_prediction_random_forest <dbl> …
## $ size_provenance__scottish_national_audit <dbl> …
## $ size_provenance__unknown <dbl> …
## $ size_provenance__visitbritain <dbl> …
## $ `size_provenance__-OTHER` <dbl> …
## $ `subject_matter__Archaeology-Roman` <dbl> …
## $ `subject_matter__Arts-Fine_and_decorative_arts` <dbl> …
## $ `subject_matter__Buildings-Houses-Large_houses` <dbl> …
## $ `subject_matter__Buildings-Houses-Medium_houses` <dbl> …
## $ `subject_matter__Industry_and_manufacture-Mining_and_quarrying` <dbl> …
## $ `subject_matter__Leisure_and_sport-Toys_and_models` <dbl> …
## $ subject_matter__Local_Histories <dbl> …
## $ `subject_matter__Mixed-Encyclopaedic` <dbl> …
## $ `subject_matter__Mixed-Other` <dbl> …
## $ subject_matter__Other <dbl> …
## $ `subject_matter__Personality-Literary` <dbl> …
## $ `subject_matter__Rural_Industry-Farming` <dbl> …
## $ `subject_matter__Sea_and_seafaring-Boats_and_ships` <dbl> …
## $ `subject_matter__Sea_and_seafaring-Mixed` <dbl> …
## $ `subject_matter__Transport-Cars_and_motorbikes` <dbl> …
## $ `subject_matter__Transport-Trains_and_railways` <dbl> …
## $ `subject_matter__War_and_conflict-Airforce` <dbl> …
## $ `subject_matter__War_and_conflict-Castles_and_forts` <dbl> …
## $ `subject_matter__War_and_conflict-Military` <dbl> …
## $ `subject_matter__War_and_conflict-Regiment` <dbl> …
## $ `subject_matter__-OTHER` <dbl> …
## $ primary_provenance_of_data__ace <dbl> …
## $ primary_provenance_of_data__aim <dbl> …
## $ primary_provenance_of_data__aim82M <dbl> …
## $ primary_provenance_of_data__aim82NM <dbl> …
## $ primary_provenance_of_data__domus <dbl> …
## $ primary_provenance_of_data__fcm <dbl> …
## $ primary_provenance_of_data__hha <dbl> …
## $ primary_provenance_of_data__mald <dbl> …
## $ primary_provenance_of_data__mgs <dbl> …
## $ primary_provenance_of_data__misc <dbl> …
## $ primary_provenance_of_data__musassoc <dbl> …
## $ primary_provenance_of_data__wiki <dbl> …
## $ `primary_provenance_of_data__-OTHER` <dbl> …
## $ `area_deprivation_index__-Inf_4` <dbl> …
## $ area_deprivation_index__4_5 <dbl> …
## $ area_deprivation_index__5_7 <dbl> …
## $ area_deprivation_index__7_Inf <dbl> …
## $ `area_deprivation_index_crime__-Inf_3` <dbl> …
## $ area_deprivation_index_crime__3_6 <dbl> …
## $ area_deprivation_index_crime__6_8 <dbl> …
## $ area_deprivation_index_crime__8_Inf <dbl> …
## $ `area_deprivation_index_education__-Inf_4` <dbl> …
## $ area_deprivation_index_education__4_6 <dbl> …
## $ area_deprivation_index_education__6_8 <dbl> …
## $ area_deprivation_index_education__8_Inf <dbl> …
## $ `area_deprivation_index_employment__-Inf_4` <dbl> …
## $ area_deprivation_index_employment__4_6 <dbl> …
## $ area_deprivation_index_employment__6_8 <dbl> …
## $ area_deprivation_index_employment__8_Inf <dbl> …
## $ `area_deprivation_index_health__-Inf_4` <dbl> …
## $ area_deprivation_index_health__4_6 <dbl> …
## $ area_deprivation_index_health__6_8 <dbl> …
## $ area_deprivation_index_health__8_Inf <dbl> …
## $ `area_deprivation_index_housing__-Inf_3` <dbl> …
## $ area_deprivation_index_housing__3_6 <dbl> …
## $ area_deprivation_index_housing__6_Inf <dbl> …
## $ `area_deprivation_index_income__-Inf_4` <dbl> …
## $ area_deprivation_index_income__4_6 <dbl> …
## $ area_deprivation_index_income__6_8 <dbl> …
## $ area_deprivation_index_income__8_Inf <dbl> …
## $ `area_deprivation_index_services__-Inf_2` <dbl> …
## $ area_deprivation_index_services__2_4 <dbl> …
## $ area_deprivation_index_services__4_8 <dbl> …
## $ area_deprivation_index_services__8_Inf <dbl> …
## $ `group_subgroup__Country_Living/Country_Living` <dbl> …
## $ `group_subgroup__English_and_Welsh_Countryside/Older_Farming_Communities` <dbl> …
## $ `group_subgroup__English_and_Welsh_Countryside/Sparse_English_and_Welsh_Countryside` <dbl> …
## $ `group_subgroup__Ethnically_Diverse_Metropolitan_Living/Ethnically_Diverse_Metropolitan_Living` <dbl> …
## $ `group_subgroup__Larger_Towns_and_Cities/Larger_Towns_and_Cities` <dbl> …
## $ `group_subgroup__London_Cosmopolitan/London_Cosmopolitan` <dbl> …
## $ `group_subgroup__Manufacturing_Traits/Industrial_and_Multi-ethnic` <dbl> …
## $ `group_subgroup__Manufacturing_Traits/Urban_Living` <dbl> …
## $ `group_subgroup__Northern_Ireland_Countryside/Northern_Ireland_Countryside` <dbl> …
## $ `group_subgroup__Remoter_Coastal_Living/Ageing_Coastal_Living` <dbl> …
## $ `group_subgroup__Remoter_Coastal_Living/Seaside_Living` <dbl> …
## $ `group_subgroup__Rural-Urban_Fringe/Rural-Urban_Fringe` <dbl> …
## $ `group_subgroup__Scottish_Countryside/Scottish_Countryside` <dbl> …
## $ `group_subgroup__Scottish_Industrial_Heritage/Scottish_Industrial_Legacy` <dbl> …
## $ `group_subgroup__Services_Manufacturing_and_Mining_Legacy/Manufacturing_Legacy` <dbl> …
## $ `group_subgroup__Services_Manufacturing_and_Mining_Legacy/Mining_Legacy` <dbl> …
## $ `group_subgroup__Services_Manufacturing_and_Mining_Legacy/Service_Economy` <dbl> …
## $ `group_subgroup__Suburban_Traits/City_Periphery` <dbl> …
## $ `group_subgroup__Suburban_Traits/Expanding_Areas` <dbl> …
## $ `group_subgroup__Thriving_Rural/Affluent_rural` <dbl> …
## $ `group_subgroup__Thriving_Rural/Rural_Growth_Areas` <dbl> …
## $ `group_subgroup__Town_Living/Prosperous_Towns` <dbl> …
## $ `group_subgroup__University_Towns_and_Cities/University_Towns_and_Cities` <dbl> …
## $ `group_subgroup__-OTHER` <dbl> …
## $ year_opened_rev__1945 <dbl> …
## $ year_opened_rev__1960 <dbl> …
## $ year_opened_rev__1972 <dbl> …
## $ year_opened_rev__1973 <dbl> …
## $ year_opened_rev__1974 <dbl> …
## $ year_opened_rev__1975 <dbl> …
## $ year_opened_rev__1976 <dbl> …
## $ year_opened_rev__1977 <dbl> …
## $ year_opened_rev__1978 <dbl> …
## $ year_opened_rev__1979 <dbl> …
## $ year_opened_rev__1980 <dbl> …
## $ year_opened_rev__1981 <dbl> …
## $ year_opened_rev__1982 <dbl> …
## $ year_opened_rev__1983 <dbl> …
## $ year_opened_rev__1984 <dbl> …
## $ year_opened_rev__1985 <dbl> …
## $ year_opened_rev__1986 <dbl> …
## $ year_opened_rev__1987 <dbl> …
## $ year_opened_rev__1988 <dbl> …
## $ year_opened_rev__1989 <dbl> …
## $ year_opened_rev__1990 <dbl> …
## $ year_opened_rev__1991 <dbl> …
## $ year_opened_rev__1992 <dbl> …
## $ year_opened_rev__1993 <dbl> …
## $ year_opened_rev__1994 <dbl> …
## $ year_opened_rev__1995 <dbl> …
## $ year_opened_rev__1996 <dbl> …
## $ year_opened_rev__1997 <dbl> …
## $ year_opened_rev__1998 <dbl> …
## $ year_opened_rev__1999 <dbl> …
## $ year_opened_rev__2000 <dbl> …
## $ year_opened_rev__2001 <dbl> …
## $ year_opened_rev__2002 <dbl> …
## $ year_opened_rev__2005 <dbl> …
## $ `year_opened_rev__-OTHER` <dbl> …
data_correlate <- data_binarized %>%
correlate(accreditation__Accredited)
data_correlate %>%
correlationfunnel::plot_correlation_funnel()
## Warning: ggrepel: 107 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.1
## ✔ dials 1.2.1 ✔ tune 1.2.1
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.2.1 ✔ yardstick 1.3.1
## ✔ recipes 1.1.0
## Warning: package 'modeldata' was built under R version 4.3.3
## Warning: package 'recipes' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
library(usemodels)
set.seed(1123)
# data_clean <- data %>% sample_n(100)
data_split <- initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [2676/298]> Fold01
## 2 <split [2676/298]> Fold02
## 3 <split [2676/298]> Fold03
## 4 <split [2676/298]> Fold04
## 5 <split [2677/297]> Fold05
## 6 <split [2677/297]> Fold06
## 7 <split [2677/297]> Fold07
## 8 <split [2677/297]> Fold08
## 9 <split [2677/297]> Fold09
## 10 <split [2677/297]> Fold10
library(themis)
library(textrecipes)
skimr::skim(data)
Name | data |
Number of rows | 3966 |
Number of columns | 18 |
_______________________ | |
Column type frequency: | |
character | 10 |
numeric | 8 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
name_of_museum | 0 | 1 | 3 | 76 | 0 | 3965 | 0 |
village_town_or_city | 0 | 1 | 3 | 24 | 0 | 1639 | 0 |
accreditation | 0 | 1 | 10 | 12 | 0 | 2 | 0 |
governance | 0 | 1 | 7 | 41 | 0 | 13 | 0 |
size | 0 | 1 | 4 | 7 | 0 | 5 | 0 |
size_provenance | 0 | 1 | 2 | 29 | 0 | 16 | 0 |
subject_matter | 0 | 1 | 5 | 45 | 0 | 112 | 0 |
primary_provenance_of_data | 0 | 1 | 3 | 8 | 0 | 17 | 0 |
group_subgroup | 0 | 1 | 28 | 79 | 0 | 32 | 0 |
year_opened_rev | 0 | 1 | 4 | 4 | 0 | 211 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
area_deprivation_index | 0 | 1 | 5.46 | 2.48 | 1 | 4 | 5 | 7 | 10 | ▃▆▇▆▃ |
area_deprivation_index_crime | 0 | 1 | 5.43 | 3.07 | 1 | 3 | 6 | 8 | 10 | ▇▆▅▆▇ |
area_deprivation_index_education | 0 | 1 | 6.05 | 2.61 | 1 | 4 | 6 | 8 | 10 | ▃▅▇▇▆ |
area_deprivation_index_employment | 0 | 1 | 6.08 | 2.77 | 1 | 4 | 6 | 8 | 10 | ▅▆▇▇▇ |
area_deprivation_index_health | 0 | 1 | 6.02 | 2.82 | 1 | 4 | 6 | 8 | 10 | ▅▆▆▇▇ |
area_deprivation_index_housing | 0 | 1 | 3.99 | 2.76 | 1 | 1 | 3 | 6 | 10 | ▇▅▃▃▂ |
area_deprivation_index_income | 0 | 1 | 6.00 | 2.63 | 1 | 4 | 6 | 8 | 10 | ▃▆▇▇▆ |
area_deprivation_index_services | 0 | 1 | 4.79 | 3.01 | 1 | 2 | 4 | 8 | 10 | ▇▅▅▅▅ |
xgboost_rec <- recipes::recipe(accreditation ~ ., data = data_train) %>%
update_role(name_of_museum, new_role = "ID") %>%
step_tokenize(subject_matter) %>%
step_tokenfilter(subject_matter, max_tokens = 50) %>%
step_tf(subject_matter) %>%
step_other(village_town_or_city) %>%
step_novel(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_smote(accreditation)
xgboost_rec %>% prep() %>% juice() %>% glimpse()
## Warning: ! The following columns have zero variance so scaling cannot be used:
## village_town_or_city_new, governance_new, size_new, size_provenance_new,
## primary_provenance_of_data_new, group_subgroup_new, and year_opened_rev_new.
## ℹ Consider using ?step_zv (`?recipes::step_zv()`) to remove those columns
## before normalizing.
## Rows: 3,368
## Columns: 336
## $ name_of_museum <fct> …
## $ area_deprivation_index <dbl> …
## $ area_deprivation_index_crime <dbl> …
## $ area_deprivation_index_education <dbl> …
## $ area_deprivation_index_employment <dbl> …
## $ area_deprivation_index_health <dbl> …
## $ area_deprivation_index_housing <dbl> …
## $ area_deprivation_index_income <dbl> …
## $ area_deprivation_index_services <dbl> …
## $ accreditation <fct> …
## $ tf_subject_matter_airforce <dbl> …
## $ tf_subject_matter_archaeology <dbl> …
## $ tf_subject_matter_arts <dbl> …
## $ tf_subject_matter_aviation <dbl> …
## $ tf_subject_matter_belief_and_identity <dbl> …
## $ tf_subject_matter_boats_and_ships <dbl> …
## $ tf_subject_matter_buildings <dbl> …
## $ tf_subject_matter_cars_and_motorbikes <dbl> …
## $ tf_subject_matter_castles_and_forts <dbl> …
## $ tf_subject_matter_costume_and_textiles <dbl> …
## $ tf_subject_matter_encyclopaedic <dbl> …
## $ tf_subject_matter_ethnic_group <dbl> …
## $ tf_subject_matter_event_or_site <dbl> …
## $ tf_subject_matter_farming <dbl> …
## $ tf_subject_matter_film_cinema_and_tv <dbl> …
## $ tf_subject_matter_fine_and_decorative_arts <dbl> …
## $ tf_subject_matter_food_and_drink <dbl> …
## $ tf_subject_matter_houses <dbl> …
## $ tf_subject_matter_industry_and_manufacture <dbl> …
## $ tf_subject_matter_large_houses <dbl> …
## $ tf_subject_matter_leisure_and_sport <dbl> …
## $ tf_subject_matter_literary <dbl> …
## $ tf_subject_matter_local_histories <dbl> …
## $ tf_subject_matter_medicine_and_health <dbl> …
## $ tf_subject_matter_medium_houses <dbl> …
## $ tf_subject_matter_military <dbl> …
## $ tf_subject_matter_mining_and_quarrying <dbl> …
## $ tf_subject_matter_mixed <dbl> …
## $ tf_subject_matter_music <dbl> …
## $ tf_subject_matter_natural_world <dbl> …
## $ tf_subject_matter_other <dbl> …
## $ tf_subject_matter_personality <dbl> …
## $ tf_subject_matter_police <dbl> …
## $ tf_subject_matter_prehistory <dbl> …
## $ tf_subject_matter_regiment <dbl> …
## $ tf_subject_matter_religious_buildings <dbl> …
## $ tf_subject_matter_roman <dbl> …
## $ tf_subject_matter_rural_industry <dbl> …
## $ tf_subject_matter_rural_life <dbl> …
## $ tf_subject_matter_sea_and_seafaring <dbl> …
## $ tf_subject_matter_services <dbl> …
## $ tf_subject_matter_small_houses <dbl> …
## $ tf_subject_matter_textiles <dbl> …
## $ tf_subject_matter_toys_and_models <dbl> …
## $ tf_subject_matter_trains_and_railways <dbl> …
## $ tf_subject_matter_transport <dbl> …
## $ tf_subject_matter_utilities <dbl> …
## $ tf_subject_matter_war_and_conflict <dbl> …
## $ tf_subject_matter_water_and_waste <dbl> …
## $ tf_subject_matter_watermills <dbl> …
## $ village_town_or_city_other <dbl> …
## $ village_town_or_city_new <dbl> …
## $ governance_Government.Local_Authority <dbl> …
## $ governance_Government.National <dbl> …
## $ governance_Government.Other <dbl> …
## $ governance_Independent.English_Heritage <dbl> …
## $ governance_Independent.Historic_Environment_Scotland <dbl> …
## $ governance_Independent.National_Trust <dbl> …
## $ governance_Independent.National_Trust_for_Scotland <dbl> …
## $ governance_Independent.Not_for_profit <dbl> …
## $ governance_Independent.Private <dbl> …
## $ governance_Independent.Unknown <dbl> …
## $ governance_University <dbl> …
## $ governance_Unknown <dbl> …
## $ governance_new <dbl> …
## $ size_large <dbl> …
## $ size_medium <dbl> …
## $ size_small <dbl> …
## $ size_unknown <dbl> …
## $ size_new <dbl> …
## $ size_provenance_aim_size_designation <dbl> …
## $ size_provenance_babbidge_ewles_and_smith_2006 <dbl> …
## $ size_provenance_domus <dbl> …
## $ size_provenance_ma.fam. <dbl> …
## $ size_provenance_ma.fam._year_stated <dbl> …
## $ size_provenance_ma.fam2. <dbl> …
## $ size_provenance_mm <dbl> …
## $ size_provenance_mm_manual_estimate_2018 <dbl> …
## $ size_provenance_mm_prediction_random_forest <dbl> …
## $ size_provenance_national_trust_annual_report_ <dbl> …
## $ size_provenance_nilm.vn. <dbl> …
## $ size_provenance_scottish_national_audit <dbl> …
## $ size_provenance_unknown <dbl> …
## $ size_provenance_visitbritain <dbl> …
## $ size_provenance_new <dbl> …
## $ primary_provenance_of_data_aim <dbl> …
## $ primary_provenance_of_data_aim82M <dbl> …
## $ primary_provenance_of_data_aim82NM <dbl> …
## $ primary_provenance_of_data_domus <dbl> …
## $ primary_provenance_of_data_fcm <dbl> …
## $ primary_provenance_of_data_hha <dbl> …
## $ primary_provenance_of_data_hud <dbl> …
## $ primary_provenance_of_data_mald <dbl> …
## $ primary_provenance_of_data_MDN <dbl> …
## $ primary_provenance_of_data_mgs <dbl> …
## $ primary_provenance_of_data_misc <dbl> …
## $ primary_provenance_of_data_Misc <dbl> …
## $ primary_provenance_of_data_musassoc <dbl> …
## $ primary_provenance_of_data_MusCal <dbl> …
## $ primary_provenance_of_data_nimc <dbl> …
## $ primary_provenance_of_data_wiki <dbl> …
## $ primary_provenance_of_data_new <dbl> …
## $ group_subgroup_English.and.Welsh.Countryside.Ethnically.Diverse.Metropolitan.Living <dbl> …
## $ group_subgroup_English.and.Welsh.Countryside.London.Cosmopolitan <dbl> …
## $ group_subgroup_English.and.Welsh.Countryside.Older.Farming.Communities <dbl> …
## $ group_subgroup_English.and.Welsh.Countryside.Sparse.English.and.Welsh.Countryside <dbl> …
## $ group_subgroup_Ethnically.Diverse.Metropolitan..Living.Ethnically.Diverse.Metropolitan..Living <dbl> …
## $ group_subgroup_Ethnically.Diverse.Metropolitan.Living.Ethnically.Diverse.Metropolitan.Living <dbl> …
## $ group_subgroup_Larger.Towns.and.Cities.Larger.Towns.and.Cities <dbl> …
## $ group_subgroup_London.Cosmopolitan.London.Cosmopolitan <dbl> …
## $ group_subgroup_Manufacturing.Traits.Industrial.and.Multi.ethnic <dbl> …
## $ group_subgroup_Manufacturing.Traits.Urban.Living <dbl> …
## $ group_subgroup_Northern.Ireland.Countryside.Northern.Ireland.Countryside <dbl> …
## $ group_subgroup_Remoter.Coastal.Living.Ageing.Coastal.Living <dbl> …
## $ group_subgroup_Remoter.Coastal.Living.Seaside.Living <dbl> …
## $ group_subgroup_Rural.Urban.Fringe.Ethnically.Diverse.Metropolitan.Living <dbl> …
## $ group_subgroup_Rural.Urban.Fringe.Rural.Urban.Fringe <dbl> …
## $ group_subgroup_Scottish.Countryside.Scottish.Countryside <dbl> …
## $ group_subgroup_Scottish.Industrial.Heritage.Scottish.Industrial.Legacy <dbl> …
## $ group_subgroup_Services.Manufacturing.and.Mining.Legacy.Affluent.rural <dbl> …
## $ group_subgroup_Services.Manufacturing.and.Mining.Legacy.Manufacturing.Legacy <dbl> …
## $ group_subgroup_Services.Manufacturing.and.Mining.Legacy.Mining.Legacy <dbl> …
## $ group_subgroup_Services.Manufacturing.and.Mining.Legacy.Service.Economy <dbl> …
## $ group_subgroup_Suburban.Traits.City.Periphery <dbl> …
## $ group_subgroup_Suburban.Traits.Expanding.Areas <dbl> …
## $ group_subgroup_Thriving.Rural.Affluent.rural <dbl> …
## $ group_subgroup_Thriving.Rural.Rural.Growth.Areas <dbl> …
## $ group_subgroup_Town.Living.Prosperous.Semi.rural <dbl> …
## $ group_subgroup_Town.Living.Prosperous.Towns <dbl> …
## $ group_subgroup_University.Towns.and.Cities.University.Towns.and.Cities <dbl> …
## $ group_subgroup_new <dbl> …
## $ year_opened_rev_X1653 <dbl> …
## $ year_opened_rev_X1676 <dbl> …
## $ year_opened_rev_X1683 <dbl> …
## $ year_opened_rev_X1728 <dbl> …
## $ year_opened_rev_X1739 <dbl> …
## $ year_opened_rev_X1750 <dbl> …
## $ year_opened_rev_X1761 <dbl> …
## $ year_opened_rev_X1771 <dbl> …
## $ year_opened_rev_X1796 <dbl> …
## $ year_opened_rev_X1800 <dbl> …
## $ year_opened_rev_X1807 <dbl> …
## $ year_opened_rev_X1812 <dbl> …
## $ year_opened_rev_X1814 <dbl> …
## $ year_opened_rev_X1815 <dbl> …
## $ year_opened_rev_X1816 <dbl> …
## $ year_opened_rev_X1817 <dbl> …
## $ year_opened_rev_X1818 <dbl> …
## $ year_opened_rev_X1819 <dbl> …
## $ year_opened_rev_X1820 <dbl> …
## $ year_opened_rev_X1821 <dbl> …
## $ year_opened_rev_X1823 <dbl> …
## $ year_opened_rev_X1824 <dbl> …
## $ year_opened_rev_X1826 <dbl> …
## $ year_opened_rev_X1830 <dbl> …
## $ year_opened_rev_X1832 <dbl> …
## $ year_opened_rev_X1833 <dbl> …
## $ year_opened_rev_X1835 <dbl> …
## $ year_opened_rev_X1836 <dbl> …
## $ year_opened_rev_X1837 <dbl> …
## $ year_opened_rev_X1839 <dbl> …
## $ year_opened_rev_X1842 <dbl> …
## $ year_opened_rev_X1843 <dbl> …
## $ year_opened_rev_X1844 <dbl> …
## $ year_opened_rev_X1845 <dbl> …
## $ year_opened_rev_X1846 <dbl> …
## $ year_opened_rev_X1847 <dbl> …
## $ year_opened_rev_X1848 <dbl> …
## $ year_opened_rev_X1849 <dbl> …
## $ year_opened_rev_X1850 <dbl> …
## $ year_opened_rev_X1852 <dbl> …
## $ year_opened_rev_X1856 <dbl> …
## $ year_opened_rev_X1857 <dbl> …
## $ year_opened_rev_X1858 <dbl> …
## $ year_opened_rev_X1859 <dbl> …
## $ year_opened_rev_X1860 <dbl> …
## $ year_opened_rev_X1862 <dbl> …
## $ year_opened_rev_X1864 <dbl> …
## $ year_opened_rev_X1865 <dbl> …
## $ year_opened_rev_X1867 <dbl> …
## $ year_opened_rev_X1868 <dbl> …
## $ year_opened_rev_X1869 <dbl> …
## $ year_opened_rev_X1870 <dbl> …
## $ year_opened_rev_X1871 <dbl> …
## $ year_opened_rev_X1872 <dbl> …
## $ year_opened_rev_X1874 <dbl> …
## $ year_opened_rev_X1875 <dbl> …
## $ year_opened_rev_X1876 <dbl> …
## $ year_opened_rev_X1878 <dbl> …
## $ year_opened_rev_X1880 <dbl> …
## $ year_opened_rev_X1881 <dbl> …
## $ year_opened_rev_X1882 <dbl> …
## $ year_opened_rev_X1883 <dbl> …
## $ year_opened_rev_X1884 <dbl> …
## $ year_opened_rev_X1885 <dbl> …
## $ year_opened_rev_X1886 <dbl> …
## $ year_opened_rev_X1887 <dbl> …
## $ year_opened_rev_X1888 <dbl> …
## $ year_opened_rev_X1890 <dbl> …
## $ year_opened_rev_X1891 <dbl> …
## $ year_opened_rev_X1892 <dbl> …
## $ year_opened_rev_X1893 <dbl> …
## $ year_opened_rev_X1894 <dbl> …
## $ year_opened_rev_X1895 <dbl> …
## $ year_opened_rev_X1896 <dbl> …
## $ year_opened_rev_X1897 <dbl> …
## $ year_opened_rev_X1898 <dbl> …
## $ year_opened_rev_X1899 <dbl> …
## $ year_opened_rev_X1900 <dbl> …
## $ year_opened_rev_X1901 <dbl> …
## $ year_opened_rev_X1902 <dbl> …
## $ year_opened_rev_X1903 <dbl> …
## $ year_opened_rev_X1904 <dbl> …
## $ year_opened_rev_X1905 <dbl> …
## $ year_opened_rev_X1906 <dbl> …
## $ year_opened_rev_X1907 <dbl> …
## $ year_opened_rev_X1908 <dbl> …
## $ year_opened_rev_X1909 <dbl> …
## $ year_opened_rev_X1910 <dbl> …
## $ year_opened_rev_X1911 <dbl> …
## $ year_opened_rev_X1912 <dbl> …
## $ year_opened_rev_X1913 <dbl> …
## $ year_opened_rev_X1914 <dbl> …
## $ year_opened_rev_X1915 <dbl> …
## $ year_opened_rev_X1917 <dbl> …
## $ year_opened_rev_X1919 <dbl> …
## $ year_opened_rev_X1920 <dbl> …
## $ year_opened_rev_X1921 <dbl> …
## $ year_opened_rev_X1922 <dbl> …
## $ year_opened_rev_X1923 <dbl> …
## $ year_opened_rev_X1924 <dbl> …
## $ year_opened_rev_X1925 <dbl> …
## $ year_opened_rev_X1926 <dbl> …
## $ year_opened_rev_X1927 <dbl> …
## $ year_opened_rev_X1928 <dbl> …
## $ year_opened_rev_X1929 <dbl> …
## $ year_opened_rev_X1930 <dbl> …
## $ year_opened_rev_X1931 <dbl> …
## $ year_opened_rev_X1932 <dbl> …
## $ year_opened_rev_X1933 <dbl> …
## $ year_opened_rev_X1934 <dbl> …
## $ year_opened_rev_X1935 <dbl> …
## $ year_opened_rev_X1936 <dbl> …
## $ year_opened_rev_X1937 <dbl> …
## $ year_opened_rev_X1938 <dbl> …
## $ year_opened_rev_X1939 <dbl> …
## $ year_opened_rev_X1940 <dbl> …
## $ year_opened_rev_X1941 <dbl> …
## $ year_opened_rev_X1942 <dbl> …
## $ year_opened_rev_X1943 <dbl> …
## $ year_opened_rev_X1944 <dbl> …
## $ year_opened_rev_X1945 <dbl> …
## $ year_opened_rev_X1946 <dbl> …
## $ year_opened_rev_X1947 <dbl> …
## $ year_opened_rev_X1948 <dbl> …
## $ year_opened_rev_X1949 <dbl> …
## $ year_opened_rev_X1950 <dbl> …
## $ year_opened_rev_X1951 <dbl> …
## $ year_opened_rev_X1952 <dbl> …
## $ year_opened_rev_X1953 <dbl> …
## $ year_opened_rev_X1954 <dbl> …
## $ year_opened_rev_X1955 <dbl> …
## $ year_opened_rev_X1956 <dbl> …
## $ year_opened_rev_X1957 <dbl> …
## $ year_opened_rev_X1958 <dbl> …
## $ year_opened_rev_X1959 <dbl> …
## $ year_opened_rev_X1960 <dbl> …
## $ year_opened_rev_X1961 <dbl> …
## $ year_opened_rev_X1962 <dbl> …
## $ year_opened_rev_X1963 <dbl> …
## $ year_opened_rev_X1964 <dbl> …
## $ year_opened_rev_X1965 <dbl> …
## $ year_opened_rev_X1966 <dbl> …
## $ year_opened_rev_X1967 <dbl> …
## $ year_opened_rev_X1968 <dbl> …
## $ year_opened_rev_X1969 <dbl> …
## $ year_opened_rev_X1970 <dbl> …
## $ year_opened_rev_X1971 <dbl> …
## $ year_opened_rev_X1972 <dbl> …
## $ year_opened_rev_X1973 <dbl> …
## $ year_opened_rev_X1974 <dbl> …
## $ year_opened_rev_X1975 <dbl> …
## $ year_opened_rev_X1976 <dbl> …
## $ year_opened_rev_X1977 <dbl> …
## $ year_opened_rev_X1978 <dbl> …
## $ year_opened_rev_X1979 <dbl> …
## $ year_opened_rev_X1980 <dbl> …
## $ year_opened_rev_X1981 <dbl> …
## $ year_opened_rev_X1982 <dbl> …
## $ year_opened_rev_X1983 <dbl> …
## $ year_opened_rev_X1984 <dbl> …
## $ year_opened_rev_X1985 <dbl> …
## $ year_opened_rev_X1986 <dbl> …
## $ year_opened_rev_X1987 <dbl> …
## $ year_opened_rev_X1988 <dbl> …
## $ year_opened_rev_X1989 <dbl> …
## $ year_opened_rev_X1990 <dbl> …
## $ year_opened_rev_X1991 <dbl> …
## $ year_opened_rev_X1992 <dbl> …
## $ year_opened_rev_X1993 <dbl> …
## $ year_opened_rev_X1994 <dbl> …
## $ year_opened_rev_X1995 <dbl> …
## $ year_opened_rev_X1996 <dbl> …
## $ year_opened_rev_X1997 <dbl> …
## $ year_opened_rev_X1998 <dbl> …
## $ year_opened_rev_X1999 <dbl> …
## $ year_opened_rev_X2000 <dbl> …
## $ year_opened_rev_X2001 <dbl> …
## $ year_opened_rev_X2002 <dbl> …
## $ year_opened_rev_X2003 <dbl> …
## $ year_opened_rev_X2004 <dbl> …
## $ year_opened_rev_X2005 <dbl> …
## $ year_opened_rev_X2006 <dbl> …
## $ year_opened_rev_X2007 <dbl> …
## $ year_opened_rev_X2008 <dbl> …
## $ year_opened_rev_X2009 <dbl> …
## $ year_opened_rev_X2010 <dbl> …
## $ year_opened_rev_X2011 <dbl> …
## $ year_opened_rev_X2012 <dbl> …
## $ year_opened_rev_X2013 <dbl> …
## $ year_opened_rev_X2014 <dbl> …
## $ year_opened_rev_X2015 <dbl> …
## $ year_opened_rev_X2016 <dbl> …
## $ year_opened_rev_X2017 <dbl> …
## $ year_opened_rev_X2018 <dbl> …
## $ year_opened_rev_new <dbl> …
xgboost_spec <-
boost_tree(trees = tune(), mtry = tune(), learn_rate = tune()) %>%
set_mode("classification") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_rec) %>%
add_model(xgboost_spec)
doParallel::registerDoParallel()
set.seed(48291)
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = data_cv,
grid = 5,
control = control_grid(save_pred = TRUE))
## i Creating pre-processing data to finalize unknown parameter: mtry
## Warning: ! The following columns have zero variance so scaling cannot be used:
## village_town_or_city_new, governance_new, size_new, size_provenance_new,
## primary_provenance_of_data_new, group_subgroup_new, and year_opened_rev_new.
## ℹ Consider using ?step_zv (`?recipes::step_zv()`) to remove those columns
## before normalizing.
## Warning: package 'xgboost' was built under R version 4.3.3
collect_metrics(xgboost_tune)
## # A tibble: 15 × 9
## mtry trees learn_rate .metric .estimator mean n std_err .config
## <int> <int> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 9 1186 0.00128 accuracy binary 0.830 10 0.00894 Preprocess…
## 2 9 1186 0.00128 brier_class binary 0.158 10 0.00229 Preprocess…
## 3 9 1186 0.00128 roc_auc binary 0.894 10 0.00688 Preprocess…
## 4 69 388 0.133 accuracy binary 0.846 10 0.00782 Preprocess…
## 5 69 388 0.133 brier_class binary 0.114 10 0.00491 Preprocess…
## 6 69 388 0.133 roc_auc binary 0.913 10 0.00589 Preprocess…
## 7 174 1960 0.00657 accuracy binary 0.855 10 0.00554 Preprocess…
## 8 174 1960 0.00657 brier_class binary 0.108 10 0.00403 Preprocess…
## 9 174 1960 0.00657 roc_auc binary 0.918 10 0.00542 Preprocess…
## 10 265 777 0.0651 accuracy binary 0.842 10 0.00826 Preprocess…
## 11 265 777 0.0651 brier_class binary 0.118 10 0.00511 Preprocess…
## 12 265 777 0.0651 roc_auc binary 0.911 10 0.00555 Preprocess…
## 13 326 1334 0.0237 accuracy binary 0.843 10 0.00818 Preprocess…
## 14 326 1334 0.0237 brier_class binary 0.113 10 0.00468 Preprocess…
## 15 326 1334 0.0237 roc_auc binary 0.915 10 0.00521 Preprocess…
collect_predictions(xgboost_tune) %>%
group_by(id) %>%
roc_curve(accreditation, .pred_Accredited) %>%
autoplot()
xgboost_last <- xgboost_workflow %>%
finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
last_fit(data_split)
## → A | warning: ! The following columns have zero variance so scaling cannot be used:
## village_town_or_city_new, governance_new, size_new, size_provenance_new,
## primary_provenance_of_data_new, group_subgroup_new, and year_opened_rev_new.
## ℹ Consider using ?step_zv (`?recipes::step_zv()`) to remove those columns
## before normalizing.
##
There were issues with some computations A: x1
There were issues with some computations A: x1
collect_metrics(xgboost_last)
## # A tibble: 3 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.837 Preprocessor1_Model1
## 2 roc_auc binary 0.912 Preprocessor1_Model1
## 3 brier_class binary 0.118 Preprocessor1_Model1
collect_predictions(xgboost_last) %>%
yardstick::conf_mat(accreditation, .pred_class) %>%
autoplot()
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
xgboost_last %>%
workflows::extract_fit_engine() %>%
vip()
previous model had accuaracy: 0.815 auc: 0.890 in apply 7 after cleaning more of the data the model had an accuracy: 0.812 and auc: 0.896
feature transformation:normalized numeric data. It resulted in accuracy: 0.820 and auc: 0.896 but there were warnings with adding step_normalize
feature tranformation: YeoJohnson tranformation. no improvement
feature selection: PCA accuracy: no improvement at mutliple thresholds
algorithm tuning: added grid regular, mtry, and learn_rate. it resulted in accuracy: 0.820 and auc: 0.912 no improvement with grid regular, but improvement with more hyperparameters before adding min_n the auc was 0.912, but after the auc dropped to 0.896, I believe auc is higher valued that accuracy, even though the accuracy is higher after adding min_n, having a higher auc is better.