museums <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')
## Rows: 4191 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (24): museum_id, Name_of_museum, Address_line_1, Address_line_2, Village...
## dbl (11): Latitude, Longitude, DOMUS_identifier, Area_Deprivation_index, Are...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
museums %>% skimr::skim()
Name | Piped data |
Number of rows | 4191 |
Number of columns | 35 |
_______________________ | |
Column type frequency: | |
character | 24 |
numeric | 11 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
museum_id | 0 | 1.00 | 8 | 15 | 0 | 4191 | 0 |
Name_of_museum | 0 | 1.00 | 3 | 76 | 0 | 4190 | 0 |
Address_line_1 | 441 | 0.89 | 3 | 61 | 0 | 3212 | 0 |
Address_line_2 | 2816 | 0.33 | 3 | 39 | 0 | 1167 | 0 |
Village,_Town_or_City | 4 | 1.00 | 3 | 24 | 0 | 1696 | 0 |
Postcode | 0 | 1.00 | 6 | 9 | 0 | 3918 | 0 |
Admin_area | 0 | 1.00 | 12 | 137 | 0 | 393 | 0 |
Accreditation | 0 | 1.00 | 10 | 12 | 0 | 2 | 0 |
Governance | 0 | 1.00 | 7 | 41 | 0 | 13 | 0 |
Size | 0 | 1.00 | 4 | 7 | 0 | 5 | 0 |
Size_provenance | 179 | 0.96 | 2 | 29 | 0 | 16 | 0 |
Subject_Matter | 0 | 1.00 | 5 | 45 | 0 | 114 | 0 |
Year_opened | 0 | 1.00 | 9 | 9 | 0 | 351 | 0 |
Year_closed | 0 | 1.00 | 9 | 9 | 0 | 170 | 0 |
DOMUS_Subject_Matter | 2788 | 0.33 | 5 | 27 | 0 | 21 | 0 |
Primary_provenance_of_data | 0 | 1.00 | 3 | 8 | 0 | 18 | 0 |
Identifier_used_in_primary_data_source | 2056 | 0.51 | 2 | 8 | 0 | 2134 | 0 |
Area_Geodemographic_group | 49 | 0.99 | 11 | 40 | 0 | 17 | 0 |
Area_Geodemographic_group_code | 49 | 0.99 | 3 | 3 | 0 | 16 | 0 |
Area_Geodemographic_subgroup | 49 | 0.99 | 12 | 39 | 0 | 25 | 0 |
Area_Geodemographic_subgroup_code | 49 | 0.99 | 4 | 4 | 0 | 24 | 0 |
Area_Geodemographic_supergroup | 49 | 0.99 | 16 | 39 | 0 | 8 | 0 |
Area_Geodemographic_supergroup_code | 49 | 0.99 | 2 | 2 | 0 | 8 | 0 |
Notes | 2980 | 0.29 | 12 | 751 | 0 | 956 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Latitude | 0 | 1.00 | 52.93 | 2.09 | 49.18 | 51.48 | 52.47 | 53.96 | 100.00 | ▇▁▁▁▁ |
Longitude | 0 | 1.00 | -1.96 | 1.84 | -8.09 | -3.10 | -1.87 | -0.48 | 1.76 | ▁▂▇▇▅ |
DOMUS_identifier | 2347 | 0.44 | 1303.45 | 1597.19 | 1.00 | 486.50 | 991.50 | 1470.25 | 7746.00 | ▇▂▁▁▁ |
Area_Deprivation_index | 49 | 0.99 | 5.44 | 2.48 | 1.00 | 4.00 | 5.00 | 7.00 | 10.00 | ▃▆▇▆▃ |
Area_Deprivation_index_crime | 49 | 0.99 | 5.43 | 3.07 | 1.00 | 3.00 | 6.00 | 8.00 | 10.00 | ▇▆▅▇▇ |
Area_Deprivation_index_education | 49 | 0.99 | 6.04 | 2.61 | 1.00 | 4.00 | 6.00 | 8.00 | 10.00 | ▃▅▇▇▆ |
Area_Deprivation_index_employment | 49 | 0.99 | 6.08 | 2.76 | 1.00 | 4.00 | 6.00 | 8.00 | 10.00 | ▅▆▇▇▇ |
Area_Deprivation_index_health | 49 | 0.99 | 6.02 | 2.82 | 1.00 | 4.00 | 6.00 | 8.00 | 10.00 | ▅▆▆▇▇ |
Area_Deprivation_index_housing | 49 | 0.99 | 3.97 | 2.75 | 1.00 | 1.00 | 3.00 | 6.00 | 10.00 | ▇▅▃▂▂ |
Area_Deprivation_index_income | 49 | 0.99 | 5.99 | 2.62 | 1.00 | 4.00 | 6.00 | 8.00 | 10.00 | ▃▆▇▇▆ |
Area_Deprivation_index_services | 49 | 0.99 | 4.78 | 3.01 | 1.00 | 2.00 | 4.00 | 7.00 | 10.00 | ▇▅▅▅▅ |
Notes about the dataset
**Village,_Town_or_City** (, can’t be used for variable name): clean variable names using janitor::clean_names()
Supergroup vs Group vs subgroup (e.g., Geodemographic_group and Geodemographic_subgroup): Keep only one, as they contain overlapping information. Retain the group variable if the subgroup does not provide additional valuable insights, and vice versa.
Variables with too many missing values: Remove the following:
notes
Address_line_2
DOMUS_identifier
DOMUS_subject_matter
Identifier_used_in_primary_data_source
Redundant Variables (e.g., Area_Geodemographic_group & Area_Geodemographic_group_code): Keep only one from each pair, as they contain the same information.
Location Variables (Address_line_1 through Admin_area): Select only one or two essential location variables, as these are redundant.
Year_opened, Year_closed: These variables contain two years separated by a colon, which requires clarification. Additionally, the value “9999” in Year_closed likely indicates “not closed.” Decide whether to keep these variables. If you keep them, consider data cleaning. For instance, one approach would be to retain only the first year from Year_opened.
Inconsistent white spaces between words (e.g., area_geodemographic_group has “Ethnically Diverse Metropolitan Living” and “Ethnically Diverse Metropolitan Living”): Revise the character variables so that they have one white space between words, using stringr::str_squish()
Character variables (These are not strings, like notes): Convert them to factors.
ID variables (museum_id and name_of_museum): Keep museum_id as name_of_museum has duplicates
library(janitor)
## Warning: package 'janitor' was built under R version 4.4.2
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
data_clean <- museums %>%
# Village,_Town_or_City (, can't be used for variable name): clean variable names
janitor::clean_names() %>%
# supergroup vs groups vs subgroups: run the model with either groups or subgroups and choose one based on the model performance
select(-area_geodemographic_group, -area_geodemographic_subgroup) %>%
# Variables with too many missing values: Remove
select(-notes, -address_line_2, -domus_identifier, -domus_subject_matter, -identifier_used_in_primary_data_source) %>%
# Redundant Variables (e.g., Area_Geodemographic_group & Area_Geodemographic_group_code):** Keep only one from each pair, as they contain the same information.
select(-ends_with("_code")) %>%
# Location Variables (Address_line_1 through Admin_area): Select only one
select(-address_line_1, -village_town_or_city, -postcode, -latitude, -longitude) %>%
# Extract provinces from admin_area
mutate(admin_area = admin_area %>% str_remove("\\/")) %>%
mutate(admin_area = admin_area %>% str_remove("\\/.*")) %>%
# Year_opened, Year_closed: Drop Year_closed and consider adding later
select(-year_closed) %>%
mutate(year_opened = year_opened %>% str_remove("\\:.*") %>% as.numeric()) %>%
# inconsistent white spaces between words (e.g., area_geodemographic_group has "Ethnically Diverse Metropolitan Living" and "Ethnically Diverse Metropolitan Living"): Revise the character variables so that they have one white space between words
mutate(across(where(is.character), str_squish)) %>%
# Character variables: Convert them to factor
mutate(across(where(is.character), factor)) %>%
# id variables (museum_id and name_of_museum): Keep museum_id
select(-name_of_museum) %>%
# Remove all rows with missing values
na.omit()
# Check whether the target variable is unbalanced
data_clean %>% count(accreditation)
## # A tibble: 2 × 2
## accreditation n
## <fct> <int>
## 1 Accredited 1703
## 2 Unaccredited 2267
# correlation funnel analysis
# Step 1: Prepare Data
data_binarized_tbl <- data_clean %>%
# Throw in all predictors except for id and strings
select(-museum_id) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 3,970
## Columns: 109
## $ admin_area__England <dbl> …
## $ admin_area__Northern_Ireland <dbl> …
## $ admin_area__Scotland <dbl> …
## $ admin_area__Wales <dbl> …
## $ `admin_area__-OTHER` <dbl> …
## $ accreditation__Accredited <dbl> …
## $ accreditation__Unaccredited <dbl> …
## $ `governance__Government-Local_Authority` <dbl> …
## $ `governance__Government-National` <dbl> …
## $ `governance__Independent-English_Heritage` <dbl> …
## $ `governance__Independent-National_Trust` <dbl> …
## $ `governance__Independent-Not_for_profit` <dbl> …
## $ `governance__Independent-Private` <dbl> …
## $ `governance__Independent-Unknown` <dbl> …
## $ governance__University <dbl> …
## $ governance__Unknown <dbl> …
## $ `governance__-OTHER` <dbl> …
## $ size__large <dbl> …
## $ size__medium <dbl> …
## $ size__small <dbl> …
## $ size__unknown <dbl> …
## $ `size__-OTHER` <dbl> …
## $ size_provenance__ace_size_designation <dbl> …
## $ size_provenance__aim_size_designation <dbl> …
## $ size_provenance__domus <dbl> …
## $ `size_provenance__ma(fam)` <dbl> …
## $ size_provenance__mm_manual_estimate_2018 <dbl> …
## $ size_provenance__mm_prediction_random_forest <dbl> …
## $ size_provenance__scottish_national_audit <dbl> …
## $ size_provenance__unknown <dbl> …
## $ size_provenance__visitbritain <dbl> …
## $ `size_provenance__-OTHER` <dbl> …
## $ `subject_matter__Archaeology-Roman` <dbl> …
## $ `subject_matter__Arts-Fine_and_decorative_arts` <dbl> …
## $ `subject_matter__Buildings-Houses-Large_houses` <dbl> …
## $ `subject_matter__Buildings-Houses-Medium_houses` <dbl> …
## $ `subject_matter__Industry_and_manufacture-Mining_and_quarrying` <dbl> …
## $ `subject_matter__Leisure_and_sport-Toys_and_models` <dbl> …
## $ subject_matter__Local_Histories <dbl> …
## $ `subject_matter__Mixed-Encyclopaedic` <dbl> …
## $ `subject_matter__Mixed-Other` <dbl> …
## $ subject_matter__Other <dbl> …
## $ `subject_matter__Personality-Literary` <dbl> …
## $ `subject_matter__Rural_Industry-Farming` <dbl> …
## $ `subject_matter__Sea_and_seafaring-Boats_and_ships` <dbl> …
## $ `subject_matter__Sea_and_seafaring-Mixed` <dbl> …
## $ `subject_matter__Transport-Cars_and_motorbikes` <dbl> …
## $ `subject_matter__Transport-Trains_and_railways` <dbl> …
## $ `subject_matter__War_and_conflict-Airforce` <dbl> …
## $ `subject_matter__War_and_conflict-Castles_and_forts` <dbl> …
## $ `subject_matter__War_and_conflict-Military` <dbl> …
## $ `subject_matter__War_and_conflict-Regiment` <dbl> …
## $ `subject_matter__-OTHER` <dbl> …
## $ `year_opened__-Inf_1957` <dbl> …
## $ year_opened__1957_1977 <dbl> …
## $ year_opened__1977_1992 <dbl> …
## $ year_opened__1992_Inf <dbl> …
## $ primary_provenance_of_data__ace <dbl> …
## $ primary_provenance_of_data__aim <dbl> …
## $ primary_provenance_of_data__aim82M <dbl> …
## $ primary_provenance_of_data__aim82NM <dbl> …
## $ primary_provenance_of_data__domus <dbl> …
## $ primary_provenance_of_data__fcm <dbl> …
## $ primary_provenance_of_data__hha <dbl> …
## $ primary_provenance_of_data__mald <dbl> …
## $ primary_provenance_of_data__mgs <dbl> …
## $ primary_provenance_of_data__misc <dbl> …
## $ primary_provenance_of_data__musassoc <dbl> …
## $ primary_provenance_of_data__wiki <dbl> …
## $ `primary_provenance_of_data__-OTHER` <dbl> …
## $ `area_deprivation_index__-Inf_4` <dbl> …
## $ area_deprivation_index__4_5 <dbl> …
## $ area_deprivation_index__5_7 <dbl> …
## $ area_deprivation_index__7_Inf <dbl> …
## $ `area_deprivation_index_crime__-Inf_3` <dbl> …
## $ area_deprivation_index_crime__3_6 <dbl> …
## $ area_deprivation_index_crime__6_8 <dbl> …
## $ area_deprivation_index_crime__8_Inf <dbl> …
## $ `area_deprivation_index_education__-Inf_4` <dbl> …
## $ area_deprivation_index_education__4_6 <dbl> …
## $ area_deprivation_index_education__6_8 <dbl> …
## $ area_deprivation_index_education__8_Inf <dbl> …
## $ `area_deprivation_index_employment__-Inf_4` <dbl> …
## $ area_deprivation_index_employment__4_6 <dbl> …
## $ area_deprivation_index_employment__6_8 <dbl> …
## $ area_deprivation_index_employment__8_Inf <dbl> …
## $ `area_deprivation_index_health__-Inf_4` <dbl> …
## $ area_deprivation_index_health__4_6 <dbl> …
## $ area_deprivation_index_health__6_8 <dbl> …
## $ area_deprivation_index_health__8_Inf <dbl> …
## $ `area_deprivation_index_housing__-Inf_3` <dbl> …
## $ area_deprivation_index_housing__3_6 <dbl> …
## $ area_deprivation_index_housing__6_Inf <dbl> …
## $ `area_deprivation_index_income__-Inf_4` <dbl> …
## $ area_deprivation_index_income__4_6 <dbl> …
## $ area_deprivation_index_income__6_8 <dbl> …
## $ area_deprivation_index_income__8_Inf <dbl> …
## $ `area_deprivation_index_services__-Inf_2` <dbl> …
## $ area_deprivation_index_services__2_4 <dbl> …
## $ area_deprivation_index_services__4_8 <dbl> …
## $ area_deprivation_index_services__8_Inf <dbl> …
## $ area_geodemographic_supergroup__Affluent_England <dbl> …
## $ area_geodemographic_supergroup__Business_Education_and_Heritage_Centres <dbl> …
## $ area_geodemographic_supergroup__Countryside_Living <dbl> …
## $ area_geodemographic_supergroup__Ethnically_Diverse_Metropolitan_Living <dbl> …
## $ area_geodemographic_supergroup__London_Cosmopolitan <dbl> …
## $ area_geodemographic_supergroup__Services_and_Industrial_Legacy <dbl> …
## $ area_geodemographic_supergroup__Town_and_Country_Living <dbl> …
## $ area_geodemographic_supergroup__Urban_Settlements <dbl> …
# Step 2: Correlate to the Target
data_corr_tbl <- data_binarized_tbl %>%
correlate(accreditation__Accredited)
data_corr_tbl
## # A tibble: 109 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 accreditation Accredited 1
## 2 accreditation Unaccredited -1
## 3 primary_provenance_of_data domus 0.571
## 4 size_provenance mm_prediction_random_forest -0.516
## 5 size_provenance domus 0.458
## 6 governance Independent-Private -0.387
## 7 size small -0.377
## 8 primary_provenance_of_data wiki -0.295
## 9 primary_provenance_of_data ace 0.261
## 10 size large 0.256
## # ℹ 99 more rows
# Step 3: Plot
data_corr_tbl %>%
plot_correlation_funnel()
## Warning: ggrepel: 56 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps