Import Data

museums <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')
## Rows: 4191 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (24): museum_id, Name_of_museum, Address_line_1, Address_line_2, Village...
## dbl (11): Latitude, Longitude, DOMUS_identifier, Area_Deprivation_index, Are...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
museums %>% skimr::skim()
Data summary
Name Piped data
Number of rows 4191
Number of columns 35
_______________________
Column type frequency:
character 24
numeric 11
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
museum_id 0 1.00 8 15 0 4191 0
Name_of_museum 0 1.00 3 76 0 4190 0
Address_line_1 441 0.89 3 61 0 3212 0
Address_line_2 2816 0.33 3 39 0 1167 0
Village,_Town_or_City 4 1.00 3 24 0 1696 0
Postcode 0 1.00 6 9 0 3918 0
Admin_area 0 1.00 12 137 0 393 0
Accreditation 0 1.00 10 12 0 2 0
Governance 0 1.00 7 41 0 13 0
Size 0 1.00 4 7 0 5 0
Size_provenance 179 0.96 2 29 0 16 0
Subject_Matter 0 1.00 5 45 0 114 0
Year_opened 0 1.00 9 9 0 351 0
Year_closed 0 1.00 9 9 0 170 0
DOMUS_Subject_Matter 2788 0.33 5 27 0 21 0
Primary_provenance_of_data 0 1.00 3 8 0 18 0
Identifier_used_in_primary_data_source 2056 0.51 2 8 0 2134 0
Area_Geodemographic_group 49 0.99 11 40 0 17 0
Area_Geodemographic_group_code 49 0.99 3 3 0 16 0
Area_Geodemographic_subgroup 49 0.99 12 39 0 25 0
Area_Geodemographic_subgroup_code 49 0.99 4 4 0 24 0
Area_Geodemographic_supergroup 49 0.99 16 39 0 8 0
Area_Geodemographic_supergroup_code 49 0.99 2 2 0 8 0
Notes 2980 0.29 12 751 0 956 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Latitude 0 1.00 52.93 2.09 49.18 51.48 52.47 53.96 100.00 ▇▁▁▁▁
Longitude 0 1.00 -1.96 1.84 -8.09 -3.10 -1.87 -0.48 1.76 ▁▂▇▇▅
DOMUS_identifier 2347 0.44 1303.45 1597.19 1.00 486.50 991.50 1470.25 7746.00 ▇▂▁▁▁
Area_Deprivation_index 49 0.99 5.44 2.48 1.00 4.00 5.00 7.00 10.00 ▃▆▇▆▃
Area_Deprivation_index_crime 49 0.99 5.43 3.07 1.00 3.00 6.00 8.00 10.00 ▇▆▅▇▇
Area_Deprivation_index_education 49 0.99 6.04 2.61 1.00 4.00 6.00 8.00 10.00 ▃▅▇▇▆
Area_Deprivation_index_employment 49 0.99 6.08 2.76 1.00 4.00 6.00 8.00 10.00 ▅▆▇▇▇
Area_Deprivation_index_health 49 0.99 6.02 2.82 1.00 4.00 6.00 8.00 10.00 ▅▆▆▇▇
Area_Deprivation_index_housing 49 0.99 3.97 2.75 1.00 1.00 3.00 6.00 10.00 ▇▅▃▂▂
Area_Deprivation_index_income 49 0.99 5.99 2.62 1.00 4.00 6.00 8.00 10.00 ▃▆▇▇▆
Area_Deprivation_index_services 49 0.99 4.78 3.01 1.00 2.00 4.00 7.00 10.00 ▇▅▅▅▅

Notes about the dataset

Clean Data

library(janitor)
## Warning: package 'janitor' was built under R version 4.4.2
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
data_clean <- museums %>%
    
    # Village,_Town_or_City (, can't be used for variable name): clean variable names
    janitor::clean_names() %>%
    
    # supergroup vs groups vs subgroups: run the model with either groups or subgroups and choose one based on the model performance
    select(-area_geodemographic_group, -area_geodemographic_subgroup) %>%
    
    # Variables with too many missing values: Remove
    select(-notes, -address_line_2, -domus_identifier, -domus_subject_matter, -identifier_used_in_primary_data_source) %>%
    
    # Redundant Variables (e.g., Area_Geodemographic_group & Area_Geodemographic_group_code):** Keep only one from each pair, as they contain the same information.
    select(-ends_with("_code")) %>%
    
    # Location Variables (Address_line_1 through Admin_area): Select only one
    select(-address_line_1, -village_town_or_city, -postcode, -latitude, -longitude) %>%
    
    # Extract provinces from admin_area
    mutate(admin_area = admin_area %>% str_remove("\\/")) %>% 
    mutate(admin_area = admin_area %>% str_remove("\\/.*")) %>%
    
    # Year_opened, Year_closed: Drop Year_closed and consider adding later
    select(-year_closed) %>%
    mutate(year_opened = year_opened %>% str_remove("\\:.*") %>% as.numeric()) %>%
    
    # inconsistent white spaces between words (e.g., area_geodemographic_group has "Ethnically Diverse Metropolitan Living" and "Ethnically Diverse Metropolitan  Living"): Revise the character variables so that they have one white space between words
    mutate(across(where(is.character), str_squish)) %>%
    
    # Character variables: Convert them to factor
    mutate(across(where(is.character), factor)) %>%
    
    # id variables (museum_id and name_of_museum): Keep museum_id
    select(-name_of_museum) %>%
    
    # Remove all rows with missing values
    na.omit()

Explore Data

# Check whether the target variable is unbalanced
data_clean %>% count(accreditation)
## # A tibble: 2 × 2
##   accreditation     n
##   <fct>         <int>
## 1 Accredited     1703
## 2 Unaccredited   2267
# correlation funnel analysis

# Step 1: Prepare Data
data_binarized_tbl <- data_clean %>%
    
    # Throw in all predictors except for id and strings
    select(-museum_id) %>%
    binarize()

data_binarized_tbl %>% glimpse()
## Rows: 3,970
## Columns: 109
## $ admin_area__England                                                     <dbl> …
## $ admin_area__Northern_Ireland                                            <dbl> …
## $ admin_area__Scotland                                                    <dbl> …
## $ admin_area__Wales                                                       <dbl> …
## $ `admin_area__-OTHER`                                                    <dbl> …
## $ accreditation__Accredited                                               <dbl> …
## $ accreditation__Unaccredited                                             <dbl> …
## $ `governance__Government-Local_Authority`                                <dbl> …
## $ `governance__Government-National`                                       <dbl> …
## $ `governance__Independent-English_Heritage`                              <dbl> …
## $ `governance__Independent-National_Trust`                                <dbl> …
## $ `governance__Independent-Not_for_profit`                                <dbl> …
## $ `governance__Independent-Private`                                       <dbl> …
## $ `governance__Independent-Unknown`                                       <dbl> …
## $ governance__University                                                  <dbl> …
## $ governance__Unknown                                                     <dbl> …
## $ `governance__-OTHER`                                                    <dbl> …
## $ size__large                                                             <dbl> …
## $ size__medium                                                            <dbl> …
## $ size__small                                                             <dbl> …
## $ size__unknown                                                           <dbl> …
## $ `size__-OTHER`                                                          <dbl> …
## $ size_provenance__ace_size_designation                                   <dbl> …
## $ size_provenance__aim_size_designation                                   <dbl> …
## $ size_provenance__domus                                                  <dbl> …
## $ `size_provenance__ma(fam)`                                              <dbl> …
## $ size_provenance__mm_manual_estimate_2018                                <dbl> …
## $ size_provenance__mm_prediction_random_forest                            <dbl> …
## $ size_provenance__scottish_national_audit                                <dbl> …
## $ size_provenance__unknown                                                <dbl> …
## $ size_provenance__visitbritain                                           <dbl> …
## $ `size_provenance__-OTHER`                                               <dbl> …
## $ `subject_matter__Archaeology-Roman`                                     <dbl> …
## $ `subject_matter__Arts-Fine_and_decorative_arts`                         <dbl> …
## $ `subject_matter__Buildings-Houses-Large_houses`                         <dbl> …
## $ `subject_matter__Buildings-Houses-Medium_houses`                        <dbl> …
## $ `subject_matter__Industry_and_manufacture-Mining_and_quarrying`         <dbl> …
## $ `subject_matter__Leisure_and_sport-Toys_and_models`                     <dbl> …
## $ subject_matter__Local_Histories                                         <dbl> …
## $ `subject_matter__Mixed-Encyclopaedic`                                   <dbl> …
## $ `subject_matter__Mixed-Other`                                           <dbl> …
## $ subject_matter__Other                                                   <dbl> …
## $ `subject_matter__Personality-Literary`                                  <dbl> …
## $ `subject_matter__Rural_Industry-Farming`                                <dbl> …
## $ `subject_matter__Sea_and_seafaring-Boats_and_ships`                     <dbl> …
## $ `subject_matter__Sea_and_seafaring-Mixed`                               <dbl> …
## $ `subject_matter__Transport-Cars_and_motorbikes`                         <dbl> …
## $ `subject_matter__Transport-Trains_and_railways`                         <dbl> …
## $ `subject_matter__War_and_conflict-Airforce`                             <dbl> …
## $ `subject_matter__War_and_conflict-Castles_and_forts`                    <dbl> …
## $ `subject_matter__War_and_conflict-Military`                             <dbl> …
## $ `subject_matter__War_and_conflict-Regiment`                             <dbl> …
## $ `subject_matter__-OTHER`                                                <dbl> …
## $ `year_opened__-Inf_1957`                                                <dbl> …
## $ year_opened__1957_1977                                                  <dbl> …
## $ year_opened__1977_1992                                                  <dbl> …
## $ year_opened__1992_Inf                                                   <dbl> …
## $ primary_provenance_of_data__ace                                         <dbl> …
## $ primary_provenance_of_data__aim                                         <dbl> …
## $ primary_provenance_of_data__aim82M                                      <dbl> …
## $ primary_provenance_of_data__aim82NM                                     <dbl> …
## $ primary_provenance_of_data__domus                                       <dbl> …
## $ primary_provenance_of_data__fcm                                         <dbl> …
## $ primary_provenance_of_data__hha                                         <dbl> …
## $ primary_provenance_of_data__mald                                        <dbl> …
## $ primary_provenance_of_data__mgs                                         <dbl> …
## $ primary_provenance_of_data__misc                                        <dbl> …
## $ primary_provenance_of_data__musassoc                                    <dbl> …
## $ primary_provenance_of_data__wiki                                        <dbl> …
## $ `primary_provenance_of_data__-OTHER`                                    <dbl> …
## $ `area_deprivation_index__-Inf_4`                                        <dbl> …
## $ area_deprivation_index__4_5                                             <dbl> …
## $ area_deprivation_index__5_7                                             <dbl> …
## $ area_deprivation_index__7_Inf                                           <dbl> …
## $ `area_deprivation_index_crime__-Inf_3`                                  <dbl> …
## $ area_deprivation_index_crime__3_6                                       <dbl> …
## $ area_deprivation_index_crime__6_8                                       <dbl> …
## $ area_deprivation_index_crime__8_Inf                                     <dbl> …
## $ `area_deprivation_index_education__-Inf_4`                              <dbl> …
## $ area_deprivation_index_education__4_6                                   <dbl> …
## $ area_deprivation_index_education__6_8                                   <dbl> …
## $ area_deprivation_index_education__8_Inf                                 <dbl> …
## $ `area_deprivation_index_employment__-Inf_4`                             <dbl> …
## $ area_deprivation_index_employment__4_6                                  <dbl> …
## $ area_deprivation_index_employment__6_8                                  <dbl> …
## $ area_deprivation_index_employment__8_Inf                                <dbl> …
## $ `area_deprivation_index_health__-Inf_4`                                 <dbl> …
## $ area_deprivation_index_health__4_6                                      <dbl> …
## $ area_deprivation_index_health__6_8                                      <dbl> …
## $ area_deprivation_index_health__8_Inf                                    <dbl> …
## $ `area_deprivation_index_housing__-Inf_3`                                <dbl> …
## $ area_deprivation_index_housing__3_6                                     <dbl> …
## $ area_deprivation_index_housing__6_Inf                                   <dbl> …
## $ `area_deprivation_index_income__-Inf_4`                                 <dbl> …
## $ area_deprivation_index_income__4_6                                      <dbl> …
## $ area_deprivation_index_income__6_8                                      <dbl> …
## $ area_deprivation_index_income__8_Inf                                    <dbl> …
## $ `area_deprivation_index_services__-Inf_2`                               <dbl> …
## $ area_deprivation_index_services__2_4                                    <dbl> …
## $ area_deprivation_index_services__4_8                                    <dbl> …
## $ area_deprivation_index_services__8_Inf                                  <dbl> …
## $ area_geodemographic_supergroup__Affluent_England                        <dbl> …
## $ area_geodemographic_supergroup__Business_Education_and_Heritage_Centres <dbl> …
## $ area_geodemographic_supergroup__Countryside_Living                      <dbl> …
## $ area_geodemographic_supergroup__Ethnically_Diverse_Metropolitan_Living  <dbl> …
## $ area_geodemographic_supergroup__London_Cosmopolitan                     <dbl> …
## $ area_geodemographic_supergroup__Services_and_Industrial_Legacy          <dbl> …
## $ area_geodemographic_supergroup__Town_and_Country_Living                 <dbl> …
## $ area_geodemographic_supergroup__Urban_Settlements                       <dbl> …
# Step 2: Correlate to the Target
data_corr_tbl <- data_binarized_tbl %>%
    correlate(accreditation__Accredited)

data_corr_tbl
## # A tibble: 109 × 3
##    feature                    bin                         correlation
##    <fct>                      <chr>                             <dbl>
##  1 accreditation              Accredited                        1    
##  2 accreditation              Unaccredited                     -1    
##  3 primary_provenance_of_data domus                             0.571
##  4 size_provenance            mm_prediction_random_forest      -0.516
##  5 size_provenance            domus                             0.458
##  6 governance                 Independent-Private              -0.387
##  7 size                       small                            -0.377
##  8 primary_provenance_of_data wiki                             -0.295
##  9 primary_provenance_of_data ace                               0.261
## 10 size                       large                             0.256
## # ℹ 99 more rows
# Step 3: Plot
data_corr_tbl %>% 
    plot_correlation_funnel()
## Warning: ggrepel: 56 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps