Goal is to predict attrition, employees who are likely to leave the company.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(correlationfunnel)
## ══ correlationfunnel Tip #1 ════════════════════════════════════════════════════
## Make sure your data is not overly imbalanced prior to using `correlate()`.
## If less than 5% imbalance, consider sampling. :)
library(dplyr)
data <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')
## Rows: 4191 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (24): museum_id, Name_of_museum, Address_line_1, Address_line_2, Village...
## dbl (11): Latitude, Longitude, DOMUS_identifier, Area_Deprivation_index, Are...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(data)
Name | data |
Number of rows | 4191 |
Number of columns | 35 |
_______________________ | |
Column type frequency: | |
character | 24 |
numeric | 11 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
museum_id | 0 | 1.00 | 8 | 15 | 0 | 4191 | 0 |
Name_of_museum | 0 | 1.00 | 3 | 76 | 0 | 4190 | 0 |
Address_line_1 | 441 | 0.89 | 3 | 61 | 0 | 3212 | 0 |
Address_line_2 | 2816 | 0.33 | 3 | 39 | 0 | 1167 | 0 |
Village,_Town_or_City | 4 | 1.00 | 3 | 24 | 0 | 1696 | 0 |
Postcode | 0 | 1.00 | 6 | 9 | 0 | 3918 | 0 |
Admin_area | 0 | 1.00 | 12 | 137 | 0 | 393 | 0 |
Accreditation | 0 | 1.00 | 10 | 12 | 0 | 2 | 0 |
Governance | 0 | 1.00 | 7 | 41 | 0 | 13 | 0 |
Size | 0 | 1.00 | 4 | 7 | 0 | 5 | 0 |
Size_provenance | 179 | 0.96 | 2 | 29 | 0 | 16 | 0 |
Subject_Matter | 0 | 1.00 | 5 | 45 | 0 | 114 | 0 |
Year_opened | 0 | 1.00 | 9 | 9 | 0 | 351 | 0 |
Year_closed | 0 | 1.00 | 9 | 9 | 0 | 170 | 0 |
DOMUS_Subject_Matter | 2788 | 0.33 | 5 | 27 | 0 | 21 | 0 |
Primary_provenance_of_data | 0 | 1.00 | 3 | 8 | 0 | 18 | 0 |
Identifier_used_in_primary_data_source | 2056 | 0.51 | 2 | 8 | 0 | 2134 | 0 |
Area_Geodemographic_group | 49 | 0.99 | 11 | 40 | 0 | 17 | 0 |
Area_Geodemographic_group_code | 49 | 0.99 | 3 | 3 | 0 | 16 | 0 |
Area_Geodemographic_subgroup | 49 | 0.99 | 12 | 39 | 0 | 25 | 0 |
Area_Geodemographic_subgroup_code | 49 | 0.99 | 4 | 4 | 0 | 24 | 0 |
Area_Geodemographic_supergroup | 49 | 0.99 | 16 | 39 | 0 | 8 | 0 |
Area_Geodemographic_supergroup_code | 49 | 0.99 | 2 | 2 | 0 | 8 | 0 |
Notes | 2980 | 0.29 | 12 | 751 | 0 | 956 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Latitude | 0 | 1.00 | 52.93 | 2.09 | 49.18 | 51.48 | 52.47 | 53.96 | 100.00 | ▇▁▁▁▁ |
Longitude | 0 | 1.00 | -1.96 | 1.84 | -8.09 | -3.10 | -1.87 | -0.48 | 1.76 | ▁▂▇▇▅ |
DOMUS_identifier | 2347 | 0.44 | 1303.45 | 1597.19 | 1.00 | 486.50 | 991.50 | 1470.25 | 7746.00 | ▇▂▁▁▁ |
Area_Deprivation_index | 49 | 0.99 | 5.44 | 2.48 | 1.00 | 4.00 | 5.00 | 7.00 | 10.00 | ▃▆▇▆▃ |
Area_Deprivation_index_crime | 49 | 0.99 | 5.43 | 3.07 | 1.00 | 3.00 | 6.00 | 8.00 | 10.00 | ▇▆▅▇▇ |
Area_Deprivation_index_education | 49 | 0.99 | 6.04 | 2.61 | 1.00 | 4.00 | 6.00 | 8.00 | 10.00 | ▃▅▇▇▆ |
Area_Deprivation_index_employment | 49 | 0.99 | 6.08 | 2.76 | 1.00 | 4.00 | 6.00 | 8.00 | 10.00 | ▅▆▇▇▇ |
Area_Deprivation_index_health | 49 | 0.99 | 6.02 | 2.82 | 1.00 | 4.00 | 6.00 | 8.00 | 10.00 | ▅▆▆▇▇ |
Area_Deprivation_index_housing | 49 | 0.99 | 3.97 | 2.75 | 1.00 | 1.00 | 3.00 | 6.00 | 10.00 | ▇▅▃▂▂ |
Area_Deprivation_index_income | 49 | 0.99 | 5.99 | 2.62 | 1.00 | 4.00 | 6.00 | 8.00 | 10.00 | ▃▆▇▇▆ |
Area_Deprivation_index_services | 49 | 0.99 | 4.78 | 3.01 | 1.00 | 2.00 | 4.00 | 7.00 | 10.00 | ▇▅▅▅▅ |
Issues with data * Missing values * Factors or numeric variables * Governance, Size, Subject_Matter, Area_Geodemographic_group, Area_Geodemographic_group_code, Area_Deprivation_index, Area_Deprivation_index_crime, Area_Deprivation_index_education, Area_Deprivation_index_employment, Area_Deprivation_index_health, Area_Deprivation_index_housing, Area_Deprivation_index_income, Area_Deprivation_index_services * Zero variance variables * Character variables: Convert them to numbers in recipe step * Unbalanced target variable: Accreditation * ID variable: museum_id
library(stringr)
data_clean <- data %>%
# Remove unnecessary columns
select(-c(Size_provenance, DOMUS_Subject_Matter, Year_closed, Primary_provenance_of_data,
Identifier_used_in_primary_data_source, Area_Geodemographic_subgroup,
Area_Geodemographic_subgroup_code, Area_Geodemographic_supergroup,
Area_Geodemographic_supergroup_code, Notes, Latitude, Longitude, DOMUS_identifier,
Address_line_2, Postcode, Admin_area, Area_Geodemographic_group, Name_of_museum)) %>%
# Remove rows with missing values
na.omit() %>%
# Change column name and adjust Years opened
rename("Village_Town_City" = "Village,_Town_or_City") %>%
mutate(Year_opened = str_sub(Year_opened, 1, 4)) %>%
# Convert selected columns to factors
mutate(across(c(Accreditation, Governance, Size, Subject_Matter,
Area_Geodemographic_group_code, Area_Deprivation_index,
Area_Deprivation_index_crime, Area_Deprivation_index_education,
Area_Deprivation_index_employment, Area_Deprivation_index_health,
Area_Deprivation_index_housing, Area_Deprivation_index_income,
Area_Deprivation_index_services), as.factor)) %>%
# Recode Accreditation
mutate(Accreditation = if_else(Accreditation == "Accredited", "Yes", "No"))
# Preview the cleaned data
glimpse(data_clean)
## Rows: 3,708
## Columns: 17
## $ museum_id <chr> "mm.New.1", "mm.domus.WM019", "mm.ai…
## $ Address_line_1 <chr> "1 Olympic Way", "Warwick College of…
## $ Village_Town_City <chr> "Belfast", "Moreton Morrell", "Chelt…
## $ Accreditation <chr> "No", "No", "Yes", "No", "No", "Yes"…
## $ Governance <fct> Independent-Not_for_profit, Governme…
## $ Size <fct> large, medium, medium, small, small,…
## $ Subject_Matter <fct> Sea_and_seafaring-Boats_and_ships, R…
## $ Year_opened <chr> "2012", "1984", "2013", "1996", "198…
## $ Area_Deprivation_index <fct> 2, 8, 8, 2, 6, 6, 5, 6, 3, 7, 5, 8, …
## $ Area_Deprivation_index_crime <fct> 3, 9, 10, 1, 10, 3, 1, 10, 1, 7, 10,…
## $ Area_Deprivation_index_education <fct> 1, 8, 7, 6, 8, 7, 7, 6, 4, 7, 6, 6, …
## $ Area_Deprivation_index_employment <fct> 2, 10, 7, 3, 7, 6, 6, 7, 2, 7, 6, 8,…
## $ Area_Deprivation_index_health <fct> 1, 8, 8, 2, 7, 8, 5, 7, 2, 9, 5, 8, …
## $ Area_Deprivation_index_housing <fct> 4, 5, 7, 1, 8, 9, 1, 7, 6, 7, 7, 5, …
## $ Area_Deprivation_index_income <fct> 5, 8, 8, 3, 5, 5, 7, 5, 3, 8, 6, 8, …
## $ Area_Deprivation_index_services <fct> 5, 1, 4, 4, 2, 3, 9, 1, 9, 1, 1, 2, …
## $ Area_Geodemographic_group_code <fct> 2ar, 3ar, 7ar, 5ar, 3cr, 6br, 2ar, 3…
Accredited vs. Unaccredited
data_clean %>% count(Accreditation)
## # A tibble: 2 × 2
## Accreditation n
## <chr> <int>
## 1 No 2040
## 2 Yes 1668
data_clean %>%
ggplot(aes(Accreditation)) +
geom_bar()
Most common Subjects
data_clean %>% count(Subject_Matter) %>% arrange(desc(n))
## # A tibble: 114 × 2
## Subject_Matter n
## <fct> <int>
## 1 Local_Histories 853
## 2 Buildings-Houses-Large_houses 200
## 3 Arts-Fine_and_decorative_arts 186
## 4 War_and_conflict-Regiment 128
## 5 Transport-Trains_and_railways 127
## 6 Mixed-Encyclopaedic 115
## 7 Personality-Literary 80
## 8 Mixed-Other 78
## 9 Other 71
## 10 Buildings-Houses-Medium_houses 67
## # ℹ 104 more rows
data_clean %>%
ggplot(aes(x= Subject_Matter)) +
geom_bar()
Years museums opened
data_clean %>%
ggplot(aes(Year_opened, Accreditation)) +
geom_count()
correlation plot
# Step 1: binarize
data_binarized <- data_clean %>%
select(-museum_id) %>%
binarize()
data_binarized %>% glimpse()
## Rows: 3,708
## Columns: 175
## $ Address_line_1__High_Street <dbl> 0, 0, …
## $ `Address_line_1__-OTHER` <dbl> 1, 1, …
## $ Village_Town_City__Edinburgh <dbl> 0, 0, …
## $ Village_Town_City__London <dbl> 0, 0, …
## $ `Village_Town_City__-OTHER` <dbl> 1, 1, …
## $ Accreditation__No <dbl> 1, 1, …
## $ Accreditation__Yes <dbl> 0, 0, …
## $ `Governance__Government-Local_Authority` <dbl> 0, 1, …
## $ `Governance__Government-National` <dbl> 0, 0, …
## $ `Governance__Independent-English_Heritage` <dbl> 0, 0, …
## $ `Governance__Independent-National_Trust` <dbl> 0, 0, …
## $ `Governance__Independent-Not_for_profit` <dbl> 1, 0, …
## $ `Governance__Independent-Private` <dbl> 0, 0, …
## $ `Governance__Independent-Unknown` <dbl> 0, 0, …
## $ Governance__University <dbl> 0, 0, …
## $ Governance__Unknown <dbl> 0, 0, …
## $ `Governance__-OTHER` <dbl> 0, 0, …
## $ Size__large <dbl> 1, 0, …
## $ Size__medium <dbl> 0, 1, …
## $ Size__small <dbl> 0, 0, …
## $ Size__unknown <dbl> 0, 0, …
## $ `Size__-OTHER` <dbl> 0, 0, …
## $ `Subject_Matter__Archaeology-Roman` <dbl> 0, 0, …
## $ `Subject_Matter__Arts-Fine_and_decorative_arts` <dbl> 0, 0, …
## $ `Subject_Matter__Buildings-Houses-Large_houses` <dbl> 0, 0, …
## $ `Subject_Matter__Buildings-Houses-Medium_houses` <dbl> 0, 0, …
## $ `Subject_Matter__Industry_and_manufacture-Mining_and_quarrying` <dbl> 0, 0, …
## $ `Subject_Matter__Leisure_and_sport-Toys_and_models` <dbl> 0, 0, …
## $ Subject_Matter__Local_Histories <dbl> 0, 0, …
## $ `Subject_Matter__Mixed-Encyclopaedic` <dbl> 0, 0, …
## $ `Subject_Matter__Mixed-Other` <dbl> 0, 0, …
## $ Subject_Matter__Other <dbl> 0, 0, …
## $ `Subject_Matter__Personality-Literary` <dbl> 0, 0, …
## $ `Subject_Matter__Rural_Industry-Farming` <dbl> 0, 1, …
## $ `Subject_Matter__Sea_and_seafaring-Boats_and_ships` <dbl> 1, 0, …
## $ `Subject_Matter__Sea_and_seafaring-Mixed` <dbl> 0, 0, …
## $ `Subject_Matter__Transport-Cars_and_motorbikes` <dbl> 0, 0, …
## $ `Subject_Matter__Transport-Trains_and_railways` <dbl> 0, 0, …
## $ `Subject_Matter__War_and_conflict-Airforce` <dbl> 0, 0, …
## $ `Subject_Matter__War_and_conflict-Castles_and_forts` <dbl> 0, 0, …
## $ `Subject_Matter__War_and_conflict-Military` <dbl> 0, 0, …
## $ `Subject_Matter__War_and_conflict-Regiment` <dbl> 0, 0, …
## $ `Subject_Matter__-OTHER` <dbl> 0, 0, …
## $ Year_opened__1945 <dbl> 0, 0, …
## $ Year_opened__1960 <dbl> 0, 0, …
## $ Year_opened__1972 <dbl> 0, 0, …
## $ Year_opened__1973 <dbl> 0, 0, …
## $ Year_opened__1974 <dbl> 0, 0, …
## $ Year_opened__1975 <dbl> 0, 0, …
## $ Year_opened__1976 <dbl> 0, 0, …
## $ Year_opened__1977 <dbl> 0, 0, …
## $ Year_opened__1978 <dbl> 0, 0, …
## $ Year_opened__1979 <dbl> 0, 0, …
## $ Year_opened__1980 <dbl> 0, 0, …
## $ Year_opened__1981 <dbl> 0, 0, …
## $ Year_opened__1982 <dbl> 0, 0, …
## $ Year_opened__1983 <dbl> 0, 0, …
## $ Year_opened__1984 <dbl> 0, 1, …
## $ Year_opened__1985 <dbl> 0, 0, …
## $ Year_opened__1986 <dbl> 0, 0, …
## $ Year_opened__1987 <dbl> 0, 0, …
## $ Year_opened__1988 <dbl> 0, 0, …
## $ Year_opened__1989 <dbl> 0, 0, …
## $ Year_opened__1990 <dbl> 0, 0, …
## $ Year_opened__1991 <dbl> 0, 0, …
## $ Year_opened__1992 <dbl> 0, 0, …
## $ Year_opened__1993 <dbl> 0, 0, …
## $ Year_opened__1994 <dbl> 0, 0, …
## $ Year_opened__1995 <dbl> 0, 0, …
## $ Year_opened__1996 <dbl> 0, 0, …
## $ Year_opened__1997 <dbl> 0, 0, …
## $ Year_opened__1998 <dbl> 0, 0, …
## $ Year_opened__1999 <dbl> 0, 0, …
## $ Year_opened__2000 <dbl> 0, 0, …
## $ Year_opened__2002 <dbl> 0, 0, …
## $ Year_opened__2005 <dbl> 0, 0, …
## $ Year_opened__2012 <dbl> 1, 0, …
## $ Year_opened__2014 <dbl> 0, 0, …
## $ `Year_opened__-OTHER` <dbl> 0, 0, …
## $ Area_Deprivation_index__1 <dbl> 0, 0, …
## $ Area_Deprivation_index__2 <dbl> 1, 0, …
## $ Area_Deprivation_index__3 <dbl> 0, 0, …
## $ Area_Deprivation_index__4 <dbl> 0, 0, …
## $ Area_Deprivation_index__5 <dbl> 0, 0, …
## $ Area_Deprivation_index__6 <dbl> 0, 0, …
## $ Area_Deprivation_index__7 <dbl> 0, 0, …
## $ Area_Deprivation_index__8 <dbl> 0, 1, …
## $ Area_Deprivation_index__9 <dbl> 0, 0, …
## $ Area_Deprivation_index__10 <dbl> 0, 0, …
## $ Area_Deprivation_index_crime__1 <dbl> 0, 0, …
## $ Area_Deprivation_index_crime__2 <dbl> 0, 0, …
## $ Area_Deprivation_index_crime__3 <dbl> 1, 0, …
## $ Area_Deprivation_index_crime__4 <dbl> 0, 0, …
## $ Area_Deprivation_index_crime__5 <dbl> 0, 0, …
## $ Area_Deprivation_index_crime__6 <dbl> 0, 0, …
## $ Area_Deprivation_index_crime__7 <dbl> 0, 0, …
## $ Area_Deprivation_index_crime__8 <dbl> 0, 0, …
## $ Area_Deprivation_index_crime__9 <dbl> 0, 1, …
## $ Area_Deprivation_index_crime__10 <dbl> 0, 0, …
## $ Area_Deprivation_index_education__1 <dbl> 1, 0, …
## $ Area_Deprivation_index_education__2 <dbl> 0, 0, …
## $ Area_Deprivation_index_education__3 <dbl> 0, 0, …
## $ Area_Deprivation_index_education__4 <dbl> 0, 0, …
## $ Area_Deprivation_index_education__5 <dbl> 0, 0, …
## $ Area_Deprivation_index_education__6 <dbl> 0, 0, …
## $ Area_Deprivation_index_education__7 <dbl> 0, 0, …
## $ Area_Deprivation_index_education__8 <dbl> 0, 1, …
## $ Area_Deprivation_index_education__9 <dbl> 0, 0, …
## $ Area_Deprivation_index_education__10 <dbl> 0, 0, …
## $ Area_Deprivation_index_employment__1 <dbl> 0, 0, …
## $ Area_Deprivation_index_employment__2 <dbl> 1, 0, …
## $ Area_Deprivation_index_employment__3 <dbl> 0, 0, …
## $ Area_Deprivation_index_employment__4 <dbl> 0, 0, …
## $ Area_Deprivation_index_employment__5 <dbl> 0, 0, …
## $ Area_Deprivation_index_employment__6 <dbl> 0, 0, …
## $ Area_Deprivation_index_employment__7 <dbl> 0, 0, …
## $ Area_Deprivation_index_employment__8 <dbl> 0, 0, …
## $ Area_Deprivation_index_employment__9 <dbl> 0, 0, …
## $ Area_Deprivation_index_employment__10 <dbl> 0, 1, …
## $ Area_Deprivation_index_health__1 <dbl> 1, 0, …
## $ Area_Deprivation_index_health__2 <dbl> 0, 0, …
## $ Area_Deprivation_index_health__3 <dbl> 0, 0, …
## $ Area_Deprivation_index_health__4 <dbl> 0, 0, …
## $ Area_Deprivation_index_health__5 <dbl> 0, 0, …
## $ Area_Deprivation_index_health__6 <dbl> 0, 0, …
## $ Area_Deprivation_index_health__7 <dbl> 0, 0, …
## $ Area_Deprivation_index_health__8 <dbl> 0, 1, …
## $ Area_Deprivation_index_health__9 <dbl> 0, 0, …
## $ Area_Deprivation_index_health__10 <dbl> 0, 0, …
## $ Area_Deprivation_index_housing__1 <dbl> 0, 0, …
## $ Area_Deprivation_index_housing__2 <dbl> 0, 0, …
## $ Area_Deprivation_index_housing__3 <dbl> 0, 0, …
## $ Area_Deprivation_index_housing__4 <dbl> 1, 0, …
## $ Area_Deprivation_index_housing__5 <dbl> 0, 1, …
## $ Area_Deprivation_index_housing__6 <dbl> 0, 0, …
## $ Area_Deprivation_index_housing__7 <dbl> 0, 0, …
## $ Area_Deprivation_index_housing__8 <dbl> 0, 0, …
## $ Area_Deprivation_index_housing__9 <dbl> 0, 0, …
## $ Area_Deprivation_index_housing__10 <dbl> 0, 0, …
## $ Area_Deprivation_index_income__1 <dbl> 0, 0, …
## $ Area_Deprivation_index_income__2 <dbl> 0, 0, …
## $ Area_Deprivation_index_income__3 <dbl> 0, 0, …
## $ Area_Deprivation_index_income__4 <dbl> 0, 0, …
## $ Area_Deprivation_index_income__5 <dbl> 1, 0, …
## $ Area_Deprivation_index_income__6 <dbl> 0, 0, …
## $ Area_Deprivation_index_income__7 <dbl> 0, 0, …
## $ Area_Deprivation_index_income__8 <dbl> 0, 1, …
## $ Area_Deprivation_index_income__9 <dbl> 0, 0, …
## $ Area_Deprivation_index_income__10 <dbl> 0, 0, …
## $ Area_Deprivation_index_services__1 <dbl> 0, 1, …
## $ Area_Deprivation_index_services__2 <dbl> 0, 0, …
## $ Area_Deprivation_index_services__3 <dbl> 0, 0, …
## $ Area_Deprivation_index_services__4 <dbl> 0, 0, …
## $ Area_Deprivation_index_services__5 <dbl> 1, 0, …
## $ Area_Deprivation_index_services__6 <dbl> 0, 0, …
## $ Area_Deprivation_index_services__7 <dbl> 0, 0, …
## $ Area_Deprivation_index_services__8 <dbl> 0, 0, …
## $ Area_Deprivation_index_services__9 <dbl> 0, 0, …
## $ Area_Deprivation_index_services__10 <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__1ar <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__1br <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__2ar <dbl> 1, 0, …
## $ Area_Geodemographic_group_code__2br <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__3ar <dbl> 0, 1, …
## $ Area_Geodemographic_group_code__3br <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__3cr <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__4ar <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__5ar <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__6ar <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__6br <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__7ar <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__7br <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__7cr <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__8ar <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__8br <dbl> 0, 0, …
# Step 2: correlate
data_correlation <- data_binarized %>%
correlate(Accreditation__Yes)
data_correlation
## # A tibble: 175 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 Accreditation No -1
## 2 Accreditation Yes 1
## 3 Governance Independent-Private -0.353
## 4 Size small -0.326
## 5 Size medium 0.253
## 6 Size large 0.242
## 7 Governance Government-Local_Authority 0.221
## 8 Governance Independent-Unknown -0.202
## 9 Size unknown -0.166
## 10 Subject_Matter -OTHER -0.147
## # ℹ 165 more rows
# Step 3: Plot
data_correlation %>%
correlationfunnel::plot_correlation_funnel()
## Warning: ggrepel: 161 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.1
## ✔ dials 1.2.1 ✔ tune 1.2.1
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.2.1 ✔ yardstick 1.3.1
## ✔ recipes 1.0.10
## Warning: package 'modeldata' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
set.seed(1234)
data_clean <- data_clean %>% sample_n(1000)
data_split <- initial_split(data_clean, strata = Accreditation)
data_train <- training(data_split)
data_test <- testing(data_split)
data_cv <- rsample::vfold_cv(data_train, strata = Accreditation)
data_cv
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [674/75]> Fold01
## 2 <split [674/75]> Fold02
## 3 <split [674/75]> Fold03
## 4 <split [674/75]> Fold04
## 5 <split [674/75]> Fold05
## 6 <split [674/75]> Fold06
## 7 <split [674/75]> Fold07
## 8 <split [674/75]> Fold08
## 9 <split [674/75]> Fold09
## 10 <split [675/74]> Fold10
{r} # library(usemodels) # use_xgboost(like_count ~ ., data = data_train) #
skimr::skim(data_clean)
Name | data_clean |
Number of rows | 1000 |
Number of columns | 17 |
_______________________ | |
Column type frequency: | |
character | 5 |
factor | 12 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
museum_id | 0 | 1 | 8 | 15 | 0 | 1000 | 0 |
Address_line_1 | 0 | 1 | 4 | 52 | 0 | 925 | 0 |
Village_Town_City | 0 | 1 | 3 | 24 | 0 | 618 | 0 |
Accreditation | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
Year_opened | 0 | 1 | 4 | 4 | 0 | 161 | 0 |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
Governance | 0 | 1 | FALSE | 12 | Ind: 439, Gov: 251, Ind: 119, Ind: 40 |
Size | 0 | 1 | FALSE | 5 | sma: 553, med: 258, lar: 136, unk: 51 |
Subject_Matter | 0 | 1 | FALSE | 108 | Loc: 244, Art: 57, Bui: 50, War: 34 |
Area_Deprivation_index | 0 | 1 | FALSE | 10 | 6: 133, 4: 132, 5: 122, 7: 114 |
Area_Deprivation_index_crime | 0 | 1 | FALSE | 10 | 1: 162, 2: 118, 4: 102, 10: 99 |
Area_Deprivation_index_education | 0 | 1 | FALSE | 10 | 7: 146, 5: 116, 8: 113, 6: 111 |
Area_Deprivation_index_employment | 0 | 1 | FALSE | 10 | 10: 147, 5: 119, 7: 110, 4: 109 |
Area_Deprivation_index_health | 0 | 1 | FALSE | 10 | 10: 126, 5: 123, 8: 116, 7: 106 |
Area_Deprivation_index_housing | 0 | 1 | FALSE | 10 | 1: 261, 2: 150, 3: 127, 4: 86 |
Area_Deprivation_index_income | 0 | 1 | FALSE | 10 | 6: 127, 5: 121, 4: 119, 7: 115 |
Area_Deprivation_index_services | 0 | 1 | FALSE | 10 | 1: 166, 9: 107, 2: 106, 3: 99 |
Area_Geodemographic_group_code | 0 | 1 | FALSE | 16 | 3ar: 164, 2ar: 106, 3br: 102, 7ar: 96 |
library(themis)
library(recipes)
library(textrecipes)
xgboost_recipe <- recipes::recipe(Accreditation ~ ., data = data_train) %>%
update_role(museum_id, new_role = "ID") %>%
step_tokenize(Village_Town_City) %>%
step_tokenfilter(Village_Town_City, max_tokens = 50) %>%
step_tf(Village_Town_City) %>%
step_novel(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(Address_line_1_new, Governance_new, Size_new, Subject_Matter_Archaeology.Other, Subject_Matter_Communications,
Subject_Matter_Natural_world, Subject_Matter_Natural_world.Zoology, Subject_Matter_Services.RNLI, Subject_Matter_War_and_conflict.Other, Subject_Matter_new, Year_opened_new,
Area_Deprivation_index_new, Area_Deprivation_index_crime_new, Area_Deprivation_index_education_new, Area_Deprivation_index_employment_new, Area_Deprivation_index_health_new,
Area_Deprivation_index_housing_new, Area_Deprivation_index_income_new, Area_Deprivation_index_services_new, Area_Geodemographic_group_code_new) %>%
step_normalize(all_numeric_predictors()) %>%
# step_pca(all_numeric_predictors(), threshold = .99) %>%
step_smote(Accreditation)
# xgboost_recipe %>% prep() %>% juice() %>% glimpse()
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
loss_reduction = tune(), sample_size = tune()) %>%
set_mode("classification") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
doParallel::registerDoParallel()
set.seed(17375)
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = data_cv,
grid = 5,
control = control_grid(save_pred = TRUE))
## Warning: package 'xgboost' was built under R version 4.3.3
collect_metrics(xgboost_tune)
## # A tibble: 15 × 12
## trees min_n tree_depth learn_rate loss_reduction sample_size .metric
## <int> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 677 7 14 0.00195 5.18e- 4 0.477 accuracy
## 2 677 7 14 0.00195 5.18e- 4 0.477 brier_class
## 3 677 7 14 0.00195 5.18e- 4 0.477 roc_auc
## 4 1016 11 4 0.00394 3.13e-10 0.723 accuracy
## 5 1016 11 4 0.00394 3.13e-10 0.723 brier_class
## 6 1016 11 4 0.00394 3.13e-10 0.723 roc_auc
## 7 1626 19 8 0.0202 2.06e- 7 0.257 accuracy
## 8 1626 19 8 0.0202 2.06e- 7 0.257 brier_class
## 9 1626 19 8 0.0202 2.06e- 7 0.257 roc_auc
## 10 1483 30 5 0.0873 5.18e- 3 0.401 accuracy
## 11 1483 30 5 0.0873 5.18e- 3 0.401 brier_class
## 12 1483 30 5 0.0873 5.18e- 3 0.401 roc_auc
## 13 111 39 12 0.238 4.91e- 1 0.850 accuracy
## 14 111 39 12 0.238 4.91e- 1 0.850 brier_class
## 15 111 39 12 0.238 4.91e- 1 0.850 roc_auc
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## # .config <chr>
collect_predictions(xgboost_tune) %>%
group_by(id) %>%
roc_curve(Accreditation, .pred_Yes) %>%
autoplot()
xgboost_last <- xgboost_workflow %>%
finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
last_fit(data_split)
## New names:
## → A | warning: ! The following columns have zero variance so scaling cannot be
## used: Governance_Government.Other, Subject_Matter_Belief_and_identity.Other,
## and Subject_Matter_Services.Other. ℹ Consider using ?step_zv
## (`?recipes::step_zv()`) to remove those columns before normalizing.
## There were issues with some computations A: x1
## New names:
## There were issues with some computations A: x1
##
## • `Address_line_1_Units.1...2` -> `Address_line_1_Units.1`
collect_metrics(xgboost_last)
## # A tibble: 3 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.705 Preprocessor1_Model1
## 2 roc_auc binary 0.777 Preprocessor1_Model1
## 3 brier_class binary 0.191 Preprocessor1_Model1
collect_predictions((xgboost_last)) %>%
#mutate(.pred_Accredited = as.factor(.pred_Accredited)) %>%
yardstick::conf_mat(Accreditation, .pred_class) %>%
autoplot()
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
xgboost_last %>%
workflows::extract_fit_engine() %>%
vip()
The previous model had accuracy 0.693 and AUC of 0.773.