Goal is to predict attrition, employees who are likely to leave the company.

Import Data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(correlationfunnel)

## ══ correlationfunnel Tip #1 ════════════════════════════════════════════════════
## Make sure your data is not overly imbalanced prior to using `correlate()`.
## If less than 5% imbalance, consider sampling. :)

library(dplyr)

data <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')

## Rows: 4191 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (24): museum_id, Name_of_museum, Address_line_1, Address_line_2, Village...
## dbl (11): Latitude, Longitude, DOMUS_identifier, Area_Deprivation_index, Are...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Explore data

skimr::skim(data)

Data summary
Name	data
Number of rows	4191
Number of columns	35
_______________________
Column type frequency:
character	24
numeric	11
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
museum_id	0	1.00	8	15	4191
Name_of_museum	0	1.00	3	76	4190
Address_line_1	441	0.89	3	61	3212
Address_line_2	2816	0.33	3	39	1167
Village,_Town_or_City	4	1.00	3	24	1696
Postcode	0	1.00	6	9	3918
Admin_area	0	1.00	12	137	393
Accreditation	0	1.00	10	12	2
Governance	0	1.00	7	41	13
Size	0	1.00	4	7	5
Size_provenance	179	0.96	2	29	16
Subject_Matter	0	1.00	5	45	114
Year_opened	0	1.00	9	9	351
Year_closed	0	1.00	9	9	170
DOMUS_Subject_Matter	2788	0.33	5	27	21
Primary_provenance_of_data	0	1.00	3	8	18
Identifier_used_in_primary_data_source	2056	0.51	2	8	2134
Area_Geodemographic_group	49	0.99	11	40	17
Area_Geodemographic_group_code	49	0.99	3	3	16
Area_Geodemographic_subgroup	49	0.99	12	39	25
Area_Geodemographic_subgroup_code	49	0.99	4	4	24
Area_Geodemographic_supergroup	49	0.99	16	39	8
Area_Geodemographic_supergroup_code	49	0.99	2	2	8
Notes	2980	0.29	12	751	956

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Latitude	0	1.00	52.93	2.09	49.18	51.48	52.47	53.96	100.00	▇▁▁▁▁
Longitude	0	1.00	-1.96	1.84	-8.09	-3.10	-1.87	-0.48	1.76	▁▂▇▇▅
DOMUS_identifier	2347	0.44	1303.45	1597.19	1.00	486.50	991.50	1470.25	7746.00	▇▂▁▁▁
Area_Deprivation_index	49	0.99	5.44	2.48	1.00	4.00	5.00	7.00	10.00	▃▆▇▆▃
Area_Deprivation_index_crime	49	0.99	5.43	3.07	1.00	3.00	6.00	8.00	10.00	▇▆▅▇▇
Area_Deprivation_index_education	49	0.99	6.04	2.61	1.00	4.00	6.00	8.00	10.00	▃▅▇▇▆
Area_Deprivation_index_employment	49	0.99	6.08	2.76	1.00	4.00	6.00	8.00	10.00	▅▆▇▇▇
Area_Deprivation_index_health	49	0.99	6.02	2.82	1.00	4.00	6.00	8.00	10.00	▅▆▆▇▇
Area_Deprivation_index_housing	49	0.99	3.97	2.75	1.00	1.00	3.00	6.00	10.00	▇▅▃▂▂
Area_Deprivation_index_income	49	0.99	5.99	2.62	1.00	4.00	6.00	8.00	10.00	▃▆▇▇▆
Area_Deprivation_index_services	49	0.99	4.78	3.01	1.00	2.00	4.00	7.00	10.00	▇▅▅▅▅

Issues with data * Missing values * Factors or numeric variables * Governance, Size, Subject_Matter, Area_Geodemographic_group, Area_Geodemographic_group_code, Area_Deprivation_index, Area_Deprivation_index_crime, Area_Deprivation_index_education, Area_Deprivation_index_employment, Area_Deprivation_index_health, Area_Deprivation_index_housing, Area_Deprivation_index_income, Area_Deprivation_index_services * Zero variance variables * Character variables: Convert them to numbers in recipe step * Unbalanced target variable: Accreditation * ID variable: museum_id

library(stringr)

data_clean <- data %>%
    # Remove unnecessary columns
    select(-c(Size_provenance, DOMUS_Subject_Matter, Year_closed, Primary_provenance_of_data, 
              Identifier_used_in_primary_data_source, Area_Geodemographic_subgroup, 
              Area_Geodemographic_subgroup_code, Area_Geodemographic_supergroup, 
              Area_Geodemographic_supergroup_code, Notes, Latitude, Longitude, DOMUS_identifier, 
              Address_line_2, Postcode, Admin_area, Area_Geodemographic_group, Name_of_museum)) %>%
    # Remove rows with missing values
    na.omit() %>%
    
    # Change column name and adjust Years opened
    rename("Village_Town_City" = "Village,_Town_or_City") %>%
    mutate(Year_opened = str_sub(Year_opened, 1, 4)) %>%
    
    # Convert selected columns to factors
    mutate(across(c(Accreditation, Governance, Size, Subject_Matter, 
                    Area_Geodemographic_group_code, Area_Deprivation_index, 
                    Area_Deprivation_index_crime, Area_Deprivation_index_education, 
                    Area_Deprivation_index_employment, Area_Deprivation_index_health, 
                    Area_Deprivation_index_housing, Area_Deprivation_index_income, 
                    Area_Deprivation_index_services), as.factor)) %>%
    # Recode Accreditation
    mutate(Accreditation = if_else(Accreditation == "Accredited", "Yes", "No"))

# Preview the cleaned data
glimpse(data_clean)

## Rows: 3,708
## Columns: 17
## $ museum_id                         <chr> "mm.New.1", "mm.domus.WM019", "mm.ai…
## $ Address_line_1                    <chr> "1 Olympic Way", "Warwick College of…
## $ Village_Town_City                 <chr> "Belfast", "Moreton Morrell", "Chelt…
## $ Accreditation                     <chr> "No", "No", "Yes", "No", "No", "Yes"…
## $ Governance                        <fct> Independent-Not_for_profit, Governme…
## $ Size                              <fct> large, medium, medium, small, small,…
## $ Subject_Matter                    <fct> Sea_and_seafaring-Boats_and_ships, R…
## $ Year_opened                       <chr> "2012", "1984", "2013", "1996", "198…
## $ Area_Deprivation_index            <fct> 2, 8, 8, 2, 6, 6, 5, 6, 3, 7, 5, 8, …
## $ Area_Deprivation_index_crime      <fct> 3, 9, 10, 1, 10, 3, 1, 10, 1, 7, 10,…
## $ Area_Deprivation_index_education  <fct> 1, 8, 7, 6, 8, 7, 7, 6, 4, 7, 6, 6, …
## $ Area_Deprivation_index_employment <fct> 2, 10, 7, 3, 7, 6, 6, 7, 2, 7, 6, 8,…
## $ Area_Deprivation_index_health     <fct> 1, 8, 8, 2, 7, 8, 5, 7, 2, 9, 5, 8, …
## $ Area_Deprivation_index_housing    <fct> 4, 5, 7, 1, 8, 9, 1, 7, 6, 7, 7, 5, …
## $ Area_Deprivation_index_income     <fct> 5, 8, 8, 3, 5, 5, 7, 5, 3, 8, 6, 8, …
## $ Area_Deprivation_index_services   <fct> 5, 1, 4, 4, 2, 3, 9, 1, 9, 1, 1, 2, …
## $ Area_Geodemographic_group_code    <fct> 2ar, 3ar, 7ar, 5ar, 3cr, 6br, 2ar, 3…

Explore data

Accredited vs. Unaccredited

data_clean %>% count(Accreditation)

## # A tibble: 2 × 2
##   Accreditation     n
##   <chr>         <int>
## 1 No             2040
## 2 Yes            1668

data_clean %>%
    ggplot(aes(Accreditation)) +
    geom_bar()

Most common Subjects

data_clean %>% count(Subject_Matter) %>% arrange(desc(n))

## # A tibble: 114 × 2
##    Subject_Matter                     n
##    <fct>                          <int>
##  1 Local_Histories                  853
##  2 Buildings-Houses-Large_houses    200
##  3 Arts-Fine_and_decorative_arts    186
##  4 War_and_conflict-Regiment        128
##  5 Transport-Trains_and_railways    127
##  6 Mixed-Encyclopaedic              115
##  7 Personality-Literary              80
##  8 Mixed-Other                       78
##  9 Other                             71
## 10 Buildings-Houses-Medium_houses    67
## # ℹ 104 more rows

data_clean %>%
    ggplot(aes(x= Subject_Matter)) +
    geom_bar()

Years museums opened

data_clean %>%
    ggplot(aes(Year_opened, Accreditation)) +
    geom_count()

correlation plot

# Step 1: binarize
data_binarized <- data_clean %>%
    select(-museum_id) %>%
    binarize()

data_binarized %>% glimpse()

## Rows: 3,708
## Columns: 175
## $ Address_line_1__High_Street                                     <dbl> 0, 0, …
## $ `Address_line_1__-OTHER`                                        <dbl> 1, 1, …
## $ Village_Town_City__Edinburgh                                    <dbl> 0, 0, …
## $ Village_Town_City__London                                       <dbl> 0, 0, …
## $ `Village_Town_City__-OTHER`                                     <dbl> 1, 1, …
## $ Accreditation__No                                               <dbl> 1, 1, …
## $ Accreditation__Yes                                              <dbl> 0, 0, …
## $ `Governance__Government-Local_Authority`                        <dbl> 0, 1, …
## $ `Governance__Government-National`                               <dbl> 0, 0, …
## $ `Governance__Independent-English_Heritage`                      <dbl> 0, 0, …
## $ `Governance__Independent-National_Trust`                        <dbl> 0, 0, …
## $ `Governance__Independent-Not_for_profit`                        <dbl> 1, 0, …
## $ `Governance__Independent-Private`                               <dbl> 0, 0, …
## $ `Governance__Independent-Unknown`                               <dbl> 0, 0, …
## $ Governance__University                                          <dbl> 0, 0, …
## $ Governance__Unknown                                             <dbl> 0, 0, …
## $ `Governance__-OTHER`                                            <dbl> 0, 0, …
## $ Size__large                                                     <dbl> 1, 0, …
## $ Size__medium                                                    <dbl> 0, 1, …
## $ Size__small                                                     <dbl> 0, 0, …
## $ Size__unknown                                                   <dbl> 0, 0, …
## $ `Size__-OTHER`                                                  <dbl> 0, 0, …
## $ `Subject_Matter__Archaeology-Roman`                             <dbl> 0, 0, …
## $ `Subject_Matter__Arts-Fine_and_decorative_arts`                 <dbl> 0, 0, …
## $ `Subject_Matter__Buildings-Houses-Large_houses`                 <dbl> 0, 0, …
## $ `Subject_Matter__Buildings-Houses-Medium_houses`                <dbl> 0, 0, …
## $ `Subject_Matter__Industry_and_manufacture-Mining_and_quarrying` <dbl> 0, 0, …
## $ `Subject_Matter__Leisure_and_sport-Toys_and_models`             <dbl> 0, 0, …
## $ Subject_Matter__Local_Histories                                 <dbl> 0, 0, …
## $ `Subject_Matter__Mixed-Encyclopaedic`                           <dbl> 0, 0, …
## $ `Subject_Matter__Mixed-Other`                                   <dbl> 0, 0, …
## $ Subject_Matter__Other                                           <dbl> 0, 0, …
## $ `Subject_Matter__Personality-Literary`                          <dbl> 0, 0, …
## $ `Subject_Matter__Rural_Industry-Farming`                        <dbl> 0, 1, …
## $ `Subject_Matter__Sea_and_seafaring-Boats_and_ships`             <dbl> 1, 0, …
## $ `Subject_Matter__Sea_and_seafaring-Mixed`                       <dbl> 0, 0, …
## $ `Subject_Matter__Transport-Cars_and_motorbikes`                 <dbl> 0, 0, …
## $ `Subject_Matter__Transport-Trains_and_railways`                 <dbl> 0, 0, …
## $ `Subject_Matter__War_and_conflict-Airforce`                     <dbl> 0, 0, …
## $ `Subject_Matter__War_and_conflict-Castles_and_forts`            <dbl> 0, 0, …
## $ `Subject_Matter__War_and_conflict-Military`                     <dbl> 0, 0, …
## $ `Subject_Matter__War_and_conflict-Regiment`                     <dbl> 0, 0, …
## $ `Subject_Matter__-OTHER`                                        <dbl> 0, 0, …
## $ Year_opened__1945                                               <dbl> 0, 0, …
## $ Year_opened__1960                                               <dbl> 0, 0, …
## $ Year_opened__1972                                               <dbl> 0, 0, …
## $ Year_opened__1973                                               <dbl> 0, 0, …
## $ Year_opened__1974                                               <dbl> 0, 0, …
## $ Year_opened__1975                                               <dbl> 0, 0, …
## $ Year_opened__1976                                               <dbl> 0, 0, …
## $ Year_opened__1977                                               <dbl> 0, 0, …
## $ Year_opened__1978                                               <dbl> 0, 0, …
## $ Year_opened__1979                                               <dbl> 0, 0, …
## $ Year_opened__1980                                               <dbl> 0, 0, …
## $ Year_opened__1981                                               <dbl> 0, 0, …
## $ Year_opened__1982                                               <dbl> 0, 0, …
## $ Year_opened__1983                                               <dbl> 0, 0, …
## $ Year_opened__1984                                               <dbl> 0, 1, …
## $ Year_opened__1985                                               <dbl> 0, 0, …
## $ Year_opened__1986                                               <dbl> 0, 0, …
## $ Year_opened__1987                                               <dbl> 0, 0, …
## $ Year_opened__1988                                               <dbl> 0, 0, …
## $ Year_opened__1989                                               <dbl> 0, 0, …
## $ Year_opened__1990                                               <dbl> 0, 0, …
## $ Year_opened__1991                                               <dbl> 0, 0, …
## $ Year_opened__1992                                               <dbl> 0, 0, …
## $ Year_opened__1993                                               <dbl> 0, 0, …
## $ Year_opened__1994                                               <dbl> 0, 0, …
## $ Year_opened__1995                                               <dbl> 0, 0, …
## $ Year_opened__1996                                               <dbl> 0, 0, …
## $ Year_opened__1997                                               <dbl> 0, 0, …
## $ Year_opened__1998                                               <dbl> 0, 0, …
## $ Year_opened__1999                                               <dbl> 0, 0, …
## $ Year_opened__2000                                               <dbl> 0, 0, …
## $ Year_opened__2002                                               <dbl> 0, 0, …
## $ Year_opened__2005                                               <dbl> 0, 0, …
## $ Year_opened__2012                                               <dbl> 1, 0, …
## $ Year_opened__2014                                               <dbl> 0, 0, …
## $ `Year_opened__-OTHER`                                           <dbl> 0, 0, …
## $ Area_Deprivation_index__1                                       <dbl> 0, 0, …
## $ Area_Deprivation_index__2                                       <dbl> 1, 0, …
## $ Area_Deprivation_index__3                                       <dbl> 0, 0, …
## $ Area_Deprivation_index__4                                       <dbl> 0, 0, …
## $ Area_Deprivation_index__5                                       <dbl> 0, 0, …
## $ Area_Deprivation_index__6                                       <dbl> 0, 0, …
## $ Area_Deprivation_index__7                                       <dbl> 0, 0, …
## $ Area_Deprivation_index__8                                       <dbl> 0, 1, …
## $ Area_Deprivation_index__9                                       <dbl> 0, 0, …
## $ Area_Deprivation_index__10                                      <dbl> 0, 0, …
## $ Area_Deprivation_index_crime__1                                 <dbl> 0, 0, …
## $ Area_Deprivation_index_crime__2                                 <dbl> 0, 0, …
## $ Area_Deprivation_index_crime__3                                 <dbl> 1, 0, …
## $ Area_Deprivation_index_crime__4                                 <dbl> 0, 0, …
## $ Area_Deprivation_index_crime__5                                 <dbl> 0, 0, …
## $ Area_Deprivation_index_crime__6                                 <dbl> 0, 0, …
## $ Area_Deprivation_index_crime__7                                 <dbl> 0, 0, …
## $ Area_Deprivation_index_crime__8                                 <dbl> 0, 0, …
## $ Area_Deprivation_index_crime__9                                 <dbl> 0, 1, …
## $ Area_Deprivation_index_crime__10                                <dbl> 0, 0, …
## $ Area_Deprivation_index_education__1                             <dbl> 1, 0, …
## $ Area_Deprivation_index_education__2                             <dbl> 0, 0, …
## $ Area_Deprivation_index_education__3                             <dbl> 0, 0, …
## $ Area_Deprivation_index_education__4                             <dbl> 0, 0, …
## $ Area_Deprivation_index_education__5                             <dbl> 0, 0, …
## $ Area_Deprivation_index_education__6                             <dbl> 0, 0, …
## $ Area_Deprivation_index_education__7                             <dbl> 0, 0, …
## $ Area_Deprivation_index_education__8                             <dbl> 0, 1, …
## $ Area_Deprivation_index_education__9                             <dbl> 0, 0, …
## $ Area_Deprivation_index_education__10                            <dbl> 0, 0, …
## $ Area_Deprivation_index_employment__1                            <dbl> 0, 0, …
## $ Area_Deprivation_index_employment__2                            <dbl> 1, 0, …
## $ Area_Deprivation_index_employment__3                            <dbl> 0, 0, …
## $ Area_Deprivation_index_employment__4                            <dbl> 0, 0, …
## $ Area_Deprivation_index_employment__5                            <dbl> 0, 0, …
## $ Area_Deprivation_index_employment__6                            <dbl> 0, 0, …
## $ Area_Deprivation_index_employment__7                            <dbl> 0, 0, …
## $ Area_Deprivation_index_employment__8                            <dbl> 0, 0, …
## $ Area_Deprivation_index_employment__9                            <dbl> 0, 0, …
## $ Area_Deprivation_index_employment__10                           <dbl> 0, 1, …
## $ Area_Deprivation_index_health__1                                <dbl> 1, 0, …
## $ Area_Deprivation_index_health__2                                <dbl> 0, 0, …
## $ Area_Deprivation_index_health__3                                <dbl> 0, 0, …
## $ Area_Deprivation_index_health__4                                <dbl> 0, 0, …
## $ Area_Deprivation_index_health__5                                <dbl> 0, 0, …
## $ Area_Deprivation_index_health__6                                <dbl> 0, 0, …
## $ Area_Deprivation_index_health__7                                <dbl> 0, 0, …
## $ Area_Deprivation_index_health__8                                <dbl> 0, 1, …
## $ Area_Deprivation_index_health__9                                <dbl> 0, 0, …
## $ Area_Deprivation_index_health__10                               <dbl> 0, 0, …
## $ Area_Deprivation_index_housing__1                               <dbl> 0, 0, …
## $ Area_Deprivation_index_housing__2                               <dbl> 0, 0, …
## $ Area_Deprivation_index_housing__3                               <dbl> 0, 0, …
## $ Area_Deprivation_index_housing__4                               <dbl> 1, 0, …
## $ Area_Deprivation_index_housing__5                               <dbl> 0, 1, …
## $ Area_Deprivation_index_housing__6                               <dbl> 0, 0, …
## $ Area_Deprivation_index_housing__7                               <dbl> 0, 0, …
## $ Area_Deprivation_index_housing__8                               <dbl> 0, 0, …
## $ Area_Deprivation_index_housing__9                               <dbl> 0, 0, …
## $ Area_Deprivation_index_housing__10                              <dbl> 0, 0, …
## $ Area_Deprivation_index_income__1                                <dbl> 0, 0, …
## $ Area_Deprivation_index_income__2                                <dbl> 0, 0, …
## $ Area_Deprivation_index_income__3                                <dbl> 0, 0, …
## $ Area_Deprivation_index_income__4                                <dbl> 0, 0, …
## $ Area_Deprivation_index_income__5                                <dbl> 1, 0, …
## $ Area_Deprivation_index_income__6                                <dbl> 0, 0, …
## $ Area_Deprivation_index_income__7                                <dbl> 0, 0, …
## $ Area_Deprivation_index_income__8                                <dbl> 0, 1, …
## $ Area_Deprivation_index_income__9                                <dbl> 0, 0, …
## $ Area_Deprivation_index_income__10                               <dbl> 0, 0, …
## $ Area_Deprivation_index_services__1                              <dbl> 0, 1, …
## $ Area_Deprivation_index_services__2                              <dbl> 0, 0, …
## $ Area_Deprivation_index_services__3                              <dbl> 0, 0, …
## $ Area_Deprivation_index_services__4                              <dbl> 0, 0, …
## $ Area_Deprivation_index_services__5                              <dbl> 1, 0, …
## $ Area_Deprivation_index_services__6                              <dbl> 0, 0, …
## $ Area_Deprivation_index_services__7                              <dbl> 0, 0, …
## $ Area_Deprivation_index_services__8                              <dbl> 0, 0, …
## $ Area_Deprivation_index_services__9                              <dbl> 0, 0, …
## $ Area_Deprivation_index_services__10                             <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__1ar                             <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__1br                             <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__2ar                             <dbl> 1, 0, …
## $ Area_Geodemographic_group_code__2br                             <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__3ar                             <dbl> 0, 1, …
## $ Area_Geodemographic_group_code__3br                             <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__3cr                             <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__4ar                             <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__5ar                             <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__6ar                             <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__6br                             <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__7ar                             <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__7br                             <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__7cr                             <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__8ar                             <dbl> 0, 0, …
## $ Area_Geodemographic_group_code__8br                             <dbl> 0, 0, …

# Step 2: correlate
data_correlation <- data_binarized %>%
    correlate(Accreditation__Yes)

data_correlation

## # A tibble: 175 × 3
##    feature        bin                        correlation
##    <fct>          <chr>                            <dbl>
##  1 Accreditation  No                              -1    
##  2 Accreditation  Yes                              1    
##  3 Governance     Independent-Private             -0.353
##  4 Size           small                           -0.326
##  5 Size           medium                           0.253
##  6 Size           large                            0.242
##  7 Governance     Government-Local_Authority       0.221
##  8 Governance     Independent-Unknown             -0.202
##  9 Size           unknown                         -0.166
## 10 Subject_Matter -OTHER                          -0.147
## # ℹ 165 more rows

# Step 3: Plot
data_correlation %>%
    correlationfunnel::plot_correlation_funnel()

## Warning: ggrepel: 161 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Model Building

Split Data

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──

## ✔ broom        1.0.5      ✔ rsample      1.2.1 
## ✔ dials        1.2.1      ✔ tune         1.2.1 
## ✔ infer        1.0.7      ✔ workflows    1.1.4 
## ✔ modeldata    1.4.0      ✔ workflowsets 1.1.0 
## ✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10

## Warning: package 'modeldata' was built under R version 4.3.3

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.

set.seed(1234)
data_clean <- data_clean %>% sample_n(1000)

data_split <- initial_split(data_clean, strata = Accreditation)
data_train <- training(data_split)
data_test <- testing(data_split)

data_cv <- rsample::vfold_cv(data_train, strata = Accreditation)
data_cv

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [674/75]> Fold01
##  2 <split [674/75]> Fold02
##  3 <split [674/75]> Fold03
##  4 <split [674/75]> Fold04
##  5 <split [674/75]> Fold05
##  6 <split [674/75]> Fold06
##  7 <split [674/75]> Fold07
##  8 <split [674/75]> Fold08
##  9 <split [674/75]> Fold09
## 10 <split [675/74]> Fold10

`{r} # library(usemodels) # use_xgboost(like_count ~ ., data = data_train) #`

Preprocess data

skimr::skim(data_clean)

Data summary
Name	data_clean
Number of rows	1000
Number of columns	17
_______________________
Column type frequency:
character	5
factor	12
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
museum_id	1	8	15	1000
Address_line_1	1	4	52	925
Village_Town_City	1	3	24	618
Accreditation	1	2	3	2
Year_opened	1	4	4	161

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
Governance	1	FALSE	12	Ind: 439, Gov: 251, Ind: 119, Ind: 40
Size	1	FALSE	5	sma: 553, med: 258, lar: 136, unk: 51
Subject_Matter	1	FALSE	108	Loc: 244, Art: 57, Bui: 50, War: 34
Area_Deprivation_index	1	FALSE	10	6: 133, 4: 132, 5: 122, 7: 114
Area_Deprivation_index_crime	1	FALSE	10	1: 162, 2: 118, 4: 102, 10: 99
Area_Deprivation_index_education	1	FALSE	10	7: 146, 5: 116, 8: 113, 6: 111
Area_Deprivation_index_employment	1	FALSE	10	10: 147, 5: 119, 7: 110, 4: 109
Area_Deprivation_index_health	1	FALSE	10	10: 126, 5: 123, 8: 116, 7: 106
Area_Deprivation_index_housing	1	FALSE	10	1: 261, 2: 150, 3: 127, 4: 86
Area_Deprivation_index_income	1	FALSE	10	6: 127, 5: 121, 4: 119, 7: 115
Area_Deprivation_index_services	1	FALSE	10	1: 166, 9: 107, 2: 106, 3: 99
Area_Geodemographic_group_code	1	FALSE	16	3ar: 164, 2ar: 106, 3br: 102, 7ar: 96

library(themis)
library(recipes)
library(textrecipes)

xgboost_recipe <- recipes::recipe(Accreditation ~ ., data = data_train) %>%
    update_role(museum_id, new_role = "ID") %>%
    step_tokenize(Village_Town_City) %>%
    step_tokenfilter(Village_Town_City, max_tokens = 50) %>%
    step_tf(Village_Town_City) %>%
    step_novel(all_nominal_predictors()) %>%
    step_dummy(all_nominal_predictors()) %>%
    step_zv(Address_line_1_new, Governance_new, Size_new, Subject_Matter_Archaeology.Other, Subject_Matter_Communications,
  Subject_Matter_Natural_world, Subject_Matter_Natural_world.Zoology, Subject_Matter_Services.RNLI, Subject_Matter_War_and_conflict.Other, Subject_Matter_new, Year_opened_new,
  Area_Deprivation_index_new, Area_Deprivation_index_crime_new, Area_Deprivation_index_education_new, Area_Deprivation_index_employment_new, Area_Deprivation_index_health_new,
  Area_Deprivation_index_housing_new, Area_Deprivation_index_income_new, Area_Deprivation_index_services_new, Area_Geodemographic_group_code_new) %>%
    step_normalize(all_numeric_predictors()) %>%
    # step_pca(all_numeric_predictors(), threshold = .99) %>%
    step_smote(Accreditation)
    

# xgboost_recipe %>% prep() %>% juice() %>% glimpse()

Specify model

xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
    loss_reduction = tune(), sample_size = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec)

Tune hyperparameters

doParallel::registerDoParallel()

set.seed(17375)
xgboost_tune <-
  tune_grid(xgboost_workflow,
            resamples = data_cv,
            grid = 5,
            control = control_grid(save_pred = TRUE))

## Warning: package 'xgboost' was built under R version 4.3.3

Model evaluation

Identify optimal values for hyperparameters

collect_metrics(xgboost_tune)

## # A tibble: 15 × 12
##    trees min_n tree_depth learn_rate loss_reduction sample_size .metric    
##    <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>      
##  1   677     7         14    0.00195       5.18e- 4       0.477 accuracy   
##  2   677     7         14    0.00195       5.18e- 4       0.477 brier_class
##  3   677     7         14    0.00195       5.18e- 4       0.477 roc_auc    
##  4  1016    11          4    0.00394       3.13e-10       0.723 accuracy   
##  5  1016    11          4    0.00394       3.13e-10       0.723 brier_class
##  6  1016    11          4    0.00394       3.13e-10       0.723 roc_auc    
##  7  1626    19          8    0.0202        2.06e- 7       0.257 accuracy   
##  8  1626    19          8    0.0202        2.06e- 7       0.257 brier_class
##  9  1626    19          8    0.0202        2.06e- 7       0.257 roc_auc    
## 10  1483    30          5    0.0873        5.18e- 3       0.401 accuracy   
## 11  1483    30          5    0.0873        5.18e- 3       0.401 brier_class
## 12  1483    30          5    0.0873        5.18e- 3       0.401 roc_auc    
## 13   111    39         12    0.238         4.91e- 1       0.850 accuracy   
## 14   111    39         12    0.238         4.91e- 1       0.850 brier_class
## 15   111    39         12    0.238         4.91e- 1       0.850 roc_auc    
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## #   .config <chr>

collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(Accreditation, .pred_Yes) %>%
    autoplot()

Fit the model for the last time

xgboost_last <- xgboost_workflow %>%
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
    last_fit(data_split)

## New names:
## → A | warning: !  The following columns have zero variance so scaling cannot be
## used: Governance_Government.Other, Subject_Matter_Belief_and_identity.Other,
## and Subject_Matter_Services.Other. ℹ Consider using ?step_zv
## (`?recipes::step_zv()`) to remove those columns before normalizing.
## There were issues with some computations A: x1
## New names:
## There were issues with some computations A: x1
## 
## • `Address_line_1_Units.1...2` -> `Address_line_1_Units.1`

collect_metrics(xgboost_last)

## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.705 Preprocessor1_Model1
## 2 roc_auc     binary         0.777 Preprocessor1_Model1
## 3 brier_class binary         0.191 Preprocessor1_Model1

collect_predictions((xgboost_last)) %>%
    #mutate(.pred_Accredited = as.factor(.pred_Accredited)) %>%
    yardstick::conf_mat(Accreditation, .pred_class) %>%
    autoplot()

Variable Importance

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()

Conclusion

The previous model had accuracy 0.693 and AUC of 0.773.

step_normalize() braught a small improvement in accuracy.
There was no improvement with step_YeopJohnson.
Feature selection: PCA didn’t make an improvement.

Apply to your Data 8

Max Haussmann

2024-10-31

Import Data

Explore data

Explore data

Model Building

Split Data

`{r} # library(usemodels) # use_xgboost(like_count ~ ., data = data_train) #`

Preprocess data

Specify model

Tune hyperparameters

Model evaluation

Identify optimal values for hyperparameters

Fit the model for the last time

Variable Importance

Conclusion