library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

museums <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-22/museums.csv')

## Rows: 4191 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (24): museum_id, Name_of_museum, Address_line_1, Address_line_2, Village...
## dbl (11): Latitude, Longitude, DOMUS_identifier, Area_Deprivation_index, Are...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

museums %>%
  count(Accreditation)

## # A tibble: 2 × 2
##   Accreditation     n
##   <chr>         <int>
## 1 Accredited     1720
## 2 Unaccredited   2471

top_subjects <- museums %>% 
    count(Subject_Matter, sort = TRUE) %>% 
    slice_max(n, n = 6) %>% 
    pull(Subject_Matter)

museums %>% 
    filter(Subject_Matter %in% top_subjects) %>% 
    count(Subject_Matter, Accreditation) %>% 
    ggplot(aes(Accreditation, n, fill = Accreditation)) + 
    geom_col(show.legend = FALSE) + 
    facet_wrap(vars(Subject_Matter), scales = "free")

museums %>% 
    count(Accreditation, Size)

## # A tibble: 10 × 3
##    Accreditation Size        n
##    <chr>         <chr>   <int>
##  1 Accredited    huge       11
##  2 Accredited    large     402
##  3 Accredited    medium    644
##  4 Accredited    small     650
##  5 Accredited    unknown    13
##  6 Unaccredited  huge        1
##  7 Unaccredited  large     142
##  8 Unaccredited  medium    381
##  9 Unaccredited  small    1751
## 10 Unaccredited  unknown   196

top_gov <- museums %>% 
    count(Governance, sort = TRUE) %>% 
    slice_max(n, n = 4) %>% 
    pull(Governance)

museums %>% 
    filter(Governance %in% top_gov) %>% 
    count(Governance, Accreditation) %>% 
    ggplot(aes(Accreditation, n, fill = Accreditation)) + 
    geom_col(show.legend = FALSE) + 
    facet_wrap(vars(Governance), scales = "free_y")

museum_parsed <- museums %>% 
    select(museum_id, Accreditation, Governance, Size, Subject_Matter, Year_opened, Year_closed, Area_Deprivation_index) %>% 
    mutate(Year_opened = parse_number(Year_opened), 
              Closed = if_else(Year_closed == "9999:9999", "Open", "Closed")) %>% 
    select(-Year_closed) %>% 
    na.omit() %>% 
    mutate_if(is.character, as.factor) %>% 
    mutate(museum_id = as.character(museum_id))

Feature engineering

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──

## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.

set.seed(123)
museum_split <- initial_split(museum_parsed, strata = Accreditation)
museum_train <- training(museum_split)
museum_test <- testing(museum_split)

set.seed(234)
museum_folds <- vfold_cv(museum_train, strata = Accreditation)
museum_folds

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [2795/311]> Fold01
##  2 <split [2795/311]> Fold02
##  3 <split [2795/311]> Fold03
##  4 <split [2795/311]> Fold04
##  5 <split [2795/311]> Fold05
##  6 <split [2795/311]> Fold06
##  7 <split [2795/311]> Fold07
##  8 <split [2796/310]> Fold08
##  9 <split [2796/310]> Fold09
## 10 <split [2797/309]> Fold10

library(embed)

museum_rec <- 
    recipe(Accreditation ~ ., data = museum_train) %>% 
    update_role(museum_id, new_role = "id") %>% 
    step_lencode_glm(Subject_Matter, outcome = vars(Accreditation)) %>%     step_dummy(all_nominal_predictors())

museum_rec

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## outcome:   1
## predictor: 6
## id:        1

##

## ── Operations

## • Linear embedding for factors via GLM for: Subject_Matter

## • Dummy variables from: all_nominal_predictors()

prep(museum_rec) %>%
  tidy(number = 1) %>%
  filter(level == "..new")

## # A tibble: 1 × 4
##   level  value terms          id               
##   <chr>  <dbl> <chr>          <chr>            
## 1 ..new -0.909 Subject_Matter lencode_glm_kFEsv

Build a model

xgb_spec <- 
    boost_tree(
        trees = tune(),
        min_n = tune(), 
        mtry = tune(), 
        learn_rate = 0.01) %>% 
    set_engine("xgboost") %>% 
    set_mode("classification")

xgb_wf <- workflow(museum_rec, xgb_spec)

library(finetune)
doParallel::registerDoParallel()

set.seed(345)
xgb_rs <- tune_race_anova(
  xgb_wf,
  resamples = museum_folds,
  grid = 15,
  control = control_race(verbose_elim = TRUE)
)

## i Creating pre-processing data to finalize unknown parameter: mtry

## ℹ Racing will maximize the roc_auc metric.
## ℹ Resamples are analyzed in a random order.
## ℹ Fold10: 10 eliminated; 5 candidates remain.
## 
## ℹ Fold07: All but one parameter combination were eliminated.

xgb_rs

## # Tuning results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 × 5
##    splits             id     .order .metrics          .notes          
##    <list>             <chr>   <int> <list>            <list>          
##  1 <split [2795/311]> Fold01      2 <tibble [30 × 7]> <tibble [0 × 3]>
##  2 <split [2795/311]> Fold02      3 <tibble [30 × 7]> <tibble [0 × 3]>
##  3 <split [2797/309]> Fold10      1 <tibble [30 × 7]> <tibble [0 × 3]>
##  4 <split [2795/311]> Fold07      4 <tibble [10 × 7]> <tibble [0 × 3]>
##  5 <split [2795/311]> Fold03      5 <tibble [2 × 7]>  <tibble [0 × 3]>
##  6 <split [2795/311]> Fold04      8 <tibble [2 × 7]>  <tibble [0 × 3]>
##  7 <split [2795/311]> Fold05      6 <tibble [2 × 7]>  <tibble [0 × 3]>
##  8 <split [2795/311]> Fold06      9 <tibble [2 × 7]>  <tibble [0 × 3]>
##  9 <split [2796/310]> Fold08     10 <tibble [2 × 7]>  <tibble [0 × 3]>
## 10 <split [2796/310]> Fold09      7 <tibble [2 × 7]>  <tibble [0 × 3]>

plot_race(xgb_rs)

collect_metrics(xgb_rs)

## # A tibble: 2 × 9
##    mtry trees min_n .metric  .estimator  mean     n std_err .config             
##   <int> <int> <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1     2   599     8 accuracy binary     0.797    10 0.00791 Preprocessor1_Model…
## 2     2   599     8 roc_auc  binary     0.885    10 0.00549 Preprocessor1_Model…

xgb_last <- xgb_wf %>%
  finalize_workflow(select_best(xgb_rs, "accuracy")) %>%
  last_fit(museum_split)

xgb_last

## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits              id               .metrics .notes   .predictions .workflow 
##   <list>              <chr>            <list>   <list>   <list>       <list>    
## 1 <split [3106/1036]> train/test split <tibble> <tibble> <tibble>     <workflow>

collect_metrics(xgb_last)

## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.810 Preprocessor1_Model1
## 2 roc_auc  binary         0.891 Preprocessor1_Model1

collect_predictions(xgb_last) %>% 
    conf_mat(Accreditation, .pred_class)

##               Truth
## Prediction     Accredited Unaccredited
##   Accredited          353          120
##   Unaccredited         77          486

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

xgb_last %>%
  extract_fit_engine() %>%
  vip()

1. What is the research question?

The question we are trying to answer is whether a museum in the UK is accredited or not based on character data like size, subject matter and if they are open or closed. # Describe the data briefly
The data consists of different museums in the UK, whatr they are showing, how big they are, are they accredited or not, where they are and more. # What are the characteristics of the key variables used in the analysis?
The primary variables we are using in this analysis consist of size, governance, accreditation, year opened/closed snd more.

#2. Data Exploration and Transformation: - The newly transformed data has a new open/closed variable data, cut out unnecessary data for the analysis such as lat/long and more, changed character datra to factor and omitted NA’s.

3. Data Preparation and Modeling:

- There were a few steps made in this data prep and modeling section that include: update_role(update_role which turns the id into an identifier instead of data used in the modeling), step_lencode_glm(creates a specification of a recipe step that convers nominal predictors into a single set of scores form a linear model), and step_dummy(creates a specification of a recipe step that converts nominal data into numeric binary terms).

#4. Model Evaluation: - Looking at the confusion matrix and plot brace, we can see that the model did a good job of predicting the outcome.

5. Conclusion:

The key findings of this model are that the Governance, whether it is open or closed, and the subject matter were the three biggest predictors of accreditation.

Code Along 6

Colin Tracy

2023-10-17

Feature engineering

Build a model

1. What is the research question?

3. Data Preparation and Modeling:

5. Conclusion: