Data summary
Name	Piped data
Number of rows	7431
Number of columns	8
_______________________
Column type frequency:
character	2
Date	1
numeric	5
________________________
Group variables	None

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
title	0	1	1	74	0	7172	0
author	4	1	4	73	0	2205	0

skim_variable	n_missing	complete_rate	min	max	median	n_unique
first_week	0	1	1931-10-12	2020-12-06	2000-06-25	3348

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	1	3715.00	2145.29	0	1857.5	3715	5572.5	7430	▇▇▇▇▇
year	1	1989.61	26.23	1931	1968.0	2000	2011.0	2020	▂▂▂▃▇
total_weeks	1	8.13	11.21	1	2.0	4	10.0	178	▇▁▁▁▁
debut_rank	1	7.90	4.57	1	4.0	8	12.0	17	▇▆▅▅▅
best_rank	1	6.91	4.57	1	3.0	6	10.0	17	▇▅▃▃▂

Explore Data

*** Identify variables with correlation with the target variable.***

* Total weeks

data %>% count(total_weeks)

## # A tibble: 90 × 2
##    total_weeks     n
##          <dbl> <int>
##  1           1  1684
##  2           2   816
##  3           3   700
##  4           4   599
##  5           5   478
##  6           6   406
##  7           7   298
##  8           8   287
##  9           9   198
## 10          10   202
## # ℹ 80 more rows

Debut rank

data %>% 
    ggplot(aes(total_weeks, debut_rank)) +
    geom_point(color= "steelblue") +
  geom_smooth(method = "lm", 
              formula = y ~ poly(x, 2), 
              color = "indianred3")

Best rank

data %>% 
    ggplot(aes(total_weeks, best_rank)) +
    geom_point(color= "steelblue") +
  geom_smooth(color = "tomato")

We will use debut rank as a predictor

Tune on xgboost model

set.seed(123)
data_split <- data %>%
    initial_split(strata = total_weeks)

data_train <- training(data_split)

data_test <- testing(data_split)

set.seed(156)
data_fold <- vfold_cv(data_train, strata = total_weeks)
data_fold

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [5010/559]> Fold01
##  2 <split [5011/558]> Fold02
##  3 <split [5011/558]> Fold03
##  4 <split [5011/558]> Fold04
##  5 <split [5012/557]> Fold05
##  6 <split [5013/556]> Fold06
##  7 <split [5013/556]> Fold07
##  8 <split [5013/556]> Fold08
##  9 <split [5013/556]> Fold09
## 10 <split [5014/555]> Fold10

split_category <- function(x) {
    x %>%
        str_split(", ") %>%
        map(str_remove_all, "[:punct:]") %>%
        map(str_squish) %>%
        map(str_to_lower) %>%
        map(str_replace_all, " ", "_")
}

data_rec <-
    recipe(total_weeks ~ ., data = data_train) %>%
    step_tokenize(debut_rank, custom_token = split_category) %>%
    step_tokenfilter(debut_rank, max_tokens = 30) %>%
    step_tf(debut_rank)

xgb_spec <-
  boost_tree(
    trees = tune(),
    mtry = tune(),
    min_n = tune(),
    learn_rate = 0.01
  ) %>%
  set_engine("xgboost") %>%
  set_mode("regression")

xgb_wf <- workflows::workflow(data_rec, xgb_spec)
xgb_wf

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## • step_tokenize()
## • step_tokenfilter()
## • step_tf()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
## 
## Main Arguments:
##   mtry = tune()
##   trees = tune()
##   min_n = tune()
##   learn_rate = 0.01
## 
## Computational engine: xgboost

Apply to data 3

Jacob Brown

2023-10-18

Clean Data

Notes

Explore Data

* Total weeks

Debut rank

Best rank

We will use debut rank as a predictor

Tune on xgboost model