Apply 2

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)
library(skimr)

## Warning: package 'skimr' was built under R version 4.5.2

library(correlationfunnel)

## Warning: package 'correlationfunnel' was built under R version 4.5.2

## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>

library(tidymodels)

## Warning: package 'tidymodels' was built under R version 4.5.2

## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom        1.0.10     ✔ rsample      1.3.2 
## ✔ dials        1.4.2      ✔ tailor       0.1.0 
## ✔ infer        1.1.0      ✔ tune         2.0.1 
## ✔ modeldata    1.5.1      ✔ workflows    1.3.0 
## ✔ parsnip      1.4.1      ✔ workflowsets 1.1.1 
## ✔ recipes      1.3.1      ✔ yardstick    1.3.2

## Warning: package 'dials' was built under R version 4.5.2

## Warning: package 'infer' was built under R version 4.5.2

## Warning: package 'modeldata' was built under R version 4.5.2

## Warning: package 'parsnip' was built under R version 4.5.2

## Warning: package 'recipes' was built under R version 4.5.2

## Warning: package 'rsample' was built under R version 4.5.2

## Warning: package 'tailor' was built under R version 4.5.2

## Warning: package 'tune' was built under R version 4.5.2

## Warning: package 'workflows' was built under R version 4.5.2

## Warning: package 'workflowsets' was built under R version 4.5.2

## Warning: package 'yardstick' was built under R version 4.5.2

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()

# Load Data
youtube <- readr::read_csv(
  "https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv"
)

## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl   (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl   (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm  (1): published_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Clean Data
data <- youtube %>%
  select(-superbowl_ads_dot_com_url,
         -youtube_url,
         -thumbnail,
         -title,
         -description,
         -etag,
         -id,
         -kind,
         -channel_title) %>%
  na.omit() %>%
  filter(view_count <= quantile(view_count, 0.99, na.rm = TRUE)) %>%
  mutate(
    view_count    = log(view_count),
    like_count    = log(like_count + 1),
    dislike_count = log(dislike_count + 1),
    comment_count = log(comment_count + 1),
    high_views    = view_count > log(1000000)
  )

# Explore
ggplot(data, aes(year, view_count)) + geom_point()

ggplot(data, aes(like_count, year)) + geom_point()

# Correlation Funnel
data_binarized_tbl <- data %>%
  select(-brand, -published_at) %>%
  binarize()

target_col <- names(data_binarized_tbl)[
  stringr::str_detect(names(data_binarized_tbl), "^high_views") &
  stringr::str_detect(names(data_binarized_tbl), "__1$|_1$")
][1]

data_corr_tbl <- data_binarized_tbl %>%
  correlate(!!rlang::sym(target_col))

plot_correlation_funnel(data_corr_tbl)

## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Train/Test Split
set.seed(1234)
data_split <- initial_split(data)
data_train <- training(data_split)
data_test  <- testing(data_split)

# Recipe
xgboost_recipe <- recipe(view_count ~ ., data = data_train) %>%
  step_rm(brand, published_at, high_views) %>%
  step_mutate_at(all_logical_predictors(), fn = as.numeric) %>%
  step_zv(all_predictors())

# Model
xgboost_spec <- boost_tree(
  trees = 500,
  tree_depth = 3,
  min_n = 5,
  learn_rate = 0.05,
  sample_size = 0.8
) %>%
  set_mode("regression") %>%
  set_engine("xgboost")

# Workflow + Fit
xgboost_workflow <- workflow() %>%
  add_recipe(xgboost_recipe) %>%
  add_model(xgboost_spec)

xgboost_fit <- fit(xgboost_workflow, data = data_train)

# Evaluate
test_preds <- predict(xgboost_fit, data_test) %>%
  bind_cols(data_test)

rmse(test_preds, truth = view_count, estimate = .pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       0.720

rsq(test_preds, truth = view_count, estimate = .pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.922

ggplot(test_preds, aes(view_count, .pred)) +
  geom_point() +
  geom_abline()

Apply 2

Matthew Plutzner

2026-03-03