Goal: Click here for the data

Import Data

nyt_titles <- read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')

skimr::skim(nyt_titles)
Data summary
Name nyt_titles
Number of rows 7431
Number of columns 8
_______________________
Column type frequency:
character 2
Date 1
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
title 0 1 1 74 0 7172 0
author 4 1 4 73 0 2205 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
first_week 0 1 1931-10-12 2020-12-06 2000-06-25 3348

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 3715.00 2145.29 0 1857.5 3715 5572.5 7430 ▇▇▇▇▇
year 0 1 1989.61 26.23 1931 1968.0 2000 2011.0 2020 ▂▂▂▃▇
total_weeks 0 1 8.13 11.21 1 2.0 4 10.0 178 ▇▁▁▁▁
debut_rank 0 1 7.90 4.57 1 4.0 8 12.0 17 ▇▆▅▅▅
best_rank 0 1 6.91 4.57 1 3.0 6 10.0 17 ▇▅▃▃▂
data <- nyt_titles %>%
    
    # Treat missing values
    select(-author) %>%
    na.omit() %>%

    # log transform variables with pos-skewed distribution
    mutate(best = log(total_weeks))

Explore Data

Identify good predictors.

best rank

data %>%
    ggplot(aes(total_weeks, best_rank)) +
    scale_y_log10() +
    geom_point()

data %>%
    ggplot(aes(total_weeks, as.factor(debut_rank))) +
    geom_boxplot()

data %>%
    
    # tokenize title
    unnest_tokens(output = word, input = title) %>%
    
    # calculate avg best rank per week
    group_by(word) %>%
    summarise(weeks = mean(total_weeks),
              n = n()) %>%
    ungroup() %>%

    filter(n > 10, !str_detect(word, "\\d")) %>%
    slice_max(order_by = weeks, n = 20) %>%
        
    # Plot
    ggplot(aes(weeks, fct_reorder(word, weeks))) +
    geom_point() +
    
    labs(y = "Words in Title")

EDA shortcut

# Step 1: Prepare data
data_binarized_tbl <- data %>%
    select(-id, -title, -first_week) %>%
    binarize()

data_binarized_tbl %>% glimpse()
## Rows: 7,431
## Columns: 20
## $ `year__-Inf_1968`                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ year__1968_2000                          <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,…
## $ year__2000_2011                          <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,…
## $ year__2011_Inf                           <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,…
## $ `total_weeks__-Inf_2`                    <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,…
## $ total_weeks__2_4                         <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,…
## $ total_weeks__4_10                        <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,…
## $ total_weeks__10_Inf                      <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ `debut_rank__-Inf_4`                     <dbl> 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,…
## $ debut_rank__4_8                          <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,…
## $ debut_rank__8_12                         <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,…
## $ debut_rank__12_Inf                       <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `best_rank__-Inf_3`                      <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ best_rank__3_6                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ best_rank__6_10                          <dbl> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,…
## $ best_rank__10_Inf                        <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,…
## $ `best__-Inf_0.693147180559945`           <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,…
## $ best__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,…
## $ best__1.38629436111989_2.30258509299405  <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,…
## $ best__2.30258509299405_Inf               <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,…
# Step 2: Correate
data_corr_tblNY <- data_binarized_tbl %>%
    correlate(best__2.30258509299405_Inf)

data_corr_tblNY
## # A tibble: 20 × 3
##    feature     bin                                correlation
##    <fct>       <chr>                                    <dbl>
##  1 total_weeks 10_Inf                                 1      
##  2 best        2.30258509299405_Inf                   1      
##  3 total_weeks -Inf_2                                -0.397  
##  4 best        -Inf_0.693147180559945                -0.397  
##  5 best_rank   -Inf_3                                 0.343  
##  6 total_weeks 4_10                                  -0.323  
##  7 best        1.38629436111989_2.30258509299405     -0.323  
##  8 best_rank   10_Inf                                -0.313  
##  9 total_weeks 2_4                                   -0.257  
## 10 best        0.693147180559945_1.38629436111989    -0.257  
## 11 year        1968_2000                              0.242  
## 12 year        2011_Inf                              -0.235  
## 13 year        2000_2011                             -0.230  
## 14 year        -Inf_1968                              0.217  
## 15 best_rank   6_10                                  -0.130  
## 16 best_rank   3_6                                    0.0795 
## 17 debut_rank  4_8                                   -0.0319 
## 18 debut_rank  8_12                                   0.0202 
## 19 debut_rank  -Inf_4                                 0.0148 
## 20 debut_rank  12_Inf                                -0.00269
# Step 3: Plot
data_corr_tblNY %>%
    plot_correlation_funnel()