Import Data

nyt_titles <- read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')

skimr::skim(nyt_titles)

Data summary
Name	nyt_titles
Number of rows	7431
Number of columns	8
_______________________
Column type frequency:
character	2
Date	1
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
title	0	1	1	74	0	7172	0
author	4	1	4	73	0	2205	0

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
first_week	0	1	1931-10-12	2020-12-06	2000-06-25	3348

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	1	3715.00	2145.29	0	1857.5	3715	5572.5	7430	▇▇▇▇▇
year	1	1989.61	26.23	1931	1968.0	2000	2011.0	2020	▂▂▂▃▇
total_weeks	1	8.13	11.21	1	2.0	4	10.0	178	▇▁▁▁▁
debut_rank	1	7.90	4.57	1	4.0	8	12.0	17	▇▆▅▅▅
best_rank	1	6.91	4.57	1	3.0	6	10.0	17	▇▅▃▃▂

data <- nyt_titles %>%
    
    # Treat missing values
    select(-author) %>%
    na.omit() %>%

    # log transform variables with pos-skewed distribution
    mutate(best = log(best_rank))

Explore Data

Identify good predictors.

total weeks

data %>%
    ggplot(aes(best, total_weeks)) +
    scale_y_log10() +
    geom_point()

data %>%
    ggplot(aes(best, as.factor(debut_rank))) +
    geom_boxplot()

data %>%
    
    # tokenize title
    unnest_tokens(output = word, input = title) %>%
    
    # calculate avg best rank per week
    group_by(word) %>%
    summarise(best = mean(best_rank),
              n = n()) %>%
    ungroup() %>%

    filter(n > 10, !str_detect(word, "\\d")) %>%
    slice_max(order_by = best, n = 20) %>%
        
    # Plot
    ggplot(aes(best, fct_reorder(word, best))) +
    geom_point() +
    
    labs(y = "Words in Title")

EDA shortcut

# Step 1: Prepare data
data_binarized_tbl <- data %>%
    select(-id, -title, -first_week) %>%
    binarize()

data_binarized_tbl %>% glimpse()

## Rows: 7,431
## Columns: 20
## $ `year__-Inf_1968`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__1968_2000                         <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, …
## $ year__2000_2011                         <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, …
## $ year__2011_Inf                          <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, …
## $ `total_weeks__-Inf_2`                   <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, …
## $ total_weeks__2_4                        <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, …
## $ total_weeks__4_10                       <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, …
## $ total_weeks__10_Inf                     <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ `debut_rank__-Inf_4`                    <dbl> 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, …
## $ debut_rank__4_8                         <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, …
## $ debut_rank__8_12                        <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, …
## $ debut_rank__12_Inf                      <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `best_rank__-Inf_3`                     <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ best_rank__3_6                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ best_rank__6_10                         <dbl> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, …
## $ best_rank__10_Inf                       <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, …
## $ `best__-Inf_1.09861228866811`           <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ best__1.09861228866811_1.79175946922805 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ best__1.79175946922805_2.30258509299405 <dbl> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, …
## $ best__2.30258509299405_Inf              <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, …

# Step 2: Correate
data_corr_tblNY <- data_binarized_tbl %>%
    correlate(best__2.30258509299405_Inf)

data_corr_tblNY

## # A tibble: 20 × 3
##    feature     bin                               correlation
##    <fct>       <chr>                                   <dbl>
##  1 best_rank   10_Inf                                1      
##  2 best        2.30258509299405_Inf                  1      
##  3 total_weeks -Inf_2                                0.469  
##  4 best_rank   -Inf_3                               -0.381  
##  5 best        -Inf_1.09861228866811                -0.381  
##  6 best_rank   6_10                                 -0.317  
##  7 best        1.79175946922805_2.30258509299405    -0.317  
##  8 total_weeks 10_Inf                               -0.313  
##  9 best_rank   3_6                                  -0.302  
## 10 best        1.09861228866811_1.79175946922805    -0.302  
## 11 total_weeks 4_10                                 -0.196  
## 12 debut_rank  12_Inf                                0.0901 
## 13 debut_rank  4_8                                  -0.0582 
## 14 year        1968_2000                            -0.0228 
## 15 debut_rank  8_12                                 -0.0153 
## 16 year        2011_Inf                              0.0129 
## 17 debut_rank  -Inf_4                               -0.0102 
## 18 total_weeks 2_4                                  -0.00828
## 19 year        -Inf_1968                             0.00569
## 20 year        2000_2011                             0.00460

# Step 3: Plot
data_corr_tblNY %>%
    plot_correlation_funnel()

Apply 1: NY Times Best Sellers

Erin McEvoy

2024-09-09

Import Data

Explore Data