Apply 1: nyt

Goal: To predict total weeks on best sellers list

Click[here for the data] (‘https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv’)

nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')

skimr::skim(nyt_titles)

Data summary
Name	nyt_titles
Number of rows	7431
Number of columns	8
_______________________
Column type frequency:
character	2
Date	1
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
title	0	1	1	74	0	7172	0
author	4	1	4	73	0	2205	0

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
first_week	0	1	1931-10-12	2020-12-06	2000-06-25	3348

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	1	3715.00	2145.29	0	1857.5	3715	5572.5	7430	▇▇▇▇▇
year	1	1989.61	26.23	1931	1968.0	2000	2011.0	2020	▂▂▂▃▇
total_weeks	1	8.13	11.21	1	2.0	4	10.0	178	▇▁▁▁▁
debut_rank	1	7.90	4.57	1	4.0	8	12.0	17	▇▆▅▅▅
best_rank	1	6.91	4.57	1	3.0	6	10.0	17	▇▅▃▃▂

data <- nyt_titles %>%
  
  # Treat missing values 
    select(-id) %>%
    filter(!is.na(author)) %>%
    filter(total_weeks < 100) %>%
    mutate(total_weeks = log(total_weeks)) %>%
    mutate(decade = year %/% 10 * 10)

#Explore Data

Identify Good Predictors

best_rank

data%>%
  ggplot(aes(total_weeks, best_rank)) +
  scale_y_log10() +
  geom_point()

debut_rank

data%>%
  ggplot(aes(total_weeks, as.factor(debut_rank))) +
  geom_boxplot()

author

data %>%
    
    group_by(author) %>%
    summarise(total_weeks_avg = mean(total_weeks)) %>% ungroup() %>%
    
    slice_max(order_by = total_weeks_avg, n = 20) %>%

    ggplot(aes(total_weeks_avg, fct_reorder(author, total_weeks_avg))) +
    geom_col() +

labs(title = "Best Author by Total Weeks", y = NULL)

Title

data %>%
    
    #tokenize title
    unnest_tokens(output = word, input = title) %>%
    
    #calculate avg rent per word
    group_by(word) %>%
    summarise(total_weeks = mean(total_weeks),
              n     = n()) %>%
    ungroup() %>%
    
    filter(n > 10, !str_detect(word, "\\a")) %>%
    slice_max(order_by = total_weeks, n = 20) %>%
    
    #plot
    ggplot(aes(total_weeks, fct_reorder(word, total_weeks))) +
    geom_point() +
    
    labs(y = "Words in Title")

EDA shortcut

# step 1 - prepare data
data_binarized_tbl <- data %>%
  select(-author, -title, -first_week) %>%
  binarize()

data_binarized_tbl %>% glimpse()

## Rows: 7,415
## Columns: 20
## $ `year__-Inf_1968`                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1968_2000                                 <dbl> 1, 1, 1, 0, 0, 0, 1, 1…
## $ year__2000_2011                                 <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ year__2011_Inf                                  <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ `total_weeks__-Inf_0.693147180559945`           <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ total_weeks__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ total_weeks__1.38629436111989_2.30258509299405  <dbl> 0, 0, 1, 0, 0, 0, 0, 1…
## $ total_weeks__2.30258509299405_Inf               <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ `debut_rank__-Inf_4`                            <dbl> 1, 0, 1, 1, 0, 1, 0, 0…
## $ debut_rank__4_8                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 1…
## $ debut_rank__8_12                                <dbl> 0, 0, 0, 0, 1, 0, 1, 0…
## $ debut_rank__12_Inf                              <dbl> 0, 1, 0, 0, 0, 0, 0, 0…
## $ `best_rank__-Inf_3`                             <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ best_rank__3_6                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ best_rank__6_11                                 <dbl> 0, 0, 1, 0, 0, 1, 0, 1…
## $ best_rank__11_Inf                               <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ `decade__-Inf_1960`                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ decade__1960_2000                               <dbl> 1, 1, 1, 0, 1, 0, 1, 1…
## $ decade__2000_2010                               <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ decade__2010_Inf                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0…

# step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
  correlate(total_weeks__0.693147180559945_1.38629436111989)

data_corr_tbl

## # A tibble: 20 × 3
##    feature     bin                                correlation
##    <fct>       <chr>                                    <dbl>
##  1 total_weeks 0.693147180559945_1.38629436111989     1      
##  2 total_weeks -Inf_0.693147180559945                -0.329  
##  3 total_weeks 1.38629436111989_2.30258509299405     -0.268  
##  4 total_weeks 2.30258509299405_Inf                  -0.256  
##  5 year        2000_2011                              0.121  
##  6 year        1968_2000                             -0.0907 
##  7 decade      -Inf_1960                             -0.0804 
##  8 year        -Inf_1968                             -0.0781 
##  9 best_rank   3_6                                    0.0710 
## 10 decade      2000_2010                              0.0681 
## 11 best_rank   -Inf_3                                -0.0606 
## 12 year        2011_Inf                               0.0493 
## 13 best_rank   11_Inf                                -0.0240 
## 14 debut_rank  4_8                                    0.0218 
## 15 debut_rank  8_12                                  -0.0209 
## 16 best_rank   6_11                                   0.0188 
## 17 decade      1960_2000                              0.0103 
## 18 debut_rank  12_Inf                                -0.00370
## 19 decade      2010_Inf                              -0.00176
## 20 debut_rank  -Inf_4                                 0.00154

# Step 3: Plot
data_corr_tbl %>%
  plot_correlation_funnel()

Apply 1: nyt_titles

Reed Wilson

2024-02-07