Import Data

nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-05-10/nyt_titles.tsv')
## Rows: 7431 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (2): title, author
## dbl  (5): id, year, total_weeks, debut_rank, best_rank
## date (1): first_week
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(nyt_titles)
Data summary
Name nyt_titles
Number of rows 7431
Number of columns 8
_______________________
Column type frequency:
character 2
Date 1
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
title 0 1 1 74 0 7172 0
author 4 1 4 73 0 2205 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
first_week 0 1 1931-10-12 2020-12-06 2000-06-25 3348

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 3715.00 2145.29 0 1857.5 3715 5572.5 7430 ▇▇▇▇▇
year 0 1 1989.61 26.23 1931 1968.0 2000 2011.0 2020 ▂▂▂▃▇
total_weeks 0 1 8.13 11.21 1 2.0 4 10.0 178 ▇▁▁▁▁
debut_rank 0 1 7.90 4.57 1 4.0 8 12.0 17 ▇▆▅▅▅
best_rank 0 1 6.91 4.57 1 3.0 6 10.0 17 ▇▅▃▃▂
data <- nyt_titles %>%
    
    # Treat missing values
    select(-id, -title) %>%
    na.omit() %>%
    filter(total_weeks <= 100) %>%

    # log transform variables with pos-skewed distribution
    mutate(total_weeks = log(total_weeks))

data
## # A tibble: 7,416 × 6
##    author                year total_weeks first_week debut_rank best_rank
##    <chr>                <dbl>       <dbl> <date>          <dbl>     <dbl>
##  1 Sue Grafton           1991        2.71 1991-05-05          1         2
##  2 Sue Grafton           1992        2.40 1992-04-26         14         2
##  3 Sue Grafton           1990        1.79 1990-05-06          4         8
##  4 W. Bruce Cameron      2012        0    2012-05-27          3        14
##  5 Kimberla Lawson Roby  2006        0    2006-02-19         11        14
##  6 Patricia Cornwell     2016        1.10 2016-12-04          1         7
##  7 Frank Herbert         1985        2.77 1985-04-21          9         2
##  8 Sandra Brown          1994        1.61 1994-05-01          7        10
##  9 John Jakes            2002        1.39 2002-08-25          7        12
## 10 Sebastian Faulks      1999        0    1999-03-14         12        17
## # ℹ 7,406 more rows

Explore Data

data %>%
    ggplot(aes(total_weeks,best_rank)) +
    geom_point()

data %>%
    ggplot(aes(total_weeks,year)) +
    geom_point()

data_binarized_tbl <- data %>%
    select(-first_week) %>%
    binarize()

data_binarized_tbl %>% glimpse()
## Rows: 7,416
## Columns: 18
## $ author__Danielle_Steel                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `author__-OTHER`                                <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `year__-Inf_1968`                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1968_2000                                 <dbl> 1, 1, 1, 0, 0, 0, 1, 1…
## $ year__2000_2011                                 <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ year__2011_Inf                                  <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ `total_weeks__-Inf_0.693147180559945`           <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ total_weeks__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ total_weeks__1.38629436111989_2.30258509299405  <dbl> 0, 0, 1, 0, 0, 0, 0, 1…
## $ total_weeks__2.30258509299405_Inf               <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ `debut_rank__-Inf_4`                            <dbl> 1, 0, 1, 1, 0, 1, 0, 0…
## $ debut_rank__4_8                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 1…
## $ debut_rank__8_12                                <dbl> 0, 0, 0, 0, 1, 0, 1, 0…
## $ debut_rank__12_Inf                              <dbl> 0, 1, 0, 0, 0, 0, 0, 0…
## $ `best_rank__-Inf_3`                             <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ best_rank__3_6                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ best_rank__6_11                                 <dbl> 0, 0, 1, 0, 0, 1, 0, 1…
## $ best_rank__11_Inf                               <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
data_corr_tbl <- data_binarized_tbl %>%
    correlate(total_weeks__2.30258509299405_Inf)

data_corr_tbl
## # A tibble: 18 × 3
##    feature     bin                                correlation
##    <fct>       <chr>                                    <dbl>
##  1 total_weeks 2.30258509299405_Inf                   1      
##  2 total_weeks -Inf_0.693147180559945                -0.396  
##  3 best_rank   -Inf_3                                 0.341  
##  4 total_weeks 1.38629436111989_2.30258509299405     -0.322  
##  5 best_rank   11_Inf                                -0.283  
##  6 total_weeks 0.693147180559945_1.38629436111989    -0.256  
##  7 year        1968_2000                              0.243  
##  8 year        2011_Inf                              -0.236  
##  9 year        2000_2011                             -0.231  
## 10 year        -Inf_1968                              0.217  
## 11 best_rank   6_11                                  -0.168  
## 12 best_rank   3_6                                    0.0816 
## 13 debut_rank  4_8                                   -0.0312 
## 14 debut_rank  8_12                                   0.0199 
## 15 debut_rank  -Inf_4                                 0.0160 
## 16 debut_rank  12_Inf                                -0.00443
## 17 author      Danielle_Steel                         0.00425
## 18 author      -OTHER                                -0.00425
data_corr_tbl %>%
    plot_correlation_funnel()
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.