Goal: Click here for the data
nyt_titles <- read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')
skimr::skim(nyt_titles)
Name | nyt_titles |
Number of rows | 7431 |
Number of columns | 8 |
_______________________ | |
Column type frequency: | |
character | 2 |
Date | 1 |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
title | 0 | 1 | 1 | 74 | 0 | 7172 | 0 |
author | 4 | 1 | 4 | 73 | 0 | 2205 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
first_week | 0 | 1 | 1931-10-12 | 2020-12-06 | 2000-06-25 | 3348 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
id | 0 | 1 | 3715.00 | 2145.29 | 0 | 1857.5 | 3715 | 5572.5 | 7430 | ▇▇▇▇▇ |
year | 0 | 1 | 1989.61 | 26.23 | 1931 | 1968.0 | 2000 | 2011.0 | 2020 | ▂▂▂▃▇ |
total_weeks | 0 | 1 | 8.13 | 11.21 | 1 | 2.0 | 4 | 10.0 | 178 | ▇▁▁▁▁ |
debut_rank | 0 | 1 | 7.90 | 4.57 | 1 | 4.0 | 8 | 12.0 | 17 | ▇▆▅▅▅ |
best_rank | 0 | 1 | 6.91 | 4.57 | 1 | 3.0 | 6 | 10.0 | 17 | ▇▅▃▃▂ |
data <- nyt_titles %>%
# Treat missing values
select(-author) %>%
na.omit() %>%
# log transform variables with pos-skewed distribution
mutate(best = log(best_rank))
Identify good predictors.
total weeks
data %>%
ggplot(aes(best, total_weeks)) +
scale_y_log10() +
geom_point()
data %>%
ggplot(aes(best, as.factor(debut_rank))) +
geom_boxplot()
data %>%
# tokenize title
unnest_tokens(output = word, input = title) %>%
# calculate avg best rank per week
group_by(word) %>%
summarise(best = mean(best_rank),
n = n()) %>%
ungroup() %>%
filter(n > 10, !str_detect(word, "\\d")) %>%
slice_max(order_by = best, n = 20) %>%
# Plot
ggplot(aes(best, fct_reorder(word, best))) +
geom_point() +
labs(y = "Words in Title")
EDA shortcut
# Step 1: Prepare data
data_binarized_tbl <- data %>%
select(-id, -title, -first_week) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 7,431
## Columns: 20
## $ `year__-Inf_1968` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__1968_2000 <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, …
## $ year__2000_2011 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, …
## $ year__2011_Inf <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, …
## $ `total_weeks__-Inf_2` <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, …
## $ total_weeks__2_4 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, …
## $ total_weeks__4_10 <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, …
## $ total_weeks__10_Inf <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ `debut_rank__-Inf_4` <dbl> 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, …
## $ debut_rank__4_8 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, …
## $ debut_rank__8_12 <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, …
## $ debut_rank__12_Inf <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `best_rank__-Inf_3` <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ best_rank__3_6 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ best_rank__6_10 <dbl> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, …
## $ best_rank__10_Inf <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, …
## $ `best__-Inf_1.09861228866811` <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ best__1.09861228866811_1.79175946922805 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ best__1.79175946922805_2.30258509299405 <dbl> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, …
## $ best__2.30258509299405_Inf <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, …
# Step 2: Correate
data_corr_tblNY <- data_binarized_tbl %>%
correlate(best__2.30258509299405_Inf)
data_corr_tblNY
## # A tibble: 20 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 best_rank 10_Inf 1
## 2 best 2.30258509299405_Inf 1
## 3 total_weeks -Inf_2 0.469
## 4 best_rank -Inf_3 -0.381
## 5 best -Inf_1.09861228866811 -0.381
## 6 best_rank 6_10 -0.317
## 7 best 1.79175946922805_2.30258509299405 -0.317
## 8 total_weeks 10_Inf -0.313
## 9 best_rank 3_6 -0.302
## 10 best 1.09861228866811_1.79175946922805 -0.302
## 11 total_weeks 4_10 -0.196
## 12 debut_rank 12_Inf 0.0901
## 13 debut_rank 4_8 -0.0582
## 14 year 1968_2000 -0.0228
## 15 debut_rank 8_12 -0.0153
## 16 year 2011_Inf 0.0129
## 17 debut_rank -Inf_4 -0.0102
## 18 total_weeks 2_4 -0.00828
## 19 year -Inf_1968 0.00569
## 20 year 2000_2011 0.00460
# Step 3: Plot
data_corr_tblNY %>%
plot_correlation_funnel()