Goal: To predict total weeks on best sellers list
Click[here for the data] (‘https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv’)
nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')
skimr::skim(nyt_titles)
Name | nyt_titles |
Number of rows | 7431 |
Number of columns | 8 |
_______________________ | |
Column type frequency: | |
character | 2 |
Date | 1 |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
title | 0 | 1 | 1 | 74 | 0 | 7172 | 0 |
author | 4 | 1 | 4 | 73 | 0 | 2205 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
first_week | 0 | 1 | 1931-10-12 | 2020-12-06 | 2000-06-25 | 3348 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
id | 0 | 1 | 3715.00 | 2145.29 | 0 | 1857.5 | 3715 | 5572.5 | 7430 | ▇▇▇▇▇ |
year | 0 | 1 | 1989.61 | 26.23 | 1931 | 1968.0 | 2000 | 2011.0 | 2020 | ▂▂▂▃▇ |
total_weeks | 0 | 1 | 8.13 | 11.21 | 1 | 2.0 | 4 | 10.0 | 178 | ▇▁▁▁▁ |
debut_rank | 0 | 1 | 7.90 | 4.57 | 1 | 4.0 | 8 | 12.0 | 17 | ▇▆▅▅▅ |
best_rank | 0 | 1 | 6.91 | 4.57 | 1 | 3.0 | 6 | 10.0 | 17 | ▇▅▃▃▂ |
data <- nyt_titles %>%
# Treat missing values
select(-id) %>%
filter(!is.na(author)) %>%
filter(total_weeks < 100) %>%
mutate(total_weeks = log(total_weeks)) %>%
mutate(decade = year %/% 10 * 10)
#Explore Data
Identify Good Predictors
best_rank
data%>%
ggplot(aes(total_weeks, best_rank)) +
scale_y_log10() +
geom_point()
debut_rank
data%>%
ggplot(aes(total_weeks, as.factor(debut_rank))) +
geom_boxplot()
author
data %>%
group_by(author) %>%
summarise(total_weeks_avg = mean(total_weeks)) %>% ungroup() %>%
slice_max(order_by = total_weeks_avg, n = 20) %>%
ggplot(aes(total_weeks_avg, fct_reorder(author, total_weeks_avg))) +
geom_col() +
labs(title = "Best Author by Total Weeks", y = NULL)
Title
data %>%
#tokenize title
unnest_tokens(output = word, input = title) %>%
#calculate avg rent per word
group_by(word) %>%
summarise(total_weeks = mean(total_weeks),
n = n()) %>%
ungroup() %>%
filter(n > 10, !str_detect(word, "\\a")) %>%
slice_max(order_by = total_weeks, n = 20) %>%
#plot
ggplot(aes(total_weeks, fct_reorder(word, total_weeks))) +
geom_point() +
labs(y = "Words in Title")
EDA shortcut
# step 1 - prepare data
data_binarized_tbl <- data %>%
select(-author, -title, -first_week) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 7,415
## Columns: 20
## $ `year__-Inf_1968` <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1968_2000 <dbl> 1, 1, 1, 0, 0, 0, 1, 1…
## $ year__2000_2011 <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ year__2011_Inf <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ `total_weeks__-Inf_0.693147180559945` <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ total_weeks__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ total_weeks__1.38629436111989_2.30258509299405 <dbl> 0, 0, 1, 0, 0, 0, 0, 1…
## $ total_weeks__2.30258509299405_Inf <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ `debut_rank__-Inf_4` <dbl> 1, 0, 1, 1, 0, 1, 0, 0…
## $ debut_rank__4_8 <dbl> 0, 0, 0, 0, 0, 0, 0, 1…
## $ debut_rank__8_12 <dbl> 0, 0, 0, 0, 1, 0, 1, 0…
## $ debut_rank__12_Inf <dbl> 0, 1, 0, 0, 0, 0, 0, 0…
## $ `best_rank__-Inf_3` <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ best_rank__3_6 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ best_rank__6_11 <dbl> 0, 0, 1, 0, 0, 1, 0, 1…
## $ best_rank__11_Inf <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ `decade__-Inf_1960` <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ decade__1960_2000 <dbl> 1, 1, 1, 0, 1, 0, 1, 1…
## $ decade__2000_2010 <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ decade__2010_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
# step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
correlate(total_weeks__0.693147180559945_1.38629436111989)
data_corr_tbl
## # A tibble: 20 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 total_weeks 0.693147180559945_1.38629436111989 1
## 2 total_weeks -Inf_0.693147180559945 -0.329
## 3 total_weeks 1.38629436111989_2.30258509299405 -0.268
## 4 total_weeks 2.30258509299405_Inf -0.256
## 5 year 2000_2011 0.121
## 6 year 1968_2000 -0.0907
## 7 decade -Inf_1960 -0.0804
## 8 year -Inf_1968 -0.0781
## 9 best_rank 3_6 0.0710
## 10 decade 2000_2010 0.0681
## 11 best_rank -Inf_3 -0.0606
## 12 year 2011_Inf 0.0493
## 13 best_rank 11_Inf -0.0240
## 14 debut_rank 4_8 0.0218
## 15 debut_rank 8_12 -0.0209
## 16 best_rank 6_11 0.0188
## 17 decade 1960_2000 0.0103
## 18 debut_rank 12_Inf -0.00370
## 19 decade 2010_Inf -0.00176
## 20 debut_rank -Inf_4 0.00154
# Step 3: Plot
data_corr_tbl %>%
plot_correlation_funnel()