Goal: To predict total weeks on best sellers list
Click[here for the data] (‘https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv’)
nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')
skimr::skim(nyt_titles)
Name | nyt_titles |
Number of rows | 7431 |
Number of columns | 8 |
_______________________ | |
Column type frequency: | |
character | 2 |
Date | 1 |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
title | 0 | 1 | 1 | 74 | 0 | 7172 | 0 |
author | 4 | 1 | 4 | 73 | 0 | 2205 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
first_week | 0 | 1 | 1931-10-12 | 2020-12-06 | 2000-06-25 | 3348 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
id | 0 | 1 | 3715.00 | 2145.29 | 0 | 1857.5 | 3715 | 5572.5 | 7430 | ▇▇▇▇▇ |
year | 0 | 1 | 1989.61 | 26.23 | 1931 | 1968.0 | 2000 | 2011.0 | 2020 | ▂▂▂▃▇ |
total_weeks | 0 | 1 | 8.13 | 11.21 | 1 | 2.0 | 4 | 10.0 | 178 | ▇▁▁▁▁ |
debut_rank | 0 | 1 | 7.90 | 4.57 | 1 | 4.0 | 8 | 12.0 | 17 | ▇▆▅▅▅ |
best_rank | 0 | 1 | 6.91 | 4.57 | 1 | 3.0 | 6 | 10.0 | 17 | ▇▅▃▃▂ |
data <- nyt_titles %>%
# Treat missing values
select(-id) %>%
filter(!is.na(author)) %>%
filter(total_weeks < 100) %>%
mutate(total_weeks = log(total_weeks)) %>%
mutate(decade = year %/% 10 * 10)
#Explore Data
Identify Good Predictors
best_rank
data %>%
ggplot(aes(as.factor(best_rank), total_weeks)) +
geom_boxplot()
debut_rank
data %>%
ggplot(aes(as.factor(total_weeks), debut_rank)) +
scale_y_log10() +
geom_boxplot() +
scale_x_discrete(breaks = unique(data$total_weeks)[c(TRUE, FALSE)])
author
data %>%
group_by(author) %>%
summarise(total_weeks_avg = mean(total_weeks)) %>% ungroup() %>%
slice_max(order_by = total_weeks_avg, n = 20) %>%
ggplot(aes(total_weeks_avg, fct_reorder(author, total_weeks_avg))) +
geom_col() +
labs(title = "Best Author by Total Weeks", y = NULL)
Title
data %>%
#tokenize title
unnest_tokens(output = word, input = title) %>%
#calculate avg rent per word
group_by(word) %>%
summarise(total_weeks = mean(total_weeks),
n = n()) %>%
ungroup() %>%
filter(n > 10, !str_detect(word, "\\a")) %>%
slice_max(order_by = total_weeks, n = 20) %>%
#plot
ggplot(aes(total_weeks, fct_reorder(word, total_weeks))) +
geom_point() +
labs(y = "Words in Title")