Goal: to predict total weeks on bestseller list Click here for the data
Need Help with the log transform! Code is not working.
nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-05-10/nyt_titles.tsv')
## Rows: 7431 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): title, author
## dbl (5): id, year, total_weeks, debut_rank, best_rank
## date (1): first_week
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(nyt_titles)
| Name | nyt_titles |
| Number of rows | 7431 |
| Number of columns | 8 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| Date | 1 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| title | 0 | 1 | 1 | 74 | 0 | 7172 | 0 |
| author | 4 | 1 | 4 | 73 | 0 | 2205 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| first_week | 0 | 1 | 1931-10-12 | 2020-12-06 | 2000-06-25 | 3348 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1 | 3715.00 | 2145.29 | 0 | 1857.5 | 3715 | 5572.5 | 7430 | ▇▇▇▇▇ |
| year | 0 | 1 | 1989.61 | 26.23 | 1931 | 1968.0 | 2000 | 2011.0 | 2020 | ▂▂▂▃▇ |
| total_weeks | 0 | 1 | 8.13 | 11.21 | 1 | 2.0 | 4 | 10.0 | 178 | ▇▁▁▁▁ |
| debut_rank | 0 | 1 | 7.90 | 4.57 | 1 | 4.0 | 8 | 12.0 | 17 | ▇▆▅▅▅ |
| best_rank | 0 | 1 | 6.91 | 4.57 | 1 | 3.0 | 6 | 10.0 | 17 | ▇▅▃▃▂ |
data <- nyt_titles %>%
na.omit() %>%
mutate(total_weeks = log(total_weeks))
Debut Rank
data %>%
ggplot(aes(total_weeks, as.factor(debut_rank))) +
geom_boxplot()
Best Rank
data %>%
ggplot(aes(total_weeks, as.factor(best_rank))) +
geom_boxplot()
data %>%
ggplot(aes(year, total_weeks)) +
geom_col()
EDA Shortcut
cols.dont.want <- "first_week"
data <- data[, ! names(data) %in% cols.dont.want, drop = F]
data_binarized_tbl <- data %>%
select(-id, -title) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 7,427
## Columns: 18
## $ author__Danielle_Steel <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `author__-OTHER` <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `year__-Inf_1968` <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1968_2000 <dbl> 1, 1, 1, 0, 0, 0, 1, 1…
## $ year__2000_2011 <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ year__2011_Inf <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ `total_weeks__-Inf_0.693147180559945` <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ total_weeks__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ total_weeks__1.38629436111989_2.30258509299405 <dbl> 0, 0, 1, 0, 0, 0, 0, 1…
## $ total_weeks__2.30258509299405_Inf <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ `debut_rank__-Inf_4` <dbl> 1, 0, 1, 1, 0, 1, 0, 0…
## $ debut_rank__4_8 <dbl> 0, 0, 0, 0, 0, 0, 0, 1…
## $ debut_rank__8_12 <dbl> 0, 0, 0, 0, 1, 0, 1, 0…
## $ debut_rank__12_Inf <dbl> 0, 1, 0, 0, 0, 0, 0, 0…
## $ `best_rank__-Inf_3` <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ best_rank__3_6 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ best_rank__6_10.5 <dbl> 0, 0, 1, 0, 0, 1, 0, 1…
## $ best_rank__10.5_Inf <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
data_corr_tbl <- data_binarized_tbl %>%
correlate(total_weeks__2.30258509299405_Inf)
data_corr_tbl
## # A tibble: 18 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 total_weeks 2.30258509299405_Inf 1
## 2 total_weeks -Inf_0.693147180559945 -0.397
## 3 best_rank -Inf_3 0.344
## 4 total_weeks 1.38629436111989_2.30258509299405 -0.323
## 5 best_rank 10.5_Inf -0.314
## 6 total_weeks 0.693147180559945_1.38629436111989 -0.256
## 7 year 1968_2000 0.242
## 8 year 2011_Inf -0.235
## 9 year 2000_2011 -0.230
## 10 year -Inf_1968 0.217
## 11 best_rank 6_10.5 -0.130
## 12 best_rank 3_6 0.0800
## 13 debut_rank 4_8 -0.0316
## 14 debut_rank 8_12 0.0201
## 15 debut_rank -Inf_4 0.0148
## 16 author -OTHER -0.00390
## 17 author Danielle_Steel 0.00390
## 18 debut_rank 12_Inf -0.00285
data_corr_tbl %>%
plot_correlation_funnel()