This dataset includes Amazon’s Top 50 Bestselling Books from 2009-2019 click here

Setting up my environment

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

Import data and clean

Rename column with clean_names(), replace blanks with NA, remove NA, remove duplicates

book <- read_csv("bestsellers _with_categories.csv") %>%
clean_names() %>%
  mutate_all(list(~na_if(.,""))) %>%
  na.omit() %>%
  distinct()
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Name = col_character(),
##   Author = col_character(),
##   `User Rating` = col_double(),
##   Reviews = col_double(),
##   Price = col_double(),
##   Year = col_double(),
##   Genre = col_character()
## )
options(digits = 2)
head(book)
## # A tibble: 6 x 7
##   name                       author       user_rating reviews price  year genre 
##   <chr>                      <chr>              <dbl>   <dbl> <dbl> <dbl> <chr> 
## 1 10-Day Green Smoothie Cle… JJ Smith             4.7   17350     8  2016 Non F…
## 2 11/22/63: A Novel          Stephen King         4.6    2052    22  2011 Ficti…
## 3 12 Rules for Life: An Ant… Jordan B. P…         4.7   18979    15  2018 Non F…
## 4 1984 (Signet Classics)     George Orwe…         4.7   21424     6  2017 Ficti…
## 5 5,000 Awesome Facts (Abou… National Ge…         4.8    7665    12  2019 Non F…
## 6 A Dance with Dragons (A S… George R. R…         4.4   12643    11  2011 Ficti…

Data Exploration

Here we will perform different summaries and visualizations of the data

Find the number/review/price of non-fiction and fiction books for every year

book_g <-
  book %>%
  group_by(year, genre) %>%
  summarize(num = n(), 
             avg_rate = mean(user_rating),
             avg_rev = mean(reviews),
             avg_price = mean(price))
## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.
head(book_g)
## # A tibble: 6 x 6
## # Groups:   year [3]
##    year genre         num avg_rate avg_rev avg_price
##   <dbl> <chr>       <int>    <dbl>   <dbl>     <dbl>
## 1  2009 Fiction        24     4.59   6534.      15.6
## 2  2009 Non Fiction    26     4.58   3026.      15.2
## 3  2010 Fiction        20     4.62   8409.       9.7
## 4  2010 Non Fiction    30     4.52   3527.      16  
## 5  2011 Fiction        21     4.62  10335.      11.6
## 6  2011 Non Fiction    29     4.51   6483.      17.6
ggplot(book_g, aes(x = year)) +
  geom_point(aes(y = num, color = "Numer of books"), shape = 18, size = 4) +
  geom_point(aes(y = avg_price, color = "Averge Price"), size = 3) +
  labs(y= "Number of books" , x = "Year", title = "Number of Books vs. Average Price") +
  scale_x_continuous(breaks = c(2009:2019)) +
  scale_y_continuous(sec.axis = sec_axis(~., name = "Average Price")) +
  theme(legend.position = "bottom", axis.text.y = element_text(angle = 45),
        axis.text.x = element_text(angle = 90),
        legend.direction = "horizontal") +
  facet_grid(cols = vars(genre))

The average price of non fiction books is higher than fiction

From 2014-2015 there was a significant decrease in the number of fiction books while during that same period the number of non fiction books increased

From 2014-2019 the average price of non fiction books decreased as the number of fiction books increased

ggplot(book_g, aes(x = year, y = avg_rev, fill = genre)) +
  geom_bar(stat = "identity", color = "#c3ddf7") +
  labs(y = "Book Reviews" , x = "Year", title = "Total Reviews: Fiction vs. Non Fiction") +
  scale_x_continuous(breaks = c(2009:2019)) +
  theme(legend.position = "bottom", axis.text.y = element_text(angle=45),
        axis.text.x = element_text(angle=90),
        legend.direction = "horizontal")

Fiction books received more reviews than non fiction, however both genres showed a steady increase in reviews from 2009-2019

subf <-  data.frame(rating = subset(book, genre == "Fiction")$user_rating)
subnf <- data.frame(rating = subset(book, genre == "Non Fiction")$user_rating)
subf$genre <- "Fiction"
subnf$genre <- "Non_fiction"
urate_all <- rbind(subf, subnf)
ggplot(urate_all, aes(rating, fill = genre)) + 
#geom_density(alpha = 0.2) +
geom_bar(pos = "dodge") +
labs(y = "Density" ,x = "User Rating",title="Distrubition of User Rating") +
xlim(3.1, 5.1)

Fiction user ratings is left skewed where a large majority of the reviews are rated > 4.6

Non Fiction user ratings is left skewed where a large majority of the reviews are rated > 4.8

List the top 10 most reviewed authors of fiction books

  • Delia Owens
  • Paula Hawkins
  • Gillian Flynn
  • John Green
  • Kristin Hannah
  • E L James
  • Andy Weir
  • Anthony Doerr
  • Paulo Coelho
  • Donna Tartt
book_f <-
  book %>%
  filter(genre == "Fiction") %>%
  group_by(name, author) %>%
  summarize(avg_rate = mean(user_rating),
             avg_rev = mean(reviews)) %>%
  arrange(desc(avg_rev)) %>%
  ungroup() %>%
  slice(1:10)
## `summarise()` has grouped output by 'name'. You can override using the `.groups` argument.
head(book_f)
## # A tibble: 6 x 4
##   name                                               author     avg_rate avg_rev
##   <chr>                                              <chr>         <dbl>   <dbl>
## 1 Where the Crawdads Sing                            Delia Owe…      4.8   87841
## 2 The Girl on the Train                              Paula Haw…      4.1   79446
## 3 Gone Girl                                          Gillian F…      4     57271
## 4 The Fault in Our Stars                             John Green      4.7   50482
## 5 The Nightingale: A Novel                           Kristin H…      4.8   49288
## 6 Fifty Shades of Grey: Book One of the Fifty Shade… E L James       3.8   47265

List the top 10 most reviewed authors of non-fiction books

  • Michelle Obama
  • Laura Hillenbrand
  • Tara Westover
  • Mark Manson
  • Gary Chapman
  • Dale Carnegie
  • Daniel James Brown
  • Don Miguel Ruiz
  • School Zone
  • Marie Kondō
book_nf <-
  book %>%
  filter(genre == "Non Fiction") %>%
  group_by(name, author) %>%
  summarize(avg_rate = mean(user_rating),
             avg_rev = mean(reviews)) %>%
  arrange(desc(avg_rev)) %>%
  ungroup() %>%
  slice(1:10)
## `summarise()` has grouped output by 'name'. You can override using the `.groups` argument.
head(book_nf)
## # A tibble: 6 x 4
##   name                                             author       avg_rate avg_rev
##   <chr>                                            <chr>           <dbl>   <dbl>
## 1 Becoming                                         Michelle Ob…      4.8   61133
## 2 Unbroken: A World War II Story of Survival, Res… Laura Hille…      4.8   29673
## 3 Educated: A Memoir                               Tara Westov…      4.7   28729
## 4 The Subtle Art of Not Giving a F*ck: A Counteri… Mark Manson       4.6   26490
## 5 The 5 Love Languages: The Secret to Love that L… Gary Chapman      4.8   25554
## 6 How to Win Friends & Influence People            Dale Carneg…      4.7   25001