Introduction

Data

This raw data was taken from Kaggle (https://www.kaggle.com/datasets/sootersaalu/amazon-top-50-bestselling-books-2009-2019)

Library

library(tidyverse)
library(skimr)
library(corrplot)
library(RColorBrewer)
library(factoextra)

Data Processing

# Read the data
book <- read.csv('bestsellers with categories.csv')

# read 5 data above
head(book, 5)

# read 5 data below
tail(book, 5)

# check data shape
dim(book)

#> [1] 550   7

From our inspection we can conclude :Book data contain 550 of rows and 7 of columns # Data Cleansing and Corrections

# check data NA
colSums(is.na(book))

#>        Name      Author User.Rating     Reviews       Price        Year 
#>           0           0           0           0           0           0 
#>       Genre 
#>           0

any(is.na(book))

#> [1] FALSE

No missing value found from this data

# describe column names
names(book)

#> [1] "Name"        "Author"      "User.Rating" "Reviews"     "Price"      
#> [6] "Year"        "Genre"

# check data type for each column
str(book)

#> 'data.frame':    550 obs. of  7 variables:
#>  $ Name       : chr  "10-Day Green Smoothie Cleanse" "11/22/63: A Novel" "12 Rules for Life: An Antidote to Chaos" "1984 (Signet Classics)" ...
#>  $ Author     : chr  "JJ Smith" "Stephen King" "Jordan B. Peterson" "George Orwell" ...
#>  $ User.Rating: num  4.7 4.6 4.7 4.7 4.8 4.4 4.7 4.7 4.7 4.6 ...
#>  $ Reviews    : int  17350 2052 18979 21424 7665 12643 19735 19699 5983 23848 ...
#>  $ Price      : int  8 22 15 6 12 11 30 15 3 8 ...
#>  $ Year       : int  2016 2011 2018 2017 2019 2011 2014 2017 2018 2016 ...
#>  $ Genre      : chr  "Non Fiction" "Fiction" "Non Fiction" "Fiction" ...

Checking Unique Value from each column

unique(book$Genre)

#> [1] "Non Fiction" "Fiction"

length(unique(book$Author))

#> [1] 248

Cheking data distribution

skim(book)

Data summary
Name	book
Number of rows	550
Number of columns	7
_______________________
Column type frequency:
character	3
numeric	4
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
Name	1	4	121	351
Author	1	2	34	248
Genre	1	7	11	2

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
User.Rating	1	4.62	0.23	3.3	4.5	4.7	4.80	4.9	▁▁▁▂▇
Reviews	1	11953.28	11731.13	37.0	4058.0	8580.0	17253.25	87841.0	▇▂▁▁▁
Price	1	13.10	10.84	0.0	7.0	11.0	16.00	105.0	▇▁▁▁▁
Year	1	2014.00	3.17	2009.0	2011.0	2014.0	2017.00	2019.0	▇▅▅▅▅

Data Visualization

Genres

Bestseller books distribution by genre

book_genre <- 
  book %>%
  group_by(Genre) %>%
  summarise(count = n())

plot1 <- 
  book_genre %>%
  ggplot(aes(x = "",
             y = count,
             fill = Genre)) +
  geom_bar(stat = 'identity',
           width = 1,
           color = 'white') +
  coord_polar("y", start = 0) +
  theme_void() +
  scale_fill_manual(values = c("#ff9900", "#000000"))

plot1

Bestsellers distribution by genre and year

book_genre_year <- 
  book %>% 
  group_by(Genre, Year) %>%
  summarise(count = n())

plot2 <- 
  book_genre_year %>%
  ggplot(aes(x = Year, 
             y = count,
             fill = Genre)) +
  geom_col() +
  theme_minimal() +
  scale_x_continuous(breaks = seq(2009, 2019, by = 1)) +
  scale_fill_manual(values = c("#ff9900", "#000000")) +
  theme(
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    axis.title.y = element_blank(),
    legend.title = element_blank(),
    axis.title.x = element_blank()
  )
plot2

Correlation Analysis

Is there any correlation seen between the different numerical variables?

book_cor <- 
  book %>%
  select(User.Rating, Reviews, Price, Year) %>%
  cor() %>%
  corrplot::corrplot(
    type = 'lower',
    order = 'hclust',
    method = 'color',
    addgrid.col = 'darkgray',
    outline = T,
    tl.cex = 1,
    tl.col = 'black',
    col = brewer.pal(n = 6,
                     name = 'RdGy')
  )

book_cor

#> $corr
#>                  Price  User.Rating      Reviews       Year
#> Price        1.0000000 -0.133086287 -0.109181883 -0.1539786
#> User.Rating -0.1330863  1.000000000 -0.001729014  0.2423830
#> Reviews     -0.1091819 -0.001729014  1.000000000  0.2635596
#> Year        -0.1539786  0.242382960  0.263559604  1.0000000
#> 
#> $corrPos
#>          xName       yName x y         corr
#> 1        Price       Price 1 4  1.000000000
#> 2        Price User.Rating 1 3 -0.133086287
#> 3        Price     Reviews 1 2 -0.109181883
#> 4        Price        Year 1 1 -0.153978582
#> 5  User.Rating User.Rating 2 3  1.000000000
#> 6  User.Rating     Reviews 2 2 -0.001729014
#> 7  User.Rating        Year 2 1  0.242382960
#> 8      Reviews     Reviews 3 2  1.000000000
#> 9      Reviews        Year 3 1  0.263559604
#> 10        Year        Year 4 1  1.000000000
#> 
#> $arg
#> $arg$type
#> [1] "lower"

We can say that: - Year and Price have strong negative correlation : when the year is older the price also cheaper.

User Reviews vs Year

Most reviewed books

book_review <-
  book %>%
  select(Name, Reviews) %>%
  arrange(desc(Reviews)) %>%
  head(20) %>%
  distinct()
book_review

Are the number of reviews changing by Year?

Linear Model

book %>%
  lm(formula = Year ~ Reviews) %>%
  summary()

#> 
#> Call:
#> lm(formula = Year ~ Reviews, data = .)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -5.5523 -2.5991 -0.3255  2.5354  5.6549 
#> 
#> Coefficients:
#>              Estimate Std. Error   t value Pr(>|t|)    
#> (Intercept) 2.013e+03  1.861e-01 10816.121  < 2e-16 ***
#> Reviews     7.111e-05  1.112e-05     6.396 3.42e-10 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 3.056 on 548 degrees of freedom
#> Multiple R-squared:  0.06946,    Adjusted R-squared:  0.06777 
#> F-statistic: 40.91 on 1 and 548 DF,  p-value: 3.423e-10

#  Reviews distribution on fiction and non fiction book over the year
book_dist <- 
  book %>%
  ggplot(aes(x = Year, 
             y = Reviews, 
             color = Genre)) + 
  geom_jitter() +
  geom_smooth(method = 'lm',
              formula = y ~ x) +
  theme_minimal() +
  scale_color_manual(values = c("#ff9900", "#000000")) + 
  scale_x_continuous(breaks = seq(2009, 2019, by = 1)) + 
  theme(
    panel.grid.minor = element_blank(),
    legend.title = element_blank(),
    axis.title.x = element_blank()
  )
book_dist

Reviews Distribution

book_review_dist <- 
  book %>%
  ggplot(aes(x = Reviews,
             y = ..density..)) +
  geom_histogram(fill = "#ff9900",
                 color = 'black',
                 binwidth = 1000) +
  geom_density(alpha = 0.5, fill = "#ff9900") +
  theme_minimal() +
  theme(panel.grid.major = element_blank())
book_review_dist

Are users rating the bestsellers differently by year?

Linear Model

book %>%
  lm(formula = Year ~ Reviews) %>%
  summary()

#> 
#> Call:
#> lm(formula = Year ~ Reviews, data = .)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -5.5523 -2.5991 -0.3255  2.5354  5.6549 
#> 
#> Coefficients:
#>              Estimate Std. Error   t value Pr(>|t|)    
#> (Intercept) 2.013e+03  1.861e-01 10816.121  < 2e-16 ***
#> Reviews     7.111e-05  1.112e-05     6.396 3.42e-10 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 3.056 on 548 degrees of freedom
#> Multiple R-squared:  0.06946,    Adjusted R-squared:  0.06777 
#> F-statistic: 40.91 on 1 and 548 DF,  p-value: 3.423e-10

#  User Rating distribution on fiction and non fiction book over the year
book_dist <- 
  book %>%
  ggplot(aes(x = Year,
             y = User.Rating, 
             color = Genre)) +
  geom_jitter() +
  geom_smooth(method = 'lm',
              formula = y ~ x) +
  theme_minimal() +
  scale_color_manual(values = c("#ff9900", "#000000")) +
  scale_x_continuous(breaks = seq(2009, 2019, by = 1)) +
  theme(
    panel.grid.minor = element_blank(),
    legend.title = element_blank(),
    axis.title.x = element_blank()
  )
book_dist

Ratings Distribution

book_rating_dist <- 
  book %>%
  ggplot(aes(x = User.Rating,
             y = ..density..)) +
  geom_histogram(fill = "#ff9900",
                 color = 'black',
                 binwidth = 0.1) +
  geom_density(alpha = 0.5, fill = "#ff9900") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank()
  )
book_rating_dist

Ratings vs Reviews

rating_review <- 
  ggplot(book) +
  aes(x = Reviews,
      y = User.Rating,
      colour = Genre,
      size = Reviews) +
  geom_jitter(alpha = 0.45) +
  scale_color_manual(
    values = c(Fiction = "#ff9900",
    `Non Fiction` = "#000000")
  ) +
  theme_minimal()
rating_review

Bestseller

Based on Authors

book_authors <- 
  book %>%
  group_by(Author) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  head(10) 
  
plot_authors <- 
  book_authors %>%
  ggplot(aes(x = reorder(Author, count),
             y = count,
             fill = ifelse(count == max(count),
                           "red", "grey"))) +
  scale_y_continuous(breaks = seq(0, 13, by = 1)) +
  geom_col() +
  coord_flip() +
  scale_fill_manual(values = c("#000000", "#ff9900")) +
  theme_minimal() +
  theme(
    panel.grid.minor = element_blank(),
    panel.grid.major.y = element_blank(),
    legend.position = "none",
    axis.title.y = element_blank()
  )
plot_authors

Based on Book Title

book_title <- 
  book %>%
  group_by(Name) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  head(10)

plot_title <- 
  book_title %>%
  ggplot(aes(x = reorder(Name, count),
             y = count,
             fill = ifelse(count == max(count), "red", "grey"))) +
  scale_y_continuous(breaks = seq(0, 13, by = 1)) +
  geom_col() +
  coord_flip() +
  scale_fill_manual(values = c("#000000", "#ff9900")) +
  theme_minimal() +
  theme(
    panel.grid.minor = element_blank(),
    panel.grid.major.y = element_blank(),
    legend.position = "none",
    axis.title.y = element_blank()
  )
plot_title

Amazon Bestseller Books

Aisah Taufik Hidayat Abdullah

August 2022