This raw data was taken from Kaggle (https://www.kaggle.com/datasets/sootersaalu/amazon-top-50-bestselling-books-2009-2019)
library(tidyverse)
library(skimr)
library(corrplot)
library(RColorBrewer)
library(factoextra)# Read the data
book <- read.csv('bestsellers with categories.csv')# read 5 data above
head(book, 5)# read 5 data below
tail(book, 5)# check data shape
dim(book)#> [1] 550 7
From our inspection we can conclude :Book data
contain 550 of rows and 7 of columns
# Data Cleansing and Corrections
# check data NA
colSums(is.na(book))#> Name Author User.Rating Reviews Price Year
#> 0 0 0 0 0 0
#> Genre
#> 0
any(is.na(book))#> [1] FALSE
No missing value found from this data
# describe column names
names(book)#> [1] "Name" "Author" "User.Rating" "Reviews" "Price"
#> [6] "Year" "Genre"
# check data type for each column
str(book)#> 'data.frame': 550 obs. of 7 variables:
#> $ Name : chr "10-Day Green Smoothie Cleanse" "11/22/63: A Novel" "12 Rules for Life: An Antidote to Chaos" "1984 (Signet Classics)" ...
#> $ Author : chr "JJ Smith" "Stephen King" "Jordan B. Peterson" "George Orwell" ...
#> $ User.Rating: num 4.7 4.6 4.7 4.7 4.8 4.4 4.7 4.7 4.7 4.6 ...
#> $ Reviews : int 17350 2052 18979 21424 7665 12643 19735 19699 5983 23848 ...
#> $ Price : int 8 22 15 6 12 11 30 15 3 8 ...
#> $ Year : int 2016 2011 2018 2017 2019 2011 2014 2017 2018 2016 ...
#> $ Genre : chr "Non Fiction" "Fiction" "Non Fiction" "Fiction" ...
unique(book$Genre)#> [1] "Non Fiction" "Fiction"
length(unique(book$Author))#> [1] 248
skim(book)| Name | book |
| Number of rows | 550 |
| Number of columns | 7 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Name | 0 | 1 | 4 | 121 | 0 | 351 | 0 |
| Author | 0 | 1 | 2 | 34 | 0 | 248 | 0 |
| Genre | 0 | 1 | 7 | 11 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| User.Rating | 0 | 1 | 4.62 | 0.23 | 3.3 | 4.5 | 4.7 | 4.80 | 4.9 | ▁▁▁▂▇ |
| Reviews | 0 | 1 | 11953.28 | 11731.13 | 37.0 | 4058.0 | 8580.0 | 17253.25 | 87841.0 | ▇▂▁▁▁ |
| Price | 0 | 1 | 13.10 | 10.84 | 0.0 | 7.0 | 11.0 | 16.00 | 105.0 | ▇▁▁▁▁ |
| Year | 0 | 1 | 2014.00 | 3.17 | 2009.0 | 2011.0 | 2014.0 | 2017.00 | 2019.0 | ▇▅▅▅▅ |
Bestseller books distribution by genre
book_genre <-
book %>%
group_by(Genre) %>%
summarise(count = n())
plot1 <-
book_genre %>%
ggplot(aes(x = "",
y = count,
fill = Genre)) +
geom_bar(stat = 'identity',
width = 1,
color = 'white') +
coord_polar("y", start = 0) +
theme_void() +
scale_fill_manual(values = c("#ff9900", "#000000"))
plot1Bestsellers distribution by genre and year
book_genre_year <-
book %>%
group_by(Genre, Year) %>%
summarise(count = n())
plot2 <-
book_genre_year %>%
ggplot(aes(x = Year,
y = count,
fill = Genre)) +
geom_col() +
theme_minimal() +
scale_x_continuous(breaks = seq(2009, 2019, by = 1)) +
scale_fill_manual(values = c("#ff9900", "#000000")) +
theme(
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
axis.title.y = element_blank(),
legend.title = element_blank(),
axis.title.x = element_blank()
)
plot2Is there any correlation seen between the different numerical variables?
book_cor <-
book %>%
select(User.Rating, Reviews, Price, Year) %>%
cor() %>%
corrplot::corrplot(
type = 'lower',
order = 'hclust',
method = 'color',
addgrid.col = 'darkgray',
outline = T,
tl.cex = 1,
tl.col = 'black',
col = brewer.pal(n = 6,
name = 'RdGy')
)book_cor#> $corr
#> Price User.Rating Reviews Year
#> Price 1.0000000 -0.133086287 -0.109181883 -0.1539786
#> User.Rating -0.1330863 1.000000000 -0.001729014 0.2423830
#> Reviews -0.1091819 -0.001729014 1.000000000 0.2635596
#> Year -0.1539786 0.242382960 0.263559604 1.0000000
#>
#> $corrPos
#> xName yName x y corr
#> 1 Price Price 1 4 1.000000000
#> 2 Price User.Rating 1 3 -0.133086287
#> 3 Price Reviews 1 2 -0.109181883
#> 4 Price Year 1 1 -0.153978582
#> 5 User.Rating User.Rating 2 3 1.000000000
#> 6 User.Rating Reviews 2 2 -0.001729014
#> 7 User.Rating Year 2 1 0.242382960
#> 8 Reviews Reviews 3 2 1.000000000
#> 9 Reviews Year 3 1 0.263559604
#> 10 Year Year 4 1 1.000000000
#>
#> $arg
#> $arg$type
#> [1] "lower"
We can say that: - Year and Price have strong negative correlation : when the year is older the price also cheaper.
Most reviewed books
book_review <-
book %>%
select(Name, Reviews) %>%
arrange(desc(Reviews)) %>%
head(20) %>%
distinct()
book_reviewAre the number of reviews changing by Year?
Linear Model
book %>%
lm(formula = Year ~ Reviews) %>%
summary()#>
#> Call:
#> lm(formula = Year ~ Reviews, data = .)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -5.5523 -2.5991 -0.3255 2.5354 5.6549
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 2.013e+03 1.861e-01 10816.121 < 2e-16 ***
#> Reviews 7.111e-05 1.112e-05 6.396 3.42e-10 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 3.056 on 548 degrees of freedom
#> Multiple R-squared: 0.06946, Adjusted R-squared: 0.06777
#> F-statistic: 40.91 on 1 and 548 DF, p-value: 3.423e-10
# Reviews distribution on fiction and non fiction book over the year
book_dist <-
book %>%
ggplot(aes(x = Year,
y = Reviews,
color = Genre)) +
geom_jitter() +
geom_smooth(method = 'lm',
formula = y ~ x) +
theme_minimal() +
scale_color_manual(values = c("#ff9900", "#000000")) +
scale_x_continuous(breaks = seq(2009, 2019, by = 1)) +
theme(
panel.grid.minor = element_blank(),
legend.title = element_blank(),
axis.title.x = element_blank()
)
book_distReviews Distribution
book_review_dist <-
book %>%
ggplot(aes(x = Reviews,
y = ..density..)) +
geom_histogram(fill = "#ff9900",
color = 'black',
binwidth = 1000) +
geom_density(alpha = 0.5, fill = "#ff9900") +
theme_minimal() +
theme(panel.grid.major = element_blank())
book_review_distAre users rating the bestsellers differently by year?
Linear Model
book %>%
lm(formula = Year ~ Reviews) %>%
summary()#>
#> Call:
#> lm(formula = Year ~ Reviews, data = .)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -5.5523 -2.5991 -0.3255 2.5354 5.6549
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 2.013e+03 1.861e-01 10816.121 < 2e-16 ***
#> Reviews 7.111e-05 1.112e-05 6.396 3.42e-10 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 3.056 on 548 degrees of freedom
#> Multiple R-squared: 0.06946, Adjusted R-squared: 0.06777
#> F-statistic: 40.91 on 1 and 548 DF, p-value: 3.423e-10
# User Rating distribution on fiction and non fiction book over the year
book_dist <-
book %>%
ggplot(aes(x = Year,
y = User.Rating,
color = Genre)) +
geom_jitter() +
geom_smooth(method = 'lm',
formula = y ~ x) +
theme_minimal() +
scale_color_manual(values = c("#ff9900", "#000000")) +
scale_x_continuous(breaks = seq(2009, 2019, by = 1)) +
theme(
panel.grid.minor = element_blank(),
legend.title = element_blank(),
axis.title.x = element_blank()
)
book_distRatings Distribution
book_rating_dist <-
book %>%
ggplot(aes(x = User.Rating,
y = ..density..)) +
geom_histogram(fill = "#ff9900",
color = 'black',
binwidth = 0.1) +
geom_density(alpha = 0.5, fill = "#ff9900") +
theme_minimal() +
theme(
panel.grid.major = element_blank()
)
book_rating_distRatings vs Reviews
rating_review <-
ggplot(book) +
aes(x = Reviews,
y = User.Rating,
colour = Genre,
size = Reviews) +
geom_jitter(alpha = 0.45) +
scale_color_manual(
values = c(Fiction = "#ff9900",
`Non Fiction` = "#000000")
) +
theme_minimal()
rating_reviewBased on Authors
book_authors <-
book %>%
group_by(Author) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
head(10)
plot_authors <-
book_authors %>%
ggplot(aes(x = reorder(Author, count),
y = count,
fill = ifelse(count == max(count),
"red", "grey"))) +
scale_y_continuous(breaks = seq(0, 13, by = 1)) +
geom_col() +
coord_flip() +
scale_fill_manual(values = c("#000000", "#ff9900")) +
theme_minimal() +
theme(
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
legend.position = "none",
axis.title.y = element_blank()
)
plot_authorsBased on Book Title
book_title <-
book %>%
group_by(Name) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
head(10)
plot_title <-
book_title %>%
ggplot(aes(x = reorder(Name, count),
y = count,
fill = ifelse(count == max(count), "red", "grey"))) +
scale_y_continuous(breaks = seq(0, 13, by = 1)) +
geom_col() +
coord_flip() +
scale_fill_manual(values = c("#000000", "#ff9900")) +
theme_minimal() +
theme(
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
legend.position = "none",
axis.title.y = element_blank()
)
plot_title