knitr::opts_chunk$set(echo = TRUE)

Load necessary libraries

library(tidyverse) 
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2) 
library(ggthemes)

Load the dataset

books <- read.csv("bestsellers.csv")

Check the structure of the dataset

str(books)
## 'data.frame':    550 obs. of  7 variables:
##  $ Name       : chr  "10-Day Green Smoothie Cleanse" "11/22/63: A Novel" "12 Rules for Life: An Antidote to Chaos" "1984 (Signet Classics)" ...
##  $ Author     : chr  "JJ Smith" "Stephen King" "Jordan B. Peterson" "George Orwell" ...
##  $ User.Rating: num  4.7 4.6 4.7 4.7 4.8 4.4 4.7 4.7 4.7 4.6 ...
##  $ Reviews    : int  17350 2052 18979 21424 7665 12643 19735 19699 5983 23848 ...
##  $ Price      : int  8 22 15 6 12 11 30 15 3 8 ...
##  $ Year       : int  2016 2011 2018 2017 2019 2011 2014 2017 2018 2016 ...
##  $ Genre      : chr  "Non Fiction" "Fiction" "Non Fiction" "Fiction" ...

Display a few rows of the dataset

head(books)
##                                                                 Name
## 1                                      10-Day Green Smoothie Cleanse
## 2                                                  11/22/63: A Novel
## 3                            12 Rules for Life: An Antidote to Chaos
## 4                                             1984 (Signet Classics)
## 5 5,000 Awesome Facts (About Everything!) (National Geographic Kids)
## 6                      A Dance with Dragons (A Song of Ice and Fire)
##                     Author User.Rating Reviews Price Year       Genre
## 1                 JJ Smith         4.7   17350     8 2016 Non Fiction
## 2             Stephen King         4.6    2052    22 2011     Fiction
## 3       Jordan B. Peterson         4.7   18979    15 2018 Non Fiction
## 4            George Orwell         4.7   21424     6 2017     Fiction
## 5 National Geographic Kids         4.8    7665    12 2019 Non Fiction
## 6      George R. R. Martin         4.4   12643    11 2011     Fiction

Numeric summary of User Rating and Reviews

summary(books$User.Rating) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.300   4.500   4.700   4.618   4.800   4.900
summary(books$Reviews)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      37    4058    8580   11953   17253   87841

Unique values and counts for Genre

table(books$Genre)
## 
##     Fiction Non Fiction 
##         240         310

Unique values and counts for Year

table(books$Year)
## 
## 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 
##   50   50   50   50   50   50   50   50   50   50   50

Summary statistics for Price

summary(books$Price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     7.0    11.0    13.1    16.0   105.0

Novel Questions to Investigate

What is the average user rating for fiction versus non-fiction books?
Is there a relationship between the number of reviews and the price of the book?
How has the average user rating changed over the years for both fiction and non-fiction books?

Aggregating to Address Question 1

{r,

Aggregate data to calculate average user rating for fiction and non-fiction books

genre_avg_rating <- books %>% group_by(Genre) %>% summarise(avg_rating = mean(User.Rating, na.rm = TRUE))

genre_avg_rating
## # A tibble: 2 × 2
##   Genre       avg_rating
##   <chr>            <dbl>
## 1 Fiction           4.65
## 2 Non Fiction       4.60

Distribution of User Rating

ggplot(books, aes(x = User.Rating)) + geom_histogram(binwidth = 0.1, fill = "skyblue", color = "black") + labs(title = "Distribution of User Rating", x = "User Rating", y = "Frequency") + theme_minimal()

Distribution of Reviews by Genre

ggplot(books, aes(x = Reviews, fill = Genre)) + geom_density(alpha = 0.5) + labs(title = "Distribution of Reviews by Genre", x = "Number of Reviews", y = "Density") + scale_fill_manual(values = c("skyblue", "salmon")) + theme_minimal()

Relationship between Price and Number of Reviews

ggplot(books, aes(x = Price, y = Reviews)) + geom_point(alpha = 0.5, color = "darkblue") + geom_smooth(method = "lm", se = FALSE, color = "red") + labs(title = "Relationship between Price and Number of Reviews", x = "Price", y = "Number of Reviews") + theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Average User Rating Over Years for Fiction and Non-Fiction Books

ggplot(books, aes(x = Year, y = User.Rating, color = Genre)) + geom_line() + geom_point() + labs(title = "Average User Rating Over Years", x = "Year", y = "Average User Rating") + theme_minimal()