knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(ggthemes)
books <- read.csv("bestsellers.csv")
str(books)
## 'data.frame': 550 obs. of 7 variables:
## $ Name : chr "10-Day Green Smoothie Cleanse" "11/22/63: A Novel" "12 Rules for Life: An Antidote to Chaos" "1984 (Signet Classics)" ...
## $ Author : chr "JJ Smith" "Stephen King" "Jordan B. Peterson" "George Orwell" ...
## $ User.Rating: num 4.7 4.6 4.7 4.7 4.8 4.4 4.7 4.7 4.7 4.6 ...
## $ Reviews : int 17350 2052 18979 21424 7665 12643 19735 19699 5983 23848 ...
## $ Price : int 8 22 15 6 12 11 30 15 3 8 ...
## $ Year : int 2016 2011 2018 2017 2019 2011 2014 2017 2018 2016 ...
## $ Genre : chr "Non Fiction" "Fiction" "Non Fiction" "Fiction" ...
head(books)
## Name
## 1 10-Day Green Smoothie Cleanse
## 2 11/22/63: A Novel
## 3 12 Rules for Life: An Antidote to Chaos
## 4 1984 (Signet Classics)
## 5 5,000 Awesome Facts (About Everything!) (National Geographic Kids)
## 6 A Dance with Dragons (A Song of Ice and Fire)
## Author User.Rating Reviews Price Year Genre
## 1 JJ Smith 4.7 17350 8 2016 Non Fiction
## 2 Stephen King 4.6 2052 22 2011 Fiction
## 3 Jordan B. Peterson 4.7 18979 15 2018 Non Fiction
## 4 George Orwell 4.7 21424 6 2017 Fiction
## 5 National Geographic Kids 4.8 7665 12 2019 Non Fiction
## 6 George R. R. Martin 4.4 12643 11 2011 Fiction
summary(books$User.Rating)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.300 4.500 4.700 4.618 4.800 4.900
summary(books$Reviews)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 37 4058 8580 11953 17253 87841
table(books$Genre)
##
## Fiction Non Fiction
## 240 310
table(books$Year)
##
## 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019
## 50 50 50 50 50 50 50 50 50 50 50
summary(books$Price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 7.0 11.0 13.1 16.0 105.0
Novel Questions to Investigate
What is the average user rating for fiction versus non-fiction books?
Is there a relationship between the number of reviews and the price of the book?
How has the average user rating changed over the years for both fiction and non-fiction books?
Aggregating to Address Question 1
{r,
genre_avg_rating <- books %>% group_by(Genre) %>% summarise(avg_rating = mean(User.Rating, na.rm = TRUE))
genre_avg_rating
## # A tibble: 2 × 2
## Genre avg_rating
## <chr> <dbl>
## 1 Fiction 4.65
## 2 Non Fiction 4.60
ggplot(books, aes(x = User.Rating)) + geom_histogram(binwidth = 0.1, fill = "skyblue", color = "black") + labs(title = "Distribution of User Rating", x = "User Rating", y = "Frequency") + theme_minimal()
ggplot(books, aes(x = Reviews, fill = Genre)) + geom_density(alpha = 0.5) + labs(title = "Distribution of Reviews by Genre", x = "Number of Reviews", y = "Density") + scale_fill_manual(values = c("skyblue", "salmon")) + theme_minimal()
ggplot(books, aes(x = Price, y = Reviews)) + geom_point(alpha = 0.5, color = "darkblue") + geom_smooth(method = "lm", se = FALSE, color = "red") + labs(title = "Relationship between Price and Number of Reviews", x = "Price", y = "Number of Reviews") + theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
ggplot(books, aes(x = Year, y = User.Rating, color = Genre)) + geom_line() + geom_point() + labs(title = "Average User Rating Over Years", x = "Year", y = "Average User Rating") + theme_minimal()