knitr::opts_chunk$set(echo = TRUE)

Load necessary libraries

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2) 
library(ggthemes)

Load the dataset

books <- read.csv("bestsellers.csv")

Check the structure of the dataset

str(books)

## 'data.frame':    550 obs. of  7 variables:
##  $ Name       : chr  "10-Day Green Smoothie Cleanse" "11/22/63: A Novel" "12 Rules for Life: An Antidote to Chaos" "1984 (Signet Classics)" ...
##  $ Author     : chr  "JJ Smith" "Stephen King" "Jordan B. Peterson" "George Orwell" ...
##  $ User.Rating: num  4.7 4.6 4.7 4.7 4.8 4.4 4.7 4.7 4.7 4.6 ...
##  $ Reviews    : int  17350 2052 18979 21424 7665 12643 19735 19699 5983 23848 ...
##  $ Price      : int  8 22 15 6 12 11 30 15 3 8 ...
##  $ Year       : int  2016 2011 2018 2017 2019 2011 2014 2017 2018 2016 ...
##  $ Genre      : chr  "Non Fiction" "Fiction" "Non Fiction" "Fiction" ...

Display a few rows of the dataset

head(books)

##                                                                 Name
## 1                                      10-Day Green Smoothie Cleanse
## 2                                                  11/22/63: A Novel
## 3                            12 Rules for Life: An Antidote to Chaos
## 4                                             1984 (Signet Classics)
## 5 5,000 Awesome Facts (About Everything!) (National Geographic Kids)
## 6                      A Dance with Dragons (A Song of Ice and Fire)
##                     Author User.Rating Reviews Price Year       Genre
## 1                 JJ Smith         4.7   17350     8 2016 Non Fiction
## 2             Stephen King         4.6    2052    22 2011     Fiction
## 3       Jordan B. Peterson         4.7   18979    15 2018 Non Fiction
## 4            George Orwell         4.7   21424     6 2017     Fiction
## 5 National Geographic Kids         4.8    7665    12 2019 Non Fiction
## 6      George R. R. Martin         4.4   12643    11 2011     Fiction

Numeric summary of User Rating and Reviews

summary(books$User.Rating)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.300   4.500   4.700   4.618   4.800   4.900

summary(books$Reviews)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      37    4058    8580   11953   17253   87841

Unique values and counts for Genre

table(books$Genre)

## 
##     Fiction Non Fiction 
##         240         310

Unique values and counts for Year

table(books$Year)

## 
## 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 
##   50   50   50   50   50   50   50   50   50   50   50

Summary statistics for Price

summary(books$Price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     7.0    11.0    13.1    16.0   105.0

Novel Questions to Investigate

What is the average user rating for fiction versus non-fiction books?
Is there a relationship between the number of reviews and the price of the book?
How has the average user rating changed over the years for both fiction and non-fiction books?

Aggregating to Address Question 1

{r,

Aggregate data to calculate average user rating for fiction and non-fiction books

genre_avg_rating <- books %>% group_by(Genre) %>% summarise(avg_rating = mean(User.Rating, na.rm = TRUE))

genre_avg_rating

## # A tibble: 2 × 2
##   Genre       avg_rating
##   <chr>            <dbl>
## 1 Fiction           4.65
## 2 Non Fiction       4.60

Distribution of User Rating

ggplot(books, aes(x = User.Rating)) + geom_histogram(binwidth = 0.1, fill = "skyblue", color = "black") + labs(title = "Distribution of User Rating", x = "User Rating", y = "Frequency") + theme_minimal()

Distribution of Reviews by Genre

ggplot(books, aes(x = Reviews, fill = Genre)) + geom_density(alpha = 0.5) + labs(title = "Distribution of Reviews by Genre", x = "Number of Reviews", y = "Density") + scale_fill_manual(values = c("skyblue", "salmon")) + theme_minimal()

Relationship between Price and Number of Reviews

ggplot(books, aes(x = Price, y = Reviews)) + geom_point(alpha = 0.5, color = "darkblue") + geom_smooth(method = "lm", se = FALSE, color = "red") + labs(title = "Relationship between Price and Number of Reviews", x = "Price", y = "Number of Reviews") + theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

Average User Rating Over Years for Fiction and Non-Fiction Books

ggplot(books, aes(x = Year, y = User.Rating, color = Genre)) + geom_line() + geom_point() + labs(title = "Average User Rating Over Years", x = "Year", y = "Average User Rating") + theme_minimal()

Week 2 | Data Dive — Summaries

Shresta

2024-02-18

Load necessary libraries

Load the dataset

Check the structure of the dataset

Display a few rows of the dataset

Numeric summary of User Rating and Reviews

Unique values and counts for Genre

Unique values and counts for Year

Summary statistics for Price

Aggregate data to calculate average user rating for fiction and non-fiction books

Distribution of User Rating

Distribution of Reviews by Genre

Relationship between Price and Number of Reviews

Average User Rating Over Years for Fiction and Non-Fiction Books