library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.2 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
Rename column with clean_names(), replace blanks with NA, remove NA, remove duplicates
book <- read_csv("bestsellers _with_categories.csv") %>%
clean_names() %>%
mutate_all(list(~na_if(.,""))) %>%
na.omit() %>%
distinct()
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## Name = col_character(),
## Author = col_character(),
## `User Rating` = col_double(),
## Reviews = col_double(),
## Price = col_double(),
## Year = col_double(),
## Genre = col_character()
## )
options(digits = 2)
head(book)
## # A tibble: 6 x 7
## name author user_rating reviews price year genre
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 10-Day Green Smoothie Cle… JJ Smith 4.7 17350 8 2016 Non F…
## 2 11/22/63: A Novel Stephen King 4.6 2052 22 2011 Ficti…
## 3 12 Rules for Life: An Ant… Jordan B. P… 4.7 18979 15 2018 Non F…
## 4 1984 (Signet Classics) George Orwe… 4.7 21424 6 2017 Ficti…
## 5 5,000 Awesome Facts (Abou… National Ge… 4.8 7665 12 2019 Non F…
## 6 A Dance with Dragons (A S… George R. R… 4.4 12643 11 2011 Ficti…
Here we will perform different summaries and visualizations of the data
book_g <-
book %>%
group_by(year, genre) %>%
summarize(num = n(),
avg_rate = mean(user_rating),
avg_rev = mean(reviews),
avg_price = mean(price))
## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.
head(book_g)
## # A tibble: 6 x 6
## # Groups: year [3]
## year genre num avg_rate avg_rev avg_price
## <dbl> <chr> <int> <dbl> <dbl> <dbl>
## 1 2009 Fiction 24 4.59 6534. 15.6
## 2 2009 Non Fiction 26 4.58 3026. 15.2
## 3 2010 Fiction 20 4.62 8409. 9.7
## 4 2010 Non Fiction 30 4.52 3527. 16
## 5 2011 Fiction 21 4.62 10335. 11.6
## 6 2011 Non Fiction 29 4.51 6483. 17.6
ggplot(book_g, aes(x = year)) +
geom_point(aes(y = num, color = "Numer of books"), shape = 18, size = 4) +
geom_point(aes(y = avg_price, color = "Averge Price"), size = 3) +
labs(y= "Number of books" , x = "Year", title = "Number of Books vs. Average Price") +
scale_x_continuous(breaks = c(2009:2019)) +
scale_y_continuous(sec.axis = sec_axis(~., name = "Average Price")) +
theme(legend.position = "bottom", axis.text.y = element_text(angle = 45),
axis.text.x = element_text(angle = 90),
legend.direction = "horizontal") +
facet_grid(cols = vars(genre))
ggplot(book_g, aes(x = year, y = avg_rev, fill = genre)) +
geom_bar(stat = "identity", color = "#c3ddf7") +
labs(y = "Book Reviews" , x = "Year", title = "Total Reviews: Fiction vs. Non Fiction") +
scale_x_continuous(breaks = c(2009:2019)) +
theme(legend.position = "bottom", axis.text.y = element_text(angle=45),
axis.text.x = element_text(angle=90),
legend.direction = "horizontal")
subf <- data.frame(rating = subset(book, genre == "Fiction")$user_rating)
subnf <- data.frame(rating = subset(book, genre == "Non Fiction")$user_rating)
subf$genre <- "Fiction"
subnf$genre <- "Non_fiction"
urate_all <- rbind(subf, subnf)
ggplot(urate_all, aes(rating, fill = genre)) +
#geom_density(alpha = 0.2) +
geom_bar(pos = "dodge") +
labs(y = "Density" ,x = "User Rating",title="Distrubition of User Rating") +
xlim(3.1, 5.1)