# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the dataset
books_data <- read.csv("bestsellers.csv")
# View the structure of the dataset
str(books_data)
## 'data.frame': 550 obs. of 7 variables:
## $ Name : chr "10-Day Green Smoothie Cleanse" "11/22/63: A Novel" "12 Rules for Life: An Antidote to Chaos" "1984 (Signet Classics)" ...
## $ Author : chr "JJ Smith" "Stephen King" "Jordan B. Peterson" "George Orwell" ...
## $ User.Rating: num 4.7 4.6 4.7 4.7 4.8 4.4 4.7 4.7 4.7 4.6 ...
## $ Reviews : int 17350 2052 18979 21424 7665 12643 19735 19699 5983 23848 ...
## $ Price : int 8 22 15 6 12 11 30 15 3 8 ...
## $ Year : int 2016 2011 2018 2017 2019 2011 2014 2017 2018 2016 ...
## $ Genre : chr "Non Fiction" "Fiction" "Non Fiction" "Fiction" ...
# Preview the first few rows of the dataset
head(books_data)
## Name
## 1 10-Day Green Smoothie Cleanse
## 2 11/22/63: A Novel
## 3 12 Rules for Life: An Antidote to Chaos
## 4 1984 (Signet Classics)
## 5 5,000 Awesome Facts (About Everything!) (National Geographic Kids)
## 6 A Dance with Dragons (A Song of Ice and Fire)
## Author User.Rating Reviews Price Year Genre
## 1 JJ Smith 4.7 17350 8 2016 Non Fiction
## 2 Stephen King 4.6 2052 22 2011 Fiction
## 3 Jordan B. Peterson 4.7 18979 15 2018 Non Fiction
## 4 George Orwell 4.7 21424 6 2017 Fiction
## 5 National Geographic Kids 4.8 7665 12 2019 Non Fiction
## 6 George R. R. Martin 4.4 12643 11 2011 Fiction
# Grouping by Genre and Year
genre_year_summary <- books_data |>
group_by(Genre, Year) |>
summarise(
Avg_Rating = mean(User.Rating),
Avg_Reviews = mean(Reviews),
Median_Price = median(Price)
)
## `summarise()` has grouped output by 'Genre'. You can override using the
## `.groups` argument.
# Heatmap for Average Rating Over Years by Genre
genre_year_heatmap <- ggplot(genre_year_summary, aes(x = Year, y = Genre, fill = Avg_Rating)) +
geom_tile() +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
labs(title = "Average Rating of Books Over Years by Genre",
x = "Year",
y = "Genre",
fill = "Average Rating") +
theme_minimal()
# Display the heatmap
print(genre_year_heatmap)
Fiction books tend to have higher average ratings than non-fiction books. The average rating for fiction books is around 4.8, while the average rating for non-fiction books is around 4.5.
The average rating for both fiction and non-fiction books has remained relatively stable over the years. There is no clear upward or downward trend in the average ratings over time.
# Grouping by Price Range and Genre
price_genre_summary <- books_data |>
mutate(Price_Range = cut(Price, breaks = c(0, 10, 20, 30, 40, Inf), labels = c("0-10", "10-20", "20-30", "30-40", "40+"))) |>
group_by(Price_Range, Genre) |>
summarise(
Avg_Rating = mean(User.Rating),
Avg_Reviews = mean(Reviews)
)
## `summarise()` has grouped output by 'Price_Range'. You can override using the
## `.groups` argument.
# Stacked Bar Chart for Average Rating by Price Range and Genre
price_genre_stacked_bar <- ggplot(price_genre_summary, aes(x = Price_Range, y = Avg_Rating, fill = Genre)) +
geom_bar(stat = "identity") +
labs(title = "Average Rating by Price Range and Genre",
x = "Price Range",
y = "Average Rating",
fill = "Genre") +
theme_minimal()
# Display the stacked bar chart
print(price_genre_stacked_bar)
In general, more expensive books tend to have higher average ratings. This is true for both fiction and non-fiction books. For example, the average rating for fiction books in the $100 to $200 price range is higher than the average rating for fiction books in the $50 to $100 price range.
Fiction books tend to have higher average ratings than non-fiction books. This is true across all price ranges. For example, the average rating for fiction books in the $0 to $10 price range is higher than the average rating for non-fiction books in the same price range.
Now, let’s identify the lowest probability group(s) and tag them accordingly
# Identifying lowest probability groups
lowest_prob_groups <- books_data |>
group_by(Genre) |>
summarise(Probability = n() / nrow(books_data)) |>
arrange(Probability) |>
slice(1)
# Tagging lowest probability groups in original data
books_data <- books_data |>
mutate(Lowest_Probability_Group = ifelse(Genre %in% lowest_prob_groups$Genre, "Lowest Probability", "Normal"))
# View the tagged data
head(books_data)
## Name
## 1 10-Day Green Smoothie Cleanse
## 2 11/22/63: A Novel
## 3 12 Rules for Life: An Antidote to Chaos
## 4 1984 (Signet Classics)
## 5 5,000 Awesome Facts (About Everything!) (National Geographic Kids)
## 6 A Dance with Dragons (A Song of Ice and Fire)
## Author User.Rating Reviews Price Year Genre
## 1 JJ Smith 4.7 17350 8 2016 Non Fiction
## 2 Stephen King 4.6 2052 22 2011 Fiction
## 3 Jordan B. Peterson 4.7 18979 15 2018 Non Fiction
## 4 George Orwell 4.7 21424 6 2017 Fiction
## 5 National Geographic Kids 4.8 7665 12 2019 Non Fiction
## 6 George R. R. Martin 4.4 12643 11 2011 Fiction
## Lowest_Probability_Group
## 1 Normal
## 2 Lowest Probability
## 3 Normal
## 4 Lowest Probability
## 5 Normal
## 6 Lowest Probability
Let’s pick the categorical variables “Genre” and “Year” for this task.
First, let’s identify whether there are any combinations of “Genre”
and “Year” that do not exist in the dataset:
# Get unique combinations of Genre and Year in the dataset
existing_combinations <- unique(select(books_data, Genre, Year))
# Create a dataframe of all possible combinations of Genre and Year
all_combinations <- expand.grid(Genre = unique(books_data$Genre), Year = unique(books_data$Year))
# Find missing combinations
missing_combinations <- anti_join(all_combinations, existing_combinations)
## Joining with `by = join_by(Genre, Year)`
# Print missing combinations
print(missing_combinations)
## [1] Genre Year
## <0 rows> (or 0-length row.names)
There are no missing combinations :)
Next, let’s analyze which combinations are the most and least common:
# Count occurrences of each combination
combination_counts <- books_data %>%
group_by(Genre, Year) %>%
summarise(Count = n())
## `summarise()` has grouped output by 'Genre'. You can override using the
## `.groups` argument.
# Order combinations by count
combination_counts <- combination_counts %>%
arrange(desc(Count))
# Print combinations with counts
print(combination_counts)
## # A tibble: 22 × 3
## # Groups: Genre [2]
## Genre Year Count
## <chr> <int> <int>
## 1 Non Fiction 2015 33
## 2 Non Fiction 2016 31
## 3 Non Fiction 2010 30
## 4 Non Fiction 2019 30
## 5 Fiction 2014 29
## 6 Non Fiction 2011 29
## 7 Non Fiction 2012 29
## 8 Non Fiction 2018 29
## 9 Non Fiction 2009 26
## 10 Non Fiction 2013 26
## # ℹ 12 more rows
This will give us an understanding of which combinations occur frequently and which are relatively rare. We can then further investigate why certain combinations are more prevalent than others.
# Visualize most common combinations
top_combinations <- combination_counts |>
slice_max(order_by = Count, n = 10) # Adjust n as needed
# Bar plot for most common combinations
top_combinations_plot <- ggplot(top_combinations, aes(x = fct_reorder(paste(Genre, Year), Count), y = Count)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Top 10 Most Common Genre-Year Combinations",
x = "Genre-Year Combination",
y = "Frequency") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
print(top_combinations_plot)
The most frequent genre is fiction. This is because “Fiction” appears more times than “Non Fiction” on the list.
The years 2009, 2010, 2011, 2012, 2013, 2014, 2016, 2017, 2018, and 2019 appear multiple times on the list. This suggests that these years may be among the most common years represented in the data set.
# Visualize least common combinations
bottom_combinations <- combination_counts %>%
slice_min(order_by = Count, n = 10) # Adjust n as needed
# Bar plot for least common combinations
bottom_combinations_plot <- ggplot(bottom_combinations, aes(x = fct_reorder(paste(Genre, Year), Count), y = Count)) +
geom_bar(stat = "identity", fill = "lightcoral") +
labs(title = "Top 10 Least Common Genre-Year Combinations",
x = "Genre-Year Combination",
y = "Frequency") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
print(bottom_combinations_plot)