skim_without_charts(assessment.data)
| Name | assessment.data |
| Number of rows | 5000 |
| Number of columns | 33 |
| _______________________ | |
| Column type frequency: | |
| character | 12 |
| logical | 1 |
| numeric | 18 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| country | 0 | 1.00 | 3 | 3 | 0 | 1 | 0 |
| formattype | 0 | 1.00 | 5 | 5 | 0 | 1 | 0 |
| asin | 0 | 1.00 | 10 | 10 | 0 | 5000 | 0 |
| isbn13 | 0 | 1.00 | 13 | 13 | 0 | 5000 | 0 |
| isbn10 | 0 | 1.00 | 10 | 10 | 0 | 5000 | 0 |
| title | 0 | 1.00 | 17 | 52 | 0 | 4959 | 0 |
| author | 0 | 1.00 | 6 | 25 | 0 | 4809 | 0 |
| narrator | 0 | 1.00 | 8 | 25 | 0 | 4842 | 0 |
| publisherlabel | 0 | 1.00 | 6 | 34 | 0 | 4494 | 0 |
| formats | 0 | 1.00 | 15 | 31 | 0 | 3 | 0 |
| Berrydistributor | 4566 | 0.09 | 9 | 9 | 0 | 1 | 0 |
| junglecategories | 91 | 0.98 | 9 | 11053 | 0 | 4879 | 0 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| maryandwilliamdistributor | 5000 | 0 | NaN | : |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| pagelength | 0 | 1 | 811.66 | 647.48 | 0 | 498.00 | 668.00 | 928.00 | 15878.00 |
| jungletotalreviews | 0 | 1 | 14142.31 | 27627.10 | 0 | 1113.75 | 5023.00 | 15083.25 | 627087.00 |
| jungleaveragerating | 0 | 1 | 4.43 | 0.74 | 0 | 4.40 | 4.60 | 4.70 | 9.90 |
| seriesorder | 0 | 1 | 1.21 | 3.48 | 0 | 0.00 | 0.00 | 1.00 | 58.00 |
| serieslength | 0 | 1 | 2.27 | 5.50 | 0 | 0.00 | 0.00 | 3.00 | 60.00 |
| Berrytotalreviews | 0 | 1 | 8.02 | 46.75 | 0 | 0.00 | 0.00 | 0.00 | 1500.00 |
| Berryaveragerating | 0 | 1 | 46.01 | 135.32 | -12 | 0.00 | 0.00 | 0.00 | 500.00 |
| maryandwilliamtotalreviews | 0 | 1 | 0.02 | 1.41 | 0 | 0.00 | 0.00 | 0.00 | 100.00 |
| maryandwilliamaveragerating | 0 | 1 | 0.00 | 0.01 | 0 | 0.00 | 0.00 | 0.00 | 1.00 |
| price | 0 | 1 | 21.28 | 9.21 | 0 | 15.54 | 19.80 | 25.01 | 104.10 |
| jungleprice | 0 | 1 | 20.89 | 8.66 | -1 | 16.40 | 19.69 | 23.88 | 85.81 |
| Berryprice | 0 | 1 | 2.41 | 6.54 | 0 | 0.00 | 0.00 | 0.00 | 42.99 |
| maryandwilliamprice | 0 | 1 | 0.00 | 0.01 | 0 | 0.00 | 0.00 | 0.00 | 1.00 |
| listprice | 1 | 1 | 0.00 | 0.13 | 0 | 0.00 | 0.00 | 0.00 | 9.00 |
| junglelistprice | 0 | 1 | -0.02 | 1.40 | -99 | 0.00 | 0.00 | 0.00 | 0.00 |
| Berrylistprice | 1 | 1 | 0.00 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 |
| maryandwilliamlistprice | 0 | 1 | 0.00 | 0.11 | 0 | 0.00 | 0.00 | 0.00 | 8.00 |
| junglesalesrank | 0 | 1 | 3646.21 | 5610.62 | -99999 | 1245.19 | 2906.51 | 4897.89 | 191574.91 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| filedate | 0 | 1 | 2021-05-09 | 2021-05-09 | 2021-05-09 | 1 |
| datepublished | 0 | 1 | 1998-12-27 | 2024-05-31 | 2021-06-29 | 1726 |
sapply(assessment.data[sapply(assessment.data, is.numeric)], range, na.rm = TRUE)
## pagelength jungletotalreviews jungleaveragerating seriesorder serieslength
## [1,] 0 0 0.0 0 0
## [2,] 15878 627087 9.9 58 60
## Berrytotalreviews Berryaveragerating maryandwilliamtotalreviews
## [1,] 0 -12 0
## [2,] 1500 500 100
## maryandwilliamaveragerating price jungleprice Berryprice
## [1,] 0 0.0 -1.00 0.00
## [2,] 1 104.1 85.81 42.99
## maryandwilliamprice listprice junglelistprice Berrylistprice
## [1,] 0 0 -99 0
## [2,] 1 9 0 0
## maryandwilliamlistprice junglesalesrank
## [1,] 0 -99999.0
## [2,] 8 191574.9
# assessment.data contains 5,000 rows and 33 columns
# all filedates are 2021-05-09
# datepublished range 1998-12-27 to 2024-05-31
# strange results:
# maryandwilliamdistributor is completely empty
# junglelistprice has a mean of -0.0244
# Berryaveragerating contains a row with -12
# jungleprice contains a row with -1
# junglesalesrank contains rows with -1 and -99999
# all price column values are missing or incorrect for 23 rows
# explore columns
table(assessment.data$country) # looks like all from aus
##
## aus
## 5000
table(assessment.data$formattype) # all audio
##
## audio
## 5000
# data frame to see how authors are distributed. Some authors represented more than once
authors.count <- data.frame(sort(table(assessment.data$author, useNA = 'always'), decreasing = TRUE))
# doing the same for narrators. Michael Smith is the most prominent narrator with 6 books
narrators.count <- data.frame(sort(table(assessment.data$narrator, useNA = 'always'),decreasing = TRUE))
table(duplicated(assessment.data$isbn10)) # isbn10 is unique
##
## FALSE
## 5000
table(duplicated(assessment.data$isbn13)) # isbn13 also unique
##
## FALSE
## 5000
length(unique(assessment.data$isbn10, assessment.data$isbn13)) # 5,000 unique rows for both isbns
## [1] 5000
# isbn10 and isbn13 are effectively the same for this analysis
table(assessment.data$listprice) #... strange result
##
## 0 9
## 4998 1
range(assessment.data$listprice) # strange again "NA NA"
## [1] NA NA
hist(assessment.data$listprice) # x must be numeric?
class(assessment.data$listprice) # was imported as list... Let's look for something strange in the source data
## [1] "numeric"
# one cell of ListPrice has "&&" in it. Let's delete that and reimport the file.
# delete empty columns:
books_clean <- assessment.data %>%
select(-c(maryandwilliamdistributor))
# delete incorrect value columns:
books_clean <- books_clean %>%
select(-c(
"listprice",
"junglelistprice",
"Berrylistprice",
"maryandwilliamlistprice"))
# change missing and incorrect values to NA
columns_to_edit <- c(
"Berryaveragerating",
"jungleaveragerating",
"maryandwilliamaveragerating",
"jungleprice",
"price",
"Berryprice",
"maryandwilliamprice",
"junglesalesrank",
"pagelength"
)
books_clean <- books_clean %>%
mutate(
across(all_of(columns_to_edit),
~ ifelse(.x < 0.01, NA, .x))
)
# ~15 books have an average rating score but 0 reviews. I'll change these rating scores to NA
books_clean$jungleaveragerating[which(books_clean$jungletotalreviews == 0)] <- NA
# 3 books have an average rating higher than 5. Change to NA
books_clean$jungleaveragerating[which(books_clean$jungleaveragerating > 5)] <- NA
skim_without_charts(books_clean)
| Name | books_clean |
| Number of rows | 5000 |
| Number of columns | 28 |
| _______________________ | |
| Column type frequency: | |
| character | 12 |
| numeric | 14 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| country | 0 | 1.00 | 3 | 3 | 0 | 1 | 0 |
| formattype | 0 | 1.00 | 5 | 5 | 0 | 1 | 0 |
| asin | 0 | 1.00 | 10 | 10 | 0 | 5000 | 0 |
| isbn13 | 0 | 1.00 | 13 | 13 | 0 | 5000 | 0 |
| isbn10 | 0 | 1.00 | 10 | 10 | 0 | 5000 | 0 |
| title | 0 | 1.00 | 17 | 52 | 0 | 4959 | 0 |
| author | 0 | 1.00 | 6 | 25 | 0 | 4809 | 0 |
| narrator | 0 | 1.00 | 8 | 25 | 0 | 4842 | 0 |
| publisherlabel | 0 | 1.00 | 6 | 34 | 0 | 4494 | 0 |
| formats | 0 | 1.00 | 15 | 31 | 0 | 3 | 0 |
| Berrydistributor | 4566 | 0.09 | 9 | 9 | 0 | 1 | 0 |
| junglecategories | 91 | 0.98 | 9 | 11053 | 0 | 4879 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| pagelength | 91 | 0.98 | 826.71 | 643.86 | 24.00 | 507.00 | 674.00 | 937.00 | 15878.00 |
| jungletotalreviews | 0 | 1.00 | 14142.31 | 27627.10 | 0.00 | 1113.75 | 5023.00 | 15083.25 | 627087.00 |
| jungleaveragerating | 132 | 0.97 | 4.53 | 0.22 | 2.50 | 4.40 | 4.60 | 4.70 | 5.00 |
| seriesorder | 0 | 1.00 | 1.21 | 3.48 | 0.00 | 0.00 | 0.00 | 1.00 | 58.00 |
| serieslength | 0 | 1.00 | 2.27 | 5.50 | 0.00 | 0.00 | 0.00 | 3.00 | 60.00 |
| Berrytotalreviews | 0 | 1.00 | 8.02 | 46.75 | 0.00 | 0.00 | 0.00 | 0.00 | 1500.00 |
| Berryaveragerating | 4478 | 0.10 | 440.73 | 37.46 | 100.00 | 430.00 | 450.00 | 460.00 | 500.00 |
| maryandwilliamtotalreviews | 0 | 1.00 | 0.02 | 1.41 | 0.00 | 0.00 | 0.00 | 0.00 | 100.00 |
| maryandwilliamaveragerating | 4999 | 0.00 | 1.00 | NA | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 |
| price | 23 | 1.00 | 21.38 | 9.12 | 1.02 | 15.61 | 19.83 | 25.02 | 104.10 |
| jungleprice | 114 | 0.98 | 21.38 | 8.14 | 1.25 | 17.05 | 19.69 | 24.04 | 85.81 |
| Berryprice | 4325 | 0.14 | 17.86 | 6.37 | 3.99 | 14.99 | 16.99 | 19.99 | 42.99 |
| maryandwilliamprice | 4999 | 0.00 | 1.00 | NA | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 |
| junglesalesrank | 115 | 0.98 | 3752.51 | 5449.70 | 1.10 | 1357.38 | 2980.19 | 4956.94 | 191574.91 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| filedate | 0 | 1 | 2021-05-09 | 2021-05-09 | 2021-05-09 | 1 |
| datepublished | 0 | 1 | 1998-12-27 | 2024-05-31 | 2021-06-29 | 1726 |
# View(books_clean[,c("jungleaveragerating","jungletotalreviews")])
Genre Binning
# extract list of unique categories from junglecategories
all_genres <- books_clean$junglecategories %>%
str_split(";") %>% # split each row on ;
unlist() %>% # flatten to one vector
str_trim() %>% # remove leading/trailing values
unique() # keep only unique values
# genre still has irrelevant leading values
# remove "See Top 100"
cleaned_genres <- all_genres %>%
str_remove("See Top 100.*") %>%
str_trim() %>%
purrr::discard(~ is.na(.x) || .x == "") # remove NA and empty strings
genre_df <- data.frame(genre = cleaned_genres, stringsAsFactors = FALSE) # turn into a df
genre_df_clean <- genre_df %>%
mutate(
genre = str_remove(genre, "^Books\\s*>\\s*") # remove leading "Books >"
)
genre_df_clean <- unique(genre_df_clean) # df of unique genres
# there are still unneeded "Space Books & Originals" etc in some rows that I want to remove
genre_df_clean <- genre_df_clean %>%
mutate(
genre = str_remove_all(genre, "Space Books & Originals\\s*>\\s*"), #AI helped with syntax
genre = str_remove_all(genre, "\\(Space Books & Originals\\)"),
genre = str_remove_all(genre, "Space Audiobooks\\s*>\\s*"),
genre = str_replace_all(genre, "^Space Books & Originals$", ""), # removes entire rows
genre = str_replace_all(genre, "^Space Audiobooks$", ""), # removes entire rows
genre = str_replace_all(genre, "^Art$", ""),
genre = str_squish(genre) # clean up extra spaces
) %>%
filter(!is.na(genre), genre != "") %>% # remove NA and empty rows
distinct() # remove duplicate rows
# I am left with 4965 genres down from 6030
# I want to count how many times each unique genre appears in books_clean$junglecategories
# grepl returns true or false whether the phrase matchs the specific expression
genre_df_clean$genre_count <- sapply(genre_df_clean$genre, function(g) sum(grepl(g, books_clean$junglecategories, ignore.case = TRUE)))
# Still some vague values listed: "Books", "Suspense", "Teens", "Science Fiction" (should be Science Fiction & Fantasy) and "Science" I'll remove those rows.
remove_genres <- c(
"Books", "Suspense", "Teens", "Science Fiction", "Fantasy", "Science", "Art")
# filter genres that are not an exact match for values in remove_genres
genre_df_clean <- genre_df_clean %>%
filter(!genre %in% remove_genres) %>%
arrange(desc(genre_count))
# find top main genres (everyting before the >)
top_genres <- genre_df_clean %>%
filter(!str_detect(genre, ">")) %>%
filter(!genre %in% c("Fiction & Literature", "Thrillers & Suspense",
"Relationships", "Mysteries", "Alternate History")) %>%
slice_max(genre_count, n = 10)
ggplot(top_genres, aes(x = reorder(genre, genre_count), y = genre_count)) +
geom_segment(aes(xend = genre, yend = 0), color = "gray50") +
geom_point(size = 4, color = "#e40058") +
coord_flip() +
scale_y_continuous(
limits = c(0, 3000),
breaks = seq(from = 0, to = 3000, by = 500)
) +
labs(
title = "This Graph is Mostly Fiction",
x = "Category",
y = "Number of Audiobooks"
) +
theme(
plot.background = element_rect(fill = "black", color = "black"),
panel.background = element_rect(fill = "black", color = "black"),
panel.grid.major = element_line(color = "#444444"),
panel.grid.minor = element_blank(),
text = element_text(color = "white"),
plot.title = element_text(color = "white", hjust = 0.5),
axis.title = element_text(color = "white"),
axis.text = element_text(color = "white"),
axis.text.y = element_text(angle = 0, hjust = 1), # Right-justify the text
plot.margin = margin(t = 10, b = 10, l = 25,)
)
top_genres$genre_precent_of_whole <- (top_genres$genre_count / 5000)*100
The makeup of our 5,000 row list of unique audiobooks is more than 55% Literature & Fiction and 34% Science Fiction & Fantasy.
# find average jungle ranking by genre
# select top 100
genre_rank <- books_clean %>%
slice_min(junglesalesrank, n = 100) %>%
select(c("isbn13","author", "narrator", "title", "publisherlabel", "datepublished",
"jungletotalreviews", "jungleaveragerating", "junglecategories",
"price", "junglesalesrank"))
# create column counting the number of times the top 10 most popular genres appear in the junglecategories column for the top 100 junglesalesrank books
top_genres$genre_rank_count <- sapply(top_genres$genre, function(g) sum(grepl(g, genre_rank$junglecategories, ignore.case = TRUE)))
# sum(top_genres$genre_rank_count) # sum is 269. Some genres must appear twice.
# convert to long format for plotting
top_genres_long <- top_genres %>%
select(-genre_count) %>%
pivot_longer(
cols = c(genre_rank_count, genre_precent_of_whole),
names_to = "metric",
values_to = "value")
# plotting each genre as a percent of the entire dataset against its representation in the top 100 best selling books
ggplot(top_genres_long, aes(x = value, y = genre)) +
geom_col(aes(color = metric, fill = metric), position = position_dodge(0.8), width = 0.7) +
scale_color_manual(
name = "Metric",
values = c("#8278ff", "#e40058"),
labels = c("% of data", "% of Top Ranked 100"),
guide = "none"
) +
scale_fill_manual(
name = "Metric",
values = c("#8278ff", "#e40058"),
labels = c("% of All Books", "% of Top Ranked 100")
) +
labs(
title = "Genre Composition",
x = "%",
y = "Genre"
) +
# Apply the dark theme, including legend styling
theme(
# Set background colors to black
plot.background = element_rect(fill = "black", color = "black"),
panel.background = element_rect(fill = "black", color = "black"),
# Set grid colors to a dark gray
panel.grid.major = element_line(color = "#444444"),
panel.grid.minor = element_blank(),
# Set text colors to white
text = element_text(color = "white"),
plot.title = element_text(color = "white", hjust = 0.5),
axis.title = element_text(color = "white"),
axis.text = element_text(color = "white"),
# Style the legend to match the dark theme
legend.background = element_rect(fill = "black"),
legend.key = element_rect(fill = "black"),
legend.title = element_text(color = "white"),
legend.text = element_text(color = "white")
)
# do book series outperform standalone novels?
# create new column labeling series or standalone books
books_clean <- books_clean %>%
mutate(book_type = case_when(
serieslength > 1 ~ "Series",
TRUE ~ "Standalone"
))
# create performance summary with average rating, total reviews, sales rank, price
# I only want to consider books with > 50 reviews
performacnce_summary <- books_clean %>%
filter(!is.na(jungleaveragerating), !is.na(junglesalesrank), !is.na(jungletotalreviews)) %>%
filter(jungleaveragerating > 0, junglesalesrank > 0, jungletotalreviews > 50) %>%
group_by(book_type) %>%
summarise(
book_count = n(),
median_avg_rating = median(jungleaveragerating),
mean_avg_rating = mean(jungleaveragerating),
median_total_reviews = median(jungletotalreviews),
median_sales_rank = median(junglesalesrank),
mean_sales_rank = mean(junglesalesrank)
)
print(performacnce_summary)
## # A tibble: 2 × 7
## book_type book_count median_avg_rating mean_avg_rating median_total_reviews
## <chr> <int> <dbl> <dbl> <dbl>
## 1 Series 1442 4.6 4.55 6974.
## 2 Standalone 3213 4.6 4.53 5283
## # ℹ 2 more variables: median_sales_rank <dbl>, mean_sales_rank <dbl>
# plot the average differences between series and standalone books
performance_long <- performacnce_summary %>%
pivot_longer(
cols = c(median_total_reviews, median_sales_rank),
names_to = "metric",
values_to = "value"
) %>%
# Make the metric names more readable for the plot
mutate(metric = recode(metric,
"median_total_reviews" = "Median Reviews",
"median_sales_rank" = "Median Sales Rank"))
# Create the plot using facet_wrap to handle different scales
ggplot(performance_long, aes(x = book_type, y = value, color = book_type)) +
geom_segment(aes(xend = book_type, yend = 0), size = 1.5) +
geom_point(size = 7, stroke = 1.5) +
facet_wrap(~ metric, scales = "free_y") +
scale_color_manual(values = c("Series" = "#8278ff", "Standalone" = "#e40058")) +
labs(
title = "Series vs. Standalone Book Performance",
subtitle = "Note: Lower Sales Rank is Better",
x = "Book Type",
y = "Median Value"
) +
theme(
plot.background = element_rect(fill = "black", color = "black"),
panel.background = element_rect(fill = "black", color = "black"),
strip.background = element_rect(fill = "#333333"),
panel.grid.major = element_line(color = "#444444"),
panel.grid.minor = element_blank(),
panel.spacing = unit(2, "lines"), # Add space between panels
text = element_text(color = "white"),
plot.title = element_text(color = "white", hjust = 0.5, size = 16),
plot.subtitle = element_text(color = "white", hjust = 0.5),
axis.title = element_text(color = "white"),
axis.text = element_text(color = "white", size = 10),
strip.text = element_text(color = "white", size = 12),
legend.position = "none" # Remove legend as the color is redundant with x-axis
)
# remove extra columns
books_concise <- books_clean %>%
select(c("isbn13", "title", "author", "narrator", "publisherlabel", "pagelength",
"datepublished", "jungletotalreviews", "jungleaveragerating",
"junglecategories", "seriesorder", "serieslength", "price", "junglesalesrank",
"book_type"))
# see if review count is associated with sales rank
books_to_plot <- books_concise %>%
filter(jungletotalreviews > 0, junglesalesrank > 0,
!is.na(jungletotalreviews), !is.na(junglesalesrank)) %>%
filter(jungleaveragerating < 5)
# plot sales rank vs total reviews
ggplot(books_to_plot, aes(x = jungletotalreviews, y = junglesalesrank)) +
geom_point(alpha = 0.2, color = "gray80") +
geom_smooth(method = "lm", formula = y ~ x, color = "#e40058") +
scale_x_log10(labels = scales::comma) +
scale_y_log10(labels = scales::comma) +
labs(
title = NULL,
# subtitle = "Books with a better sales rank have more reviews",
x = "Jungle Total Reviews (Log Scale)",
y = "Jungle Sales Rank (Log Scale)"
) +
theme(
plot.background = element_rect(fill = "black", color = "black"),
panel.background = element_rect(fill = "black", color = "black"),
panel.grid.major = element_line(color = "#444444"),
panel.grid.minor = element_blank(),
text = element_text(color = "white"),
plot.title = element_text(color = "white", hjust = 0.5),
plot.subtitle = element_text(color = "white", hjust = 0.5),
axis.title = element_text(color = "white"),
axis.text = element_text(color = "white")
)
# plot sales rank vs rating
ggplot(books_to_plot, aes(x = jungleaveragerating, y = junglesalesrank)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "lm", formula = y ~ x, color = "#e40058") + # linear trend line
scale_x_log10(labels = scales::comma) + # Log scale for reviews
scale_y_log10(labels = scales::comma) + # Log scale for rank
labs(
title = "Sales Rank vs. Average Rating",
subtitle = "Books with a better sales rank have slightly higher ratings",
x = "Jungle Average Rating (Log Scale)",
y = "Jungle Sales Rank (Log Scale)"
) +
theme_minimal()
# heatmap to find best predictor
# use only numeric data
numeric_data <- books_concise %>%
select(where(is.numeric))
# create matrix to feed to heatmap
# pairwise handles missing values well
# spearman nicer to outliers
cor_matrix <- cor(numeric_data, method = "spearman", use = "pairwise.complete.obs")
# crete correlation chart for numerical data
long_cor_data <- cor_matrix %>%
as.data.frame() %>%
rownames_to_column("variable1") %>%
pivot_longer(
cols = -variable1,
names_to = "variable2",
values_to = "correlation"
)
heatmap_colors <- c("#e40058", "white", "#8278ff")
ggplot(long_cor_data, aes(x = variable1, y = variable2, fill = correlation)) +
# Create the heatmap tiles
geom_tile(color = "black") + # Add black borders for tile separation
# Add the correlation numbers on top
geom_text(
aes(label = sprintf("%.2f", correlation)),
color = "black",
size = 3
) +
# Use a diverging color scale for correlation data
scale_fill_gradient2(
low = heatmap_colors[1], # Negative correlation color
mid = heatmap_colors[2], # Zero correlation color
high = heatmap_colors[3], # Positive correlation color
midpoint = 0,
limits = c(-1, 1),
name = "Correlation"
) +
# Add title and labels
labs(
title = "Feature Relationships",
x = "",
y = ""
) +
# Apply the custom dark theme
theme(
# Set background colors
plot.background = element_rect(fill = "black", color = "black"),
panel.background = element_rect(fill = "black", color = "black"),
legend.background = element_rect(fill = "black"),
legend.key = element_rect(fill = "black"),
# Remove panel grid and axis ticks
panel.grid = element_blank(),
axis.ticks = element_blank(),
# Set text colors to white
text = element_text(color = "white"),
plot.title = element_text(color = "white", hjust = 0.5, size = 16),
axis.text = element_text(color = "white", size = 10),
legend.title = element_text(color = "white"),
legend.text = element_text(color = "white"),
# Rotate x-axis labels to prevent overlap
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)
)
# 0.0 - 0.3: Weak
# 0.3 - 0.6 Moderate
# 0.6 - 1.0 Strong
# Not many high correlation values. Series length and series order have the highest cor.
# sales rank and total reviews have the 2nd highest but is still a weak relationship
# price has a 0.0 correlation with sales rank!
# higher rated books are not typically more expensive
Average rating of a Literature & Fiction Book
# change incorrect ratings above 5 to NA
# books_clean$jungleaveragerating[books_clean$jungleaveragerating > 5] <- NA
# function to quickly create dfs for all major categories
create_genre_df <- function(df, genre_name) {
genre_df <- df %>%
select(c("isbn13", "title", "author", "narrator", "publisherlabel", "pagelength",
"datepublished", "jungletotalreviews", "jungleaveragerating",
"junglecategories", "seriesorder", "serieslength", "price", "junglesalesrank",
"book_type")) %>%
filter(grepl(genre_name, junglecategories, ignore.case = TRUE)) %>%
mutate(genre = genre_name)
return(genre_df)
}
# create dfs for top 10 genres
lit_fic <- create_genre_df(books_clean, "Literature & Fiction")
scifi_fant <- create_genre_df(books_clean, "Science Fiction & Fantasy")
myst_thrill <- create_genre_df(books_clean, "Mystery, Thriller & Suspense")
romance <- create_genre_df(books_clean, "Romance")
hist_fict <- create_genre_df(books_clean, "Historical Fiction")
history <- create_genre_df(books_clean, "History >")
rela_pd <- create_genre_df(books_clean, "Relationships, Parenting & Personal Development")
self_help <- create_genre_df(books_clean, "Self-Help")
health <- create_genre_df(books_clean, "Health, Fitness & Dieting")
humor <- create_genre_df(books_clean, "Humor")
# find means for each genre
fic_means <- colMeans(lit_fic[, sapply(lit_fic, is.numeric)], na.rm = TRUE)
scifi_means <- colMeans(scifi_fant[, sapply(scifi_fant, is.numeric)], na.rm = TRUE)
myst_means <- colMeans(myst_thrill[, sapply(myst_thrill, is.numeric)], na.rm = TRUE)
romance_means <- colMeans(romance[, sapply(romance, is.numeric)], na.rm = TRUE)
hist_fic_means <- colMeans(hist_fict[, sapply(hist_fict, is.numeric)], na.rm = TRUE)
history_means <- colMeans(history[, sapply(history, is.numeric)], na.rm = TRUE)
rela_pd_means <- colMeans(rela_pd[, sapply(rela_pd, is.numeric)], na.rm = TRUE)
self_help_means <- colMeans(self_help[, sapply(self_help, is.numeric)], na.rm = TRUE)
health_means <- colMeans(health[, sapply(health, is.numeric)], na.rm = TRUE)
humor_means <- colMeans(humor[, sapply(humor, is.numeric)], na.rm = TRUE)
# combine results into one beautiful df
all_cat_means <- rbind(fic_means, scifi_means, myst_means, romance_means,
hist_fic_means, history_means, rela_pd_means,
self_help_means, health_means, humor_means)
cleaned_means_df <- all_cat_means %>%
as.data.frame() %>%
rownames_to_column(var = "genre_code") %>%
mutate(
Genre = case_when(
genre_code == "fic_means" ~ "Literature & Fiction",
genre_code == "scifi_means" ~ "Science Fiction",
genre_code == "myst_means" ~ "Mystery & Suspense",
genre_code == "romance_means" ~ "Romance",
genre_code == "hist_fic_means"~ "Historical Fiction",
genre_code == "history_means" ~ "History",
genre_code == "rela_pd_means" ~ "Religion & Spirituality",
genre_code == "self_help_means" ~ "Self-Help",
genre_code == "health_means" ~ "Health, Fitness & Dieting",
genre_code == "humor_means" ~ "Humor",
TRUE ~ genre_code
),
.before = 1
) %>%
select(-genre_code)
# create kable table
# Define base palette
pink_scale <- c("black", "#e40058")
# reverse scale for salesrank
rev_pink_scale <- rev(pink_scale)
# function for kable styling
colored_cell <- function(x, palette, reverse = FALSE) {
pal <- if (reverse) rev(palette) else palette
cell_spec(
round(x, 2),
format = "html",
color = "white",
background = col_numeric(pal, domain = range(x, na.rm = TRUE))(x),
extra_css = "display: inline-block; width: 100%; padding: 2px;"
)
}
# Apply styling column by column
styled_df <- cleaned_means_df %>%
mutate(
pagelength = colored_cell(pagelength, pink_scale),
jungletotalreviews = colored_cell(jungletotalreviews, pink_scale),
jungleaveragerating = colored_cell(jungleaveragerating, pink_scale),
seriesorder = colored_cell(seriesorder, pink_scale),
serieslength = colored_cell(serieslength, pink_scale),
price = colored_cell(price, pink_scale),
junglesalesrank = colored_cell(junglesalesrank, pink_scale, reverse = TRUE)
)
# Display kable
Performance_table <- styled_df %>%
kbl(
escape = FALSE,
caption = "Mean Performance Metrics Across Book Genres",
digits = 2
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE
)
Top Ranking Titles, Narrators, Authors in Sales Ranking and Average Review Score
books_clean %>% filter(!is.na(junglesalesrank)) %>%
arrange(junglesalesrank) %>%
head(., 2)
## # A tibble: 2 × 29
## filedate country formattype asin isbn13 isbn10 title author
## <dttm> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2021-05-09 00:00:00 aus audio E3J1A9E0SX 8HAAOIW… XSELU… Syne… Marga…
## 2 2021-05-09 00:00:00 aus audio IQ0K6SHW5H G2L4KWJ… H2OWD… Opti… Dr. C…
## # ℹ 21 more variables: narrator <chr>, publisherlabel <chr>, formats <chr>,
## # pagelength <dbl>, datepublished <dttm>, Berrydistributor <chr>,
## # jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## # junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>,
## # Berrytotalreviews <dbl>, Berryaveragerating <dbl>,
## # maryandwilliamtotalreviews <dbl>, maryandwilliamaveragerating <dbl>,
## # price <dbl>, jungleprice <dbl>, Berryprice <dbl>, …
lit_fic %>% arrange(junglesalesrank) %>%
head(., 2)
## # A tibble: 2 × 16
## isbn13 title author narrator publisherlabel pagelength datepublished
## <chr> <chr> <chr> <chr> <chr> <dbl> <dttm>
## 1 SMB1TZL1Y… Face… Annet… Debra S… Wilcox LLC 897 2024-02-06 00:00:00
## 2 UEJ53TIQG… Open… Lacey… William… Rodriguez, Ca… 1282 2023-05-02 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## # junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## # junglesalesrank <dbl>, book_type <chr>, genre <chr>
scifi_fant %>% arrange(junglesalesrank) %>%
head(., 2)
## # A tibble: 2 × 16
## isbn13 title author narrator publisherlabel pagelength datepublished
## <chr> <chr> <chr> <chr> <chr> <dbl> <dttm>
## 1 FOJSOZJUD… User… Charl… Lisa Ba… Young, Moore … 1696 2023-11-07 00:00:00
## 2 UEJ53TIQG… Open… Lacey… William… Rodriguez, Ca… 1282 2023-05-02 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## # junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## # junglesalesrank <dbl>, book_type <chr>, genre <chr>
myst_thrill %>% arrange(junglesalesrank) %>%
head(., 2)
## # A tibble: 2 × 16
## isbn13 title author narrator publisherlabel pagelength datepublished
## <chr> <chr> <chr> <chr> <chr> <dbl> <dttm>
## 1 7H9G6YYN0… Fund… Miche… Jason T… Parker-Johnson 556 2024-01-02 00:00:00
## 2 I98JWEAQM… Gras… Amand… Jessica… Robinson, New… 1200 2024-05-21 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## # junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## # junglesalesrank <dbl>, book_type <chr>, genre <chr>
romance %>% arrange(junglesalesrank) %>%
head(., 2)
## # A tibble: 2 × 16
## isbn13 title author narrator publisherlabel pagelength datepublished
## <chr> <chr> <chr> <chr> <chr> <dbl> <dttm>
## 1 8HAAOIWK2… Syne… Marga… Joseph … Boyd-Mason 683 2024-04-23 00:00:00
## 2 1P2PVJFWL… Mand… Arthu… Joe Sol… Burnett Group 1782 2024-01-30 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## # junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## # junglesalesrank <dbl>, book_type <chr>, genre <chr>
hist_fict %>% arrange(junglesalesrank) %>%
head(., 2)
## # A tibble: 2 × 16
## isbn13 title author narrator publisherlabel pagelength datepublished
## <chr> <chr> <chr> <chr> <chr> <dbl> <dttm>
## 1 SMB1TZL1Y… Face… Annet… Debra S… Wilcox LLC 897 2024-02-06 00:00:00
## 2 XKA2XED83… Assi… Allis… Bonnie … Thompson, Mas… 586 2022-04-26 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## # junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## # junglesalesrank <dbl>, book_type <chr>, genre <chr>
history %>% arrange(junglesalesrank) %>%
head(., 2)
## # A tibble: 2 × 16
## isbn13 title author narrator publisherlabel pagelength datepublished
## <chr> <chr> <chr> <chr> <chr> <dbl> <dttm>
## 1 Z68QR1268… Down… Amand… Patrici… Perez, Petty … 335 2018-10-16 00:00:00
## 2 A4KS2CXGJ… Impl… Dana … Randy G… Webb LLC 1262 2006-12-31 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## # junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## # junglesalesrank <dbl>, book_type <chr>, genre <chr>
rela_pd %>% arrange(junglesalesrank) %>%
head(., 2)
## # A tibble: 2 × 16
## isbn13 title author narrator publisherlabel pagelength datepublished
## <chr> <chr> <chr> <chr> <chr> <dbl> <dttm>
## 1 G2L4KWJSN… Opti… Dr. C… Chelsey… Warren and So… 344 2022-04-01 00:00:00
## 2 Z68QR1268… Down… Amand… Patrici… Perez, Petty … 335 2018-10-16 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## # junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## # junglesalesrank <dbl>, book_type <chr>, genre <chr>
self_help %>% arrange(junglesalesrank) %>%
head(., 2)
## # A tibble: 2 × 16
## isbn13 title author narrator publisherlabel pagelength datepublished
## <chr> <chr> <chr> <chr> <chr> <dbl> <dttm>
## 1 G2L4KWJSN… Opti… Dr. C… Chelsey… Warren and So… 344 2022-04-01 00:00:00
## 2 Z68QR1268… Down… Amand… Patrici… Perez, Petty … 335 2018-10-16 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## # junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## # junglesalesrank <dbl>, book_type <chr>, genre <chr>
health %>% arrange(junglesalesrank) %>%
head(., 2)
## # A tibble: 2 × 16
## isbn13 title author narrator publisherlabel pagelength datepublished
## <chr> <chr> <chr> <chr> <chr> <dbl> <dttm>
## 1 Z68QR1268… Down… Amand… Patrici… Perez, Petty … 335 2018-10-16 00:00:00
## 2 TFJO646GN… Dece… Charl… Randy J… Morgan-Douglas 317 2016-09-13 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## # junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## # junglesalesrank <dbl>, book_type <chr>, genre <chr>
humor %>% arrange(junglesalesrank) %>%
head(., 2)
## # A tibble: 2 × 16
## isbn13 title author narrator publisherlabel pagelength datepublished
## <chr> <chr> <chr> <chr> <chr> <dbl> <dttm>
## 1 A4KS2CXGJ… Impl… Dana … Randy G… Webb LLC 1262 2006-12-31 00:00:00
## 2 SVYFXMKII… Trip… Kelly… Donna S… Carr, Davidso… 1670 2020-03-03 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## # junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## # junglesalesrank <dbl>, book_type <chr>, genre <chr>
# worst performing books by genre
# make cover art and display on slide
books_clean %>% filter(!is.na(junglesalesrank)) %>%
arrange(junglesalesrank) %>%
tail(., 2)
## # A tibble: 2 × 29
## filedate country formattype asin isbn13 isbn10 title author
## <dttm> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2021-05-09 00:00:00 aus audio UUO6XFLEP5 EUQSXRT… 7HABX… Mult… Rober…
## 2 2021-05-09 00:00:00 aus audio IKF6P5OODD EBHF1W8… 7KIES… Seam… Natal…
## # ℹ 21 more variables: narrator <chr>, publisherlabel <chr>, formats <chr>,
## # pagelength <dbl>, datepublished <dttm>, Berrydistributor <chr>,
## # jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## # junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>,
## # Berrytotalreviews <dbl>, Berryaveragerating <dbl>,
## # maryandwilliamtotalreviews <dbl>, maryandwilliamaveragerating <dbl>,
## # price <dbl>, jungleprice <dbl>, Berryprice <dbl>, …
lit_fic %>% filter(!is.na(junglesalesrank)) %>%
arrange(junglesalesrank) %>%
tail(., 2)
## # A tibble: 2 × 16
## isbn13 title author narrator publisherlabel pagelength datepublished
## <chr> <chr> <chr> <chr> <chr> <dbl> <dttm>
## 1 K8WM0EGXK… Moni… Jerry… Rebecca… Adams LLC 463 2024-05-31 00:00:00
## 2 3UX6KOMA8… Stan… James… Nicole … Phillips-Mill… 1109 2024-05-20 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## # junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## # junglesalesrank <dbl>, book_type <chr>, genre <chr>
scifi_fant %>% filter(!is.na(junglesalesrank)) %>%
arrange(junglesalesrank) %>%
tail(., 2)
## # A tibble: 2 × 16
## isbn13 title author narrator publisherlabel pagelength datepublished
## <chr> <chr> <chr> <chr> <chr> <dbl> <dttm>
## 1 9GJDEP056… Orga… Sarah… Katelyn… Sims Group 829 2021-05-14 00:00:00
## 2 EUQSXRTOD… Mult… Rober… Dustin … Lawson Ltd 1217 2024-05-30 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## # junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## # junglesalesrank <dbl>, book_type <chr>, genre <chr>
myst_thrill %>% filter(!is.na(junglesalesrank)) %>%
arrange(junglesalesrank) %>%
tail(., 2)
## # A tibble: 2 × 16
## isbn13 title author narrator publisherlabel pagelength datepublished
## <chr> <chr> <chr> <chr> <chr> <dbl> <dttm>
## 1 BKW4C8XM7… User… John … Maurice… Griffin PLC 653 2023-06-06 00:00:00
## 2 K8WM0EGXK… Moni… Jerry… Rebecca… Adams LLC 463 2024-05-31 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## # junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## # junglesalesrank <dbl>, book_type <chr>, genre <chr>
romance %>% filter(!is.na(junglesalesrank)) %>%
arrange(junglesalesrank) %>%
tail(., 2)
## # A tibble: 2 × 16
## isbn13 title author narrator publisherlabel pagelength datepublished
## <chr> <chr> <chr> <chr> <chr> <dbl> <dttm>
## 1 QOHR0JFTH… Mult… Saman… Kylie J… Harrington, J… 447 2024-05-31 00:00:00
## 2 3UX6KOMA8… Stan… James… Nicole … Phillips-Mill… 1109 2024-05-20 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## # junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## # junglesalesrank <dbl>, book_type <chr>, genre <chr>
hist_fict %>% filter(!is.na(junglesalesrank)) %>%
arrange(junglesalesrank) %>%
tail(., 2)
## # A tibble: 2 × 16
## isbn13 title author narrator publisherlabel pagelength datepublished
## <chr> <chr> <chr> <chr> <chr> <dbl> <dttm>
## 1 MVOV5F9FR… Futu… Keith… William… Kidd LLC 787 2019-08-20 00:00:00
## 2 XOBOL5RAQ… Stre… Emily… Joseph … Dougherty, Ga… 650 2018-05-22 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## # junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## # junglesalesrank <dbl>, book_type <chr>, genre <chr>
history %>% filter(!is.na(junglesalesrank)) %>%
arrange(junglesalesrank) %>%
tail(., 2)
## # A tibble: 2 × 16
## isbn13 title author narrator publisherlabel pagelength datepublished
## <chr> <chr> <chr> <chr> <chr> <dbl> <dttm>
## 1 M2IOA0J8J… Trip… Baile… Carrie … Schroeder-Lop… 716 2024-01-16 00:00:00
## 2 91JN5YUCW… Busi… Felic… Kelly M… Lopez-Pena 1000 2022-07-12 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## # junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## # junglesalesrank <dbl>, book_type <chr>, genre <chr>
rela_pd %>% filter(!is.na(junglesalesrank)) %>%
arrange(junglesalesrank) %>%
tail(., 2)
## # A tibble: 2 × 16
## isbn13 title author narrator publisherlabel pagelength datepublished
## <chr> <chr> <chr> <chr> <chr> <dbl> <dttm>
## 1 AN0J797KT… Reve… Richa… Tony Wa… Lopez-Hill 660 2024-05-07 00:00:00
## 2 9QXV2I76A… Busi… Georg… Patty W… Patrick, Oroz… 373 2016-05-03 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## # junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## # junglesalesrank <dbl>, book_type <chr>, genre <chr>
self_help %>% filter(!is.na(junglesalesrank)) %>%
arrange(junglesalesrank) %>%
tail(., 2)
## # A tibble: 2 × 16
## isbn13 title author narrator publisherlabel pagelength datepublished
## <chr> <chr> <chr> <chr> <chr> <dbl> <dttm>
## 1 9QXV2I76A… Busi… Georg… Patty W… Patrick, Oroz… 373 2016-05-03 00:00:00
## 2 EBHF1W8KR… Seam… Natal… Travis … Wagner, Spear… 52 2019-06-05 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## # junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## # junglesalesrank <dbl>, book_type <chr>, genre <chr>
health %>% filter(!is.na(junglesalesrank)) %>%
arrange(junglesalesrank) %>%
tail(., 2)
## # A tibble: 2 × 16
## isbn13 title author narrator publisherlabel pagelength datepublished
## <chr> <chr> <chr> <chr> <chr> <dbl> <dttm>
## 1 Q8S4JZCLS… Func… Edwar… Pamela … Shaw-Nelson 292 2021-03-02 00:00:00
## 2 L93JKHIC8… Digi… Micha… Richard… Morris and So… 465 2024-05-14 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## # junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## # junglesalesrank <dbl>, book_type <chr>, genre <chr>