Audiobook Data Assessment

skim_without_charts(assessment.data)

Data summary
Name	assessment.data
Number of rows	5000
Number of columns	33
_______________________
Column type frequency:
character	12
logical	1
numeric	18
POSIXct	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
country	0	1.00	3	3	1
formattype	0	1.00	5	5	1
asin	0	1.00	10	10	5000
isbn13	0	1.00	13	13	5000
isbn10	0	1.00	10	10	5000
title	0	1.00	17	52	4959
author	0	1.00	6	25	4809
narrator	0	1.00	8	25	4842
publisherlabel	0	1.00	6	34	4494
formats	0	1.00	15	31	3
Berrydistributor	4566	0.09	9	9	1
junglecategories	91	0.98	9	11053	4879

Variable type: logical

skim_variable	n_missing	complete_rate	mean	count
maryandwilliamdistributor	5000	0	NaN	:

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
pagelength	0	1	811.66	647.48	0	498.00	668.00	928.00	15878.00
jungletotalreviews	0	1	14142.31	27627.10	0	1113.75	5023.00	15083.25	627087.00
jungleaveragerating	0	1	4.43	0.74	0	4.40	4.60	4.70	9.90
seriesorder	0	1	1.21	3.48	0	0.00	0.00	1.00	58.00
serieslength	0	1	2.27	5.50	0	0.00	0.00	3.00	60.00
Berrytotalreviews	0	1	8.02	46.75	0	0.00	0.00	0.00	1500.00
Berryaveragerating	0	1	46.01	135.32	-12	0.00	0.00	0.00	500.00
maryandwilliamtotalreviews	0	1	0.02	1.41	0	0.00	0.00	0.00	100.00
maryandwilliamaveragerating	0	1	0.00	0.01	0	0.00	0.00	0.00	1.00
price	0	1	21.28	9.21	0	15.54	19.80	25.01	104.10
jungleprice	0	1	20.89	8.66	-1	16.40	19.69	23.88	85.81
Berryprice	0	1	2.41	6.54	0	0.00	0.00	0.00	42.99
maryandwilliamprice	0	1	0.00	0.01	0	0.00	0.00	0.00	1.00
listprice	1	1	0.00	0.13	0	0.00	0.00	0.00	9.00
junglelistprice	0	1	-0.02	1.40	-99	0.00	0.00	0.00	0.00
Berrylistprice	1	1	0.00	0.00	0	0.00	0.00	0.00	0.00
maryandwilliamlistprice	0	1	0.00	0.11	0	0.00	0.00	0.00	8.00
junglesalesrank	0	1	3646.21	5610.62	-99999	1245.19	2906.51	4897.89	191574.91

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
filedate	0	1	2021-05-09	2021-05-09	2021-05-09	1
datepublished	0	1	1998-12-27	2024-05-31	2021-06-29	1726

sapply(assessment.data[sapply(assessment.data, is.numeric)], range, na.rm = TRUE)

##      pagelength jungletotalreviews jungleaveragerating seriesorder serieslength
## [1,]          0                  0                 0.0           0            0
## [2,]      15878             627087                 9.9          58           60
##      Berrytotalreviews Berryaveragerating maryandwilliamtotalreviews
## [1,]                 0                -12                          0
## [2,]              1500                500                        100
##      maryandwilliamaveragerating price jungleprice Berryprice
## [1,]                           0   0.0       -1.00       0.00
## [2,]                           1 104.1       85.81      42.99
##      maryandwilliamprice listprice junglelistprice Berrylistprice
## [1,]                   0         0             -99              0
## [2,]                   1         9               0              0
##      maryandwilliamlistprice junglesalesrank
## [1,]                       0        -99999.0
## [2,]                       8        191574.9

# assessment.data contains 5,000 rows and 33 columns
# all filedates are 2021-05-09
# datepublished range 1998-12-27 to 2024-05-31

# strange results:
# maryandwilliamdistributor is completely empty
# junglelistprice has a mean of -0.0244
# Berryaveragerating contains a row with -12
# jungleprice contains a row with -1
# junglesalesrank contains rows with -1 and -99999
# all price column values are missing or incorrect for 23 rows

# explore columns
table(assessment.data$country) # looks like all from aus

## 
##  aus 
## 5000

table(assessment.data$formattype) # all audio

## 
## audio 
##  5000

# data frame to see how authors are distributed. Some authors represented more than once
authors.count <- data.frame(sort(table(assessment.data$author, useNA = 'always'), decreasing = TRUE))

# doing the same for narrators. Michael Smith is the most prominent narrator with 6 books
narrators.count <- data.frame(sort(table(assessment.data$narrator, useNA = 'always'),decreasing = TRUE))

table(duplicated(assessment.data$isbn10)) # isbn10 is unique

## 
## FALSE 
##  5000

table(duplicated(assessment.data$isbn13)) # isbn13 also unique

## 
## FALSE 
##  5000

length(unique(assessment.data$isbn10, assessment.data$isbn13)) # 5,000 unique rows for both isbns

## [1] 5000

# isbn10 and isbn13 are effectively the same for this analysis

table(assessment.data$listprice) #... strange result

## 
##    0    9 
## 4998    1

range(assessment.data$listprice) # strange again "NA NA"

## [1] NA NA

hist(assessment.data$listprice) # x must be numeric?

class(assessment.data$listprice) # was imported as list... Let's look for something strange in the source data

## [1] "numeric"

# one cell of ListPrice has "&&" in it. Let's delete that and reimport the file.

Data Cleanup

# delete empty columns: 
books_clean <- assessment.data %>% 
  select(-c(maryandwilliamdistributor))

# delete incorrect value columns:
books_clean <- books_clean %>% 
  select(-c(
      "listprice",
      "junglelistprice", 
      "Berrylistprice", 
      "maryandwilliamlistprice"))

# change missing and incorrect values to NA
columns_to_edit <- c(
      "Berryaveragerating",
      "jungleaveragerating",
      "maryandwilliamaveragerating",
      "jungleprice",
      "price",
      "Berryprice",
      "maryandwilliamprice",
      "junglesalesrank",
      "pagelength"
)

books_clean <- books_clean %>% 
  mutate(
    across(all_of(columns_to_edit),
           ~ ifelse(.x < 0.01, NA, .x))
  )

# ~15 books have an average rating score but 0 reviews. I'll change these rating scores to NA
books_clean$jungleaveragerating[which(books_clean$jungletotalreviews == 0)] <- NA

# 3 books have an average rating higher than 5. Change to NA
books_clean$jungleaveragerating[which(books_clean$jungleaveragerating > 5)] <- NA
skim_without_charts(books_clean)

Data summary
Name	books_clean
Number of rows	5000
Number of columns	28
_______________________
Column type frequency:
character	12
numeric	14
POSIXct	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
country	0	1.00	3	3	1
formattype	0	1.00	5	5	1
asin	0	1.00	10	10	5000
isbn13	0	1.00	13	13	5000
isbn10	0	1.00	10	10	5000
title	0	1.00	17	52	4959
author	0	1.00	6	25	4809
narrator	0	1.00	8	25	4842
publisherlabel	0	1.00	6	34	4494
formats	0	1.00	15	31	3
Berrydistributor	4566	0.09	9	9	1
junglecategories	91	0.98	9	11053	4879

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
pagelength	91	0.98	826.71	643.86	24.00	507.00	674.00	937.00	15878.00
jungletotalreviews	0	1.00	14142.31	27627.10	0.00	1113.75	5023.00	15083.25	627087.00
jungleaveragerating	132	0.97	4.53	0.22	2.50	4.40	4.60	4.70	5.00
seriesorder	0	1.00	1.21	3.48	0.00	0.00	0.00	1.00	58.00
serieslength	0	1.00	2.27	5.50	0.00	0.00	0.00	3.00	60.00
Berrytotalreviews	0	1.00	8.02	46.75	0.00	0.00	0.00	0.00	1500.00
Berryaveragerating	4478	0.10	440.73	37.46	100.00	430.00	450.00	460.00	500.00
maryandwilliamtotalreviews	0	1.00	0.02	1.41	0.00	0.00	0.00	0.00	100.00
maryandwilliamaveragerating	4999	0.00	1.00	NA	1.00	1.00	1.00	1.00	1.00
price	23	1.00	21.38	9.12	1.02	15.61	19.83	25.02	104.10
jungleprice	114	0.98	21.38	8.14	1.25	17.05	19.69	24.04	85.81
Berryprice	4325	0.14	17.86	6.37	3.99	14.99	16.99	19.99	42.99
maryandwilliamprice	4999	0.00	1.00	NA	1.00	1.00	1.00	1.00	1.00
junglesalesrank	115	0.98	3752.51	5449.70	1.10	1357.38	2980.19	4956.94	191574.91

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
filedate	0	1	2021-05-09	2021-05-09	2021-05-09	1
datepublished	0	1	1998-12-27	2024-05-31	2021-06-29	1726

# View(books_clean[,c("jungleaveragerating","jungletotalreviews")])

Genre Binning

# extract list of unique categories from junglecategories
all_genres <- books_clean$junglecategories %>% 
  str_split(";") %>% # split each row on ;
  unlist() %>%       # flatten to one vector
  str_trim() %>%     # remove leading/trailing values
  unique()          # keep only unique values

# genre still has irrelevant leading values
# remove "See Top 100"
cleaned_genres <- all_genres %>% 
  str_remove("See Top 100.*") %>% 
  str_trim() %>% 
  purrr::discard(~ is.na(.x) || .x == "") # remove NA and empty strings

genre_df <- data.frame(genre = cleaned_genres, stringsAsFactors = FALSE) # turn into a df

genre_df_clean <- genre_df %>% 
  mutate(
    genre = str_remove(genre, "^Books\\s*>\\s*") # remove leading "Books >"
  )

genre_df_clean <- unique(genre_df_clean) # df of unique genres

# there are still unneeded "Space Books & Originals" etc in some rows that I want to remove
genre_df_clean <- genre_df_clean %>% 
  mutate(
    genre = str_remove_all(genre, "Space Books & Originals\\s*>\\s*"), #AI helped with syntax 
    genre = str_remove_all(genre, "\\(Space Books & Originals\\)"),
    genre = str_remove_all(genre, "Space Audiobooks\\s*>\\s*"),
    genre = str_replace_all(genre, "^Space Books & Originals$", ""), # removes entire rows 
    genre = str_replace_all(genre, "^Space Audiobooks$", ""), # removes entire rows
    genre = str_replace_all(genre, "^Art$", ""),
    genre = str_squish(genre)  # clean up extra spaces
  ) %>%
  filter(!is.na(genre), genre != "") %>%  # remove NA and empty rows
  distinct()  # remove duplicate rows
# I am left with 4965 genres down from 6030

# I want to count how many times each unique genre appears in books_clean$junglecategories
# grepl returns true or false whether the phrase matchs the specific expression
genre_df_clean$genre_count <- sapply(genre_df_clean$genre, function(g) sum(grepl(g, books_clean$junglecategories, ignore.case = TRUE)))

# Still some vague values listed: "Books", "Suspense", "Teens", "Science Fiction" (should be Science Fiction & Fantasy) and "Science" I'll remove those rows.
remove_genres <- c(
  "Books", "Suspense", "Teens", "Science Fiction", "Fantasy", "Science", "Art")

# filter genres that are not an exact match for values in remove_genres 
genre_df_clean <- genre_df_clean %>% 
  filter(!genre %in% remove_genres) %>% 
  arrange(desc(genre_count))

# find top main genres (everyting before the >)
top_genres <- genre_df_clean %>% 
  filter(!str_detect(genre, ">")) %>% 
  filter(!genre %in% c("Fiction & Literature", "Thrillers & Suspense",
                    "Relationships", "Mysteries", "Alternate History")) %>% 
  slice_max(genre_count, n = 10)

Insights

ggplot(top_genres, aes(x = reorder(genre, genre_count), y = genre_count)) +
  geom_segment(aes(xend = genre, yend = 0), color = "gray50") +
  geom_point(size = 4, color = "#e40058") +
  coord_flip() +
  scale_y_continuous(
    limits = c(0, 3000), 
    breaks = seq(from = 0, to = 3000, by = 500)
  ) +
  labs(
    title = "This Graph is Mostly Fiction",
    x = "Category",
    y = "Number of Audiobooks"
  ) +
  theme(
    plot.background = element_rect(fill = "black", color = "black"),
    panel.background = element_rect(fill = "black", color = "black"),
    panel.grid.major = element_line(color = "#444444"),
    panel.grid.minor = element_blank(),

    text = element_text(color = "white"),
    plot.title = element_text(color = "white", hjust = 0.5),
    axis.title = element_text(color = "white"),
    axis.text = element_text(color = "white"),

        axis.text.y = element_text(angle = 0, hjust = 1), # Right-justify the text
    plot.margin = margin(t = 10, b = 10, l = 25,)
  )

top_genres$genre_precent_of_whole <- (top_genres$genre_count / 5000)*100

The makeup of our 5,000 row list of unique audiobooks is more than 55% Literature & Fiction and 34% Science Fiction & Fantasy.

# find average jungle ranking by genre
# select top 100 
genre_rank <- books_clean %>% 
  slice_min(junglesalesrank, n = 100) %>% 
  select(c("isbn13","author", "narrator", "title", "publisherlabel", "datepublished",
           "jungletotalreviews", "jungleaveragerating", "junglecategories",
           "price", "junglesalesrank"))

# create column counting the number of times the top 10 most popular genres appear in the junglecategories column for the top 100 junglesalesrank books
top_genres$genre_rank_count <- sapply(top_genres$genre, function(g) sum(grepl(g, genre_rank$junglecategories, ignore.case = TRUE)))

# sum(top_genres$genre_rank_count) # sum is 269. Some genres must appear twice.

# convert to long format for plotting
top_genres_long <- top_genres %>% 
  select(-genre_count) %>% 
  pivot_longer(
  cols = c(genre_rank_count, genre_precent_of_whole),
               names_to = "metric",
               values_to = "value")

# plotting each genre as a percent of the entire dataset against its representation in the top 100 best selling books

ggplot(top_genres_long, aes(x = value, y = genre)) +
  geom_col(aes(color = metric, fill = metric), position = position_dodge(0.8), width = 0.7) +
    scale_color_manual(
    name = "Metric",
    values = c("#8278ff", "#e40058"),
    labels = c("% of data", "% of Top Ranked 100"),
    guide = "none"
  ) +
  scale_fill_manual(
    name = "Metric",
    values = c("#8278ff", "#e40058"),
    labels = c("% of All Books", "% of Top Ranked 100")
  ) +
  labs(
    title = "Genre Composition", 
    x = "%", 
    y = "Genre"
  ) +
  
 # Apply the dark theme, including legend styling
  theme(
    # Set background colors to black
    plot.background = element_rect(fill = "black", color = "black"),
    panel.background = element_rect(fill = "black", color = "black"),

    # Set grid colors to a dark gray
    panel.grid.major = element_line(color = "#444444"),
    panel.grid.minor = element_blank(),

    # Set text colors to white
    text = element_text(color = "white"),
    plot.title = element_text(color = "white", hjust = 0.5),
    axis.title = element_text(color = "white"),
    axis.text = element_text(color = "white"),

    # Style the legend to match the dark theme
    legend.background = element_rect(fill = "black"),
    legend.key = element_rect(fill = "black"),
    legend.title = element_text(color = "white"),
    legend.text = element_text(color = "white")
  )

# do book series outperform standalone novels?
# create new column labeling series or standalone books
books_clean <- books_clean %>% 
  mutate(book_type = case_when(
    serieslength > 1 ~ "Series",
    TRUE ~ "Standalone"
  ))

# create performance summary with average rating, total reviews, sales rank, price
# I only want to consider books with > 50 reviews
performacnce_summary <- books_clean %>% 
  filter(!is.na(jungleaveragerating), !is.na(junglesalesrank), !is.na(jungletotalreviews)) %>% 
  filter(jungleaveragerating > 0, junglesalesrank > 0, jungletotalreviews > 50) %>% 
  group_by(book_type) %>% 
  summarise(
    book_count = n(),
    median_avg_rating = median(jungleaveragerating),
    mean_avg_rating = mean(jungleaveragerating),
    median_total_reviews = median(jungletotalreviews),
    median_sales_rank = median(junglesalesrank),
    mean_sales_rank = mean(junglesalesrank)
  )

print(performacnce_summary)

## # A tibble: 2 × 7
##   book_type  book_count median_avg_rating mean_avg_rating median_total_reviews
##   <chr>           <int>             <dbl>           <dbl>                <dbl>
## 1 Series           1442               4.6            4.55                6974.
## 2 Standalone       3213               4.6            4.53                5283 
## # ℹ 2 more variables: median_sales_rank <dbl>, mean_sales_rank <dbl>

# plot the average differences between series and standalone books
performance_long <- performacnce_summary %>%
  pivot_longer(
    cols = c(median_total_reviews, median_sales_rank),
    names_to = "metric",
    values_to = "value"
  ) %>%
  # Make the metric names more readable for the plot
  mutate(metric = recode(metric,
                         "median_total_reviews" = "Median Reviews",
                         "median_sales_rank" = "Median Sales Rank"))

# Create the plot using facet_wrap to handle different scales
ggplot(performance_long, aes(x = book_type, y = value, color = book_type)) +
  geom_segment(aes(xend = book_type, yend = 0), size = 1.5) +
  geom_point(size = 7, stroke = 1.5) +
  facet_wrap(~ metric, scales = "free_y") +
  scale_color_manual(values = c("Series" = "#8278ff", "Standalone" = "#e40058")) +

  labs(
    title = "Series vs. Standalone Book Performance",
    subtitle = "Note: Lower Sales Rank is Better",
    x = "Book Type",
    y = "Median Value"
  ) +
  theme(
    plot.background = element_rect(fill = "black", color = "black"),
    panel.background = element_rect(fill = "black", color = "black"),
    strip.background = element_rect(fill = "#333333"),

    panel.grid.major = element_line(color = "#444444"),
    panel.grid.minor = element_blank(),
    panel.spacing = unit(2, "lines"), # Add space between panels

    text = element_text(color = "white"),
    plot.title = element_text(color = "white", hjust = 0.5, size = 16),
    plot.subtitle = element_text(color = "white", hjust = 0.5),
    axis.title = element_text(color = "white"),
    axis.text = element_text(color = "white", size = 10),
    strip.text = element_text(color = "white", size = 12),
    legend.position = "none" # Remove legend as the color is redundant with x-axis
  )

What are the attributes of a top ranked book?

# remove extra columns
books_concise <- books_clean %>% 
  select(c("isbn13", "title", "author", "narrator", "publisherlabel", "pagelength",
           "datepublished", "jungletotalreviews", "jungleaveragerating",
           "junglecategories", "seriesorder", "serieslength", "price", "junglesalesrank",
           "book_type"))

# see if review count is associated with sales rank
books_to_plot <- books_concise %>%
  filter(jungletotalreviews > 0, junglesalesrank > 0, 
         !is.na(jungletotalreviews), !is.na(junglesalesrank)) %>% 
  filter(jungleaveragerating < 5)

# plot sales rank vs total reviews
ggplot(books_to_plot, aes(x = jungletotalreviews, y = junglesalesrank)) +
  geom_point(alpha = 0.2, color = "gray80") + 
  geom_smooth(method = "lm", formula = y ~ x, color = "#e40058") +
  scale_x_log10(labels = scales::comma) +
  scale_y_log10(labels = scales::comma) +
  labs(
    title = NULL,
    # subtitle = "Books with a better sales rank have more reviews",
    x = "Jungle Total Reviews (Log Scale)",
    y = "Jungle Sales Rank (Log Scale)"
  ) +
  theme(
    plot.background = element_rect(fill = "black", color = "black"),
    panel.background = element_rect(fill = "black", color = "black"),
    panel.grid.major = element_line(color = "#444444"),
    panel.grid.minor = element_blank(),
    text = element_text(color = "white"),
    plot.title = element_text(color = "white", hjust = 0.5),
    plot.subtitle = element_text(color = "white", hjust = 0.5),
    axis.title = element_text(color = "white"),
    axis.text = element_text(color = "white")
  )

# plot sales rank vs rating
ggplot(books_to_plot, aes(x = jungleaveragerating, y = junglesalesrank)) +
  geom_point(alpha = 0.2) +
  geom_smooth(method = "lm", formula = y ~ x, color = "#e40058") + # linear trend line
  scale_x_log10(labels = scales::comma) + # Log scale for reviews
  scale_y_log10(labels = scales::comma) + # Log scale for rank
  labs(
    title = "Sales Rank vs. Average Rating",
    subtitle = "Books with a better sales rank have slightly higher ratings",
    x = "Jungle Average Rating (Log Scale)",
    y = "Jungle Sales Rank (Log Scale)"
  ) +
  theme_minimal()

# heatmap to find best predictor
# use only numeric data
numeric_data <- books_concise %>% 
    select(where(is.numeric))

# create matrix to feed to heatmap
# pairwise handles missing values well
# spearman nicer to outliers
cor_matrix <- cor(numeric_data, method = "spearman", use = "pairwise.complete.obs")

# crete correlation chart for numerical data
long_cor_data <- cor_matrix %>%
  as.data.frame() %>%
  rownames_to_column("variable1") %>%
  pivot_longer(
    cols = -variable1,
    names_to = "variable2",
    values_to = "correlation"
  )
heatmap_colors <- c("#e40058", "white", "#8278ff")

ggplot(long_cor_data, aes(x = variable1, y = variable2, fill = correlation)) +
  # Create the heatmap tiles
  geom_tile(color = "black") + # Add black borders for tile separation
  
  # Add the correlation numbers on top
  geom_text(
    aes(label = sprintf("%.2f", correlation)),
    color = "black",
    size = 3
  ) +
  
  # Use a diverging color scale for correlation data
  scale_fill_gradient2(
    low = heatmap_colors[1],   # Negative correlation color
    mid = heatmap_colors[2],   # Zero correlation color
    high = heatmap_colors[3],  # Positive correlation color
    midpoint = 0,
    limits = c(-1, 1),
    name = "Correlation"
  ) +
  
  # Add title and labels
  labs(
    title = "Feature Relationships",
    x = "", 
    y = ""
  ) +
  
  # Apply the custom dark theme
  theme(
    # Set background colors
    plot.background = element_rect(fill = "black", color = "black"),
    panel.background = element_rect(fill = "black", color = "black"),
    legend.background = element_rect(fill = "black"),
    legend.key = element_rect(fill = "black"),

    # Remove panel grid and axis ticks
    panel.grid = element_blank(),
    axis.ticks = element_blank(),
    
    # Set text colors to white
    text = element_text(color = "white"),
    plot.title = element_text(color = "white", hjust = 0.5, size = 16),
    axis.text = element_text(color = "white", size = 10),
    legend.title = element_text(color = "white"),
    legend.text = element_text(color = "white"),

    # Rotate x-axis labels to prevent overlap
    axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)
  )

# 0.0 - 0.3: Weak
# 0.3 - 0.6 Moderate
# 0.6 - 1.0 Strong 
# Not many high correlation values. Series length and series order have the highest cor.
# sales rank and total reviews have the 2nd highest but is still a weak relationship
# price has a 0.0 correlation with sales rank!
# higher rated books are not typically more expensive

Average rating of a Literature & Fiction Book

# change incorrect ratings above 5 to NA
# books_clean$jungleaveragerating[books_clean$jungleaveragerating > 5] <- NA  

# function to quickly create dfs for all major categories 
create_genre_df <- function(df, genre_name) {

  genre_df <- df %>%
    select(c("isbn13", "title", "author", "narrator", "publisherlabel", "pagelength",
              "datepublished", "jungletotalreviews", "jungleaveragerating",
              "junglecategories", "seriesorder", "serieslength", "price", "junglesalesrank",
              "book_type")) %>%
    filter(grepl(genre_name, junglecategories, ignore.case = TRUE)) %>%
        mutate(genre = genre_name)

  return(genre_df)
}

# create dfs for top 10 genres
lit_fic <- create_genre_df(books_clean, "Literature & Fiction")
scifi_fant <- create_genre_df(books_clean, "Science Fiction & Fantasy")
myst_thrill <- create_genre_df(books_clean, "Mystery, Thriller & Suspense")
romance <- create_genre_df(books_clean, "Romance")
hist_fict <- create_genre_df(books_clean, "Historical Fiction")
history <- create_genre_df(books_clean, "History >")
rela_pd <- create_genre_df(books_clean, "Relationships, Parenting & Personal Development")
self_help <- create_genre_df(books_clean, "Self-Help")
health <- create_genre_df(books_clean, "Health, Fitness & Dieting")
humor <- create_genre_df(books_clean, "Humor")


# find means for each genre
fic_means <- colMeans(lit_fic[, sapply(lit_fic, is.numeric)], na.rm = TRUE)
scifi_means <- colMeans(scifi_fant[, sapply(scifi_fant, is.numeric)], na.rm = TRUE)
myst_means <- colMeans(myst_thrill[, sapply(myst_thrill, is.numeric)], na.rm = TRUE)
romance_means <- colMeans(romance[, sapply(romance, is.numeric)], na.rm = TRUE)
hist_fic_means <- colMeans(hist_fict[, sapply(hist_fict, is.numeric)], na.rm = TRUE)
history_means <- colMeans(history[, sapply(history, is.numeric)], na.rm = TRUE)
rela_pd_means <- colMeans(rela_pd[, sapply(rela_pd, is.numeric)], na.rm = TRUE)
self_help_means <- colMeans(self_help[, sapply(self_help, is.numeric)], na.rm = TRUE)
health_means <- colMeans(health[, sapply(health, is.numeric)], na.rm = TRUE)
humor_means <- colMeans(humor[, sapply(humor, is.numeric)], na.rm = TRUE)


# combine results into one beautiful df
all_cat_means <- rbind(fic_means, scifi_means, myst_means, romance_means, 
                       hist_fic_means, history_means, rela_pd_means,
                       self_help_means, health_means, humor_means)

cleaned_means_df <- all_cat_means %>%
  as.data.frame() %>%
  rownames_to_column(var = "genre_code") %>%
  mutate(
    Genre = case_when(
      genre_code == "fic_means"      ~ "Literature & Fiction",
      genre_code == "scifi_means"   ~ "Science Fiction",
      genre_code == "myst_means"    ~ "Mystery & Suspense",
      genre_code == "romance_means"  ~ "Romance",
      genre_code == "hist_fic_means"~ "Historical Fiction",
      genre_code == "history_means" ~ "History",
      genre_code == "rela_pd_means" ~ "Religion & Spirituality",
      genre_code == "self_help_means" ~ "Self-Help",
      genre_code == "health_means" ~ "Health, Fitness & Dieting",
      genre_code == "humor_means"  ~ "Humor",

      TRUE                            ~ genre_code
    ),
    .before = 1
  ) %>%
  select(-genre_code)

# create kable table
# Define base palette
pink_scale <- c("black", "#e40058")

# reverse scale for salesrank
rev_pink_scale <- rev(pink_scale)

# function for kable styling
colored_cell <- function(x, palette, reverse = FALSE) {
  pal <- if (reverse) rev(palette) else palette
  cell_spec(
    round(x, 2),
    format = "html",
    color = "white",
    background = col_numeric(pal, domain = range(x, na.rm = TRUE))(x),
    extra_css = "display: inline-block; width: 100%; padding: 2px;"
  )
}
# Apply styling column by column
styled_df <- cleaned_means_df %>%
  mutate(
    pagelength = colored_cell(pagelength, pink_scale),
    jungletotalreviews = colored_cell(jungletotalreviews, pink_scale),
    jungleaveragerating = colored_cell(jungleaveragerating, pink_scale),
    seriesorder = colored_cell(seriesorder, pink_scale),
    serieslength = colored_cell(serieslength, pink_scale),
    price = colored_cell(price, pink_scale),
    junglesalesrank = colored_cell(junglesalesrank, pink_scale, reverse = TRUE)
  )

# Display kable
Performance_table <- styled_df %>%
  kbl(
    escape = FALSE,
    caption = "Mean Performance Metrics Across Book Genres",
    digits = 2
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "responsive"),
    full_width = FALSE
  )

Top Ranking Titles, Narrators, Authors in Sales Ranking and Average Review Score

books_clean %>% filter(!is.na(junglesalesrank)) %>% 
  arrange(junglesalesrank) %>% 
  head(., 2)

## # A tibble: 2 × 29
##   filedate            country formattype asin       isbn13   isbn10 title author
##   <dttm>              <chr>   <chr>      <chr>      <chr>    <chr>  <chr> <chr> 
## 1 2021-05-09 00:00:00 aus     audio      E3J1A9E0SX 8HAAOIW… XSELU… Syne… Marga…
## 2 2021-05-09 00:00:00 aus     audio      IQ0K6SHW5H G2L4KWJ… H2OWD… Opti… Dr. C…
## # ℹ 21 more variables: narrator <chr>, publisherlabel <chr>, formats <chr>,
## #   pagelength <dbl>, datepublished <dttm>, Berrydistributor <chr>,
## #   jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## #   junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>,
## #   Berrytotalreviews <dbl>, Berryaveragerating <dbl>,
## #   maryandwilliamtotalreviews <dbl>, maryandwilliamaveragerating <dbl>,
## #   price <dbl>, jungleprice <dbl>, Berryprice <dbl>, …

lit_fic %>% arrange(junglesalesrank) %>% 
  head(., 2)

## # A tibble: 2 × 16
##   isbn13     title author narrator publisherlabel pagelength datepublished      
##   <chr>      <chr> <chr>  <chr>    <chr>               <dbl> <dttm>             
## 1 SMB1TZL1Y… Face… Annet… Debra S… Wilcox LLC            897 2024-02-06 00:00:00
## 2 UEJ53TIQG… Open… Lacey… William… Rodriguez, Ca…       1282 2023-05-02 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## #   junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## #   junglesalesrank <dbl>, book_type <chr>, genre <chr>

scifi_fant %>% arrange(junglesalesrank) %>% 
  head(., 2)

## # A tibble: 2 × 16
##   isbn13     title author narrator publisherlabel pagelength datepublished      
##   <chr>      <chr> <chr>  <chr>    <chr>               <dbl> <dttm>             
## 1 FOJSOZJUD… User… Charl… Lisa Ba… Young, Moore …       1696 2023-11-07 00:00:00
## 2 UEJ53TIQG… Open… Lacey… William… Rodriguez, Ca…       1282 2023-05-02 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## #   junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## #   junglesalesrank <dbl>, book_type <chr>, genre <chr>

myst_thrill %>% arrange(junglesalesrank) %>% 
  head(., 2)

## # A tibble: 2 × 16
##   isbn13     title author narrator publisherlabel pagelength datepublished      
##   <chr>      <chr> <chr>  <chr>    <chr>               <dbl> <dttm>             
## 1 7H9G6YYN0… Fund… Miche… Jason T… Parker-Johnson        556 2024-01-02 00:00:00
## 2 I98JWEAQM… Gras… Amand… Jessica… Robinson, New…       1200 2024-05-21 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## #   junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## #   junglesalesrank <dbl>, book_type <chr>, genre <chr>

romance %>% arrange(junglesalesrank) %>% 
  head(., 2)

## # A tibble: 2 × 16
##   isbn13     title author narrator publisherlabel pagelength datepublished      
##   <chr>      <chr> <chr>  <chr>    <chr>               <dbl> <dttm>             
## 1 8HAAOIWK2… Syne… Marga… Joseph … Boyd-Mason            683 2024-04-23 00:00:00
## 2 1P2PVJFWL… Mand… Arthu… Joe Sol… Burnett Group        1782 2024-01-30 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## #   junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## #   junglesalesrank <dbl>, book_type <chr>, genre <chr>

hist_fict %>% arrange(junglesalesrank) %>% 
  head(., 2)

## # A tibble: 2 × 16
##   isbn13     title author narrator publisherlabel pagelength datepublished      
##   <chr>      <chr> <chr>  <chr>    <chr>               <dbl> <dttm>             
## 1 SMB1TZL1Y… Face… Annet… Debra S… Wilcox LLC            897 2024-02-06 00:00:00
## 2 XKA2XED83… Assi… Allis… Bonnie … Thompson, Mas…        586 2022-04-26 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## #   junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## #   junglesalesrank <dbl>, book_type <chr>, genre <chr>

history %>% arrange(junglesalesrank) %>% 
  head(., 2)

## # A tibble: 2 × 16
##   isbn13     title author narrator publisherlabel pagelength datepublished      
##   <chr>      <chr> <chr>  <chr>    <chr>               <dbl> <dttm>             
## 1 Z68QR1268… Down… Amand… Patrici… Perez, Petty …        335 2018-10-16 00:00:00
## 2 A4KS2CXGJ… Impl… Dana … Randy G… Webb LLC             1262 2006-12-31 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## #   junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## #   junglesalesrank <dbl>, book_type <chr>, genre <chr>

rela_pd %>% arrange(junglesalesrank) %>% 
  head(., 2)

## # A tibble: 2 × 16
##   isbn13     title author narrator publisherlabel pagelength datepublished      
##   <chr>      <chr> <chr>  <chr>    <chr>               <dbl> <dttm>             
## 1 G2L4KWJSN… Opti… Dr. C… Chelsey… Warren and So…        344 2022-04-01 00:00:00
## 2 Z68QR1268… Down… Amand… Patrici… Perez, Petty …        335 2018-10-16 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## #   junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## #   junglesalesrank <dbl>, book_type <chr>, genre <chr>

self_help %>% arrange(junglesalesrank) %>% 
  head(., 2)

## # A tibble: 2 × 16
##   isbn13     title author narrator publisherlabel pagelength datepublished      
##   <chr>      <chr> <chr>  <chr>    <chr>               <dbl> <dttm>             
## 1 G2L4KWJSN… Opti… Dr. C… Chelsey… Warren and So…        344 2022-04-01 00:00:00
## 2 Z68QR1268… Down… Amand… Patrici… Perez, Petty …        335 2018-10-16 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## #   junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## #   junglesalesrank <dbl>, book_type <chr>, genre <chr>

health %>% arrange(junglesalesrank) %>% 
  head(., 2)

## # A tibble: 2 × 16
##   isbn13     title author narrator publisherlabel pagelength datepublished      
##   <chr>      <chr> <chr>  <chr>    <chr>               <dbl> <dttm>             
## 1 Z68QR1268… Down… Amand… Patrici… Perez, Petty …        335 2018-10-16 00:00:00
## 2 TFJO646GN… Dece… Charl… Randy J… Morgan-Douglas        317 2016-09-13 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## #   junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## #   junglesalesrank <dbl>, book_type <chr>, genre <chr>

humor %>% arrange(junglesalesrank) %>% 
  head(., 2)

## # A tibble: 2 × 16
##   isbn13     title author narrator publisherlabel pagelength datepublished      
##   <chr>      <chr> <chr>  <chr>    <chr>               <dbl> <dttm>             
## 1 A4KS2CXGJ… Impl… Dana … Randy G… Webb LLC             1262 2006-12-31 00:00:00
## 2 SVYFXMKII… Trip… Kelly… Donna S… Carr, Davidso…       1670 2020-03-03 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## #   junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## #   junglesalesrank <dbl>, book_type <chr>, genre <chr>

# worst performing books by genre
# make cover art and display on slide
books_clean %>% filter(!is.na(junglesalesrank)) %>% 
  arrange(junglesalesrank) %>% 
  tail(., 2)

## # A tibble: 2 × 29
##   filedate            country formattype asin       isbn13   isbn10 title author
##   <dttm>              <chr>   <chr>      <chr>      <chr>    <chr>  <chr> <chr> 
## 1 2021-05-09 00:00:00 aus     audio      UUO6XFLEP5 EUQSXRT… 7HABX… Mult… Rober…
## 2 2021-05-09 00:00:00 aus     audio      IKF6P5OODD EBHF1W8… 7KIES… Seam… Natal…
## # ℹ 21 more variables: narrator <chr>, publisherlabel <chr>, formats <chr>,
## #   pagelength <dbl>, datepublished <dttm>, Berrydistributor <chr>,
## #   jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## #   junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>,
## #   Berrytotalreviews <dbl>, Berryaveragerating <dbl>,
## #   maryandwilliamtotalreviews <dbl>, maryandwilliamaveragerating <dbl>,
## #   price <dbl>, jungleprice <dbl>, Berryprice <dbl>, …

lit_fic %>% filter(!is.na(junglesalesrank)) %>% 
  arrange(junglesalesrank) %>% 
  tail(., 2)

## # A tibble: 2 × 16
##   isbn13     title author narrator publisherlabel pagelength datepublished      
##   <chr>      <chr> <chr>  <chr>    <chr>               <dbl> <dttm>             
## 1 K8WM0EGXK… Moni… Jerry… Rebecca… Adams LLC             463 2024-05-31 00:00:00
## 2 3UX6KOMA8… Stan… James… Nicole … Phillips-Mill…       1109 2024-05-20 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## #   junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## #   junglesalesrank <dbl>, book_type <chr>, genre <chr>

scifi_fant %>% filter(!is.na(junglesalesrank)) %>% 
  arrange(junglesalesrank) %>% 
  tail(., 2)

## # A tibble: 2 × 16
##   isbn13     title author narrator publisherlabel pagelength datepublished      
##   <chr>      <chr> <chr>  <chr>    <chr>               <dbl> <dttm>             
## 1 9GJDEP056… Orga… Sarah… Katelyn… Sims Group            829 2021-05-14 00:00:00
## 2 EUQSXRTOD… Mult… Rober… Dustin … Lawson Ltd           1217 2024-05-30 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## #   junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## #   junglesalesrank <dbl>, book_type <chr>, genre <chr>

myst_thrill %>% filter(!is.na(junglesalesrank)) %>% 
  arrange(junglesalesrank) %>% 
  tail(., 2)

## # A tibble: 2 × 16
##   isbn13     title author narrator publisherlabel pagelength datepublished      
##   <chr>      <chr> <chr>  <chr>    <chr>               <dbl> <dttm>             
## 1 BKW4C8XM7… User… John … Maurice… Griffin PLC           653 2023-06-06 00:00:00
## 2 K8WM0EGXK… Moni… Jerry… Rebecca… Adams LLC             463 2024-05-31 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## #   junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## #   junglesalesrank <dbl>, book_type <chr>, genre <chr>

romance %>% filter(!is.na(junglesalesrank)) %>% 
  arrange(junglesalesrank) %>% 
  tail(., 2)

## # A tibble: 2 × 16
##   isbn13     title author narrator publisherlabel pagelength datepublished      
##   <chr>      <chr> <chr>  <chr>    <chr>               <dbl> <dttm>             
## 1 QOHR0JFTH… Mult… Saman… Kylie J… Harrington, J…        447 2024-05-31 00:00:00
## 2 3UX6KOMA8… Stan… James… Nicole … Phillips-Mill…       1109 2024-05-20 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## #   junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## #   junglesalesrank <dbl>, book_type <chr>, genre <chr>

hist_fict %>% filter(!is.na(junglesalesrank)) %>% 
  arrange(junglesalesrank) %>% 
  tail(., 2)

## # A tibble: 2 × 16
##   isbn13     title author narrator publisherlabel pagelength datepublished      
##   <chr>      <chr> <chr>  <chr>    <chr>               <dbl> <dttm>             
## 1 MVOV5F9FR… Futu… Keith… William… Kidd LLC              787 2019-08-20 00:00:00
## 2 XOBOL5RAQ… Stre… Emily… Joseph … Dougherty, Ga…        650 2018-05-22 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## #   junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## #   junglesalesrank <dbl>, book_type <chr>, genre <chr>

history %>% filter(!is.na(junglesalesrank)) %>% 
  arrange(junglesalesrank) %>% 
  tail(., 2)

## # A tibble: 2 × 16
##   isbn13     title author narrator publisherlabel pagelength datepublished      
##   <chr>      <chr> <chr>  <chr>    <chr>               <dbl> <dttm>             
## 1 M2IOA0J8J… Trip… Baile… Carrie … Schroeder-Lop…        716 2024-01-16 00:00:00
## 2 91JN5YUCW… Busi… Felic… Kelly M… Lopez-Pena           1000 2022-07-12 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## #   junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## #   junglesalesrank <dbl>, book_type <chr>, genre <chr>

rela_pd %>% filter(!is.na(junglesalesrank)) %>% 
  arrange(junglesalesrank) %>% 
  tail(., 2)

## # A tibble: 2 × 16
##   isbn13     title author narrator publisherlabel pagelength datepublished      
##   <chr>      <chr> <chr>  <chr>    <chr>               <dbl> <dttm>             
## 1 AN0J797KT… Reve… Richa… Tony Wa… Lopez-Hill            660 2024-05-07 00:00:00
## 2 9QXV2I76A… Busi… Georg… Patty W… Patrick, Oroz…        373 2016-05-03 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## #   junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## #   junglesalesrank <dbl>, book_type <chr>, genre <chr>

self_help %>% filter(!is.na(junglesalesrank)) %>% 
  arrange(junglesalesrank) %>% 
  tail(., 2)

## # A tibble: 2 × 16
##   isbn13     title author narrator publisherlabel pagelength datepublished      
##   <chr>      <chr> <chr>  <chr>    <chr>               <dbl> <dttm>             
## 1 9QXV2I76A… Busi… Georg… Patty W… Patrick, Oroz…        373 2016-05-03 00:00:00
## 2 EBHF1W8KR… Seam… Natal… Travis … Wagner, Spear…         52 2019-06-05 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## #   junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## #   junglesalesrank <dbl>, book_type <chr>, genre <chr>

health %>% filter(!is.na(junglesalesrank)) %>% 
  arrange(junglesalesrank) %>% 
  tail(., 2)

## # A tibble: 2 × 16
##   isbn13     title author narrator publisherlabel pagelength datepublished      
##   <chr>      <chr> <chr>  <chr>    <chr>               <dbl> <dttm>             
## 1 Q8S4JZCLS… Func… Edwar… Pamela … Shaw-Nelson           292 2021-03-02 00:00:00
## 2 L93JKHIC8… Digi… Micha… Richard… Morris and So…        465 2024-05-14 00:00:00
## # ℹ 9 more variables: jungletotalreviews <dbl>, jungleaveragerating <dbl>,
## #   junglecategories <chr>, seriesorder <dbl>, serieslength <dbl>, price <dbl>,
## #   junglesalesrank <dbl>, book_type <chr>, genre <chr>

Audiobook Data Assessment

Aria Ricci

June 24, 2025

Data Cleanup

Insights

What are the attributes of a top ranked book?