GitHub - https://github.com/rickysoo
Contact - ricky [at] rickysoo.com

Source: https://www.pexels.com

Would you like to know what the top 100 audiobooks are whenever you want? I’m a big fan of audio books, having been a paying subscriber to Audible.com for many years.

Here, we will retrieve the current 100 bestsellers on Audible.com and save them into a dataframe. 5 publicly available web pages are retrieved from Audible.com web site.

Import libraries

Required for web scraping in R.

library(xml2)
library(rvest)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter()         masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag()            masks stats::lag()
## x purrr::pluck()          masks rvest::pluck()

Set number of pages and number of items per page

We are going to retrieve 20 audiobooks from each of 5 pages.

pages <- 5
items <- 20

Set the data parameters

There are 12 data fields to be retrieved. A matrix is used to store the data retrieved.

cols <- c('Rank', 'Title', 'Subtitle', 'Author', 'Narrator', 'Length', 'Release', 'Language', 'Stars', 'Ratings', 'Price', 'URL')
data <- matrix('', nrow = pages * items, ncol = length(cols))
colnames(data) <- cols

Retrieve and process 100 items from 5 web pages

Now we go to each page, extract the data, do some minimal cleaning, and put the them into the matrix.

# Loop through the pages
for (page in 1:pages) {
  url <- paste0('https://www.audible.com/adblbestsellers?page=', page)
  html <- read_html(url)

  # Loop through the items on a page
  for (item_num in 1:items) {
    row <- (page - 1) * items + item_num

    item_selector <- paste0('#product-list-a11y-skiplink-target > span > ul > div > li:nth-child(', item_num, ') > div > div.bc-col-responsive.bc-spacing-top-none.bc-col-8 > div > div.bc-col-responsive.bc-col-6 > div > div > span > ul')
    item_node <- html_node(html, item_selector)

    # The audiobook ranking
    data[row, 'Rank'] <- row
    
    # The audiobook title
    title_selector <- 'li:nth-child(1) > h3 > a'
    title_node <- html_node(item_node, title_selector)
    title <- html_text(title_node, trim = TRUE)
    data[row, 'Title'] <- title

    # The audiobook subtitle. It's empty for some items.
    subtitle_selector <- 'li.bc-list-item.subtitle > span'
    subtitle_node <- html_node(item_node, subtitle_selector)
    subtitle <- html_text(subtitle_node, trim = TRUE)
    data[row, 'Subtitle'] <- subtitle
    
    # The author. There might be more than one.
    author_selector <- 'li.bc-list-item.authorLabel > span > a'
    author_nodes <- html_nodes(item_node, author_selector)
    authors <- html_text(author_nodes, trim = TRUE)
    author <- paste(authors, collapse = ', ')
    data[row, 'Author'] <- author
    
    # The narrator. There might be more than one.
    narrator_selector <- 'li.bc-list-item.narratorLabel > span > a'
    narrator_nodes <- html_nodes(item_node, narrator_selector)
    narrators <- html_text(narrator_nodes, trim = TRUE)
    narrator <- paste(narrators, collapse = ', ')
    data[row, 'Narrator'] <- narrator
    
    # The audiobook length in hours and minutes
    length_selector <- 'li.bc-list-item.runtimeLabel > span'
    length_node <- html_node(item_node, length_selector)
    length <- html_text(length_node, trim = TRUE)
    length <- gsub('Length: ', '', length)
    data[row, 'Length'] <- length

    # The release date    
    release_selector <- 'li.bc-list-item.releaseDateLabel > span'
    release_node <- html_node(item_node, release_selector)
    release <- html_text(release_node, trim = TRUE)
    release <- gsub('Release date:\n\\s+', '', release)
    data[row, 'Release'] <- release
    
    # The audiobook language
    language_selector <- 'li.bc-list-item.languageLabel > span'
    language_node <- html_node(item_node, language_selector)
    language <- html_text(language_node, trim = TRUE)
    language <- gsub('Language:\n\\s+', '', language)
    data[row, 'Language'] <- language
    
    # The number of stars received
    stars_selector <- 'li.bc-list-item.ratingsLabel > span.bc-text.bc-pub-offscreen'
    stars_node <- html_node(item_node, stars_selector)
    stars <- html_text(stars_node, trim = TRUE)
    data[row, 'Stars'] <- stars
    
    # The number of ratings received
    ratings_selector <- 'li.bc-list-item.ratingsLabel > span.bc-text.bc-size-small.bc-color-secondary'
    ratings_node <- html_node(item_node, ratings_selector)
    ratings <- html_text(ratings_node, trim = TRUE)
    data[row, 'Ratings'] <- ratings
    
    # The selling price
    price_selector <- paste0('#buybox-regular-price-', item_num - 1, ' > span:nth-child(2)')
    price_node <- html_node(html, price_selector)
    price <- html_text(price_node, trim = TRUE)
    data[row, 'Price'] <- price
    
    # Web page address
    url <- paste0('https://www.audible.com', html_attr(title_node, 'href'))
    data[row, 'URL'] <- url
  }
}

Create and view the dataframe

Convert matrix into dataframe and show the first 10 rows.

df <- as.data.frame(data)
head(df, 10)
##    Rank                     Title
## 1     1           A Promised Land
## 2     2          Washington's End
## 3     3               Greenlights
## 4     4          Ready Player Two
## 5     5             Atomic Habits
## 6     6           A Killer's Wife
## 7     7             Rhythm of War
## 8     8                The Tribes
## 9     9             Can't Hurt Me
## 10   10 Caste (Oprah's Book Club)
##                                                      Subtitle
## 1                                                        <NA>
## 2                      The Final Years and Forgotten Struggle
## 3                                                        <NA>
## 4                                                     A Novel
## 5  An Easy & Proven Way to Build Good Habits & Break Bad Ones
## 6                                       Desert Plains, Book 1
## 7                              The Stormlight Archive, Book 4
## 8                                                     A Novel
## 9                          Master Your Mind and Defy the Odds
## 10                             The Origins of Our Discontents
##                 Author                      Narrator             Length
## 1         Barack Obama                  Barack Obama 29 hrs and 10 mins
## 2        Jonathan Horn                  Arthur Morey  8 hrs and 14 mins
## 3  Matthew McConaughey           Matthew McConaughey  6 hrs and 42 mins
## 4         Ernest Cline                   Wil Wheaton 13 hrs and 46 mins
## 5          James Clear                   James Clear  5 hrs and 35 mins
## 6        Victor Methos             Brittany Pressley  9 hrs and 21 mins
## 7    Brandon Sanderson  Kate Reading, Michael Kramer 57 hrs and 26 mins
## 8           Mari Howes Barrie Kreinik, Piper Goodeve 11 hrs and 48 mins
## 9        David Goggins  David Goggins, Adam Skolnick 13 hrs and 37 mins
## 10    Isabel Wilkerson                   Robin Miles 14 hrs and 26 mins
##     Release Language              Stars         Ratings  Price
## 1  11-17-20  English   5 out of 5 stars   4,676 ratings $45.50
## 2  02-11-20  English 4.5 out of 5 stars      33 ratings $17.00
## 3  10-20-20  English   5 out of 5 stars  23,359 ratings $28.00
## 4  11-24-20  English 4.5 out of 5 stars  13,169 ratings $31.50
## 5  10-16-18  English   5 out of 5 stars  49,078 ratings $28.00
## 6  03-01-20  English 4.5 out of 5 stars   3,204 ratings $25.19
## 7  11-17-20  English   5 out of 5 stars   5,161 ratings $66.49
## 8  12-10-20  English 4.5 out of 5 stars      25 ratings $34.95
## 9  11-28-18  English   5 out of 5 stars 117,784 ratings $24.95
## 10 08-04-20  English   5 out of 5 stars   5,530 ratings $31.50
##                                                                                                                                                                                  URL
## 1          https://www.audible.com/pd/A-Promised-Land-Audiobook/0525633723?ref=a_adblbests_c3_lProduct_1_1&pf_rd_p=4100380b-3e9d-4594-990a-9c93d1a8dac3&pf_rd_r=FHGFT2DZGSYYEYZH51EP
## 2          https://www.audible.com/pd/Washingtons-End-Audiobook/1797111051?ref=a_adblbests_c3_lProduct_1_2&pf_rd_p=4100380b-3e9d-4594-990a-9c93d1a8dac3&pf_rd_r=FHGFT2DZGSYYEYZH51EP
## 3              https://www.audible.com/pd/Greenlights-Audiobook/0593294181?ref=a_adblbests_c3_lProduct_1_3&pf_rd_p=4100380b-3e9d-4594-990a-9c93d1a8dac3&pf_rd_r=FHGFT2DZGSYYEYZH51EP
## 4         https://www.audible.com/pd/Ready-Player-Two-Audiobook/0593396960?ref=a_adblbests_c3_lProduct_1_4&pf_rd_p=4100380b-3e9d-4594-990a-9c93d1a8dac3&pf_rd_r=FHGFT2DZGSYYEYZH51EP
## 5            https://www.audible.com/pd/Atomic-Habits-Audiobook/1524779261?ref=a_adblbests_c3_lProduct_1_5&pf_rd_p=4100380b-3e9d-4594-990a-9c93d1a8dac3&pf_rd_r=FHGFT2DZGSYYEYZH51EP
## 6           https://www.audible.com/pd/A-Killers-Wife-Audiobook/179975006X?ref=a_adblbests_c3_lProduct_1_6&pf_rd_p=4100380b-3e9d-4594-990a-9c93d1a8dac3&pf_rd_r=FHGFT2DZGSYYEYZH51EP
## 7            https://www.audible.com/pd/Rhythm-of-War-Audiobook/1250759781?ref=a_adblbests_c3_lProduct_1_7&pf_rd_p=4100380b-3e9d-4594-990a-9c93d1a8dac3&pf_rd_r=FHGFT2DZGSYYEYZH51EP
## 8               https://www.audible.com/pd/The-Tribes-Audiobook/B08K3J3294?ref=a_adblbests_c3_lProduct_1_8&pf_rd_p=4100380b-3e9d-4594-990a-9c93d1a8dac3&pf_rd_r=FHGFT2DZGSYYEYZH51EP
## 9             https://www.audible.com/pd/Cant-Hurt-Me-Audiobook/B07KKMNZCH?ref=a_adblbests_c3_lProduct_1_9&pf_rd_p=4100380b-3e9d-4594-990a-9c93d1a8dac3&pf_rd_r=FHGFT2DZGSYYEYZH51EP
## 10 https://www.audible.com/pd/Caste-Oprahs-Book-Club-Audiobook/0593339800?ref=a_adblbests_c3_lProduct_1_10&pf_rd_p=4100380b-3e9d-4594-990a-9c93d1a8dac3&pf_rd_r=FHGFT2DZGSYYEYZH51EP
View(df)

Save the raw data

filename <- paste0('audiobooks-original-', format(Sys.time(), '%Y%m%d'), '.csv')
write.csv(df, filename, row.names = FALSE, fileEncoding = 'UTF-8')

Clean data

Check data frame for missing values.

sum(is.na(df))
## [1] 41
for (col in colnames(df)) {
  print(paste0(col, ': ', sum(is.na(df[col]))))
}
## [1] "Rank: 0"
## [1] "Title: 0"
## [1] "Subtitle: 40"
## [1] "Author: 0"
## [1] "Narrator: 0"
## [1] "Length: 0"
## [1] "Release: 0"
## [1] "Language: 0"
## [1] "Stars: 1"
## [1] "Ratings: 0"
## [1] "Price: 0"
## [1] "URL: 0"

Subtitle - Replace missing values with empty space.

df$Subtitle <- replace_na(df$Subtitle, '')

Length - Calculate length in minutes.

Hours <- str_extract(df$Length, '^\\d+(?=(\\shr))') %>% as.numeric()
Minutes <- str_extract(df$Length, '\\d+(?=(\\smin))') %>% replace_na(0) %>% as.numeric()
df$Length <- Hours * 60 + Minutes

Release - Reformat value as date.

df$Release <- as.Date(df$Release, '%m-%d-%y')

Stars - Extract the number of stars. Missing values are left as NAs.

df$Stars <- str_extract(df$Stars, '^[\\d\\.]+')
df$Stars <- as.numeric(df$Stars)

Stars - Extract the number of ratings. Missing values are left as NAs.

df$Ratings <- gsub('Not rated yet', NA, df$Ratings)
df$Ratings <- str_extract(df$Ratings, '^[\\d,]+')
df$Ratings <- gsub(',', '', df$Ratings)
df$Ratings <- as.numeric(df$Ratings)

Price - Reformat data as number.

df$Price <- gsub('\\$', '', df$Price)
df$Price <- as.numeric(df$Price)

Recheck data frame for missing values. Missing values in Stars and Ratings are left as NAs.

sum(is.na(df))
## [1] 2
for (col in colnames(df)) {
  print(paste0(col, ': ', sum(is.na(df[col]))))
}
## [1] "Rank: 0"
## [1] "Title: 0"
## [1] "Subtitle: 0"
## [1] "Author: 0"
## [1] "Narrator: 0"
## [1] "Length: 0"
## [1] "Release: 0"
## [1] "Language: 0"
## [1] "Stars: 1"
## [1] "Ratings: 1"
## [1] "Price: 0"
## [1] "URL: 0"

View the data frame.

str(df)
## 'data.frame':    100 obs. of  12 variables:
##  $ Rank    : chr  "1" "2" "3" "4" ...
##  $ Title   : chr  "A Promised Land" "Washington's End" "Greenlights" "Ready Player Two" ...
##  $ Subtitle: chr  "" "The Final Years and Forgotten Struggle" "" "A Novel" ...
##  $ Author  : chr  "Barack Obama" "Jonathan Horn" "Matthew McConaughey" "Ernest Cline" ...
##  $ Narrator: chr  "Barack Obama" "Arthur Morey" "Matthew McConaughey" "Wil Wheaton" ...
##  $ Length  : num  1750 494 402 826 335 ...
##  $ Release : Date, format: "2020-11-17" "2020-02-11" ...
##  $ Language: chr  "English" "English" "English" "English" ...
##  $ Stars   : num  5 4.5 5 4.5 5 4.5 5 4.5 5 5 ...
##  $ Ratings : num  4676 33 23359 13169 49078 ...
##  $ Price   : num  45.5 17 28 31.5 28 ...
##  $ URL     : chr  "https://www.audible.com/pd/A-Promised-Land-Audiobook/0525633723?ref=a_adblbests_c3_lProduct_1_1&pf_rd_p=4100380"| __truncated__ "https://www.audible.com/pd/Washingtons-End-Audiobook/1797111051?ref=a_adblbests_c3_lProduct_1_2&pf_rd_p=4100380"| __truncated__ "https://www.audible.com/pd/Greenlights-Audiobook/0593294181?ref=a_adblbests_c3_lProduct_1_3&pf_rd_p=4100380b-3e"| __truncated__ "https://www.audible.com/pd/Ready-Player-Two-Audiobook/0593396960?ref=a_adblbests_c3_lProduct_1_4&pf_rd_p=410038"| __truncated__ ...
summary(df)
##      Rank              Title             Subtitle            Author         
##  Length:100         Length:100         Length:100         Length:100        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    Narrator             Length          Release             Language        
##  Length:100         Min.   :  95.0   Min.   :2000-12-31   Length:100        
##  Class :character   1st Qu.: 517.0   1st Qu.:2015-11-12   Class :character  
##  Mode  :character   Median : 715.0   Median :2019-02-26   Mode  :character  
##                     Mean   : 885.3   Mean   :2016-12-27                     
##                     3rd Qu.:1006.5   3rd Qu.:2020-09-29                     
##                     Max.   :3446.0   Max.   :2020-12-15                     
##                                                                             
##      Stars          Ratings           Price           URL           
##  Min.   :4.500   Min.   :    25   Min.   :14.95   Length:100        
##  1st Qu.:4.500   1st Qu.:  3533   1st Qu.:24.98   Class :character  
##  Median :4.500   Median : 20466   Median :30.79   Mode  :character  
##  Mean   :4.707   Mean   : 36604   Mean   :32.36                     
##  3rd Qu.:5.000   3rd Qu.: 53466   3rd Qu.:34.99                     
##  Max.   :5.000   Max.   :234727   Max.   :66.49                     
##  NA's   :1       NA's   :1
View(df)

Save the cleaned data

filename <- paste0('audiobooks-cleaned-', format(Sys.time(), '%Y%m%d'), '.csv')
write.csv(df, filename, row.names = FALSE, fileEncoding = 'UTF-8')