GitHub - https://github.com/rickysoo
Contact - ricky [at] rickysoo.com
Source: https://www.pexels.com
Would you like to know what the top 100 audiobooks are whenever you want? I’m a big fan of audio books, having been a paying subscriber to Audible.com for many years.
Here, we will retrieve the current 100 bestsellers on Audible.com and save them into a dataframe. 5 publicly available web pages are retrieved from Audible.com web site.
Required for web scraping in R.
library(xml2)
library(rvest)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.4 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag() masks stats::lag()
## x purrr::pluck() masks rvest::pluck()
We are going to retrieve 20 audiobooks from each of 5 pages.
pages <- 5
items <- 20
There are 12 data fields to be retrieved. A matrix is used to store the data retrieved.
cols <- c('Rank', 'Title', 'Subtitle', 'Author', 'Narrator', 'Length', 'Release', 'Language', 'Stars', 'Ratings', 'Price', 'URL')
data <- matrix('', nrow = pages * items, ncol = length(cols))
colnames(data) <- cols
Now we go to each page, extract the data, do some minimal cleaning, and put the them into the matrix.
# Loop through the pages
for (page in 1:pages) {
url <- paste0('https://www.audible.com/adblbestsellers?page=', page)
html <- read_html(url)
# Loop through the items on a page
for (item_num in 1:items) {
row <- (page - 1) * items + item_num
item_selector <- paste0('#product-list-a11y-skiplink-target > span > ul > div > li:nth-child(', item_num, ') > div > div.bc-col-responsive.bc-spacing-top-none.bc-col-8 > div > div.bc-col-responsive.bc-col-6 > div > div > span > ul')
item_node <- html_node(html, item_selector)
# The audiobook ranking
data[row, 'Rank'] <- row
# The audiobook title
title_selector <- 'li:nth-child(1) > h3 > a'
title_node <- html_node(item_node, title_selector)
title <- html_text(title_node, trim = TRUE)
data[row, 'Title'] <- title
# The audiobook subtitle. It's empty for some items.
subtitle_selector <- 'li.bc-list-item.subtitle > span'
subtitle_node <- html_node(item_node, subtitle_selector)
subtitle <- html_text(subtitle_node, trim = TRUE)
data[row, 'Subtitle'] <- subtitle
# The author. There might be more than one.
author_selector <- 'li.bc-list-item.authorLabel > span > a'
author_nodes <- html_nodes(item_node, author_selector)
authors <- html_text(author_nodes, trim = TRUE)
author <- paste(authors, collapse = ', ')
data[row, 'Author'] <- author
# The narrator. There might be more than one.
narrator_selector <- 'li.bc-list-item.narratorLabel > span > a'
narrator_nodes <- html_nodes(item_node, narrator_selector)
narrators <- html_text(narrator_nodes, trim = TRUE)
narrator <- paste(narrators, collapse = ', ')
data[row, 'Narrator'] <- narrator
# The audiobook length in hours and minutes
length_selector <- 'li.bc-list-item.runtimeLabel > span'
length_node <- html_node(item_node, length_selector)
length <- html_text(length_node, trim = TRUE)
length <- gsub('Length: ', '', length)
data[row, 'Length'] <- length
# The release date
release_selector <- 'li.bc-list-item.releaseDateLabel > span'
release_node <- html_node(item_node, release_selector)
release <- html_text(release_node, trim = TRUE)
release <- gsub('Release date:\n\\s+', '', release)
data[row, 'Release'] <- release
# The audiobook language
language_selector <- 'li.bc-list-item.languageLabel > span'
language_node <- html_node(item_node, language_selector)
language <- html_text(language_node, trim = TRUE)
language <- gsub('Language:\n\\s+', '', language)
data[row, 'Language'] <- language
# The number of stars received
stars_selector <- 'li.bc-list-item.ratingsLabel > span.bc-text.bc-pub-offscreen'
stars_node <- html_node(item_node, stars_selector)
stars <- html_text(stars_node, trim = TRUE)
data[row, 'Stars'] <- stars
# The number of ratings received
ratings_selector <- 'li.bc-list-item.ratingsLabel > span.bc-text.bc-size-small.bc-color-secondary'
ratings_node <- html_node(item_node, ratings_selector)
ratings <- html_text(ratings_node, trim = TRUE)
data[row, 'Ratings'] <- ratings
# The selling price
price_selector <- paste0('#buybox-regular-price-', item_num - 1, ' > span:nth-child(2)')
price_node <- html_node(html, price_selector)
price <- html_text(price_node, trim = TRUE)
data[row, 'Price'] <- price
# Web page address
url <- paste0('https://www.audible.com', html_attr(title_node, 'href'))
data[row, 'URL'] <- url
}
}
Convert matrix into dataframe and show the first 10 rows.
df <- as.data.frame(data)
head(df, 10)
## Rank Title
## 1 1 A Promised Land
## 2 2 Washington's End
## 3 3 Greenlights
## 4 4 Ready Player Two
## 5 5 Atomic Habits
## 6 6 A Killer's Wife
## 7 7 Rhythm of War
## 8 8 The Tribes
## 9 9 Can't Hurt Me
## 10 10 Caste (Oprah's Book Club)
## Subtitle
## 1 <NA>
## 2 The Final Years and Forgotten Struggle
## 3 <NA>
## 4 A Novel
## 5 An Easy & Proven Way to Build Good Habits & Break Bad Ones
## 6 Desert Plains, Book 1
## 7 The Stormlight Archive, Book 4
## 8 A Novel
## 9 Master Your Mind and Defy the Odds
## 10 The Origins of Our Discontents
## Author Narrator Length
## 1 Barack Obama Barack Obama 29 hrs and 10 mins
## 2 Jonathan Horn Arthur Morey 8 hrs and 14 mins
## 3 Matthew McConaughey Matthew McConaughey 6 hrs and 42 mins
## 4 Ernest Cline Wil Wheaton 13 hrs and 46 mins
## 5 James Clear James Clear 5 hrs and 35 mins
## 6 Victor Methos Brittany Pressley 9 hrs and 21 mins
## 7 Brandon Sanderson Kate Reading, Michael Kramer 57 hrs and 26 mins
## 8 Mari Howes Barrie Kreinik, Piper Goodeve 11 hrs and 48 mins
## 9 David Goggins David Goggins, Adam Skolnick 13 hrs and 37 mins
## 10 Isabel Wilkerson Robin Miles 14 hrs and 26 mins
## Release Language Stars Ratings Price
## 1 11-17-20 English 5 out of 5 stars 4,676 ratings $45.50
## 2 02-11-20 English 4.5 out of 5 stars 33 ratings $17.00
## 3 10-20-20 English 5 out of 5 stars 23,359 ratings $28.00
## 4 11-24-20 English 4.5 out of 5 stars 13,169 ratings $31.50
## 5 10-16-18 English 5 out of 5 stars 49,078 ratings $28.00
## 6 03-01-20 English 4.5 out of 5 stars 3,204 ratings $25.19
## 7 11-17-20 English 5 out of 5 stars 5,161 ratings $66.49
## 8 12-10-20 English 4.5 out of 5 stars 25 ratings $34.95
## 9 11-28-18 English 5 out of 5 stars 117,784 ratings $24.95
## 10 08-04-20 English 5 out of 5 stars 5,530 ratings $31.50
## URL
## 1 https://www.audible.com/pd/A-Promised-Land-Audiobook/0525633723?ref=a_adblbests_c3_lProduct_1_1&pf_rd_p=4100380b-3e9d-4594-990a-9c93d1a8dac3&pf_rd_r=FHGFT2DZGSYYEYZH51EP
## 2 https://www.audible.com/pd/Washingtons-End-Audiobook/1797111051?ref=a_adblbests_c3_lProduct_1_2&pf_rd_p=4100380b-3e9d-4594-990a-9c93d1a8dac3&pf_rd_r=FHGFT2DZGSYYEYZH51EP
## 3 https://www.audible.com/pd/Greenlights-Audiobook/0593294181?ref=a_adblbests_c3_lProduct_1_3&pf_rd_p=4100380b-3e9d-4594-990a-9c93d1a8dac3&pf_rd_r=FHGFT2DZGSYYEYZH51EP
## 4 https://www.audible.com/pd/Ready-Player-Two-Audiobook/0593396960?ref=a_adblbests_c3_lProduct_1_4&pf_rd_p=4100380b-3e9d-4594-990a-9c93d1a8dac3&pf_rd_r=FHGFT2DZGSYYEYZH51EP
## 5 https://www.audible.com/pd/Atomic-Habits-Audiobook/1524779261?ref=a_adblbests_c3_lProduct_1_5&pf_rd_p=4100380b-3e9d-4594-990a-9c93d1a8dac3&pf_rd_r=FHGFT2DZGSYYEYZH51EP
## 6 https://www.audible.com/pd/A-Killers-Wife-Audiobook/179975006X?ref=a_adblbests_c3_lProduct_1_6&pf_rd_p=4100380b-3e9d-4594-990a-9c93d1a8dac3&pf_rd_r=FHGFT2DZGSYYEYZH51EP
## 7 https://www.audible.com/pd/Rhythm-of-War-Audiobook/1250759781?ref=a_adblbests_c3_lProduct_1_7&pf_rd_p=4100380b-3e9d-4594-990a-9c93d1a8dac3&pf_rd_r=FHGFT2DZGSYYEYZH51EP
## 8 https://www.audible.com/pd/The-Tribes-Audiobook/B08K3J3294?ref=a_adblbests_c3_lProduct_1_8&pf_rd_p=4100380b-3e9d-4594-990a-9c93d1a8dac3&pf_rd_r=FHGFT2DZGSYYEYZH51EP
## 9 https://www.audible.com/pd/Cant-Hurt-Me-Audiobook/B07KKMNZCH?ref=a_adblbests_c3_lProduct_1_9&pf_rd_p=4100380b-3e9d-4594-990a-9c93d1a8dac3&pf_rd_r=FHGFT2DZGSYYEYZH51EP
## 10 https://www.audible.com/pd/Caste-Oprahs-Book-Club-Audiobook/0593339800?ref=a_adblbests_c3_lProduct_1_10&pf_rd_p=4100380b-3e9d-4594-990a-9c93d1a8dac3&pf_rd_r=FHGFT2DZGSYYEYZH51EP
View(df)
filename <- paste0('audiobooks-original-', format(Sys.time(), '%Y%m%d'), '.csv')
write.csv(df, filename, row.names = FALSE, fileEncoding = 'UTF-8')
Check data frame for missing values.
sum(is.na(df))
## [1] 41
for (col in colnames(df)) {
print(paste0(col, ': ', sum(is.na(df[col]))))
}
## [1] "Rank: 0"
## [1] "Title: 0"
## [1] "Subtitle: 40"
## [1] "Author: 0"
## [1] "Narrator: 0"
## [1] "Length: 0"
## [1] "Release: 0"
## [1] "Language: 0"
## [1] "Stars: 1"
## [1] "Ratings: 0"
## [1] "Price: 0"
## [1] "URL: 0"
Subtitle - Replace missing values with empty space.
df$Subtitle <- replace_na(df$Subtitle, '')
Length - Calculate length in minutes.
Hours <- str_extract(df$Length, '^\\d+(?=(\\shr))') %>% as.numeric()
Minutes <- str_extract(df$Length, '\\d+(?=(\\smin))') %>% replace_na(0) %>% as.numeric()
df$Length <- Hours * 60 + Minutes
Release - Reformat value as date.
df$Release <- as.Date(df$Release, '%m-%d-%y')
Stars - Extract the number of stars. Missing values are left as NAs.
df$Stars <- str_extract(df$Stars, '^[\\d\\.]+')
df$Stars <- as.numeric(df$Stars)
Stars - Extract the number of ratings. Missing values are left as NAs.
df$Ratings <- gsub('Not rated yet', NA, df$Ratings)
df$Ratings <- str_extract(df$Ratings, '^[\\d,]+')
df$Ratings <- gsub(',', '', df$Ratings)
df$Ratings <- as.numeric(df$Ratings)
Price - Reformat data as number.
df$Price <- gsub('\\$', '', df$Price)
df$Price <- as.numeric(df$Price)
Recheck data frame for missing values. Missing values in Stars and Ratings are left as NAs.
sum(is.na(df))
## [1] 2
for (col in colnames(df)) {
print(paste0(col, ': ', sum(is.na(df[col]))))
}
## [1] "Rank: 0"
## [1] "Title: 0"
## [1] "Subtitle: 0"
## [1] "Author: 0"
## [1] "Narrator: 0"
## [1] "Length: 0"
## [1] "Release: 0"
## [1] "Language: 0"
## [1] "Stars: 1"
## [1] "Ratings: 1"
## [1] "Price: 0"
## [1] "URL: 0"
View the data frame.
str(df)
## 'data.frame': 100 obs. of 12 variables:
## $ Rank : chr "1" "2" "3" "4" ...
## $ Title : chr "A Promised Land" "Washington's End" "Greenlights" "Ready Player Two" ...
## $ Subtitle: chr "" "The Final Years and Forgotten Struggle" "" "A Novel" ...
## $ Author : chr "Barack Obama" "Jonathan Horn" "Matthew McConaughey" "Ernest Cline" ...
## $ Narrator: chr "Barack Obama" "Arthur Morey" "Matthew McConaughey" "Wil Wheaton" ...
## $ Length : num 1750 494 402 826 335 ...
## $ Release : Date, format: "2020-11-17" "2020-02-11" ...
## $ Language: chr "English" "English" "English" "English" ...
## $ Stars : num 5 4.5 5 4.5 5 4.5 5 4.5 5 5 ...
## $ Ratings : num 4676 33 23359 13169 49078 ...
## $ Price : num 45.5 17 28 31.5 28 ...
## $ URL : chr "https://www.audible.com/pd/A-Promised-Land-Audiobook/0525633723?ref=a_adblbests_c3_lProduct_1_1&pf_rd_p=4100380"| __truncated__ "https://www.audible.com/pd/Washingtons-End-Audiobook/1797111051?ref=a_adblbests_c3_lProduct_1_2&pf_rd_p=4100380"| __truncated__ "https://www.audible.com/pd/Greenlights-Audiobook/0593294181?ref=a_adblbests_c3_lProduct_1_3&pf_rd_p=4100380b-3e"| __truncated__ "https://www.audible.com/pd/Ready-Player-Two-Audiobook/0593396960?ref=a_adblbests_c3_lProduct_1_4&pf_rd_p=410038"| __truncated__ ...
summary(df)
## Rank Title Subtitle Author
## Length:100 Length:100 Length:100 Length:100
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Narrator Length Release Language
## Length:100 Min. : 95.0 Min. :2000-12-31 Length:100
## Class :character 1st Qu.: 517.0 1st Qu.:2015-11-12 Class :character
## Mode :character Median : 715.0 Median :2019-02-26 Mode :character
## Mean : 885.3 Mean :2016-12-27
## 3rd Qu.:1006.5 3rd Qu.:2020-09-29
## Max. :3446.0 Max. :2020-12-15
##
## Stars Ratings Price URL
## Min. :4.500 Min. : 25 Min. :14.95 Length:100
## 1st Qu.:4.500 1st Qu.: 3533 1st Qu.:24.98 Class :character
## Median :4.500 Median : 20466 Median :30.79 Mode :character
## Mean :4.707 Mean : 36604 Mean :32.36
## 3rd Qu.:5.000 3rd Qu.: 53466 3rd Qu.:34.99
## Max. :5.000 Max. :234727 Max. :66.49
## NA's :1 NA's :1
View(df)
filename <- paste0('audiobooks-cleaned-', format(Sys.time(), '%Y%m%d'), '.csv')
write.csv(df, filename, row.names = FALSE, fileEncoding = 'UTF-8')