This is to provide a short overview of the use and capabilties of this webscraper I’ve written in R. This can be used to create datasets that will allow analysis of genres of interest.

To use the scraping function, first you will need to download and source the script.

All that is needed is the url of a reading list of your choice and and a stable internet connection. Towards the end of the script itself (below or on Github) is a short set of examples you can look at, commented out. For a short demo, I suggest “excellentSpaceOpera” — it is a short list and the function will take just a few minutes to return a result.

Argument

Output

A dataframe with the following columns:

Example (Book descriptions not shown here)

goodreadsID title authors hyper pageCounts proportion_text_reviews average_rating genreVoted hasAward total_ratings
51964 Old Man’s War (Old Man’s War, #1) John Scalzi /book/show/51964.Old_Man_s_War 351 5.71e-05 4.23 Science Fiction FALSE 122567
234225 Dune (Dune Chronicles, #1) Frank Herbert /book/show/234225.Dune 604 2.47e-05 4.20 Science Fiction FALSE 567930
17214 Starship Troopers Robert A. Heinlein /book/show/17214.Starship_Troopers 335 2.53e-05 4.00 Science Fiction FALSE 158267
8855321 Leviathan Wakes (The Expanse, #1) James S.A. Corey /book/show/8855321-leviathan-wakes 561 7.81e-05 4.22 Science Fiction FALSE 102368
45252 Pandora’s Star Peter F. Hamilton /book/show/45252.Pandora_s_Star 768 2.87e-05 4.24 Science Fiction FALSE 34799

Postscript

Additionally the webscraper in its entirety can be found here or on Github.

library(rvest)
library(dplyr)
library(scales)
library(magrittr)
library(ggplot2)

# This Rscript can roughly be divided into the following sections:
  # 1) Small supporting functions that are used to collect specific sections from a webpage. Used by the webscraper.
  # 2) Webscraper with the following components (for loops):
        #2a) Scrape the list compose a list of books (and their urls). Example: 'https://www.goodreads.com/list/show/1127'
        #2b) Use the urls to visit the pages of each book on the list, collecting detailed information.
  # 3) Finally call the webscraper function, and all the reading lists scraped for this project. I've taken the liberty of 
  #    of commenting out all but the smallest list so running a demo doesn't take hours.


# Section 1: Supporting functions
find.description<-function(book.url){
  desc<-html_nodes(book.url,"#description span")%>%html_text()%>%max()
  return(desc)
}

find.page.length<-function(book.url){
  book_page<-html_nodes(book.url,'#details span+ span')[1]
  pageCount<-html_text(book_page)
  if(length(book_page)==0){
    pageCounts<-NA
  }
  else{
    pageCounts<-as.numeric(gsub(" pages","",pageCount))
  }
  return(as.numeric(pageCounts))
}

find.genre.tag<-function(book.url){
  genre<-html_nodes(book.url,'.elementList:nth-child(1) .left .bookPageGenreLink')
  if(length(genre)==0){
    genreVoted=NA
  }
  else{
    genreVoted<-gsub( " *<.*?> *", "", genre)
  }
  return(genreVoted)
}

check.for.awards<-function(book.url){
  award<-html_nodes(book.url,'.clear+ .clearFloats .infoBoxRowItem')
  if(length(award)==0){
    hasAward<-FALSE
  }
  else{
    hasAward<-TRUE
  }
  return(hasAward)
}

find.total.reviews<-function(book.url){
  if(length(html_nodes(book.url,'.votes'))){
    ratings<-html_text(html_nodes(book.url,'.votes'))
    ratings<-gsub("[[:space:]]", "", ratings)
    ratings<-gsub(",", "", ratings)%>%as.numeric()
  }
  else{
    ratings<-NA
  }
  return(ratings)
}

find.text.reviews<-function(book.url){
  if(length(html_nodes(book.url,'.count'))!=0){
    reviews<-html_text(html_nodes(book.url,'.count'))
    text_reviews<-as.numeric(regmatches(reviews, regexpr("[[:digit:]]+",reviews)))
  }
  else{
    text_reviews<-NA
  }
  return(text_reviews)
}

#Section 2: The main webscraping function


goodReads.webscrape<-function(listUrl){
  
  #Section 2a: Takes the url of a reading list, scrapes it. This collects urls, goodReads ID's, titles, and authors.
  #The urls are used by the second part of the scraper to visit each book's page, and collect additional info.
  
  
  List_Main<-read_html(listUrl)
  pages<-html_nodes(List_Main,'.pagination a')%>%html_text()
  listIndex<-as.numeric(pages[length(pages)-1])
  #Exception for short lists less than 100 books
  if(length(listIndex)==0){
    listIndex=1
  }
  guide<-list()
  data<-data.frame()
  for(i in 1:listIndex){
    ReadList<-read_html(paste(listUrl,"?page=",i,sep=''))
    
    #Titles, authors, and hyperlinks from browsing the given list.
    title<-html_nodes(ReadList,"div.leftContainer")%>%html_nodes("td")%>%html_nodes('.bookTitle span')%>%html_text()
    authors<-html_nodes(ReadList,"div.leftContainer")%>%html_nodes("td")%>%html_nodes('.authorName span')%>%html_text()
    hyper<-html_nodes(ReadList,"div.leftContainer")%>%html_nodes("td")%>%html_nodes('a.bookTitle')%>%html_attr("href")
    
    #Extract Goodreads book ID from url, which can be used as a key variable. Second gsub() is used to catch an occasional exception
    goodreadsID<-gsub(".*/book/show/\\s*|-.*", "", hyper)
    goodreadsID<-gsub("\\..*","",goodreadsID)%>%as.numeric()
    
    a<-cbind(goodreadsID,title,authors,hyper)
    data<-rbind(data,a)
    complete<-i/listIndex
    cat("Completion (Step 1): ",format(percent(complete),digits=4,justify="left"),"\n","Works Found: ",nrow(data),"\n")
  }
  
  cat("Step 1 Complete! Found ",nrow(data)," separate works in this list.", "\n")
  
  
  # Section 2b: Scrape the page of individual books in the list. This gets additional information: page lengths, 
  # the genre tag (with the most votes by readers), text summaries etc. 
  # This step is takes time! (Technically I could speed this up with the doParallel package and run it on multiple cores,
  # however I would not be able to display program progress updates. I decided I'd rather be able to keep close tabs
  # on how the program is doing.)
  
  badurls=0
  total_ratings<-rep(NA,nrow(data))
  book.descriptions<-rep(NA,nrow(data))
  pageCounts<-rep(NA,nrow(data))
  genreVoted<-rep(NA,nrow(data))
  hasAward<-rep(NA,nrow(data))
  average_rating<-rep(NA,nrow(data))
  total_ratings<-rep(NA,nrow(data))
  text_reviews<-rep(NA,nrow(data))
  proportion_text_reviews<-rep(NA,nrow(data))
  
  
  for(i in 1:nrow(data)){
    #The url of a book in the target list
    url<-paste('https://www.goodreads.com',data$hyper[i],sep='')
    go<-tryCatch(read_html(url),
                 error=function(c) 'stop')
    #So it doesn't crash if it is a bad url
    if(go!='stop'){
      #the html page of a book in the target list
      goodReads<-read_html(url)
      
      # Number of total reviews and proportion that are also text reviews
      text_reviews[i]<-find.text.reviews(goodReads)
      
      total_ratings[i]<-find.total.reviews(goodReads)
      
      proportion_text_reviews[i]<-(text_reviews[i]/total_ratings[i])
      
      #Get the summaries of each book
      book.descriptions[i]<-find.description(goodReads)
      
      
      #Average Rating
      if(length(html_nodes(goodReads,'.average'))!=0){
        average_rating[i]<-as.numeric(html_text(html_nodes(goodReads,'.average')))
      } else{
        average_rating[i]<-NA
      }
      
      hasAward[i]<-check.for.awards(goodReads)
      
      genreVoted[i]<-find.genre.tag(goodReads)
      
      pageCounts[i]<-find.page.length(goodReads)
    }
    else{
      badurls=badurls+1
    }
    complete<-(i/nrow(data))*100
    cat(paste("Completion: ",format(complete,digits=4,justify="left"),"%","       Title: ",substr(data$title[i],1,45),sep=''),'\n',
        "                       Last Captured: ",format(average_rating[i],width=3,justify="centre",nsmall=2),format(pageCounts[i],width=6,justify="centre"),format(hasAward[i],width=5,justify="centre"),format(genreVoted[i],width=15,justify="centre"),format(percent(proportion_text_reviews[i]),width=5,justify="right"),'\n')
  }
  newData<-cbind.data.frame(pageCounts,proportion_text_reviews,average_rating,genreVoted,hasAward,total_ratings,book.descriptions)
  Finished<-cbind(data,newData)
  return(Finished)}

#---------------------------------------------------------------------
#-----      Section 3: Actually webscraping some lists      ----------
#---------------------------------------------------------------------
# Usage is quite simple, with only one argument.
# Argument: url of a list on goodreads.com
# Output: returns a dataframe with information scraped from each work on the list. 

# Comments: The scraper isn't perfect, and appears to have issues scraping certain works. I have number of exceptions 
# to handle this, but it does still encounter issues.
# Sometimes this is expected: 
  # Audiobooks don't have page lengths
  # Lesser known works sometimes do not have a genre tag, since no one voted on a tag.
# Also be careful feeding very large lists into the function! I'd say ~2000 books is ideal, anything further and you
# may run into issues (and can expect to wait some time for it to finish).

#bestEpicFantasy<-goodReads.webscrape('https://www.goodreads.com/list/show/50.The_Best_Epic_Fantasy')

#excellentSpaceOpera<-goodReads.webscrape('https://www.goodreads.com/list/show/1127')

#bestScienceFiction<-goodReads.webscrape('https://www.goodreads.com/list/show/19341')

#apocalyptic<-goodReads.webscrape('https://www.goodreads.com/list/show/47')

#travel<-goodReads.webscrape('https://www.goodreads.com/list/show/633.Favourite_Travel_Books')

#bestScience<-goodReads.webscrape('https://www.goodreads.com/list/show/692.Best_Science_Books_Non_Fiction_Only')