A little life

Introduction

This project is an analysis of data scraped from Goodreads about Hanya Yanagihara “A little life” - one of my favourite books among the ones I read in 2020.

Collecting the data

The first step is to scrape the ratings and reviews from Goodreads and build a clean dataframe to work with. I will be working with the best 300 reviews in English.

library(rvest)
library(tidyverse)
library(stringr)
library(purrr)
library(lubridate)

# Function to get urls of reviews
get_page_url <- function(page_number) {
  str_c("https://www.goodreads.com/book/show/22822858-a-little-life?page=", page_number, .sep = "")
}
# Functions to get username, date, rating and review of each entry
get_username <- function(url) {
  username <- read_html(url) %>%
    html_nodes("div#bookReviews") %>%
    html_nodes("div.left.bodycol") %>% 
    html_nodes("div.reviewHeader.uitext.stacked") %>% 
    html_nodes('.user') %>%
    html_text()
}
get_date <- function(url) {
  date <- read_html(url) %>%
    html_nodes("div.left.bodycol") %>% 
    html_nodes("div.reviewHeader.uitext.stacked") %>% 
    html_nodes('.reviewDate.createdAt.right') %>%
    html_text()
}
get_rating <- function(url) {
  rating <- read_html(url) %>%
    html_nodes("div.left.bodycol") %>% 
    html_nodes("div.reviewHeader.uitext.stacked") %>% 
    html_nodes('.staticStars.notranslate') %>%
    html_text()
}
get_review <- function(url) {
  review <- read_html(url) %>%
    html_nodes("div.left.bodycol") %>% 
    html_nodes("div.reviewText.stacked") %>% 
    html_nodes('.readable') %>%
    html_text()
}

# Get the elements for each url
page_urls <- map(1:10, get_page_url)
username <- map(page_urls, get_username)
date <- map(page_urls, get_date)
rating <- map(page_urls, get_rating)
review <- map(page_urls, get_review)

# collect into one dataframe and cleaning
data_all <- as.data.frame(cbind(unlist(username), unlist(date), unlist(rating), unlist(review)))
colnames(data_all) <- c("username", "date", "rating", "review")
data_all$date <- str_replace(as.character(data_all$date), pattern = ",", replacement = "")
data_all$date <- parse_date_time(data_all$date, orders = c("%Y-%m-%d", "%m/%d/%y", "%B %d, %Y"))
data_all$date <- as.Date(data_all$date)
data_all$review <- as.character(data_all$review)
data_all$rating <- factor(data_all$rating, levels = c("it was amazing", "really liked it", "liked it", "it was ok", "did not like it"))

data_all <- data_all %>%
  subset(username != "nettebuecherkiste")

(the last line of code is to remove a German review which somehow made its way into the Englisah ones, and would perturb the analysis).

EDA

I started off by building a few explorative data visualizations, such as the timeline of the reviews, the rating distribution and the lenght of the reviews - to get a feeling of what the data look like.

data_all %>%
  arrange(date, decreasing = FALSE) %>%
  mutate(ym = floor_date(date, "month")) %>%
  group_by(ym) %>%
  count() %>%
  ggplot(aes(x = ym, y = n)) +
  geom_line(color = "seagreen3") +
  geom_point(color = "seagreen3", fill = "seagreen3", alpha = 0.7, size = 3) +
  labs(x = "Date",
       y = " ",
       title = "Reviews timeline",
       subtitle = "Number of reviews pulished per month") +
  theme(plot.title = element_text(face = "bold", size = 12, family = "Courier"),
        plot.subtitle = element_text(face = "italic", size = 10, family = "Courier"),
        axis.text.y = element_text(size = 9, family = "Courier"),
        axis.text.x = element_text(size = 9, family = "Courier"),
        axis.title.x = element_text(size = 9, family = "Courier"),
        panel.grid.major.y = element_line(linetype = "dashed", size = 0.5),
        panel.grid.minor.y = element_line(linetype = "dashed", size = 0.5),
        panel.grid.major.x = element_line(linetype = "dashed", size = 0.5),
        panel.grid.minor.x = element_line(linetype = "dashed", size = 0.5))

data_all %>%
  group_by(rating) %>%
  count() %>%
  ggplot(aes(y = n, x = reorder(rating, desc(-rating)), fill =rating)) +
  geom_col(show.legend = FALSE) +
  labs(y = " ",
       x = " ",
       title = "Rating distribution",
       subtitle = "Total number of reviews per rating") +
  theme(plot.title = element_text(face = "bold", size = 12, family = "Courier"),
        plot.subtitle = element_text(face = "italic", size = 10, family = "Courier"),
        axis.text.y = element_text(size = 9, family = "Courier"),
        axis.text.x = element_text(size = 9, family = "Courier"),
        panel.grid.major.y = element_line(linetype = "dashed", size = 0.5),
        panel.grid.minor.y = element_line(linetype = "dashed", size = 0.5),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()) +
  scale_y_continuous(expand = c(0, 0))

library(stringr)
data_all %>%
  mutate(lng_str = str_count(review, '\\w+')) %>%
  ggplot(aes(x = lng_str)) +
  geom_histogram(fill = "cyan3", alpha = 0.8, color = "cyan3") +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_continuous(expand = c(0,0)) +
  labs (x = "Word's total numer",
        y = " ",
        title = "Reviews length",
       subtitle = "Distribution of total number of words per review") +
  theme(plot.title = element_text(face = "bold", size = 12, family = "Courier"),
        plot.subtitle = element_text(face = "italic", size = 10, family = "Courier"),
        axis.text.y = element_text(size = 9, family = "Courier"),
        axis.text.x = element_text(size = 9, family = "Courier"),
        axis.title.x = element_text(size = 9, family = "Courier"),
        panel.grid.major.y = element_line(linetype = "dashed", size = 0.5),
        panel.grid.minor.y = element_line(linetype = "dashed", size = 0.5),
        panel.grid.major.x = element_line(linetype = "dashed", size = 0.5),
        panel.grid.minor.x = element_line(linetype = "dashed", size = 0.5))

It would be interesting to see if the lenght of the review correlated with how much the reader liked the book. Meaning, are people who liked the book more likely to write a long review, or the other way around?

data_all %>%
  group_by(rating) %>%
  summarize(length_rev = mean(str_count(review, '\\w+'))) %>%
  ggplot(aes(y = length_rev, x = reorder(rating, desc(-rating)), fill = rating)) +
  geom_col(show.legend = FALSE) +
  scale_y_continuous(expand = c(0,0)) +
  labs(x = " ",
       y = " ",
       title = "Average review length by rating",
       subtitle = "average numer of words in reviews per rating") +
  theme(plot.title = element_text(face = "bold", size = 12, family = "Courier"),
        plot.subtitle = element_text(face = "italic", size = 10, family = "Courier"),
        axis.text.y = element_text(size = 9, family = "Courier"),
        axis.text.x = element_text(size = 9, family = "Courier"),
        panel.grid.major.y = element_line(linetype = "dashed", size = 0.5),
        panel.grid.minor.y = element_line(linetype = "dashed", size = 0.5),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank())

Text analysis

Thare are various sujects that I would like to explore with this analysis:

Most frequently used words
Characters mentions/popularity
Sentiment analysis and sentiment analysis by rating
Topic modeling

Words frequency

One measure of how important a word is, is its term frequency, or tf, meaning how frequently a word occurs in a document. There are words in a collection of documents, however, that occur many times without being particularly important (such as “the”, “is”, “of” etc). One might want to remove them by using a list of common sop words, but that is not a very sophisticated approach and could lead to both removing words that are important in our context and keeping words that are not adding value to our analysis. Another approach is to look at a term’s inverse document frequency ,or idf, which decreases the weight for commonly used words and increases the weight for words that are not used very much in a collection of documents. By multiplying the term’s idf with tf, one can calculate a term’s frequency adjusted for how rarely it is used (tf-idf).

I decided to collapse all the reviews having the same rating into a single document, so that we have a collection of 5 documents, each of them representing a different level of appreciation for the book.

it_was_amazing <- paste(filter(data_all, rating == "it was amazing")$review, collapse = " ")
really_liked_it <- paste(filter(data_all, rating == "really liked it")$review, collapse = " ")
liked_it <- paste(filter(data_all, rating == "liked it")$review, collapse = " ")
it_was_ok <- paste(filter(data_all, rating == "it was ok")$review, collapse = " ")
did_not_like_it <- paste(filter(data_all, rating == "did not like it")$review, collapse = " ")
text <- c(it_was_amazing, really_liked_it, liked_it, it_was_ok, did_not_like_it)
doc <- c("it was amazing", "really liked it", "liked it", "it was ok", "did not like it")
text_by_rating <- as.data.frame(cbind(doc, text))
text_by_rating$text <- as.character(text_by_rating$text)
text_by_rating$text  <- gsub("[^A-Za-z]"," " , text_by_rating$text)
text_by_rating$doc <- factor(text_by_rating$doc, levels = c("it was amazing", "really liked it", "liked it", "it was ok", "did not like it"))

Once I built my dataframe, I need to transform it into a tidy form, as I will e using the tidytext library to run the analysis:

library(tidytext)
text_by_rating_tidy <- text_by_rating %>%
  unnest_tokens(word, text) %>%
  count(doc, word, sort = TRUE)

I can then calculate and plot the tf-idf for teh terms in my documents, using the bind_tf_idf() function from the tidytext package.

# Calculate tf-itf
text_idf <- text_by_rating_tidy %>%
  bind_tf_idf(word, doc, n) %>%
  group_by(doc) %>%
  slice_max(order_by = tf_idf, n = 15)

# plot
ggplot(text_idf, aes(tf_idf, fct_reorder(word, tf_idf), fill = doc)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~doc, scales = "free") +
  labs(x = NULL, y = NULL,
       title = "Most frequently used words by rating",
       subtitle = "tf-idf of words in collections of reviews having the same rating") +
  theme(plot.title = element_text(face = "bold", size = 12, family = "Courier"),
        plot.subtitle = element_text(face = "italic", size = 10, family = "Courier"),
        axis.text.y = element_text(size = 8, family = "Courier"),
        panel.grid.major.x = element_line(linetype = "dashed", size = 0.5),
        panel.grid.minor.x = element_line(linetype = "dashed", size = 0.5),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank())

As text documents of this type are likely filled with abbreviations or typos, the results are not easy to interprete and it is not surprising that we end up with terms such as “ok” or “thru”, which do not bring valuable information. By looking at the graphs we are tough still able to extract some insights: all documents refer to the suffering and horrors described in the book, but while it seems that people who liked it saw it as something meaningful, those who did not saw it as something gratuitus and unoriginal.

Characters mentions

We can use the text_by_rating_tidy dataframe built before to see which amongst the main characters are most popular (or received more mentions). In order to do that, we need to build a vector containing the names of the main characters, and then filter out all the other words from the initial dataframe.

characters <- c("jude", "willem", "malcolm", "jb", "andy", "harold", "julia", "luke", "richard")

text_char <- text_by_rating %>%
  unnest_tokens(word, text) %>%
  filter(word %in% characters) %>%
  group_by(word) %>%
  count()

library(forcats)
ggplot(text_char, aes(y = fct_reorder(as.factor(word), n), x = n, fill = word)) +
  geom_col(show.legend = FALSE) +
  scale_x_continuous(expand = c(0,0)) +
  labs(x = NULL, y = NULL,
       title = "Characters popularity",
       subtitle = "main characters total mentions in the whole body of text") +
  theme(plot.title = element_text(face = "bold", size = 14, family = "Courier"),
        plot.subtitle = element_text(face = "italic", size = 12, family = "Courier"),
        axis.text.y = element_text(size = 12, family = "Courier"),
        axis.text.x = element_text(size = 12, family = "Courier"),
        panel.grid.major.x = element_line(linetype = "dashed", size = 0.5),
        panel.grid.minor.x = element_line(linetype = "dashed", size = 0.5),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank())

Sentiment analysis

Sentiment analysis is going to be tricky on this tyoe of texts: users are simultaneously writing about the book’s themes and plot (which will be associated with specific sentiments), and about their own feelings and opinions about those themes and the book in general. As we cannot distinguish between the two, the total sentiment score will be a mixture of the two. First of all we will check the sentiments distribution for all the reviews, without disaggregating by rating. We will use the NRC lexicon.

text_sentiment <- text_by_rating %>%
  unnest_tokens(word, text) %>%
  inner_join(get_sentiments("nrc")) %>%
  group_by(sentiment) %>%
  count()

ggplot(text_sentiment, aes(y = sentiment, x = n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  scale_x_continuous(expand = c(0,0)) +
  labs(x = " ", y =" ",
       title = "Sentiment analysis",
       subtitle = "NRC sentiments counts over the whole body of reviews") +
  theme(plot.title = element_text(face = "bold", size = 14, family = "Courier"),
        plot.subtitle = element_text(face = "italic", size = 12, family = "Courier"),
        axis.text.y = element_text(size = 10, family = "Courier"),
        axis.text.x = element_text(size = 10, family = "Courier"),
        panel.grid.major.x = element_line(linetype = "dashed", size = 0.5),
        panel.grid.minor.x = element_line(linetype = "dashed", size = 0.5),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank())

Who read the book knows that “A little life” is a very sad story, in which the characters live through unspeakable tragedies at worst and minor sadnesses and dissatisfactions at best, so I was expecting a predominance of negative emotions - that does not seem to be the case. I then wanted to see if there was a difference in sentiment distribution according to the rating, expecting a larger proportion of negative language for negative reviews.

text_sentiment_by_rating <- text_by_rating %>%
  unnest_tokens(word, text) %>%
  inner_join(get_sentiments("nrc")) %>%
  group_by(sentiment, doc) %>%
  count()

ggplot(text_sentiment_by_rating, aes(y = sentiment, x = n, fill = doc)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~doc, scales = "free") +
  labs(x = " ", y =" ",
       title = "Sentiment analysis per rating",
       subtitle = "NRC sentiments counts per rating") +
  theme(plot.title = element_text(face = "bold", size = 14, family = "Courier"),
        plot.subtitle = element_text(face = "italic", size = 12, family = "Courier"),
        axis.text.y = element_text(size = 10, family = "Courier"),
        axis.text.x = element_text(size = 10, family = "Courier"),
        panel.grid.major.x = element_line(linetype = "dashed", size = 0.5),
        panel.grid.minor.x = element_line(linetype = "dashed", size = 0.5),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) +
  scale_x_continuous(expand = c(0,0))

It seems like the sentiment distribution is very much similar for different ratings. that is somewhat surprising and might mean that either the reviews are mostly about the book and less about what the person thought about the book (or at least that the first component is predominant), or that our analysis is flawed.

In order to test the latter hypothesis, I looked at which words were contributing the most as positive or negative.

text_by_rating_tidy %>%
  inner_join(get_sentiments("nrc")) %>%
  group_by(sentiment) %>%
  slice_max(order_by = n, n = 15) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = n, y = word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  scale_x_continuous(expand = c(0,0)) +
  labs(x = " ", y = " ",
       title = "Words count by rating",
       subtitle = "Numer of words associated with a certain NRC sentiment") +
  theme(plot.title = element_text(face = "bold", size = 14, family = "Courier"),
        plot.subtitle = element_text(face = "italic", size = 12, family = "Courier"),
        axis.text.y = element_text(size = 9, family = "Courier"),
        axis.text.x = element_text(size = 9, family = "Courier"),
        panel.grid.major.x = element_line(linetype = "dashed", size = 0.5),
        panel.grid.minor.x = element_line(linetype = "dashed", size = 0.5),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank())

Some of the words do not seem to be very meaningful, such as “don” or “reading”, and one option to refine the analysis would be to remove them from the dataset. We can then push the analysis even further by looking at combination of words, instead of single ones, as tokens: whenever we have a negation such as a “not” in front of a positive word, such as “like”, while the lexicon whill categorize the like as a positive contribution, the meaning is actually the opposite. We can thus tokenize by bigram, see how many words are associated with a negation and exclude them from the analysis.

# find negated words
negation_words <- c("not", "no", "never", "without")
invalid_sent <- text_by_rating %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  filter(word1 %in% negation_words) %>%
  inner_join(get_sentiments("nrc"), by = c(word2 = "word")) %>%
  count(doc, sentiment, sort = TRUE) %>%
  rename(n_inv = n)

# Eliminate not significant word form the dataset
text_sentiment_by_rating_modif <- text_by_rating %>%
  unnest_tokens(word, text) %>%
  filter(word != "don") %>%
  filter(word != "reading") %>%
  filter(word != "reader") %>%
  filter(word != "author") %>%
  filter(word != "childhood") %>%
  inner_join(get_sentiments("nrc")) %>%
  group_by(sentiment, doc) %>%
  count()

# merge dataframes
sent_tot <- merge(text_sentiment_by_rating_modif, invalid_sent, all.x = TRUE)
sent_tot[is.na(sent_tot)] <- 0
sent_tot <- sent_tot %>%
  mutate(tot = n-n_inv)

# plot
ggplot(sent_tot, aes(y = sentiment, x = tot, fill = doc)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~doc, scales = "free") +
  labs(x = " ", y =" ",
       title = "Sentiment analysis per rating - corrected",
       subtitle = "NRC sentiments counts per rating - corrected") +
  theme(plot.title = element_text(face = "bold", size = 14, family = "Courier"),
        plot.subtitle = element_text(face = "italic", size = 12, family = "Courier"),
        axis.text.y = element_text(size = 10, family = "Courier"),
        axis.text.x = element_text(size = 10, family = "Courier"),
        panel.grid.major.x = element_line(linetype = "dashed", size = 0.5),
        panel.grid.minor.x = element_line(linetype = "dashed", size = 0.5),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) +
  scale_x_continuous(expand = c(0,0)) +
  scale_x_continuous(expand = c(0,0))

By comparing this graph with the previous one we can see that the difference between positive and negative words is decreased, as well as the total word count. The overall trend though does not seem to be much different, which supports the initial hypothesis: the sentiment score for different ratings is similar, and most of what is written is likely more associated with the book rather than the opinion of the reader.

Topic modeling

LDA (Latent Dirichlet allocation) is one of the most common algorithms for topic modeling. It is based on two main principles:

Every document is a mixture of topics, in certain proportions
Every topic is a mixture of words, with certain words bein more frequent in a topic rather than another

All of the text documents I am working with are based on the same suject, so I would not expect to find multiple topics, though we could still hypotesize some distinction. For example, there could be a distinction between the summary of the plot and what the user thought about the book, or there could be a differentiation between documents written by people who liked the book and people who didn’t.

library(tm)
library(topicmodels)

text_dtm <- text_by_rating_tidy %>%
  anti_join(stop_words) %>%
  cast_dtm(doc, word, n) %>%
  as.matrix

all_lda <- LDA(text_dtm, k = 2, control = list(seed = 1234))
all_topics <- tidy(all_lda, matrix = "beta")

all_top_terms <- all_topics %>%
  group_by(topic) %>%
  slice_max(beta, n = 10) %>% 
  ungroup() %>%
  arrange(topic, -beta)

all_top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered() +
  labs(x = " ", y =" ",
       title = "Topic modeling",
       subtitle = "Word counts by topic") +
  theme(plot.title = element_text(face = "bold", size = 14, family = "Courier"),
        plot.subtitle = element_text(face = "italic", size = 12, family = "Courier"),
        axis.text.y = element_text(size = 10, family = "Courier"),
        axis.text.x = element_text(size = 10, family = "Courier"),
        panel.grid.major.x = element_line(linetype = "dashed", size = 0.5),
        panel.grid.minor.x = element_line(linetype = "dashed", size = 0.5),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) +
  scale_x_continuous(expand = c(0,0)) +
  scale_x_continuous(expand = c(0,0))

Both topic have pretty much the same words (7 ou of 10 of the most used words overlap), which means that topic modeling is not an appropriate type of analysis for this dataset.

Conclusions

In this project, I have analysid the best 300 goodreads english reviews of the book “A little life”, by Hanya Yanagihara. This is what we can conclude from the analysis:

The majority of the people loved or liked the book
The average review length does not seem to be correlated with how much the reviewer liked the book
The reviews seem to be mostly centered on the book’s plot and the tragic events of Jude life: this is demonstrated by the fact that Jude is the most mentioned character, as well as from the words having the highest tf-idf in the reviews
By looking at the tf-idf vs rating, we can also see how those tragic events have a negative connotation for people who did not like the ook (“victimization”, “banal”..)
There seem to be no significant difference in sentiments expressed in the reviews versus rating: this confirm point n.3, meaning the reviews seem to be prevalently focused on the plot rather than how the reader felt about the book
LDA analysis did not reveal the presence of different topics in the reviews