knitr::opts_chunk$set(echo = TRUE,
                      warning = TRUE,
                      message = TRUE)

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# For text mining:
library(pdftools)  # Used to extract text from PDF files
## Using poppler version 23.04.0
library(tidytext)  # Facilitates text analysis by working with words in a 'tidy' format
library(textdata)  # Contains various sentiment dictionaries
library(ggwordcloud)  # Used to create word clouds

Get the Game of Thrones text:

got_path <- "/Users/lars/Desktop/data/got.pdf"
got_text <- pdf_text(got_path)  # Extracts text from the PDF file as a vector of strings (one per page)

Some wrangling:

got_df <- data.frame(got_text) %>% 
  mutate(text_full = str_split(got_text, pattern = '\\n')) %>%  # Splits the text by line breaks
  unnest(text_full) %>%  # 'Unnests' the listed text so each line becomes a row in the dataframe
  mutate(text_full = str_trim(text_full))  # Removes leading and trailing spaces from each line

Get the tokens (individual words) in tidy format

got_tokens <- got_df %>% 
  unnest_tokens(word, text_full)  # Splits the text into individual words (tokens), so each row contains a single word

got_tokens

Remove stop words:

got_stop <- got_tokens %>% 
  anti_join(stop_words) %>%  # Removes stop words (commonly used words like 'the', 'and', 'of') that do not carry analytical value
  select(-got_text)  # Removes the original text column as it is no longer needed
## Joining with `by = join_by(word)`

Count the words

got_stop %>% 
  count(word) %>% 
  arrange(-n)

Word cloud of GoT words

got_top100 <- got_stop %>% 
  count(word) %>%  # Counts the occurrences of each word
  arrange(-n) %>%  # Sorts words by frequency
  head(100)  # Keeps only the 100 most frequent words
got_cloud <- ggplot(data = got_top100, aes(label = word)) +
  geom_text_wordcloud() +  # Visualizes the words in a word cloud
  theme_minimal()

got_cloud

# Let’s make the word cloud af star

ggplot(data = got_top100, aes(label = word, size = n)) +
  geom_text_wordcloud_area(aes(color = n), shape = "star") +
  scale_size_area(max_size = 12) +
  scale_color_gradientn(colors = c("darkgreen","blue","red")) +
  theme_minimal()

Sentiment analysis with afinn:

“afinn”: Words ranked from -5 (very negative) to +5 (very positive)

got_afinn <- got_stop %>% 
  inner_join(get_sentiments("afinn"))  # Matches words with sentiment scores from the AFINN lexicon (-5 to +5)
## Joining with `by = join_by(word)`

The negative words

get_sentiments(lexicon = "afinn")
# Note: may be prompted to download (yes)

The positive words

library(tidytext)  
afinn <- get_sentiments("afinn")  
afinn_pos <- afinn %>% filter(value > 0) #finds the sentiments with a greater value than 0
head(afinn_pos)

Word association with NRC

get_sentiments(lexicon = "nrc")

Plot sentiment scores:

got_afinn_hist <- got_afinn %>% 
  count(value)  # Counts the number of words for each sentiment score

ggplot(data = got_afinn_hist, aes(x = value, y = n)) +
  geom_col(aes(fill = value)) +  # Visualizes sentiment scores with a bar chart
  theme_bw()

# Investegate which words have a sentiment score of 2 (quite positive)

got_afinn2 <- got_afinn%>% 
  filter(value == 2)
got_afinn2%>% 
  distinct(word)
#These commandoes isolates the 2-score words

Finding the unique 2-score words

unique(got_afinn2$word) #finds the unique words
##   [1] "smile"           "fine"            "glory"           "hope"           
##   [5] "smiled"          "care"            "strength"        "peaceful"       
##   [9] "honor"           "carefully"       "slick"           "top"            
##  [13] "gained"          "comfort"         "sweet"           "courage"        
##  [17] "daring"          "elegant"         "justice"         "heroes"         
##  [21] "fair"            "strong"          "brave"           "solid"          
##  [25] "proud"           "mercy"           "rescue"          "swift"          
##  [29] "smiling"         "true"            "noble"           "saved"          
##  [33] "gift"            "treasures"       "favorite"        "clean"          
##  [37] "rich"            "fearless"        "fortunate"       "likes"          
##  [41] "earnest"         "generous"        "chances"         "smiles"         
##  [45] "hug"             "kiss"            "approved"        "fond"           
##  [49] "honored"         "consent"         "peace"           "powerful"       
##  [53] "worthy"          "humor"           "entertaining"    "save"           
##  [57] "sincerely"       "festive"         "careful"         "stronger"       
##  [61] "bold"            "eager"           "favored"         "warmth"         
##  [65] "pardon"          "pardons"         "healthy"         "loving"         
##  [69] "chance"          "thoughtful"      "enjoy"           "privileged"     
##  [73] "positively"      "stout"           "encouragement"   "stable"         
##  [77] "smarter"         "ease"            "ambitious"       "improvement"    
##  [81] "hopeful"         "hopes"           "relieved"        "helping"        
##  [85] "cares"           "importance"      "favor"           "tender"         
##  [89] "welcomed"        "treasure"        "spirited"        "secured"        
##  [93] "courtesy"        "calm"            "resolved"        "courageous"     
##  [97] "comfortable"     "sympathy"        "reassuring"      "resolute"       
## [101] "brisk"           "appeased"        "enjoying"        "hoping"         
## [105] "intricate"       "rescued"         "glorious"        "adventures"     
## [109] "friendly"        "astonished"      "reward"          "trusted"        
## [113] "honest"          "clever"          "dear"            "favors"         
## [117] "determined"      "strengthen"      "approval"        "slicker"        
## [121] "sincere"         "jokes"           "joke"            "smartest"       
## [125] "favorites"       "hero"            "adventure"       "abilities"      
## [129] "strongest"       "courteous"       "exasperated"     "enjoys"         
## [133] "rewarded"        "cherished"       "comforting"      "robust"         
## [137] "cherish"         "sympathetic"     "surviving"       "cheered"        
## [141] "worth"           "boldly"          "acquitted"       "unstoppable"    
## [145] "cheer"           "fervent"         "applause"        "cheers"         
## [149] "proudly"         "compassionate"   "bless"           "success"        
## [153] "supported"       "kinder"          "improved"        "defender"       
## [157] "tranquil"        "helpful"         "hail"            "tops"           
## [161] "thankful"        "calmed"          "sunshine"        "opportunity"    
## [165] "inspiration"     "survived"        "gain"            "freedom"        
## [169] "growth"          "futile"          "swiftly"         "satisfied"      
## [173] "congratulations" "confident"       "pardoned"        "energetic"      
## [177] "esteemed"        "benefit"         "secure"          "accomplished"   
## [181] "support"         "rewarding"       "ability"         "jovial"         
## [185] "cheering"        "hailed"          "playful"         "confidence"     
## [189] "consents"        "bargain"         "encouraged"      "relieving"      
## [193] "accomplish"      "resolve"         "cleaner"         "prominent"      
## [197] "serene"          "defenders"       "strengthened"    "wealthy"        
## [201] "revive"
got_afinn2_n <- got_afinn2 %>%
  count(word, sort = TRUE) %>%
  slice_max(n, n = 20) %>%   
  mutate(word = fct_reorder(factor(word), n)) #set up for the plot

ggplot(data = got_afinn2_n, aes(x = word, y = n)) +
  geom_col() +
  coord_flip() +
  theme_bw()

### I asked chatGPT how to show less words, and it came up with slice_max(n, n = 20) %>%

Let’s find the median and mean of the sentiment of the words

got_summary <- got_afinn %>% 
  summarize(
    mean_score = mean(value),
    median_score = median(value)
  )

print(got_summary)
## # A tibble: 1 × 2
##   mean_score median_score
##        <dbl>        <dbl>
## 1     -0.542           -1

The words in the GoT.pdf are not quite as positive, as they have a median score of -1

NRC lexicon for sentiment analysis

got_nrc <- got_stop %>% 
  inner_join(get_sentiments("nrc"))  # Matches words with the NRC lexicon, which categorizes words into different emotions
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("nrc")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 148 of `x` matches multiple rows in `y`.
## ℹ Row 9803 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

Before we do the sentiment analysis, I will find out, which words are excluded

got_exclude <- got_stop %>% 
  anti_join(get_sentiments("nrc")) #finds the excluded wrods
## Joining with `by = join_by(word)`
got_exclude_n <- got_exclude %>% 
  count(word, sort = TRUE) #counts the excluded words

head(got_exclude_n) #shows the result

Above are the excluded words

Now we can continue analysing the sentiment

got_nrc_n <- got_nrc %>% 
  count(sentiment, sort = TRUE)  # Counts how many words belong to each sentiment category

ggplot(data = got_nrc_n, aes(x = sentiment, y = n, fill = sentiment)) +
  geom_col() +  # Visualizes the results in a bar chart
  theme_bw()

Creating a bar plot of the top words per sentiment category

got_nrc_n5 <- got_nrc %>% 
  count(word, sentiment, sort = TRUE) %>%  # Counts occurrences of each word categorized by sentiment
  group_by(sentiment) %>%  # Groups by sentiment category
  top_n(5) %>%  # Selects the top 5 most frequent words for each sentiment
  ungroup()  # Removes grouping to allow further independent operations
## Selecting by n
# Create a bar plot of the top words per sentiment category
got_nrc_gg <- ggplot(data = got_nrc_n5, aes(x = reorder(word, n), y = n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +  # Creates a bar plot without legend
  facet_wrap(~sentiment, ncol = 2, scales = "free") +  # Creates separate panels for each sentiment
  coord_flip() +  # Rotates the bar chart for better readability
  theme_minimal() +  # Applies a minimalistic theme
  labs(x = "Word", y = "Count")  # Labels the axes

# Show the plot
got_nrc_gg

### I notice that the word “lord” is in many of the charts…

conf <- get_sentiments(lexicon = "nrc") %>% 
  filter(word == "lord")

# Yep, check it out:
conf

It was true

Answering the task

My task

Taking this script as a point of departure, apply sentiment analysis on the Game of Thrones. You will find a pdf in the data folder. What are the most common meaningful words and what emotions do you expect will dominate this volume? Are there any terms that are similarly ambiguous to the ‘confidence’ above?

My answer

Using this script, we applied sentiment analysis to Game of Thrones. The most common meaningful words likely include character names, titles (e.g., “king,” “lord”), and thematic words such as “battle” or “death.”

In terms of emotions, we expect a dominance of fear, anger, and trust, as the book revolves around political intrigue, betrayal, and loyalty.

An ambiguous term similar to “confidence” is “lord.” It appears frequently but does not inherently convey a positive or negative sentiment—it depends on context.