knitr::opts_chunk$set(echo = TRUE,
warning = TRUE,
message = TRUE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# For text mining:
library(pdftools) # Used to extract text from PDF files
## Using poppler version 23.04.0
library(tidytext) # Facilitates text analysis by working with words in a 'tidy' format
library(textdata) # Contains various sentiment dictionaries
library(ggwordcloud) # Used to create word clouds
Get the Game of Thrones text:
got_path <- "/Users/lars/Desktop/data/got.pdf"
got_text <- pdf_text(got_path) # Extracts text from the PDF file as a vector of strings (one per page)
Some wrangling:
got_df <- data.frame(got_text) %>%
mutate(text_full = str_split(got_text, pattern = '\\n')) %>% # Splits the text by line breaks
unnest(text_full) %>% # 'Unnests' the listed text so each line becomes a row in the dataframe
mutate(text_full = str_trim(text_full)) # Removes leading and trailing spaces from each line