We can use pins to discover datasets in Kaggle by searching for “Game of Thrones (GoT)”,
library(pins)
pin_find("got", board = "kaggle")
We can then retrieve and tidy a particular GoT dataset,
library(tidyverse)
theme_set(theme_minimal())
got_dialogs <- pin_get("davidkyle/got-subtitles", board = "kaggle") %>%
read_csv(col_types = readr::cols()) %>%
unite("dialogs", as.character(2:775)) %>%
transmute(episode = gsub(" [^ ]*", "", Episode),
title = gsub("S[0-9]+E[0-9]+ |\\.srt", "", Episode),
dialogs = dialogs)
And then share this tidy dataset with pins at javierluraschi/datasets,
board_register_github(repo = "javierluraschi/datasets")
pin(got_dialogs, description = "Game of Thrones dialogs", board = "github")
We can use xml2 to parse an HTML table of GoT characters,
got_characters <- xml2::read_xml("got_characters.xml") %>%
xml2::xml_find_all("./tr/td[2]/a/text()") %>%
as.character() %>%
tibble(name = .) %>%
mutate(first = gsub(" .*", "", name),
last = gsub(".* ", "", name)) %>%
filter(first != "The")
And then share this dataset with pins in GitHub as well,
pin(got_characters, description = "Game of Thrones characters", board = "github")
got_characters
We can then use both datasets to find out how many times a character is referenced per episode,
got_references <- got_characters %>%
crossing(got_dialogs) %>%
mutate(references = stringr::str_count(tolower(dialogs), paste0("[^a-z]", tolower(first), "[^a-z]"))) %>%
select(episode, first, references)
And easily share in GitHub again,
pin(got_references, description = "Game of Thrones character references", board = "github")
got_references
Followed by plotting the characters most referenced across all episodes,
got_references %>%
group_by(first) %>%
summarise(references = sum(references)) %>%
filter(references > 50) %>%
ggplot(data=., aes(x = first, y = references)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "GoT Characters", subtitle = "Most referenced characters in Game of Thrones")
Or plot specific character references across episodes,
got_references %>%
filter(first == "Arya") %>%
ggplot(data=., aes(x = episode, y = references, group = 1)) +
geom_line() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Arya Stark", subtitle = "GoT character references by episode")
The best part, you can retrieve these datasets to perform your own analysis!
library(pins)
board_register_datatxt("got", "https://github.com/javierluraschi/datasets/raw/master/")
pin_get("got-references", board = "got")